xref: /freebsd/sys/contrib/rdma/krping/krping.c (revision 8a0a413e)
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <linux/module.h>
38 #include <linux/moduleparam.h>
39 #include <linux/slab.h>
40 #include <linux/err.h>
41 #include <linux/string.h>
42 #include <linux/list.h>
43 #include <linux/in.h>
44 #include <linux/device.h>
45 #include <linux/pci.h>
46 #include <linux/sched.h>
47 #include <linux/wait.h>
48 
49 #include <asm/atomic.h>
50 
51 #include <rdma/ib_verbs.h>
52 #include <rdma/rdma_cm.h>
53 
54 #include "krping.h"
55 #include "getopt.h"
56 
57 extern int krping_debug;
58 #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x)
59 #define PRINTF(cb, x...) log(LOG_INFO, x)
60 #define BIND_INFO 1
61 
62 MODULE_AUTHOR("Steve Wise");
63 MODULE_DESCRIPTION("RDMA ping client/server");
64 MODULE_LICENSE("Dual BSD/GPL");
65 MODULE_VERSION(krping, 1);
66 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
67 
68 static __inline uint64_t
69 get_cycles(void)
70 {
71 	uint32_t low, high;
72 	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
73 	return (low | ((u_int64_t)high << 32));
74 }
75 
76 typedef uint64_t cycles_t;
77 
78 enum mem_type {
79 	DMA = 1,
80 	FASTREG = 2,
81 	MW = 3,
82 	MR = 4
83 };
84 
85 static const struct krping_option krping_opts[] = {
86 	{"count", OPT_INT, 'C'},
87 	{"size", OPT_INT, 'S'},
88 	{"addr", OPT_STRING, 'a'},
89 	{"addr6", OPT_STRING, 'A'},
90 	{"port", OPT_INT, 'p'},
91 	{"verbose", OPT_NOPARAM, 'v'},
92 	{"validate", OPT_NOPARAM, 'V'},
93 	{"server", OPT_NOPARAM, 's'},
94 	{"client", OPT_NOPARAM, 'c'},
95 	{"mem_mode", OPT_STRING, 'm'},
96 	{"server_inv", OPT_NOPARAM, 'I'},
97  	{"wlat", OPT_NOPARAM, 'l'},
98  	{"rlat", OPT_NOPARAM, 'L'},
99  	{"bw", OPT_NOPARAM, 'B'},
100  	{"duplex", OPT_NOPARAM, 'd'},
101  	{"txdepth", OPT_INT, 'T'},
102  	{"poll", OPT_NOPARAM, 'P'},
103  	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
104  	{"read_inv", OPT_NOPARAM, 'R'},
105  	{"fr", OPT_INT, 'f'},
106 	{NULL, 0, 0}
107 };
108 
109 #define htonll(x) cpu_to_be64((x))
110 #define ntohll(x) cpu_to_be64((x))
111 
112 static struct mutex krping_mutex;
113 
114 /*
115  * List of running krping threads.
116  */
117 static LIST_HEAD(krping_cbs);
118 
119 /*
120  * krping "ping/pong" loop:
121  * 	client sends source rkey/addr/len
122  *	server receives source rkey/add/len
123  *	server rdma reads "ping" data from source
124  * 	server sends "go ahead" on rdma read completion
125  *	client sends sink rkey/addr/len
126  * 	server receives sink rkey/addr/len
127  * 	server rdma writes "pong" data to sink
128  * 	server sends "go ahead" on rdma write completion
129  * 	<repeat loop>
130  */
131 
132 /*
133  * These states are used to signal events between the completion handler
134  * and the main client or server thread.
135  *
136  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
137  * and RDMA_WRITE_COMPLETE for each ping.
138  */
139 enum test_state {
140 	IDLE = 1,
141 	CONNECT_REQUEST,
142 	ADDR_RESOLVED,
143 	ROUTE_RESOLVED,
144 	CONNECTED,
145 	RDMA_READ_ADV,
146 	RDMA_READ_COMPLETE,
147 	RDMA_WRITE_ADV,
148 	RDMA_WRITE_COMPLETE,
149 	ERROR
150 };
151 
152 struct krping_rdma_info {
153 	uint64_t buf;
154 	uint32_t rkey;
155 	uint32_t size;
156 };
157 
158 /*
159  * Default max buffer size for IO...
160  */
161 #define RPING_BUFSIZE 128*1024
162 #define RPING_SQ_DEPTH 64
163 
164 /*
165  * Control block struct.
166  */
167 struct krping_cb {
168 	void *cookie;
169 	int server;			/* 0 iff client */
170 	struct ib_cq *cq;
171 	struct ib_pd *pd;
172 	struct ib_qp *qp;
173 
174 	enum mem_type mem;
175 	struct ib_mr *dma_mr;
176 
177 	struct ib_fast_reg_page_list *page_list;
178 	int page_list_len;
179 	struct ib_send_wr fastreg_wr;
180 	struct ib_send_wr invalidate_wr;
181 	struct ib_mr *fastreg_mr;
182 	int server_invalidate;
183 	int read_inv;
184 	u8 key;
185 
186 	struct ib_mw *mw;
187 	struct ib_mw_bind bind_attr;
188 
189 	struct ib_recv_wr rq_wr;	/* recv work request record */
190 	struct ib_sge recv_sgl;		/* recv single SGE */
191 	struct krping_rdma_info recv_buf;/* malloc'd buffer */
192 	u64 recv_dma_addr;
193 	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
194 	struct ib_mr *recv_mr;
195 
196 	struct ib_send_wr sq_wr;	/* send work requrest record */
197 	struct ib_sge send_sgl;
198 	struct krping_rdma_info send_buf;/* single send buf */
199 	u64 send_dma_addr;
200 	DECLARE_PCI_UNMAP_ADDR(send_mapping)
201 	struct ib_mr *send_mr;
202 
203 	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
204 	struct ib_sge rdma_sgl;		/* rdma single SGE */
205 	char *rdma_buf;			/* used as rdma sink */
206 	u64  rdma_dma_addr;
207 	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
208 	struct ib_mr *rdma_mr;
209 
210 	uint32_t remote_rkey;		/* remote guys RKEY */
211 	uint64_t remote_addr;		/* remote guys TO */
212 	uint32_t remote_len;		/* remote guys LEN */
213 
214 	char *start_buf;		/* rdma read src */
215 	u64  start_dma_addr;
216 	DECLARE_PCI_UNMAP_ADDR(start_mapping)
217 	struct ib_mr *start_mr;
218 
219 	enum test_state state;		/* used for cond/signalling */
220 	wait_queue_head_t sem;
221 	struct krping_stats stats;
222 
223 	uint16_t port;			/* dst port in NBO */
224 	union {
225 		struct in_addr v4;
226 		struct in6_addr v6;
227 	} addr;				/* dst addr in NBO */
228 	int addr_type;			/* AF_INET or AF_INET6 */
229 	char *addr_str;			/* dst addr string */
230 	int verbose;			/* verbose logging */
231 	int count;			/* ping count */
232 	int size;			/* ping data size */
233 	int validate;			/* validate ping data */
234 	int wlat;			/* run wlat test */
235 	int rlat;			/* run rlat test */
236 	int bw;				/* run bw test */
237 	int duplex;			/* run bw full duplex test */
238 	int poll;			/* poll or block for rlat test */
239 	int txdepth;			/* SQ depth */
240 	int local_dma_lkey;		/* use 0 for lkey */
241 	int frtest;			/* fastreg test */
242 	int testnum;
243 
244 	/* CM stuff */
245 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
246 					/* listener on server side. */
247 	struct rdma_cm_id *child_cm_id;	/* connection on server side */
248 	struct list_head list;
249 };
250 
251 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
252 				   struct rdma_cm_event *event)
253 {
254 	int ret;
255 	struct krping_cb *cb = cma_id->context;
256 
257 	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
258 	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
259 
260 	switch (event->event) {
261 	case RDMA_CM_EVENT_ADDR_RESOLVED:
262 		cb->state = ADDR_RESOLVED;
263 		ret = rdma_resolve_route(cma_id, 2000);
264 		if (ret) {
265 			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
266 			wake_up_interruptible(&cb->sem);
267 		}
268 		break;
269 
270 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
271 		cb->state = ROUTE_RESOLVED;
272 		cb->child_cm_id = cma_id;
273 		wake_up_interruptible(&cb->sem);
274 		break;
275 
276 	case RDMA_CM_EVENT_CONNECT_REQUEST:
277 		if (cb->state == IDLE) {
278 			cb->state = CONNECT_REQUEST;
279 			cb->child_cm_id = cma_id;
280 		} else {
281 			PRINTF(cb, "Received connection request in wrong state"
282 			    " (%d)\n", cb->state);
283 		}
284 		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
285 		wake_up_interruptible(&cb->sem);
286 		break;
287 
288 	case RDMA_CM_EVENT_ESTABLISHED:
289 		DEBUG_LOG(cb, "ESTABLISHED\n");
290 		if (!cb->server) {
291 			cb->state = CONNECTED;
292 		}
293 		wake_up_interruptible(&cb->sem);
294 		break;
295 
296 	case RDMA_CM_EVENT_ADDR_ERROR:
297 	case RDMA_CM_EVENT_ROUTE_ERROR:
298 	case RDMA_CM_EVENT_CONNECT_ERROR:
299 	case RDMA_CM_EVENT_UNREACHABLE:
300 	case RDMA_CM_EVENT_REJECTED:
301 		PRINTF(cb, "cma event %d, error %d\n", event->event,
302 		       event->status);
303 		cb->state = ERROR;
304 		wake_up_interruptible(&cb->sem);
305 		break;
306 
307 	case RDMA_CM_EVENT_DISCONNECTED:
308 		PRINTF(cb, "DISCONNECT EVENT...\n");
309 		cb->state = ERROR;
310 		wake_up_interruptible(&cb->sem);
311 		break;
312 
313 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
314 		PRINTF(cb, "cma detected device removal!!!!\n");
315 		break;
316 
317 	default:
318 		PRINTF(cb, "oof bad type!\n");
319 		wake_up_interruptible(&cb->sem);
320 		break;
321 	}
322 	return 0;
323 }
324 
325 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
326 {
327 	if (wc->byte_len != sizeof(cb->recv_buf)) {
328 		PRINTF(cb, "Received bogus data, size %d\n",
329 		       wc->byte_len);
330 		return -1;
331 	}
332 
333 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
334 	cb->remote_addr = ntohll(cb->recv_buf.buf);
335 	cb->remote_len  = ntohl(cb->recv_buf.size);
336 	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
337 		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
338 		  cb->remote_len);
339 
340 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
341 		cb->state = RDMA_READ_ADV;
342 	else
343 		cb->state = RDMA_WRITE_ADV;
344 
345 	return 0;
346 }
347 
348 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
349 {
350 	if (wc->byte_len != sizeof(cb->recv_buf)) {
351 		PRINTF(cb, "Received bogus data, size %d\n",
352 		       wc->byte_len);
353 		return -1;
354 	}
355 
356 	if (cb->state == RDMA_READ_ADV)
357 		cb->state = RDMA_WRITE_ADV;
358 	else
359 		cb->state = RDMA_WRITE_COMPLETE;
360 
361 	return 0;
362 }
363 
364 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
365 {
366 	struct krping_cb *cb = ctx;
367 	struct ib_wc wc;
368 	struct ib_recv_wr *bad_wr;
369 	int ret;
370 
371 	BUG_ON(cb->cq != cq);
372 	if (cb->state == ERROR) {
373 		PRINTF(cb, "cq completion in ERROR state\n");
374 		return;
375 	}
376 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest)
377 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
378 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
379 		if (wc.status) {
380 			if (wc.status == IB_WC_WR_FLUSH_ERR) {
381 				DEBUG_LOG(cb, "cq flushed\n");
382 				continue;
383 			} else {
384 				PRINTF(cb, "cq completion failed with "
385 				       "wr_id %jx status %d opcode %d vender_err %x\n",
386 					(uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
387 				goto error;
388 			}
389 		}
390 
391 		switch (wc.opcode) {
392 		case IB_WC_SEND:
393 			DEBUG_LOG(cb, "send completion\n");
394 			cb->stats.send_bytes += cb->send_sgl.length;
395 			cb->stats.send_msgs++;
396 			break;
397 
398 		case IB_WC_RDMA_WRITE:
399 			DEBUG_LOG(cb, "rdma write completion\n");
400 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
401 			cb->stats.write_msgs++;
402 			cb->state = RDMA_WRITE_COMPLETE;
403 			wake_up_interruptible(&cb->sem);
404 			break;
405 
406 		case IB_WC_RDMA_READ:
407 			DEBUG_LOG(cb, "rdma read completion\n");
408 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
409 			cb->stats.read_msgs++;
410 			cb->state = RDMA_READ_COMPLETE;
411 			wake_up_interruptible(&cb->sem);
412 			break;
413 
414 		case IB_WC_RECV:
415 			DEBUG_LOG(cb, "recv completion\n");
416 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
417 			cb->stats.recv_msgs++;
418 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest)
419 				ret = server_recv(cb, &wc);
420 			else
421 				ret = cb->server ? server_recv(cb, &wc) :
422 						   client_recv(cb, &wc);
423 			if (ret) {
424 				PRINTF(cb, "recv wc error: %d\n", ret);
425 				goto error;
426 			}
427 
428 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
429 			if (ret) {
430 				PRINTF(cb, "post recv error: %d\n",
431 				       ret);
432 				goto error;
433 			}
434 			wake_up_interruptible(&cb->sem);
435 			break;
436 
437 		default:
438 			PRINTF(cb,
439 			       "%s:%d Unexpected opcode %d, Shutting down\n",
440 			       __func__, __LINE__, wc.opcode);
441 			goto error;
442 		}
443 	}
444 	if (ret) {
445 		PRINTF(cb, "poll error %d\n", ret);
446 		goto error;
447 	}
448 	return;
449 error:
450 	cb->state = ERROR;
451 	wake_up_interruptible(&cb->sem);
452 }
453 
454 static int krping_accept(struct krping_cb *cb)
455 {
456 	struct rdma_conn_param conn_param;
457 	int ret;
458 
459 	DEBUG_LOG(cb, "accepting client connection request\n");
460 
461 	memset(&conn_param, 0, sizeof conn_param);
462 	conn_param.responder_resources = 1;
463 	conn_param.initiator_depth = 1;
464 
465 	ret = rdma_accept(cb->child_cm_id, &conn_param);
466 	if (ret) {
467 		PRINTF(cb, "rdma_accept error: %d\n", ret);
468 		return ret;
469 	}
470 
471 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
472 		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
473 		if (cb->state == ERROR) {
474 			PRINTF(cb, "wait for CONNECTED state %d\n",
475 				cb->state);
476 			return -1;
477 		}
478 	}
479 	return 0;
480 }
481 
482 static void krping_setup_wr(struct krping_cb *cb)
483 {
484 	cb->recv_sgl.addr = cb->recv_dma_addr;
485 	cb->recv_sgl.length = sizeof cb->recv_buf;
486 	if (cb->local_dma_lkey)
487 		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
488 	else if (cb->mem == DMA)
489 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
490 	else
491 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
492 	cb->rq_wr.sg_list = &cb->recv_sgl;
493 	cb->rq_wr.num_sge = 1;
494 
495 	cb->send_sgl.addr = cb->send_dma_addr;
496 	cb->send_sgl.length = sizeof cb->send_buf;
497 	if (cb->local_dma_lkey)
498 		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
499 	else if (cb->mem == DMA)
500 		cb->send_sgl.lkey = cb->dma_mr->lkey;
501 	else
502 		cb->send_sgl.lkey = cb->send_mr->lkey;
503 
504 	cb->sq_wr.opcode = IB_WR_SEND;
505 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
506 	cb->sq_wr.sg_list = &cb->send_sgl;
507 	cb->sq_wr.num_sge = 1;
508 
509 	if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
510 		cb->rdma_sgl.addr = cb->rdma_dma_addr;
511 		if (cb->mem == MR)
512 			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
513 		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
514 		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
515 		cb->rdma_sq_wr.num_sge = 1;
516 	}
517 
518 	switch(cb->mem) {
519 	case FASTREG:
520 
521 		/*
522 		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
523 		 * both unsignaled.  The client uses them to reregister
524 		 * the rdma buffers with a new key each iteration.
525 		 */
526 		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
527 		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
528 		cb->fastreg_wr.wr.fast_reg.length = cb->size;
529 		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
530 		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
531 
532 		cb->invalidate_wr.next = &cb->fastreg_wr;
533 		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
534 		break;
535 	case MW:
536 		cb->bind_attr.wr_id = 0xabbaabba;
537 		cb->bind_attr.send_flags = 0; /* unsignaled */
538 #ifdef BIND_INFO
539 		cb->bind_attr.bind_info.length = cb->size;
540 #else
541 		cb->bind_attr.length = cb->size;
542 #endif
543 		break;
544 	default:
545 		break;
546 	}
547 }
548 
549 static int krping_setup_buffers(struct krping_cb *cb)
550 {
551 	int ret;
552 	struct ib_phys_buf buf;
553 	u64 iovbase;
554 
555 	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
556 
557 	cb->recv_dma_addr = ib_dma_map_single(cb->pd->device,
558 				   &cb->recv_buf,
559 				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
560 	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
561 	cb->send_dma_addr = ib_dma_map_single(cb->pd->device,
562 					   &cb->send_buf, sizeof(cb->send_buf),
563 					   DMA_BIDIRECTIONAL);
564 	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
565 
566 	if (cb->mem == DMA) {
567 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
568 					   IB_ACCESS_REMOTE_READ|
569 				           IB_ACCESS_REMOTE_WRITE);
570 		if (IS_ERR(cb->dma_mr)) {
571 			DEBUG_LOG(cb, "reg_dmamr failed\n");
572 			ret = PTR_ERR(cb->dma_mr);
573 			goto bail;
574 		}
575 	} else {
576 		if (!cb->local_dma_lkey) {
577 			buf.addr = cb->recv_dma_addr;
578 			buf.size = sizeof cb->recv_buf;
579 			DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n",
580 			    (uintmax_t)buf.addr, (int)buf.size);
581 			iovbase = cb->recv_dma_addr;
582 			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
583 						     IB_ACCESS_LOCAL_WRITE,
584 						     &iovbase);
585 
586 			if (IS_ERR(cb->recv_mr)) {
587 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
588 				ret = PTR_ERR(cb->recv_mr);
589 				goto bail;
590 			}
591 
592 			buf.addr = cb->send_dma_addr;
593 			buf.size = sizeof cb->send_buf;
594 			DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n",
595 			    (uintmax_t)buf.addr, (int)buf.size);
596 			iovbase = cb->send_dma_addr;
597 			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
598 						     0, &iovbase);
599 
600 			if (IS_ERR(cb->send_mr)) {
601 				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
602 				ret = PTR_ERR(cb->send_mr);
603 				goto bail;
604 			}
605 		}
606 	}
607 
608 	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
609 	if (!cb->rdma_buf) {
610 		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
611 		ret = -ENOMEM;
612 		goto bail;
613 	}
614 
615 	cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device,
616 			       cb->rdma_buf, cb->size,
617 			       DMA_BIDIRECTIONAL);
618 	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
619 	if (cb->mem != DMA) {
620 		switch (cb->mem) {
621 		case FASTREG:
622 			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
623 				PAGE_SIZE) >> PAGE_SHIFT;
624 			cb->page_list = ib_alloc_fast_reg_page_list(
625 						cb->pd->device,
626 						cb->page_list_len);
627 			if (IS_ERR(cb->page_list)) {
628 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
629 				ret = PTR_ERR(cb->page_list);
630 				goto bail;
631 			}
632 			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
633 					cb->page_list->max_page_list_len);
634 			if (IS_ERR(cb->fastreg_mr)) {
635 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
636 				ret = PTR_ERR(cb->fastreg_mr);
637 				goto bail;
638 			}
639 			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
640 				" page_list_len %u\n", cb->fastreg_mr->rkey,
641 				cb->page_list, cb->page_list_len);
642 			break;
643 		case MW:
644 			cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1);
645 			if (IS_ERR(cb->mw)) {
646 				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
647 				ret = PTR_ERR(cb->mw);
648 				goto bail;
649 			}
650 			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
651 			/*FALLTHROUGH*/
652 		case MR:
653 			buf.addr = cb->rdma_dma_addr;
654 			buf.size = cb->size;
655 			iovbase = cb->rdma_dma_addr;
656 			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
657 						IB_ACCESS_LOCAL_WRITE|
658 					     IB_ACCESS_REMOTE_READ|
659 					     IB_ACCESS_REMOTE_WRITE,
660 					     &iovbase);
661 			if (IS_ERR(cb->rdma_mr)) {
662 				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
663 				ret = PTR_ERR(cb->rdma_mr);
664 				goto bail;
665 			}
666 			DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n",
667 				(uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey);
668 			break;
669 		default:
670 			ret = -EINVAL;
671 			goto bail;
672 			break;
673 		}
674 	}
675 
676 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
677 
678 		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
679 		if (!cb->start_buf) {
680 			DEBUG_LOG(cb, "start_buf malloc failed\n");
681 			ret = -ENOMEM;
682 			goto bail;
683 		}
684 
685 		cb->start_dma_addr = ib_dma_map_single(cb->pd->device,
686 						   cb->start_buf, cb->size,
687 						   DMA_BIDIRECTIONAL);
688 		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
689 
690 		if (cb->mem == MR || cb->mem == MW) {
691 			unsigned flags = IB_ACCESS_REMOTE_READ;
692 
693 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest) {
694 				flags |= IB_ACCESS_LOCAL_WRITE |
695 					IB_ACCESS_REMOTE_WRITE;
696 			}
697 
698 			buf.addr = cb->start_dma_addr;
699 			buf.size = cb->size;
700 			DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n",
701 				(uintmax_t)buf.addr, (int)buf.size);
702 			iovbase = cb->start_dma_addr;
703 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
704 					     flags,
705 					     &iovbase);
706 
707 			if (IS_ERR(cb->start_mr)) {
708 				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
709 				ret = PTR_ERR(cb->start_mr);
710 				goto bail;
711 			}
712 		}
713 	}
714 
715 	krping_setup_wr(cb);
716 	DEBUG_LOG(cb, "allocated & registered buffers...\n");
717 	return 0;
718 bail:
719 	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
720 		ib_dereg_mr(cb->fastreg_mr);
721 	if (cb->mw && !IS_ERR(cb->mw))
722 		ib_dealloc_mw(cb->mw);
723 	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
724 		ib_dereg_mr(cb->rdma_mr);
725 	if (cb->page_list && !IS_ERR(cb->page_list))
726 		ib_free_fast_reg_page_list(cb->page_list);
727 	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
728 		ib_dereg_mr(cb->dma_mr);
729 	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
730 		ib_dereg_mr(cb->recv_mr);
731 	if (cb->send_mr && !IS_ERR(cb->send_mr))
732 		ib_dereg_mr(cb->send_mr);
733 	if (cb->rdma_buf)
734 		kfree(cb->rdma_buf);
735 	if (cb->start_buf)
736 		kfree(cb->start_buf);
737 	return ret;
738 }
739 
740 static void krping_free_buffers(struct krping_cb *cb)
741 {
742 	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
743 
744 	if (cb->dma_mr)
745 		ib_dereg_mr(cb->dma_mr);
746 	if (cb->send_mr)
747 		ib_dereg_mr(cb->send_mr);
748 	if (cb->recv_mr)
749 		ib_dereg_mr(cb->recv_mr);
750 	if (cb->rdma_mr)
751 		ib_dereg_mr(cb->rdma_mr);
752 	if (cb->start_mr)
753 		ib_dereg_mr(cb->start_mr);
754 	if (cb->fastreg_mr)
755 		ib_dereg_mr(cb->fastreg_mr);
756 	if (cb->mw)
757 		ib_dealloc_mw(cb->mw);
758 
759 	dma_unmap_single(cb->pd->device->dma_device,
760 			 pci_unmap_addr(cb, recv_mapping),
761 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
762 	dma_unmap_single(cb->pd->device->dma_device,
763 			 pci_unmap_addr(cb, send_mapping),
764 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
765 	dma_unmap_single(cb->pd->device->dma_device,
766 			 pci_unmap_addr(cb, rdma_mapping),
767 			 cb->size, DMA_BIDIRECTIONAL);
768 	kfree(cb->rdma_buf);
769 	if (cb->start_buf) {
770 		dma_unmap_single(cb->pd->device->dma_device,
771 			 pci_unmap_addr(cb, start_mapping),
772 			 cb->size, DMA_BIDIRECTIONAL);
773 		kfree(cb->start_buf);
774 	}
775 }
776 
777 static int krping_create_qp(struct krping_cb *cb)
778 {
779 	struct ib_qp_init_attr init_attr;
780 	int ret;
781 
782 	memset(&init_attr, 0, sizeof(init_attr));
783 	init_attr.cap.max_send_wr = cb->txdepth;
784 	init_attr.cap.max_recv_wr = 2;
785 	init_attr.cap.max_recv_sge = 1;
786 	init_attr.cap.max_send_sge = 1;
787 	init_attr.qp_type = IB_QPT_RC;
788 	init_attr.send_cq = cb->cq;
789 	init_attr.recv_cq = cb->cq;
790 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
791 
792 	if (cb->server) {
793 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
794 		if (!ret)
795 			cb->qp = cb->child_cm_id->qp;
796 	} else {
797 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
798 		if (!ret)
799 			cb->qp = cb->cm_id->qp;
800 	}
801 
802 	return ret;
803 }
804 
805 static void krping_free_qp(struct krping_cb *cb)
806 {
807 	ib_destroy_qp(cb->qp);
808 	ib_destroy_cq(cb->cq);
809 	ib_dealloc_pd(cb->pd);
810 }
811 
812 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
813 {
814 	int ret;
815 	cb->pd = ib_alloc_pd(cm_id->device);
816 	if (IS_ERR(cb->pd)) {
817 		PRINTF(cb, "ib_alloc_pd failed\n");
818 		return PTR_ERR(cb->pd);
819 	}
820 	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
821 
822 	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
823 
824 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
825 			      cb, cb->txdepth * 2, 0);
826 	if (IS_ERR(cb->cq)) {
827 		PRINTF(cb, "ib_create_cq failed\n");
828 		ret = PTR_ERR(cb->cq);
829 		goto err1;
830 	}
831 	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
832 
833 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
834 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
835 		if (ret) {
836 			PRINTF(cb, "ib_create_cq failed\n");
837 			goto err2;
838 		}
839 	}
840 
841 	ret = krping_create_qp(cb);
842 	if (ret) {
843 		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
844 		goto err2;
845 	}
846 	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
847 	return 0;
848 err2:
849 	ib_destroy_cq(cb->cq);
850 err1:
851 	ib_dealloc_pd(cb->pd);
852 	return ret;
853 }
854 
855 /*
856  * return the (possibly rebound) rkey for the rdma buffer.
857  * FASTREG mode: invalidate and rebind via fastreg wr.
858  * MW mode: rebind the MW.
859  * other modes: just return the mr rkey.
860  */
861 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
862 {
863 	u32 rkey = 0xffffffff;
864 	u64 p;
865 	struct ib_send_wr *bad_wr;
866 	int i;
867 	int ret;
868 
869 	switch (cb->mem) {
870 	case FASTREG:
871 		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
872 
873 		/*
874 		 * Update the fastreg key.
875 		 */
876 		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
877 		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
878 
879 		/*
880 		 * Update the fastreg WR with new buf info.
881 		 */
882 		if (buf == (u64)cb->start_dma_addr)
883 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
884 		else
885 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
886 		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
887 		p = (u64)(buf & PAGE_MASK);
888 		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
889 		     i++, p += PAGE_SIZE) {
890 			cb->page_list->page_list[i] = p;
891 			DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p);
892 		}
893 
894 		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
895 			" iova_start %jx page_list_len %u\n",
896 			post_inv,
897 			cb->fastreg_wr.wr.fast_reg.rkey,
898 			cb->fastreg_wr.wr.fast_reg.page_shift,
899 			(unsigned)cb->fastreg_wr.wr.fast_reg.length,
900 			(uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start,
901 			cb->fastreg_wr.wr.fast_reg.page_list_len);
902 
903 		if (post_inv)
904 			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
905 		else
906 			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
907 		if (ret) {
908 			PRINTF(cb, "post send error %d\n", ret);
909 			cb->state = ERROR;
910 		}
911 		rkey = cb->fastreg_mr->rkey;
912 		break;
913 	case MW:
914 		/*
915 		 * Update the MW with new buf info.
916 		 */
917 		if (buf == (u64)cb->start_dma_addr) {
918 #ifdef BIND_INFO
919 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ;
920 			cb->bind_attr.bind_info.mr = cb->start_mr;
921 #else
922 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
923 			cb->bind_attr.mr = cb->start_mr;
924 #endif
925 		} else {
926 #ifdef BIND_INFO
927 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
928 			cb->bind_attr.bind_info.mr = cb->rdma_mr;
929 #else
930 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
931 			cb->bind_attr.mr = cb->rdma_mr;
932 #endif
933 		}
934 #ifdef BIND_INFO
935 		cb->bind_attr.bind_info.addr = buf;
936 #else
937 		cb->bind_attr.addr = buf;
938 #endif
939 		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n",
940 #ifdef BIND_INFO
941 			cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey);
942 #else
943 			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
944 #endif
945 		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
946 		if (ret) {
947 			PRINTF(cb, "bind mw error %d\n", ret);
948 			cb->state = ERROR;
949 		} else
950 			rkey = cb->mw->rkey;
951 		break;
952 	case MR:
953 		if (buf == (u64)cb->start_dma_addr)
954 			rkey = cb->start_mr->rkey;
955 		else
956 			rkey = cb->rdma_mr->rkey;
957 		break;
958 	case DMA:
959 		rkey = cb->dma_mr->rkey;
960 		break;
961 	default:
962 		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
963 		cb->state = ERROR;
964 		break;
965 	}
966 	return rkey;
967 }
968 
969 static void krping_format_send(struct krping_cb *cb, u64 buf)
970 {
971 	struct krping_rdma_info *info = &cb->send_buf;
972 	u32 rkey;
973 
974 	/*
975 	 * Client side will do fastreg or mw bind before
976 	 * advertising the rdma buffer.  Server side
977 	 * sends have no data.
978 	 */
979 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
980 		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
981 		info->buf = htonll(buf);
982 		info->rkey = htonl(rkey);
983 		info->size = htonl(cb->size);
984 		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
985 			  (unsigned long long)buf, rkey, cb->size);
986 	}
987 }
988 
989 static void krping_test_server(struct krping_cb *cb)
990 {
991 	struct ib_send_wr *bad_wr, inv;
992 	int ret;
993 
994 	while (1) {
995 		/* Wait for client's Start STAG/TO/Len */
996 		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
997 		if (cb->state != RDMA_READ_ADV) {
998 			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
999 				cb->state);
1000 			break;
1001 		}
1002 
1003 		DEBUG_LOG(cb, "server received sink adv\n");
1004 
1005 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1006 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1007 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
1008 		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
1009 
1010 		/* Issue RDMA Read. */
1011 		if (cb->read_inv)
1012 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
1013 		else {
1014 
1015 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1016 			if (cb->mem == FASTREG) {
1017 				/*
1018 				 * Immediately follow the read with a
1019 				 * fenced LOCAL_INV.
1020 				 */
1021 				cb->rdma_sq_wr.next = &inv;
1022 				memset(&inv, 0, sizeof inv);
1023 				inv.opcode = IB_WR_LOCAL_INV;
1024 				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
1025 				inv.send_flags = IB_SEND_FENCE;
1026 			}
1027 		}
1028 
1029 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1030 		if (ret) {
1031 			PRINTF(cb, "post send error %d\n", ret);
1032 			break;
1033 		}
1034 		cb->rdma_sq_wr.next = NULL;
1035 
1036 		DEBUG_LOG(cb, "server posted rdma read req \n");
1037 
1038 		/* Wait for read completion */
1039 		wait_event_interruptible(cb->sem,
1040 					 cb->state >= RDMA_READ_COMPLETE);
1041 		if (cb->state != RDMA_READ_COMPLETE) {
1042 			PRINTF(cb,
1043 			       "wait for RDMA_READ_COMPLETE state %d\n",
1044 			       cb->state);
1045 			break;
1046 		}
1047 		DEBUG_LOG(cb, "server received read complete\n");
1048 
1049 		/* Display data in recv buf */
1050 		if (cb->verbose) {
1051 			if (strlen(cb->rdma_buf) > 128) {
1052 				char msgbuf[128];
1053 
1054 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
1055 				PRINTF(cb, "server ping data stripped: %s\n",
1056 				       msgbuf);
1057 			} else
1058 				PRINTF(cb, "server ping data: %s\n",
1059 				       cb->rdma_buf);
1060 		}
1061 
1062 		/* Tell client to continue */
1063 		if (cb->server && cb->server_invalidate) {
1064 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1065 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1066 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1067 		}
1068 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1069 		if (ret) {
1070 			PRINTF(cb, "post send error %d\n", ret);
1071 			break;
1072 		}
1073 		DEBUG_LOG(cb, "server posted go ahead\n");
1074 
1075 		/* Wait for client's RDMA STAG/TO/Len */
1076 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1077 		if (cb->state != RDMA_WRITE_ADV) {
1078 			PRINTF(cb,
1079 			       "wait for RDMA_WRITE_ADV state %d\n",
1080 			       cb->state);
1081 			break;
1082 		}
1083 		DEBUG_LOG(cb, "server received sink adv\n");
1084 
1085 		/* RDMA Write echo data */
1086 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1087 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1088 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1089 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1090 		if (cb->local_dma_lkey)
1091 			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1092 		else
1093 			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1094 
1095 		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1096 			  cb->rdma_sq_wr.sg_list->lkey,
1097 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1098 			  cb->rdma_sq_wr.sg_list->length);
1099 
1100 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1101 		if (ret) {
1102 			PRINTF(cb, "post send error %d\n", ret);
1103 			break;
1104 		}
1105 
1106 		/* Wait for completion */
1107 		ret = wait_event_interruptible(cb->sem, cb->state >=
1108 							 RDMA_WRITE_COMPLETE);
1109 		if (cb->state != RDMA_WRITE_COMPLETE) {
1110 			PRINTF(cb,
1111 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1112 			       cb->state);
1113 			break;
1114 		}
1115 		DEBUG_LOG(cb, "server rdma write complete \n");
1116 
1117 		cb->state = CONNECTED;
1118 
1119 		/* Tell client to begin again */
1120 		if (cb->server && cb->server_invalidate) {
1121 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1122 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1123 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1124 		}
1125 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1126 		if (ret) {
1127 			PRINTF(cb, "post send error %d\n", ret);
1128 			break;
1129 		}
1130 		DEBUG_LOG(cb, "server posted go ahead\n");
1131 	}
1132 }
1133 
1134 static void rlat_test(struct krping_cb *cb)
1135 {
1136 	int scnt;
1137 	int iters = cb->count;
1138 	struct timeval start_tv, stop_tv;
1139 	int ret;
1140 	struct ib_wc wc;
1141 	struct ib_send_wr *bad_wr;
1142 	int ne;
1143 
1144 	scnt = 0;
1145 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1146 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1147 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1148 	cb->rdma_sq_wr.sg_list->length = cb->size;
1149 
1150 	microtime(&start_tv);
1151 	if (!cb->poll) {
1152 		cb->state = RDMA_READ_ADV;
1153 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1154 	}
1155 	while (scnt < iters) {
1156 
1157 		cb->state = RDMA_READ_ADV;
1158 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1159 		if (ret) {
1160 			PRINTF(cb,
1161 				"Couldn't post send: ret=%d scnt %d\n",
1162 				ret, scnt);
1163 			return;
1164 		}
1165 
1166 		do {
1167 			if (!cb->poll) {
1168 				wait_event_interruptible(cb->sem,
1169 					cb->state != RDMA_READ_ADV);
1170 				if (cb->state == RDMA_READ_COMPLETE) {
1171 					ne = 1;
1172 					ib_req_notify_cq(cb->cq,
1173 						IB_CQ_NEXT_COMP);
1174 				} else {
1175 					ne = -1;
1176 				}
1177 			} else
1178 				ne = ib_poll_cq(cb->cq, 1, &wc);
1179 			if (cb->state == ERROR) {
1180 				PRINTF(cb,
1181 					"state == ERROR...bailing scnt %d\n",
1182 					scnt);
1183 				return;
1184 			}
1185 		} while (ne == 0);
1186 
1187 		if (ne < 0) {
1188 			PRINTF(cb, "poll CQ failed %d\n", ne);
1189 			return;
1190 		}
1191 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
1192 			PRINTF(cb, "Completion wth error at %s:\n",
1193 				cb->server ? "server" : "client");
1194 			PRINTF(cb, "Failed status %d: wr_id %d\n",
1195 				wc.status, (int) wc.wr_id);
1196 			return;
1197 		}
1198 		++scnt;
1199 	}
1200 	microtime(&stop_tv);
1201 
1202         if (stop_tv.tv_usec < start_tv.tv_usec) {
1203                 stop_tv.tv_usec += 1000000;
1204                 stop_tv.tv_sec  -= 1;
1205         }
1206 
1207 	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1208 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1209 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1210 		scnt, cb->size);
1211 }
1212 
1213 static void wlat_test(struct krping_cb *cb)
1214 {
1215 	int ccnt, scnt, rcnt;
1216 	int iters=cb->count;
1217 	volatile char *poll_buf = (char *) cb->start_buf;
1218 	char *buf = (char *)cb->rdma_buf;
1219 	struct timeval start_tv, stop_tv;
1220 	cycles_t *post_cycles_start, *post_cycles_stop;
1221 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1222 	cycles_t *last_poll_cycles_start;
1223 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1224 	int i;
1225 	int cycle_iters = 1000;
1226 
1227 	ccnt = 0;
1228 	scnt = 0;
1229 	rcnt = 0;
1230 
1231 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1232 	if (!post_cycles_start) {
1233 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1234 		return;
1235 	}
1236 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1237 	if (!post_cycles_stop) {
1238 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1239 		return;
1240 	}
1241 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1242 	if (!poll_cycles_start) {
1243 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1244 		return;
1245 	}
1246 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1247 	if (!poll_cycles_stop) {
1248 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1249 		return;
1250 	}
1251 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1252 		GFP_KERNEL);
1253 	if (!last_poll_cycles_start) {
1254 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1255 		return;
1256 	}
1257 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1258 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1259 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1260 	cb->rdma_sq_wr.sg_list->length = cb->size;
1261 
1262 	if (cycle_iters > iters)
1263 		cycle_iters = iters;
1264 	microtime(&start_tv);
1265 	while (scnt < iters || ccnt < iters || rcnt < iters) {
1266 
1267 		/* Wait till buffer changes. */
1268 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1269 			++rcnt;
1270 			while (*poll_buf != (char)rcnt) {
1271 				if (cb->state == ERROR) {
1272 					PRINTF(cb,
1273 						"state = ERROR, bailing\n");
1274 					return;
1275 				}
1276 			}
1277 		}
1278 
1279 		if (scnt < iters) {
1280 			struct ib_send_wr *bad_wr;
1281 
1282 			*buf = (char)scnt+1;
1283 			if (scnt < cycle_iters)
1284 				post_cycles_start[scnt] = get_cycles();
1285 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1286 				PRINTF(cb,
1287 					"Couldn't post send: scnt=%d\n",
1288 					scnt);
1289 				return;
1290 			}
1291 			if (scnt < cycle_iters)
1292 				post_cycles_stop[scnt] = get_cycles();
1293 			scnt++;
1294 		}
1295 
1296 		if (ccnt < iters) {
1297 			struct ib_wc wc;
1298 			int ne;
1299 
1300 			if (ccnt < cycle_iters)
1301 				poll_cycles_start[ccnt] = get_cycles();
1302 			do {
1303 				if (ccnt < cycle_iters)
1304 					last_poll_cycles_start[ccnt] =
1305 						get_cycles();
1306 				ne = ib_poll_cq(cb->cq, 1, &wc);
1307 			} while (ne == 0);
1308 			if (ccnt < cycle_iters)
1309 				poll_cycles_stop[ccnt] = get_cycles();
1310 			++ccnt;
1311 
1312 			if (ne < 0) {
1313 				PRINTF(cb, "poll CQ failed %d\n", ne);
1314 				return;
1315 			}
1316 			if (wc.status != IB_WC_SUCCESS) {
1317 				PRINTF(cb,
1318 					"Completion wth error at %s:\n",
1319 					cb->server ? "server" : "client");
1320 				PRINTF(cb,
1321 					"Failed status %d: wr_id %d\n",
1322 					wc.status, (int) wc.wr_id);
1323 				PRINTF(cb,
1324 					"scnt=%d, rcnt=%d, ccnt=%d\n",
1325 					scnt, rcnt, ccnt);
1326 				return;
1327 			}
1328 		}
1329 	}
1330 	microtime(&stop_tv);
1331 
1332         if (stop_tv.tv_usec < start_tv.tv_usec) {
1333                 stop_tv.tv_usec += 1000000;
1334                 stop_tv.tv_sec  -= 1;
1335         }
1336 
1337 	for (i=0; i < cycle_iters; i++) {
1338 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1339 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1340 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1341 	}
1342 	PRINTF(cb,
1343 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1344 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1345 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1346 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1347 		scnt, cb->size, cycle_iters,
1348 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1349 		(unsigned long long)sum_last_poll);
1350 	kfree(post_cycles_start);
1351 	kfree(post_cycles_stop);
1352 	kfree(poll_cycles_start);
1353 	kfree(poll_cycles_stop);
1354 	kfree(last_poll_cycles_start);
1355 }
1356 
1357 static void bw_test(struct krping_cb *cb)
1358 {
1359 	int ccnt, scnt, rcnt;
1360 	int iters=cb->count;
1361 	struct timeval start_tv, stop_tv;
1362 	cycles_t *post_cycles_start, *post_cycles_stop;
1363 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1364 	cycles_t *last_poll_cycles_start;
1365 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1366 	int i;
1367 	int cycle_iters = 1000;
1368 
1369 	ccnt = 0;
1370 	scnt = 0;
1371 	rcnt = 0;
1372 
1373 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1374 	if (!post_cycles_start) {
1375 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1376 		return;
1377 	}
1378 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1379 	if (!post_cycles_stop) {
1380 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1381 		return;
1382 	}
1383 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1384 	if (!poll_cycles_start) {
1385 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1386 		return;
1387 	}
1388 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1389 	if (!poll_cycles_stop) {
1390 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1391 		return;
1392 	}
1393 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1394 		GFP_KERNEL);
1395 	if (!last_poll_cycles_start) {
1396 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1397 		return;
1398 	}
1399 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1400 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1401 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1402 	cb->rdma_sq_wr.sg_list->length = cb->size;
1403 
1404 	if (cycle_iters > iters)
1405 		cycle_iters = iters;
1406 	microtime(&start_tv);
1407 	while (scnt < iters || ccnt < iters) {
1408 
1409 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1410 			struct ib_send_wr *bad_wr;
1411 
1412 			if (scnt < cycle_iters)
1413 				post_cycles_start[scnt] = get_cycles();
1414 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1415 				PRINTF(cb,
1416 					"Couldn't post send: scnt=%d\n",
1417 					scnt);
1418 				return;
1419 			}
1420 			if (scnt < cycle_iters)
1421 				post_cycles_stop[scnt] = get_cycles();
1422 			++scnt;
1423 		}
1424 
1425 		if (ccnt < iters) {
1426 			int ne;
1427 			struct ib_wc wc;
1428 
1429 			if (ccnt < cycle_iters)
1430 				poll_cycles_start[ccnt] = get_cycles();
1431 			do {
1432 				if (ccnt < cycle_iters)
1433 					last_poll_cycles_start[ccnt] =
1434 						get_cycles();
1435 				ne = ib_poll_cq(cb->cq, 1, &wc);
1436 			} while (ne == 0);
1437 			if (ccnt < cycle_iters)
1438 				poll_cycles_stop[ccnt] = get_cycles();
1439 			ccnt += 1;
1440 
1441 			if (ne < 0) {
1442 				PRINTF(cb, "poll CQ failed %d\n", ne);
1443 				return;
1444 			}
1445 			if (wc.status != IB_WC_SUCCESS) {
1446 				PRINTF(cb,
1447 					"Completion wth error at %s:\n",
1448 					cb->server ? "server" : "client");
1449 				PRINTF(cb,
1450 					"Failed status %d: wr_id %d\n",
1451 					wc.status, (int) wc.wr_id);
1452 				return;
1453 			}
1454 		}
1455 	}
1456 	microtime(&stop_tv);
1457 
1458         if (stop_tv.tv_usec < start_tv.tv_usec) {
1459                 stop_tv.tv_usec += 1000000;
1460                 stop_tv.tv_sec  -= 1;
1461         }
1462 
1463 	for (i=0; i < cycle_iters; i++) {
1464 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1465 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1466 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1467 	}
1468 	PRINTF(cb,
1469 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1470 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1471 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1472 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1473 		scnt, cb->size, cycle_iters,
1474 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1475 		(unsigned long long)sum_last_poll);
1476 	kfree(post_cycles_start);
1477 	kfree(post_cycles_stop);
1478 	kfree(poll_cycles_start);
1479 	kfree(poll_cycles_stop);
1480 	kfree(last_poll_cycles_start);
1481 }
1482 
1483 static void krping_rlat_test_server(struct krping_cb *cb)
1484 {
1485 	struct ib_send_wr *bad_wr;
1486 	struct ib_wc wc;
1487 	int ret;
1488 
1489 	/* Spin waiting for client's Start STAG/TO/Len */
1490 	while (cb->state < RDMA_READ_ADV) {
1491 		krping_cq_event_handler(cb->cq, cb);
1492 	}
1493 
1494 	/* Send STAG/TO/Len to client */
1495 	krping_format_send(cb, cb->start_dma_addr);
1496 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1497 	if (ret) {
1498 		PRINTF(cb, "post send error %d\n", ret);
1499 		return;
1500 	}
1501 
1502 	/* Spin waiting for send completion */
1503 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1504 	if (ret < 0) {
1505 		PRINTF(cb, "poll error %d\n", ret);
1506 		return;
1507 	}
1508 	if (wc.status) {
1509 		PRINTF(cb, "send completiong error %d\n", wc.status);
1510 		return;
1511 	}
1512 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1513 }
1514 
1515 static void krping_wlat_test_server(struct krping_cb *cb)
1516 {
1517 	struct ib_send_wr *bad_wr;
1518 	struct ib_wc wc;
1519 	int ret;
1520 
1521 	/* Spin waiting for client's Start STAG/TO/Len */
1522 	while (cb->state < RDMA_READ_ADV) {
1523 		krping_cq_event_handler(cb->cq, cb);
1524 	}
1525 
1526 	/* Send STAG/TO/Len to client */
1527 	krping_format_send(cb, cb->start_dma_addr);
1528 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1529 	if (ret) {
1530 		PRINTF(cb, "post send error %d\n", ret);
1531 		return;
1532 	}
1533 
1534 	/* Spin waiting for send completion */
1535 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1536 	if (ret < 0) {
1537 		PRINTF(cb, "poll error %d\n", ret);
1538 		return;
1539 	}
1540 	if (wc.status) {
1541 		PRINTF(cb, "send completiong error %d\n", wc.status);
1542 		return;
1543 	}
1544 
1545 	wlat_test(cb);
1546 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1547 }
1548 
1549 static void krping_bw_test_server(struct krping_cb *cb)
1550 {
1551 	struct ib_send_wr *bad_wr;
1552 	struct ib_wc wc;
1553 	int ret;
1554 
1555 	/* Spin waiting for client's Start STAG/TO/Len */
1556 	while (cb->state < RDMA_READ_ADV) {
1557 		krping_cq_event_handler(cb->cq, cb);
1558 	}
1559 
1560 	/* Send STAG/TO/Len to client */
1561 	krping_format_send(cb, cb->start_dma_addr);
1562 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1563 	if (ret) {
1564 		PRINTF(cb, "post send error %d\n", ret);
1565 		return;
1566 	}
1567 
1568 	/* Spin waiting for send completion */
1569 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1570 	if (ret < 0) {
1571 		PRINTF(cb, "poll error %d\n", ret);
1572 		return;
1573 	}
1574 	if (wc.status) {
1575 		PRINTF(cb, "send completiong error %d\n", wc.status);
1576 		return;
1577 	}
1578 
1579 	if (cb->duplex)
1580 		bw_test(cb);
1581 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1582 }
1583 
1584 static int fastreg_supported(struct krping_cb *cb, int server)
1585 {
1586 	struct ib_device *dev = server?cb->child_cm_id->device:
1587 					cb->cm_id->device;
1588 	struct ib_device_attr attr;
1589 	int ret;
1590 
1591 	ret = ib_query_device(dev, &attr);
1592 	if (ret) {
1593 		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1594 		return 0;
1595 	}
1596 	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1597 		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n",
1598 		    (unsigned long long)attr.device_cap_flags);
1599 		return 0;
1600 	}
1601 	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n",
1602 		(uintmax_t)attr.device_cap_flags);
1603 	return 1;
1604 }
1605 
1606 static int krping_bind_server(struct krping_cb *cb)
1607 {
1608 	union {
1609 		struct sockaddr_in v4;
1610 		struct sockaddr_in6 v6;
1611 	} sin;
1612 	int ret;
1613 
1614 	memset(&sin, 0, sizeof(sin));
1615 
1616 	switch (cb->addr_type) {
1617 	case AF_INET:
1618 		sin.v4.sin_len = sizeof sin.v4;
1619 		sin.v4.sin_family = AF_INET;
1620 		sin.v4.sin_addr = cb->addr.v4;
1621 		sin.v4.sin_port = cb->port;
1622 		break;
1623 	case AF_INET6:
1624 		sin.v6.sin6_len = sizeof sin.v6;
1625 		sin.v6.sin6_family = AF_INET6;
1626 		sin.v6.sin6_addr = cb->addr.v6;
1627 		sin.v6.sin6_port = cb->port;
1628 		break;
1629 	default:
1630 		return (-EINVAL);
1631 	}
1632 
1633 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1634 	if (ret) {
1635 		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1636 		return ret;
1637 	}
1638 	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1639 
1640 	DEBUG_LOG(cb, "rdma_listen\n");
1641 	ret = rdma_listen(cb->cm_id, 3);
1642 	if (ret) {
1643 		PRINTF(cb, "rdma_listen failed: %d\n", ret);
1644 		return ret;
1645 	}
1646 
1647 	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1648 	if (cb->state != CONNECT_REQUEST) {
1649 		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1650 			cb->state);
1651 		return -1;
1652 	}
1653 
1654 	if (cb->mem == FASTREG && !fastreg_supported(cb, 1))
1655 		return -EINVAL;
1656 
1657 	return 0;
1658 }
1659 
1660 /*
1661  * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads
1662  * complete.
1663  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
1664  */
1665 static void krping_fr_test5(struct krping_cb *cb)
1666 {
1667 	struct ib_fast_reg_page_list **pl;
1668 	struct ib_send_wr *fr, *read, *bad;
1669 	struct ib_wc wc;
1670 	struct ib_sge *sgl;
1671 	u8 key = 0;
1672 	struct ib_mr **mr;
1673 	u8 **buf;
1674 	dma_addr_t *dma_addr;
1675 	int i;
1676 	int ret;
1677 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1678 	time_t start;
1679 	int count = 0;
1680 	int scnt;
1681 	int depth = cb->txdepth >> 1;
1682 
1683 	if (!depth) {
1684 		PRINTF(cb, "txdepth must be > 1 for this test!\n");
1685 		return;
1686 	}
1687 
1688 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
1689 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
1690 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
1691 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
1692 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
1693 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
1694 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
1695 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
1696 	read = kzalloc(sizeof *read * depth, GFP_KERNEL);
1697 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth);
1698 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
1699 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
1700 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
1701 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
1702 	if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) {
1703 		PRINTF(cb, "kzalloc failed\n");
1704 		goto err1;
1705 	}
1706 
1707 	for (scnt = 0; scnt < depth; scnt++) {
1708 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1709 		if (IS_ERR(pl[scnt])) {
1710 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
1711 			       PTR_ERR(pl[scnt]));
1712 			goto err2;
1713 		}
1714 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
1715 
1716 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
1717 		if (IS_ERR(mr[scnt])) {
1718 			PRINTF(cb, "alloc_fr failed %ld\n",
1719 			       PTR_ERR(mr[scnt]));
1720 			goto err2;
1721 		}
1722 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
1723 		ib_update_fast_reg_key(mr[scnt], ++key);
1724 
1725 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
1726 		if (!buf[scnt]) {
1727 			PRINTF(cb, "kmalloc failed\n");
1728 			ret = -ENOMEM;
1729 			goto err2;
1730 		}
1731 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
1732 		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
1733 						   buf[scnt], cb->size,
1734 						   DMA_BIDIRECTIONAL);
1735 		if (dma_mapping_error(cb->pd->device->dma_device,
1736 		    dma_addr[scnt])) {
1737 			PRINTF(cb, "dma_map failed\n");
1738 			ret = -ENOMEM;
1739 			goto err2;
1740 		}
1741 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
1742 		for (i=0; i<plen; i++) {
1743 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
1744 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
1745 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
1746 		}
1747 
1748 		sgl[scnt].lkey = mr[scnt]->rkey;
1749 		sgl[scnt].length = cb->size;
1750 		sgl[scnt].addr = (u64)buf[scnt];
1751 		DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n",
1752 			  __func__, scnt,  sgl[scnt].lkey, sgl[scnt].length,
1753 			  (uintmax_t)sgl[scnt].addr);
1754 
1755 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
1756 		fr[scnt].wr_id = scnt;
1757 		fr[scnt].send_flags = 0;
1758 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
1759 		fr[scnt].wr.fast_reg.length = cb->size;
1760 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
1761 		fr[scnt].wr.fast_reg.page_list_len = plen;
1762 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
1763 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1764 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
1765 		fr[scnt].next = &read[scnt];
1766 		read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV;
1767 		read[scnt].wr_id = scnt;
1768 		read[scnt].send_flags = IB_SEND_SIGNALED;
1769 		read[scnt].wr.rdma.rkey = cb->remote_rkey;
1770 		read[scnt].wr.rdma.remote_addr = cb->remote_addr;
1771 		read[scnt].num_sge = 1;
1772 		read[scnt].sg_list = &sgl[scnt];
1773 		ret = ib_post_send(cb->qp, &fr[scnt], &bad);
1774 		if (ret) {
1775 			PRINTF(cb, "ib_post_send failed %d\n", ret);
1776 			goto err2;
1777 		}
1778 	}
1779 
1780 	start = time_uptime;
1781 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
1782 	while (!cb->count || cb->server || count < cb->count) {
1783 		if ((time_uptime - start) >= 9) {
1784 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
1785 				  count);
1786 			wait_event_interruptible_timeout(cb->sem,
1787 							 cb->state == ERROR,
1788 							 1);
1789 			if (cb->state == ERROR)
1790 				break;
1791 			start = time_uptime;
1792 		}
1793 		do {
1794 			ret = ib_poll_cq(cb->cq, 1, &wc);
1795 			if (ret < 0) {
1796 				PRINTF(cb, "ib_poll_cq failed %d\n",
1797 				       ret);
1798 				goto err2;
1799 			}
1800 			if (ret == 1) {
1801 				if (wc.status) {
1802 					PRINTF(cb,
1803 					       "completion error %u wr_id %ju "
1804 					       "opcode %d\n", wc.status,
1805 					       (uintmax_t)wc.wr_id, wc.opcode);
1806 					goto err2;
1807 				}
1808 				count++;
1809 				if (count == cb->count)
1810 					break;
1811 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
1812 				fr[wc.wr_id].wr.fast_reg.rkey =
1813 					mr[wc.wr_id]->rkey;
1814 				sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey;
1815 				ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad);
1816 				if (ret) {
1817 					PRINTF(cb,
1818 					       "ib_post_send failed %d\n", ret);
1819 					goto err2;
1820 				}
1821 			} else if (krping_sigpending()) {
1822 				PRINTF(cb, "signal!\n");
1823 				goto err2;
1824 			}
1825 		} while (ret == 1);
1826 	}
1827 	DEBUG_LOG(cb, "%s done!\n", __func__);
1828 err2:
1829 	DEBUG_LOG(cb, "sleeping 1 second\n");
1830 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1831 	DEBUG_LOG(cb, "draining the cq...\n");
1832 	do {
1833 		ret = ib_poll_cq(cb->cq, 1, &wc);
1834 		if (ret < 0) {
1835 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1836 			break;
1837 		}
1838 		if (ret == 1) {
1839 			if (wc.status) {
1840 				PRINTF(cb, "completion error %u "
1841 				       "opcode %u\n", wc.status, wc.opcode);
1842 			}
1843 		}
1844 	} while (ret == 1);
1845 
1846 	DEBUG_LOG(cb, "destroying fr mrs!\n");
1847 	for (scnt = 0; scnt < depth; scnt++) {
1848 		if (mr[scnt]) {
1849 			ib_dereg_mr(mr[scnt]);
1850 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
1851 		}
1852 	}
1853 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
1854 	for (scnt = 0; scnt < depth; scnt++) {
1855 		if (buf[scnt]) {
1856 			dma_unmap_single(cb->pd->device->dma_device,
1857 					 dma_addr[scnt], cb->size,
1858 					 DMA_BIDIRECTIONAL);
1859 			kfree(buf[scnt]);
1860 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
1861 		}
1862 	}
1863 	DEBUG_LOG(cb, "destroying fr page lists!\n");
1864 	for (scnt = 0; scnt < depth; scnt++) {
1865 		if (pl[scnt]) {
1866 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
1867 			ib_free_fast_reg_page_list(pl[scnt]);
1868 		}
1869 	}
1870 err1:
1871 	if (pl)
1872 		kfree(pl);
1873 	if (mr)
1874 		kfree(mr);
1875 	if (fr)
1876 		kfree(fr);
1877 	if (read)
1878 		kfree(read);
1879 	if (sgl)
1880 		kfree(sgl);
1881 	if (buf)
1882 		kfree(buf);
1883 	if (dma_addr)
1884 		kfree(dma_addr);
1885 }
1886 static void krping_fr_test_server(struct krping_cb *cb)
1887 {
1888 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
1889 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1890 }
1891 
1892 static void krping_fr_test5_server(struct krping_cb *cb)
1893 {
1894 	struct ib_send_wr *bad_wr;
1895 	struct ib_wc wc;
1896 	int ret;
1897 
1898 	/* Spin waiting for client's Start STAG/TO/Len */
1899 	while (cb->state < RDMA_READ_ADV) {
1900 		krping_cq_event_handler(cb->cq, cb);
1901 	}
1902 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
1903 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
1904 
1905 	/* Send STAG/TO/Len to client */
1906 	krping_format_send(cb, cb->start_dma_addr);
1907 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1908 	if (ret) {
1909 		PRINTF(cb, "post send error %d\n", ret);
1910 		return;
1911 	}
1912 
1913 	/* Spin waiting for send completion */
1914 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1915 	if (ret < 0) {
1916 		PRINTF(cb, "poll error %d\n", ret);
1917 		return;
1918 	}
1919 	if (wc.status) {
1920 		PRINTF(cb, "send completiong error %d\n", wc.status);
1921 		return;
1922 	}
1923 
1924 	if (cb->duplex)
1925 		krping_fr_test5(cb);
1926 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
1927 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1928 }
1929 
1930 static void krping_fr_test5_client(struct krping_cb *cb)
1931 {
1932 	struct ib_send_wr *bad;
1933 	struct ib_wc wc;
1934 	int ret;
1935 
1936 	cb->state = RDMA_READ_ADV;
1937 
1938 	/* Send STAG/TO/Len to server */
1939 	krping_format_send(cb, cb->start_dma_addr);
1940 	if (cb->state == ERROR) {
1941 		PRINTF(cb, "krping_format_send failed\n");
1942 		return;
1943 	}
1944 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
1945 	if (ret) {
1946 		PRINTF(cb, "post send error %d\n", ret);
1947 		return;
1948 	}
1949 
1950 	/* Spin waiting for send completion */
1951 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1952 	if (ret < 0) {
1953 		PRINTF(cb, "poll error %d\n", ret);
1954 		return;
1955 	}
1956 	if (wc.status) {
1957 		PRINTF(cb, "send completion error %d\n", wc.status);
1958 		return;
1959 	}
1960 
1961 	/* Spin waiting for server's Start STAG/TO/Len */
1962 	while (cb->state < RDMA_WRITE_ADV) {
1963 		krping_cq_event_handler(cb->cq, cb);
1964 	}
1965 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
1966 	    (uintmax_t)cb->remote_addr);
1967 
1968 	return krping_fr_test5(cb);
1969 }
1970 
1971 /*
1972  * sq-depth worth of write + fastreg + inv, reposting them as the invs
1973  * complete.
1974  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
1975  * If a count is given, then the last IO will have a bogus lkey in the
1976  * write work request.  This reproduces a fw bug where the connection
1977  * will get stuck if a fastreg is processed while the ulptx is failing
1978  * the bad write.
1979  */
1980 static void krping_fr_test6(struct krping_cb *cb)
1981 {
1982 	struct ib_fast_reg_page_list **pl;
1983 	struct ib_send_wr *fr, *write, *inv, *bad;
1984 	struct ib_wc wc;
1985 	struct ib_sge *sgl;
1986 	u8 key = 0;
1987 	struct ib_mr **mr;
1988 	u8 **buf;
1989 	dma_addr_t *dma_addr;
1990 	int i;
1991 	int ret;
1992 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1993 	unsigned long start;
1994 	int count = 0;
1995 	int scnt;
1996 	int depth = cb->txdepth  / 3;
1997 
1998 	if (!depth) {
1999 		PRINTF(cb, "txdepth must be > 3 for this test!\n");
2000 		return;
2001 	}
2002 
2003 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
2004 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
2005 
2006 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
2007 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
2008 
2009 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
2010 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
2011 
2012 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
2013 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
2014 
2015 	write = kzalloc(sizeof *write * depth, GFP_KERNEL);
2016 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth);
2017 
2018 	inv = kzalloc(sizeof *inv * depth, GFP_KERNEL);
2019 	DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth);
2020 
2021 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
2022 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
2023 
2024 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
2025 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
2026 
2027 	if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) {
2028 		PRINTF(cb, "kzalloc failed\n");
2029 		goto err1;
2030 	}
2031 
2032 	for (scnt = 0; scnt < depth; scnt++) {
2033 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2034 		if (IS_ERR(pl[scnt])) {
2035 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
2036 			       PTR_ERR(pl[scnt]));
2037 			goto err2;
2038 		}
2039 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
2040 
2041 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
2042 		if (IS_ERR(mr[scnt])) {
2043 			PRINTF(cb, "alloc_fr failed %ld\n",
2044 			       PTR_ERR(mr[scnt]));
2045 			goto err2;
2046 		}
2047 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
2048 		ib_update_fast_reg_key(mr[scnt], ++key);
2049 
2050 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
2051 		if (!buf[scnt]) {
2052 			PRINTF(cb, "kmalloc failed\n");
2053 			ret = -ENOMEM;
2054 			goto err2;
2055 		}
2056 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
2057 		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
2058 						   buf[scnt], cb->size,
2059 						   DMA_BIDIRECTIONAL);
2060 		if (dma_mapping_error(cb->pd->device->dma_device,
2061 		    dma_addr[scnt])) {
2062 			PRINTF(cb, "dma_map failed\n");
2063 			ret = -ENOMEM;
2064 			goto err2;
2065 		}
2066 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
2067 		for (i=0; i<plen; i++) {
2068 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
2069 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
2070 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
2071 		}
2072 
2073 		write[scnt].opcode = IB_WR_RDMA_WRITE;
2074 		write[scnt].wr_id = scnt;
2075 		write[scnt].wr.rdma.rkey = cb->remote_rkey;
2076 		write[scnt].wr.rdma.remote_addr = cb->remote_addr;
2077 		write[scnt].num_sge = 1;
2078 		write[scnt].sg_list = &cb->rdma_sgl;
2079 		write[scnt].sg_list->length = cb->size;
2080 		write[scnt].next = &fr[scnt];
2081 
2082 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
2083 		fr[scnt].wr_id = scnt;
2084 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
2085 		fr[scnt].wr.fast_reg.length = cb->size;
2086 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
2087 		fr[scnt].wr.fast_reg.page_list_len = plen;
2088 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
2089 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2090 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
2091 		fr[scnt].next = &inv[scnt];
2092 
2093 		inv[scnt].opcode = IB_WR_LOCAL_INV;
2094 		inv[scnt].send_flags = IB_SEND_SIGNALED;
2095 		inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey;
2096 
2097 		ret = ib_post_send(cb->qp, &write[scnt], &bad);
2098 		if (ret) {
2099 			PRINTF(cb, "ib_post_send failed %d\n", ret);
2100 			goto err2;
2101 		}
2102 	}
2103 
2104 	start = time_uptime;
2105 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
2106 	while (!cb->count || cb->server || count < cb->count) {
2107 		if ((time_uptime - start) >= 9) {
2108 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
2109 				  count);
2110 			wait_event_interruptible_timeout(cb->sem,
2111 							 cb->state == ERROR,
2112 							 1);
2113 			if (cb->state == ERROR)
2114 				break;
2115 			start = time_uptime;
2116 		}
2117 		do {
2118 			ret = ib_poll_cq(cb->cq, 1, &wc);
2119 			if (ret < 0) {
2120 				PRINTF(cb, "ib_poll_cq failed %d\n",
2121 				       ret);
2122 				goto err2;
2123 			}
2124 			if (ret == 1) {
2125 				if (wc.status) {
2126 					PRINTF(cb,
2127 					       "completion error %u wr_id %ju "
2128 					       "opcode %d\n", wc.status,
2129 					       (uintmax_t)wc.wr_id, wc.opcode);
2130 					goto err2;
2131 				}
2132 				count++;
2133 				if (count == (cb->count -1))
2134 					cb->rdma_sgl.lkey = 0x00dead;
2135 				if (count == cb->count)
2136 					break;
2137 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
2138 				fr[wc.wr_id].wr.fast_reg.rkey =
2139 					mr[wc.wr_id]->rkey;
2140 				inv[wc.wr_id].ex.invalidate_rkey =
2141 					mr[wc.wr_id]->rkey;
2142 				ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad);
2143 				if (ret) {
2144 					PRINTF(cb,
2145 					       "ib_post_send failed %d\n", ret);
2146 					goto err2;
2147 				}
2148 			} else if (krping_sigpending()){
2149 				PRINTF(cb, "signal!\n");
2150 				goto err2;
2151 			}
2152 		} while (ret == 1);
2153 	}
2154 	DEBUG_LOG(cb, "%s done!\n", __func__);
2155 err2:
2156 	DEBUG_LOG(cb, "sleeping 1 second\n");
2157 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2158 	DEBUG_LOG(cb, "draining the cq...\n");
2159 	do {
2160 		ret = ib_poll_cq(cb->cq, 1, &wc);
2161 		if (ret < 0) {
2162 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2163 			break;
2164 		}
2165 		if (ret == 1) {
2166 			if (wc.status) {
2167 				PRINTF(cb, "completion error %u "
2168 				       "opcode %u\n", wc.status, wc.opcode);
2169 			}
2170 		}
2171 	} while (ret == 1);
2172 
2173 	DEBUG_LOG(cb, "destroying fr mrs!\n");
2174 	for (scnt = 0; scnt < depth; scnt++) {
2175 		if (mr[scnt]) {
2176 			ib_dereg_mr(mr[scnt]);
2177 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
2178 		}
2179 	}
2180 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
2181 	for (scnt = 0; scnt < depth; scnt++) {
2182 		if (buf[scnt]) {
2183 			dma_unmap_single(cb->pd->device->dma_device,
2184 					 dma_addr[scnt], cb->size,
2185 					 DMA_BIDIRECTIONAL);
2186 			kfree(buf[scnt]);
2187 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
2188 		}
2189 	}
2190 	DEBUG_LOG(cb, "destroying fr page lists!\n");
2191 	for (scnt = 0; scnt < depth; scnt++) {
2192 		if (pl[scnt]) {
2193 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
2194 			ib_free_fast_reg_page_list(pl[scnt]);
2195 		}
2196 	}
2197 err1:
2198 	if (pl)
2199 		kfree(pl);
2200 	if (mr)
2201 		kfree(mr);
2202 	if (fr)
2203 		kfree(fr);
2204 	if (write)
2205 		kfree(write);
2206 	if (inv)
2207 		kfree(inv);
2208 	if (sgl)
2209 		kfree(sgl);
2210 	if (buf)
2211 		kfree(buf);
2212 	if (dma_addr)
2213 		kfree(dma_addr);
2214 }
2215 
2216 static void krping_fr_test6_server(struct krping_cb *cb)
2217 {
2218 	struct ib_send_wr *bad_wr;
2219 	struct ib_wc wc;
2220 	int ret;
2221 
2222 	/* Spin waiting for client's Start STAG/TO/Len */
2223 	while (cb->state < RDMA_READ_ADV) {
2224 		krping_cq_event_handler(cb->cq, cb);
2225 	}
2226 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
2227 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
2228 
2229 	/* Send STAG/TO/Len to client */
2230 	krping_format_send(cb, cb->start_dma_addr);
2231 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2232 	if (ret) {
2233 		PRINTF(cb, "post send error %d\n", ret);
2234 		return;
2235 	}
2236 
2237 	/* Spin waiting for send completion */
2238 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2239 	if (ret < 0) {
2240 		PRINTF(cb, "poll error %d\n", ret);
2241 		return;
2242 	}
2243 	if (wc.status) {
2244 		PRINTF(cb, "send completiong error %d\n", wc.status);
2245 		return;
2246 	}
2247 
2248 	if (cb->duplex)
2249 		krping_fr_test6(cb);
2250 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
2251 	wait_event_interruptible(cb->sem, cb->state == ERROR);
2252 }
2253 
2254 static void krping_fr_test6_client(struct krping_cb *cb)
2255 {
2256 	struct ib_send_wr *bad;
2257 	struct ib_wc wc;
2258 	int ret;
2259 
2260 	cb->state = RDMA_READ_ADV;
2261 
2262 	/* Send STAG/TO/Len to server */
2263 	krping_format_send(cb, cb->start_dma_addr);
2264 	if (cb->state == ERROR) {
2265 		PRINTF(cb, "krping_format_send failed\n");
2266 		return;
2267 	}
2268 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
2269 	if (ret) {
2270 		PRINTF(cb, "post send error %d\n", ret);
2271 		return;
2272 	}
2273 
2274 	/* Spin waiting for send completion */
2275 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2276 	if (ret < 0) {
2277 		PRINTF(cb, "poll error %d\n", ret);
2278 		return;
2279 	}
2280 	if (wc.status) {
2281 		PRINTF(cb, "send completion error %d\n", wc.status);
2282 		return;
2283 	}
2284 
2285 	/* Spin waiting for server's Start STAG/TO/Len */
2286 	while (cb->state < RDMA_WRITE_ADV) {
2287 		krping_cq_event_handler(cb->cq, cb);
2288 	}
2289 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
2290 	    (uintmax_t)cb->remote_addr);
2291 
2292 	return krping_fr_test6(cb);
2293 }
2294 
2295 static void krping_run_server(struct krping_cb *cb)
2296 {
2297 	struct ib_recv_wr *bad_wr;
2298 	int ret;
2299 
2300 	ret = krping_bind_server(cb);
2301 	if (ret)
2302 		return;
2303 
2304 	ret = krping_setup_qp(cb, cb->child_cm_id);
2305 	if (ret) {
2306 		PRINTF(cb, "setup_qp failed: %d\n", ret);
2307 		goto err0;
2308 	}
2309 
2310 	ret = krping_setup_buffers(cb);
2311 	if (ret) {
2312 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2313 		goto err1;
2314 	}
2315 
2316 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2317 	if (ret) {
2318 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2319 		goto err2;
2320 	}
2321 
2322 	ret = krping_accept(cb);
2323 	if (ret) {
2324 		PRINTF(cb, "connect error %d\n", ret);
2325 		goto err2;
2326 	}
2327 
2328 	if (cb->wlat)
2329 		krping_wlat_test_server(cb);
2330 	else if (cb->rlat)
2331 		krping_rlat_test_server(cb);
2332 	else if (cb->bw)
2333 		krping_bw_test_server(cb);
2334 	else if (cb->frtest) {
2335 		switch (cb->testnum) {
2336 		case 1:
2337 		case 2:
2338 		case 3:
2339 		case 4:
2340 			krping_fr_test_server(cb);
2341 			break;
2342 		case 5:
2343 			krping_fr_test5_server(cb);
2344 			break;
2345 		case 6:
2346 			krping_fr_test6_server(cb);
2347 			break;
2348 		default:
2349 			PRINTF(cb, "unknown fr test %d\n", cb->testnum);
2350 			goto err2;
2351 			break;
2352 		}
2353 	} else
2354 		krping_test_server(cb);
2355 	rdma_disconnect(cb->child_cm_id);
2356 err2:
2357 	krping_free_buffers(cb);
2358 err1:
2359 	krping_free_qp(cb);
2360 err0:
2361 	rdma_destroy_id(cb->child_cm_id);
2362 }
2363 
2364 static void krping_test_client(struct krping_cb *cb)
2365 {
2366 	int ping, start, cc, i, ret;
2367 	struct ib_send_wr *bad_wr;
2368 	unsigned char c;
2369 
2370 	start = 65;
2371 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
2372 		cb->state = RDMA_READ_ADV;
2373 
2374 		/* Put some ascii text in the buffer. */
2375 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
2376 		for (i = cc, c = start; i < cb->size; i++) {
2377 			cb->start_buf[i] = c;
2378 			c++;
2379 			if (c > 122)
2380 				c = 65;
2381 		}
2382 		start++;
2383 		if (start > 122)
2384 			start = 65;
2385 		cb->start_buf[cb->size - 1] = 0;
2386 
2387 		krping_format_send(cb, cb->start_dma_addr);
2388 		if (cb->state == ERROR) {
2389 			PRINTF(cb, "krping_format_send failed\n");
2390 			break;
2391 		}
2392 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2393 		if (ret) {
2394 			PRINTF(cb, "post send error %d\n", ret);
2395 			break;
2396 		}
2397 
2398 		/* Wait for server to ACK */
2399 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
2400 		if (cb->state != RDMA_WRITE_ADV) {
2401 			PRINTF(cb,
2402 			       "wait for RDMA_WRITE_ADV state %d\n",
2403 			       cb->state);
2404 			break;
2405 		}
2406 
2407 		krping_format_send(cb, cb->rdma_dma_addr);
2408 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2409 		if (ret) {
2410 			PRINTF(cb, "post send error %d\n", ret);
2411 			break;
2412 		}
2413 
2414 		/* Wait for the server to say the RDMA Write is complete. */
2415 		wait_event_interruptible(cb->sem,
2416 					 cb->state >= RDMA_WRITE_COMPLETE);
2417 		if (cb->state != RDMA_WRITE_COMPLETE) {
2418 			PRINTF(cb,
2419 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
2420 			       cb->state);
2421 			break;
2422 		}
2423 
2424 		if (cb->validate)
2425 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
2426 				PRINTF(cb, "data mismatch!\n");
2427 				break;
2428 			}
2429 
2430 		if (cb->verbose) {
2431 			if (strlen(cb->rdma_buf) > 128) {
2432 				char msgbuf[128];
2433 
2434 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
2435 				PRINTF(cb, "ping data stripped: %s\n",
2436 				       msgbuf);
2437 			} else
2438 				PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
2439 		}
2440 #ifdef SLOW_KRPING
2441 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2442 #endif
2443 	}
2444 }
2445 
2446 static void krping_rlat_test_client(struct krping_cb *cb)
2447 {
2448 	struct ib_send_wr *bad_wr;
2449 	struct ib_wc wc;
2450 	int ret;
2451 
2452 	cb->state = RDMA_READ_ADV;
2453 
2454 	/* Send STAG/TO/Len to client */
2455 	krping_format_send(cb, cb->start_dma_addr);
2456 	if (cb->state == ERROR) {
2457 		PRINTF(cb, "krping_format_send failed\n");
2458 		return;
2459 	}
2460 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2461 	if (ret) {
2462 		PRINTF(cb, "post send error %d\n", ret);
2463 		return;
2464 	}
2465 
2466 	/* Spin waiting for send completion */
2467 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2468 	if (ret < 0) {
2469 		PRINTF(cb, "poll error %d\n", ret);
2470 		return;
2471 	}
2472 	if (wc.status) {
2473 		PRINTF(cb, "send completion error %d\n", wc.status);
2474 		return;
2475 	}
2476 
2477 	/* Spin waiting for server's Start STAG/TO/Len */
2478 	while (cb->state < RDMA_WRITE_ADV) {
2479 		krping_cq_event_handler(cb->cq, cb);
2480 	}
2481 
2482 #if 0
2483 {
2484 	int i;
2485 	struct timeval start, stop;
2486 	time_t sec;
2487 	suseconds_t usec;
2488 	unsigned long long elapsed;
2489 	struct ib_wc wc;
2490 	struct ib_send_wr *bad_wr;
2491 	int ne;
2492 
2493 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
2494 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
2495 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
2496 	cb->rdma_sq_wr.sg_list->length = 0;
2497 	cb->rdma_sq_wr.num_sge = 0;
2498 
2499 	microtime(&start);
2500 	for (i=0; i < 100000; i++) {
2501 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
2502 			PRINTF(cb, "Couldn't post send\n");
2503 			return;
2504 		}
2505 		do {
2506 			ne = ib_poll_cq(cb->cq, 1, &wc);
2507 		} while (ne == 0);
2508 		if (ne < 0) {
2509 			PRINTF(cb, "poll CQ failed %d\n", ne);
2510 			return;
2511 		}
2512 		if (wc.status != IB_WC_SUCCESS) {
2513 			PRINTF(cb, "Completion wth error at %s:\n",
2514 				cb->server ? "server" : "client");
2515 			PRINTF(cb, "Failed status %d: wr_id %d\n",
2516 				wc.status, (int) wc.wr_id);
2517 			return;
2518 		}
2519 	}
2520 	microtime(&stop);
2521 
2522 	if (stop.tv_usec < start.tv_usec) {
2523 		stop.tv_usec += 1000000;
2524 		stop.tv_sec  -= 1;
2525 	}
2526 	sec     = stop.tv_sec - start.tv_sec;
2527 	usec    = stop.tv_usec - start.tv_usec;
2528 	elapsed = sec * 1000000 + usec;
2529 	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
2530 }
2531 #endif
2532 
2533 	rlat_test(cb);
2534 }
2535 
2536 static void krping_wlat_test_client(struct krping_cb *cb)
2537 {
2538 	struct ib_send_wr *bad_wr;
2539 	struct ib_wc wc;
2540 	int ret;
2541 
2542 	cb->state = RDMA_READ_ADV;
2543 
2544 	/* Send STAG/TO/Len to client */
2545 	krping_format_send(cb, cb->start_dma_addr);
2546 	if (cb->state == ERROR) {
2547 		PRINTF(cb, "krping_format_send failed\n");
2548 		return;
2549 	}
2550 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2551 	if (ret) {
2552 		PRINTF(cb, "post send error %d\n", ret);
2553 		return;
2554 	}
2555 
2556 	/* Spin waiting for send completion */
2557 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2558 	if (ret < 0) {
2559 		PRINTF(cb, "poll error %d\n", ret);
2560 		return;
2561 	}
2562 	if (wc.status) {
2563 		PRINTF(cb, "send completion error %d\n", wc.status);
2564 		return;
2565 	}
2566 
2567 	/* Spin waiting for server's Start STAG/TO/Len */
2568 	while (cb->state < RDMA_WRITE_ADV) {
2569 		krping_cq_event_handler(cb->cq, cb);
2570 	}
2571 
2572 	wlat_test(cb);
2573 }
2574 
2575 static void krping_bw_test_client(struct krping_cb *cb)
2576 {
2577 	struct ib_send_wr *bad_wr;
2578 	struct ib_wc wc;
2579 	int ret;
2580 
2581 	cb->state = RDMA_READ_ADV;
2582 
2583 	/* Send STAG/TO/Len to client */
2584 	krping_format_send(cb, cb->start_dma_addr);
2585 	if (cb->state == ERROR) {
2586 		PRINTF(cb, "krping_format_send failed\n");
2587 		return;
2588 	}
2589 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2590 	if (ret) {
2591 		PRINTF(cb, "post send error %d\n", ret);
2592 		return;
2593 	}
2594 
2595 	/* Spin waiting for send completion */
2596 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2597 	if (ret < 0) {
2598 		PRINTF(cb, "poll error %d\n", ret);
2599 		return;
2600 	}
2601 	if (wc.status) {
2602 		PRINTF(cb, "send completion error %d\n", wc.status);
2603 		return;
2604 	}
2605 
2606 	/* Spin waiting for server's Start STAG/TO/Len */
2607 	while (cb->state < RDMA_WRITE_ADV) {
2608 		krping_cq_event_handler(cb->cq, cb);
2609 	}
2610 
2611 	bw_test(cb);
2612 }
2613 
2614 
2615 /*
2616  * fastreg 2 valid different mrs and verify the completions.
2617  */
2618 static void krping_fr_test1(struct krping_cb *cb)
2619 {
2620 	struct ib_fast_reg_page_list *pl;
2621 	struct ib_send_wr fr, *bad;
2622 	struct ib_wc wc;
2623 	struct ib_mr *mr1, *mr2;
2624 	int i;
2625 	int ret;
2626 	int size = cb->size;
2627 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2628 	int count = 0;
2629 
2630 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2631 	if (IS_ERR(pl)) {
2632 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2633 		return;
2634 	}
2635 
2636 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
2637 	if (IS_ERR(mr1)) {
2638 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2639 		goto err1;
2640 	}
2641 	mr2 = ib_alloc_fast_reg_mr(cb->pd, plen);
2642 	if (IS_ERR(mr2)) {
2643 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2644 		goto err2;
2645 	}
2646 
2647 
2648 	for (i=0; i<plen; i++)
2649 		pl->page_list[i] = i * PAGE_SIZE;
2650 
2651 	memset(&fr, 0, sizeof fr);
2652 	fr.opcode = IB_WR_FAST_REG_MR;
2653 	fr.wr_id = 1;
2654 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2655 	fr.wr.fast_reg.length = size;
2656 	fr.wr.fast_reg.page_list = pl;
2657 	fr.wr.fast_reg.page_list_len = plen;
2658 	fr.wr.fast_reg.iova_start = 0;
2659 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2660 	fr.send_flags = IB_SEND_SIGNALED;
2661 	fr.wr.fast_reg.rkey = mr1->rkey;
2662 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2663 	ret = ib_post_send(cb->qp, &fr, &bad);
2664 	if (ret) {
2665 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2666 		goto err3;
2667 	}
2668 	fr.wr.fast_reg.rkey = mr2->rkey;
2669 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2670 	ret = ib_post_send(cb->qp, &fr, &bad);
2671 	if (ret) {
2672 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2673 		goto err3;
2674 	}
2675 
2676 	DEBUG_LOG(cb, "sleeping 1 second\n");
2677 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2678 	do {
2679 		ret = ib_poll_cq(cb->cq, 1, &wc);
2680 		if (ret < 0) {
2681 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2682 			goto err3;
2683 		}
2684 		if (ret == 1) {
2685 			DEBUG_LOG(cb, "completion status %u wr %s\n",
2686 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
2687 			count++;
2688 		} else if (krping_sigpending()) {
2689 			PRINTF(cb, "signal!\n");
2690 			goto err3;
2691 		}
2692 
2693 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2694 	} while (count != 2);
2695 err3:
2696 	DEBUG_LOG(cb, "sleeping 1 second\n");
2697 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2698 	DEBUG_LOG(cb, "draining the cq...\n");
2699 	do {
2700 		ret = ib_poll_cq(cb->cq, 1, &wc);
2701 		if (ret < 0) {
2702 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2703 			break;
2704 		}
2705 		if (ret == 1) {
2706 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
2707 		}
2708 	} while (ret == 1);
2709 	DEBUG_LOG(cb, "destroying fr mr2!\n");
2710 
2711 	ib_dereg_mr(mr2);
2712 err2:
2713 	DEBUG_LOG(cb, "destroying fr mr1!\n");
2714 	ib_dereg_mr(mr1);
2715 err1:
2716 	DEBUG_LOG(cb, "destroying fr page list!\n");
2717 	ib_free_fast_reg_page_list(pl);
2718 	DEBUG_LOG(cb, "%s done!\n", __func__);
2719 }
2720 
2721 /*
2722  * fastreg the same mr twice, 2nd one should produce error cqe.
2723  */
2724 static void krping_fr_test2(struct krping_cb *cb)
2725 {
2726 	struct ib_fast_reg_page_list *pl;
2727 	struct ib_send_wr fr, *bad;
2728 	struct ib_wc wc;
2729 	struct ib_mr *mr1;
2730 	int i;
2731 	int ret;
2732 	int size = cb->size;
2733 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2734 	int count = 0;
2735 
2736 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2737 	if (IS_ERR(pl)) {
2738 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2739 		return;
2740 	}
2741 
2742 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
2743 	if (IS_ERR(mr1)) {
2744 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2745 		goto err1;
2746 	}
2747 
2748 	for (i=0; i<plen; i++)
2749 		pl->page_list[i] = i * PAGE_SIZE;
2750 
2751 	memset(&fr, 0, sizeof fr);
2752 	fr.opcode = IB_WR_FAST_REG_MR;
2753 	fr.wr_id = 1;
2754 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2755 	fr.wr.fast_reg.length = size;
2756 	fr.wr.fast_reg.page_list = pl;
2757 	fr.wr.fast_reg.page_list_len = plen;
2758 	fr.wr.fast_reg.iova_start = 0;
2759 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2760 	fr.send_flags = IB_SEND_SIGNALED;
2761 	fr.wr.fast_reg.rkey = mr1->rkey;
2762 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2763 	ret = ib_post_send(cb->qp, &fr, &bad);
2764 	if (ret) {
2765 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2766 		goto err3;
2767 	}
2768 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2769 	ret = ib_post_send(cb->qp, &fr, &bad);
2770 	if (ret) {
2771 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2772 		goto err3;
2773 	}
2774 
2775 	DEBUG_LOG(cb, "sleeping 1 second\n");
2776 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2777 	do {
2778 		ret = ib_poll_cq(cb->cq, 1, &wc);
2779 		if (ret < 0) {
2780 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2781 			goto err3;
2782 		}
2783 		if (ret == 1) {
2784 			DEBUG_LOG(cb, "completion status %u wr %s\n",
2785 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
2786 			count++;
2787 		} else if (krping_sigpending()) {
2788 			PRINTF(cb, "signal!\n");
2789 			goto err3;
2790 		}
2791 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2792 	} while (count != 2);
2793 err3:
2794 	DEBUG_LOG(cb, "sleeping 1 second\n");
2795 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2796 	DEBUG_LOG(cb, "draining the cq...\n");
2797 	do {
2798 		ret = ib_poll_cq(cb->cq, 1, &wc);
2799 		if (ret < 0) {
2800 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2801 			break;
2802 		}
2803 		if (ret == 1) {
2804 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
2805 		}
2806 	} while (ret == 1);
2807 	DEBUG_LOG(cb, "destroying fr mr1!\n");
2808 	ib_dereg_mr(mr1);
2809 err1:
2810 	DEBUG_LOG(cb, "destroying fr page list!\n");
2811 	ib_free_fast_reg_page_list(pl);
2812 	DEBUG_LOG(cb, "%s done!\n", __func__);
2813 }
2814 
2815 /*
2816  * fastreg pipelined in a loop as fast as we can until the user interrupts.
2817  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
2818  */
2819 static void krping_fr_test3(struct krping_cb *cb)
2820 {
2821 	struct ib_fast_reg_page_list *pl;
2822 	struct ib_send_wr fr, inv, *bad;
2823 	struct ib_wc wc;
2824 	u8 key = 0;
2825 	struct ib_mr *mr;
2826 	int i;
2827 	int ret;
2828 	int size = cb->size;
2829 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2830 	unsigned long start;
2831 	int count = 0;
2832 	int scnt = 0;
2833 
2834 
2835 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2836 	if (IS_ERR(pl)) {
2837 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2838 		return;
2839 	}
2840 
2841 	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
2842 	if (IS_ERR(mr)) {
2843 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2844 		goto err1;
2845 	}
2846 
2847 	for (i=0; i<plen; i++)
2848 		pl->page_list[i] = i * PAGE_SIZE;
2849 
2850 	memset(&fr, 0, sizeof fr);
2851 	fr.opcode = IB_WR_FAST_REG_MR;
2852 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2853 	fr.wr.fast_reg.length = size;
2854 	fr.wr.fast_reg.page_list = pl;
2855 	fr.wr.fast_reg.page_list_len = plen;
2856 	fr.wr.fast_reg.iova_start = 0;
2857 	fr.send_flags = IB_SEND_SIGNALED;
2858 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2859 	fr.next = &inv;
2860 	memset(&inv, 0, sizeof inv);
2861 	inv.opcode = IB_WR_LOCAL_INV;
2862 	inv.send_flags = IB_SEND_SIGNALED;
2863 
2864 	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
2865 	start = time_uptime;
2866 	while (1) {
2867 		if ((time_uptime - start) >= 9) {
2868 			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
2869 			wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2870 			if (cb->state == ERROR)
2871 				break;
2872 			start = time_uptime;
2873 		}
2874 		while (scnt < (cb->txdepth>>1)) {
2875 			ib_update_fast_reg_key(mr, ++key);
2876 			fr.wr.fast_reg.rkey = mr->rkey;
2877 			inv.ex.invalidate_rkey = mr->rkey;
2878 			size = arc4random() % cb->size;
2879 			if (size == 0)
2880 				size = cb->size;
2881 			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2882 			fr.wr.fast_reg.length = size;
2883 			fr.wr.fast_reg.page_list_len = plen;
2884 			ret = ib_post_send(cb->qp, &fr, &bad);
2885 			if (ret) {
2886 				PRINTF(cb, "ib_post_send failed %d\n", ret);
2887 				goto err2;
2888 			}
2889 			scnt+=2;
2890 		}
2891 
2892 		do {
2893 			ret = ib_poll_cq(cb->cq, 1, &wc);
2894 			if (ret < 0) {
2895 				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2896 				goto err2;
2897 			}
2898 			if (ret == 1) {
2899 				if (wc.status) {
2900 					PRINTF(cb, "completion error %u\n", wc.status);
2901 					goto err2;
2902 				}
2903 				count++;
2904 				scnt--;
2905 			}
2906 			else if (krping_sigpending()) {
2907 				PRINTF(cb, "signal!\n");
2908 				goto err2;
2909 			}
2910 		} while (ret == 1);
2911 	}
2912 err2:
2913 	DEBUG_LOG(cb, "sleeping 1 second\n");
2914 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2915 	DEBUG_LOG(cb, "draining the cq...\n");
2916 	do {
2917 		ret = ib_poll_cq(cb->cq, 1, &wc);
2918 		if (ret < 0) {
2919 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2920 			break;
2921 		}
2922 		if (ret == 1) {
2923 			if (wc.status) {
2924 				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
2925 			}
2926 		}
2927 	} while (ret == 1);
2928 	DEBUG_LOG(cb, "fr_test: done!\n");
2929 	ib_dereg_mr(mr);
2930 err1:
2931 	DEBUG_LOG(cb, "destroying fr page list!\n");
2932 	ib_free_fast_reg_page_list(pl);
2933 	DEBUG_LOG(cb, "%s done!\n", __func__);
2934 }
2935 
2936 /*
2937  * fastreg 1 and invalidate 1 mr and verify completion.
2938  */
2939 static void krping_fr_test4(struct krping_cb *cb)
2940 {
2941 	struct ib_fast_reg_page_list *pl;
2942 	struct ib_send_wr fr, inv, *bad;
2943 	struct ib_wc wc;
2944 	struct ib_mr *mr1;
2945 	int i;
2946 	int ret;
2947 	int size = cb->size;
2948 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2949 	int count = 0;
2950 
2951 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2952 	if (IS_ERR(pl)) {
2953 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2954 		return;
2955 	}
2956 
2957 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
2958 	if (IS_ERR(mr1)) {
2959 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2960 		goto err1;
2961 	}
2962 
2963 	for (i=0; i<plen; i++)
2964 		pl->page_list[i] = i * PAGE_SIZE;
2965 
2966 	memset(&fr, 0, sizeof fr);
2967 	fr.opcode = IB_WR_FAST_REG_MR;
2968 	fr.wr_id = 1;
2969 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2970 	fr.wr.fast_reg.length = size;
2971 	fr.wr.fast_reg.page_list = pl;
2972 	fr.wr.fast_reg.page_list_len = plen;
2973 	fr.wr.fast_reg.iova_start = 0;
2974 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2975 	fr.send_flags = IB_SEND_SIGNALED;
2976 	fr.wr.fast_reg.rkey = mr1->rkey;
2977 	fr.next = &inv;
2978 	memset(&inv, 0, sizeof inv);
2979 	inv.opcode = IB_WR_LOCAL_INV;
2980 	inv.ex.invalidate_rkey = mr1->rkey;
2981 
2982 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2983 	ret = ib_post_send(cb->qp, &fr, &bad);
2984 	if (ret) {
2985 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2986 		goto err3;
2987 	}
2988 	DEBUG_LOG(cb, "sleeping 1 second\n");
2989 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2990 	do {
2991 		ret = ib_poll_cq(cb->cq, 1, &wc);
2992 		if (ret < 0) {
2993 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2994 			goto err3;
2995 		}
2996 		if (ret == 1) {
2997 			DEBUG_LOG(cb, "completion status %u wr %s\n",
2998 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
2999 			count++;
3000 		} else if (krping_sigpending()) {
3001 			PRINTF(cb, "signal!\n");
3002 			goto err3;
3003 		}
3004 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
3005 	} while (count != 1);
3006 err3:
3007 	DEBUG_LOG(cb, "sleeping 1 second\n");
3008 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
3009 	DEBUG_LOG(cb, "draining the cq...\n");
3010 	do {
3011 		ret = ib_poll_cq(cb->cq, 1, &wc);
3012 		if (ret < 0) {
3013 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
3014 			break;
3015 		}
3016 		if (ret == 1) {
3017 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
3018 		}
3019 	} while (ret == 1);
3020 	DEBUG_LOG(cb, "destroying fr mr1!\n");
3021 	ib_dereg_mr(mr1);
3022 err1:
3023 	DEBUG_LOG(cb, "destroying fr page list!\n");
3024 	ib_free_fast_reg_page_list(pl);
3025 	DEBUG_LOG(cb, "%s done!\n", __func__);
3026 }
3027 
3028 static void krping_fr_test(struct krping_cb *cb)
3029 {
3030 	switch (cb->testnum) {
3031 	case 1:
3032 		krping_fr_test1(cb);
3033 		break;
3034 	case 2:
3035 		krping_fr_test2(cb);
3036 		break;
3037 	case 3:
3038 		krping_fr_test3(cb);
3039 		break;
3040 	case 4:
3041 		krping_fr_test4(cb);
3042 		break;
3043 	case 5:
3044 		krping_fr_test5_client(cb);
3045 		break;
3046 	case 6:
3047 		krping_fr_test6_client(cb);
3048 		break;
3049 	default:
3050 		PRINTF(cb, "Unkown frtest num %u\n", cb->testnum);
3051 		break;
3052 	}
3053 }
3054 
3055 static int krping_connect_client(struct krping_cb *cb)
3056 {
3057 	struct rdma_conn_param conn_param;
3058 	int ret;
3059 
3060 	memset(&conn_param, 0, sizeof conn_param);
3061 	conn_param.responder_resources = 1;
3062 	conn_param.initiator_depth = 1;
3063 	conn_param.retry_count = 10;
3064 
3065 	ret = rdma_connect(cb->cm_id, &conn_param);
3066 	if (ret) {
3067 		PRINTF(cb, "rdma_connect error %d\n", ret);
3068 		return ret;
3069 	}
3070 
3071 	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
3072 	if (cb->state == ERROR) {
3073 		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
3074 		return -1;
3075 	}
3076 
3077 	DEBUG_LOG(cb, "rdma_connect successful\n");
3078 	return 0;
3079 }
3080 
3081 static int krping_bind_client(struct krping_cb *cb)
3082 {
3083 	union {
3084 		struct sockaddr_in v4;
3085 		struct sockaddr_in6 v6;
3086 	} sin;
3087 	int ret;
3088 
3089 	memset(&sin, 0, sizeof(sin));
3090 
3091 	switch (cb->addr_type) {
3092 	case AF_INET:
3093 		sin.v4.sin_len = sizeof sin.v4;
3094 		sin.v4.sin_family = AF_INET;
3095 		sin.v4.sin_addr = cb->addr.v4;
3096 		sin.v4.sin_port = cb->port;
3097 		break;
3098 	case AF_INET6:
3099 		sin.v6.sin6_len = sizeof sin.v6;
3100 		sin.v6.sin6_family = AF_INET6;
3101 		sin.v6.sin6_addr = cb->addr.v6;
3102 		sin.v6.sin6_port = cb->port;
3103 		break;
3104 	default:
3105 		return (-EINVAL);
3106 	}
3107 
3108 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
3109 				2000);
3110 	if (ret) {
3111 		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
3112 		return ret;
3113 	}
3114 
3115 	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
3116 	if (cb->state != ROUTE_RESOLVED) {
3117 		PRINTF(cb,
3118 		       "addr/route resolution did not resolve: state %d\n",
3119 		       cb->state);
3120 		return -EINTR;
3121 	}
3122 
3123 	if (cb->mem == FASTREG && !fastreg_supported(cb, 0))
3124 		return -EINVAL;
3125 
3126 	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
3127 	return 0;
3128 }
3129 
3130 static void krping_run_client(struct krping_cb *cb)
3131 {
3132 	struct ib_recv_wr *bad_wr;
3133 	int ret;
3134 
3135 	ret = krping_bind_client(cb);
3136 	if (ret)
3137 		return;
3138 
3139 	ret = krping_setup_qp(cb, cb->cm_id);
3140 	if (ret) {
3141 		PRINTF(cb, "setup_qp failed: %d\n", ret);
3142 		return;
3143 	}
3144 
3145 	ret = krping_setup_buffers(cb);
3146 	if (ret) {
3147 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
3148 		goto err1;
3149 	}
3150 
3151 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
3152 	if (ret) {
3153 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
3154 		goto err2;
3155 	}
3156 
3157 	ret = krping_connect_client(cb);
3158 	if (ret) {
3159 		PRINTF(cb, "connect error %d\n", ret);
3160 		goto err2;
3161 	}
3162 
3163 	if (cb->wlat)
3164 		krping_wlat_test_client(cb);
3165 	else if (cb->rlat)
3166 		krping_rlat_test_client(cb);
3167 	else if (cb->bw)
3168 		krping_bw_test_client(cb);
3169 	else if (cb->frtest)
3170 		krping_fr_test(cb);
3171 	else
3172 		krping_test_client(cb);
3173 	rdma_disconnect(cb->cm_id);
3174 err2:
3175 	krping_free_buffers(cb);
3176 err1:
3177 	krping_free_qp(cb);
3178 }
3179 
3180 static uint16_t
3181 krping_get_ipv6_scope_id(char *name)
3182 {
3183 	struct ifnet *ifp;
3184 	uint16_t retval;
3185 
3186 	if (name == NULL)
3187 		return (0);
3188 	CURVNET_SET_QUIET(TD_TO_VNET(curthread));
3189 	ifp = ifunit_ref(name);
3190 	CURVNET_RESTORE();
3191 	if (ifp == NULL)
3192 		return (0);
3193 	retval = ifp->if_index;
3194 	if_rele(ifp);
3195 	return (retval);
3196 }
3197 
3198 int krping_doit(char *cmd, void *cookie)
3199 {
3200 	struct krping_cb *cb;
3201 	int op;
3202 	int ret = 0;
3203 	char *optarg;
3204 	char *scope;
3205 	unsigned long optint;
3206 
3207 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
3208 	if (!cb)
3209 		return -ENOMEM;
3210 
3211 	mutex_lock(&krping_mutex);
3212 	list_add_tail(&cb->list, &krping_cbs);
3213 	mutex_unlock(&krping_mutex);
3214 
3215 	cb->cookie = cookie;
3216 	cb->server = -1;
3217 	cb->state = IDLE;
3218 	cb->size = 64;
3219 	cb->txdepth = RPING_SQ_DEPTH;
3220 	cb->mem = DMA;
3221 	cb->addr_type = AF_INET;
3222 	init_waitqueue_head(&cb->sem);
3223 
3224 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
3225 			      &optint)) != 0) {
3226 		switch (op) {
3227 		case 'a':
3228 			cb->addr_str = optarg;
3229 			cb->addr_type = AF_INET;
3230 			DEBUG_LOG(cb, "ipv4addr (%s)\n", optarg);
3231 			if (inet_pton(AF_INET, optarg, &cb->addr) != 1) {
3232 				PRINTF(cb, "bad addr string %s\n",
3233 				    optarg);
3234 				ret = EINVAL;
3235 			}
3236 			break;
3237 		case 'A':
3238 			cb->addr_str = optarg;
3239 			cb->addr_type = AF_INET6;
3240 			DEBUG_LOG(cb, "ipv6addr (%s)\n", optarg);
3241 			scope = strstr(optarg, "%");
3242 			/* extract scope ID, if any */
3243 			if (scope != NULL)
3244 				*scope++ = 0;
3245 			/* extract IPv6 network address */
3246 			if (inet_pton(AF_INET6, optarg, &cb->addr) != 1) {
3247 				PRINTF(cb, "bad addr string %s\n",
3248 				    optarg);
3249 				ret = EINVAL;
3250 			} else if (IN6_IS_SCOPE_LINKLOCAL(&cb->addr.v6) ||
3251 			    IN6_IS_ADDR_MC_INTFACELOCAL(&cb->addr.v6)) {
3252 				uint16_t scope_id = krping_get_ipv6_scope_id(scope);
3253 				DEBUG_LOG(cb, "ipv6 scope ID = %d\n", scope_id);
3254 				cb->addr.v6.s6_addr[2] = scope_id >> 8;
3255 				cb->addr.v6.s6_addr[3] = scope_id & 0xFF;
3256 			}
3257 			break;
3258 		case 'p':
3259 			cb->port = htons(optint);
3260 			DEBUG_LOG(cb, "port %d\n", (int)optint);
3261 			break;
3262 		case 'P':
3263 			cb->poll = 1;
3264 			DEBUG_LOG(cb, "server\n");
3265 			break;
3266 		case 's':
3267 			cb->server = 1;
3268 			DEBUG_LOG(cb, "server\n");
3269 			break;
3270 		case 'c':
3271 			cb->server = 0;
3272 			DEBUG_LOG(cb, "client\n");
3273 			break;
3274 		case 'S':
3275 			cb->size = optint;
3276 			if ((cb->size < 1) ||
3277 			    (cb->size > RPING_BUFSIZE)) {
3278 				PRINTF(cb, "Invalid size %d "
3279 				       "(valid range is 1 to %d)\n",
3280 				       cb->size, RPING_BUFSIZE);
3281 				ret = EINVAL;
3282 			} else
3283 				DEBUG_LOG(cb, "size %d\n", (int)optint);
3284 			break;
3285 		case 'C':
3286 			cb->count = optint;
3287 			if (cb->count < 0) {
3288 				PRINTF(cb, "Invalid count %d\n",
3289 					cb->count);
3290 				ret = EINVAL;
3291 			} else
3292 				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
3293 			break;
3294 		case 'v':
3295 			cb->verbose++;
3296 			DEBUG_LOG(cb, "verbose\n");
3297 			break;
3298 		case 'V':
3299 			cb->validate++;
3300 			DEBUG_LOG(cb, "validate data\n");
3301 			break;
3302 		case 'l':
3303 			cb->wlat++;
3304 			break;
3305 		case 'L':
3306 			cb->rlat++;
3307 			break;
3308 		case 'B':
3309 			cb->bw++;
3310 			break;
3311 		case 'd':
3312 			cb->duplex++;
3313 			break;
3314 		case 'm':
3315 			if (!strncmp(optarg, "dma", 3))
3316 				cb->mem = DMA;
3317 			else if (!strncmp(optarg, "fastreg", 7))
3318 				cb->mem = FASTREG;
3319 			else if (!strncmp(optarg, "mw", 2))
3320 				cb->mem = MW;
3321 			else if (!strncmp(optarg, "mr", 2))
3322 				cb->mem = MR;
3323 			else {
3324 				PRINTF(cb, "unknown mem mode %s.  "
3325 					"Must be dma, fastreg, mw, or mr\n",
3326 					optarg);
3327 				ret = -EINVAL;
3328 				break;
3329 			}
3330 			break;
3331 		case 'I':
3332 			cb->server_invalidate = 1;
3333 			break;
3334 		case 'T':
3335 			cb->txdepth = optint;
3336 			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
3337 			break;
3338 		case 'Z':
3339 			cb->local_dma_lkey = 1;
3340 			DEBUG_LOG(cb, "using local dma lkey\n");
3341 			break;
3342 		case 'R':
3343 			cb->read_inv = 1;
3344 			DEBUG_LOG(cb, "using read-with-inv\n");
3345 			break;
3346 		case 'f':
3347 			cb->frtest = 1;
3348 			cb->testnum = optint;
3349 			DEBUG_LOG(cb, "fast-reg test!\n");
3350 			break;
3351 		default:
3352 			PRINTF(cb, "unknown opt %s\n", optarg);
3353 			ret = -EINVAL;
3354 			break;
3355 		}
3356 	}
3357 	if (ret)
3358 		goto out;
3359 
3360 	if (cb->server == -1) {
3361 		PRINTF(cb, "must be either client or server\n");
3362 		ret = -EINVAL;
3363 		goto out;
3364 	}
3365 
3366 	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
3367 		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
3368 		ret = -EINVAL;
3369 		goto out;
3370 	}
3371 	if (cb->server_invalidate && cb->mem != FASTREG) {
3372 		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
3373 		ret = -EINVAL;
3374 		goto out;
3375 	}
3376 
3377 	if (cb->read_inv && cb->mem != FASTREG) {
3378 		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
3379 		ret = -EINVAL;
3380 		goto out;
3381 	}
3382 
3383 	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) {
3384 		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
3385 		ret = -EINVAL;
3386 		goto out;
3387 	}
3388 
3389 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
3390 	if (IS_ERR(cb->cm_id)) {
3391 		ret = PTR_ERR(cb->cm_id);
3392 		PRINTF(cb, "rdma_create_id error %d\n", ret);
3393 		goto out;
3394 	}
3395 	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
3396 
3397 	if (cb->server)
3398 		krping_run_server(cb);
3399 	else
3400 		krping_run_client(cb);
3401 
3402 	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
3403 	rdma_destroy_id(cb->cm_id);
3404 out:
3405 	mutex_lock(&krping_mutex);
3406 	list_del(&cb->list);
3407 	mutex_unlock(&krping_mutex);
3408 	kfree(cb);
3409 	return ret;
3410 }
3411 
3412 void
3413 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
3414 {
3415 	struct krping_cb *cb;
3416 
3417 	mutex_lock(&krping_mutex);
3418 	list_for_each_entry(cb, &krping_cbs, list)
3419 	    (*f)(cb->pd ? &cb->stats : NULL, arg);
3420 	mutex_unlock(&krping_mutex);
3421 }
3422 
3423 void krping_init(void)
3424 {
3425 
3426 	mutex_init(&krping_mutex);
3427 }
3428