xref: /freebsd/sys/dev/iser/icl_iser.h (revision 4b9d6057)
1 /*-
2  * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #ifndef ICL_ISER_H
27 #define ICL_ISER_H
28 
29 /*
30  * iSCSI Common Layer for RDMA.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/capsicum.h>
35 #include <sys/condvar.h>
36 #include <sys/conf.h>
37 #include <sys/file.h>
38 #include <sys/kernel.h>
39 #include <sys/kthread.h>
40 #include <sys/lock.h>
41 #include <sys/mbuf.h>
42 #include <sys/mutex.h>
43 #include <sys/module.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/systm.h>
49 #include <sys/sx.h>
50 #include <sys/uio.h>
51 #include <sys/taskqueue.h>
52 #include <sys/bio.h>
53 #include <vm/uma.h>
54 #include <netinet/in.h>
55 #include <netinet/tcp.h>
56 #include <dev/iscsi/icl.h>
57 #include <dev/iscsi/iscsi_proto.h>
58 #include <icl_conn_if.h>
59 #include <cam/cam.h>
60 #include <cam/cam_ccb.h>
61 #include <rdma/ib_verbs.h>
62 #include <rdma/ib_fmr_pool.h>
63 #include <rdma/rdma_cm.h>
64 
65 
66 #define	ISER_DBG(X, ...)						\
67 	do {								\
68 		if (unlikely(iser_debug > 2))				\
69 			printf("DEBUG: %s: " X "\n",			\
70 				__func__, ## __VA_ARGS__);		\
71 	} while (0)
72 
73 #define	ISER_INFO(X, ...)						\
74 	do {								\
75 		if (unlikely(iser_debug > 1))				\
76 			printf("INFO: %s: " X "\n",			\
77 				__func__, ## __VA_ARGS__);		\
78 	} while (0)
79 
80 #define	ISER_WARN(X, ...)						\
81 	do {								\
82 		if (unlikely(iser_debug > 0)) {				\
83 			printf("WARNING: %s: " X "\n",			\
84 				__func__, ## __VA_ARGS__);		\
85 		}							\
86 	} while (0)
87 
88 #define	ISER_ERR(X, ...) 						\
89 	printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
90 
91 #define ISER_VER			0x10
92 #define ISER_WSV			0x08
93 #define ISER_RSV			0x04
94 
95 #define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
96 #define ISER_BEACON_WRID		0xfffffffffffffffeULL
97 
98 #define SHIFT_4K	12
99 #define SIZE_4K	(1ULL << SHIFT_4K)
100 #define MASK_4K	(~(SIZE_4K-1))
101 
102 /* support up to 512KB in one RDMA */
103 #define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
104 #define ISER_DEF_XMIT_CMDS_MAX 256
105 
106 /* the max RX (recv) WR supported by the iSER QP is defined by                 *
107  * max_recv_wr = commands_max + recv_beacon                                    */
108 #define ISER_QP_MAX_RECV_DTOS  (ISER_DEF_XMIT_CMDS_MAX + 1)
109 #define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
110 
111 /* QP settings */
112 /* Maximal bounds on received asynchronous PDUs */
113 #define ISER_MAX_RX_MISC_PDUS           4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
114 #define ISER_MAX_TX_MISC_PDUS           6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */
115 
116 /* the max TX (send) WR supported by the iSER QP is defined by                 *
117  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
118  * to have at max for SCSI command. The tx posting & completion handling code  *
119  * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
120  * send WR. D=8 comes from 64K/8K                                              */
121 
122 #define ISER_INFLIGHT_DATAOUTS		8
123 
124 /* the send_beacon increase the max_send_wr by 1  */
125 #define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
126 					(1 + ISER_INFLIGHT_DATAOUTS) + \
127 					ISER_MAX_TX_MISC_PDUS        + \
128 					ISER_MAX_RX_MISC_PDUS + 1)
129 
130 #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr			\
131 					 - ISER_MAX_TX_MISC_PDUS	\
132 					 - ISER_MAX_RX_MISC_PDUS - 1) /	\
133 					 (1 + ISER_INFLIGHT_DATAOUTS))
134 
135 #define ISER_WC_BATCH_COUNT   16
136 #define ISER_SIGNAL_CMD_COUNT 32
137 
138 /* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might   *
139  * encounter a CQ overrun state.                                               */
140 #define ISCSI_ISER_MAX_CONN	8
141 #define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
142 #define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
143 #define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
144 				 ISCSI_ISER_MAX_CONN)
145 
146 #define ISER_ZBVA_NOT_SUPPORTED                0x80
147 #define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
148 
149 #define	ISCSI_DEF_MAX_RECV_SEG_LEN	8192
150 #define	ISCSI_OPCODE_MASK		0x3f
151 
152 #define icl_to_iser_conn(ic) \
153 	container_of(ic, struct iser_conn, icl_conn)
154 #define icl_to_iser_pdu(ip) \
155 	container_of(ip, struct icl_iser_pdu, icl_pdu)
156 
157 /**
158  * struct iser_hdr - iSER header
159  *
160  * @flags:        flags support (zbva, remote_inv)
161  * @rsvd:         reserved
162  * @write_stag:   write rkey
163  * @write_va:     write virtual address
164  * @reaf_stag:    read rkey
165  * @read_va:      read virtual address
166  */
167 struct iser_hdr {
168 	u8      flags;
169 	u8      rsvd[3];
170 	__be32  write_stag;
171 	__be64  write_va;
172 	__be32  read_stag;
173 	__be64  read_va;
174 } __attribute__((packed));
175 
176 struct iser_cm_hdr {
177 	u8      flags;
178 	u8      rsvd[3];
179 } __packed;
180 
181 /* Constant PDU lengths calculations */
182 #define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
183 
184 #define ISER_RECV_DATA_SEG_LEN	128
185 #define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
186 
187 #define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
188 
189 enum iser_conn_state {
190 	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
191 	ISER_CONN_PENDING,	   /* in the process of being established */
192 	ISER_CONN_UP,		   /* up and running                      */
193 	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
194 	ISER_CONN_DOWN,		   /* shut down                           */
195 	ISER_CONN_STATES_NUM
196 };
197 
198 enum iser_task_status {
199 	ISER_TASK_STATUS_INIT = 0,
200 	ISER_TASK_STATUS_STARTED,
201 	ISER_TASK_STATUS_COMPLETED
202 };
203 
204 enum iser_data_dir {
205 	ISER_DIR_IN = 0,	   /* to initiator */
206 	ISER_DIR_OUT,		   /* from initiator */
207 	ISER_DIRS_NUM
208 };
209 
210 /**
211  * struct iser_mem_reg - iSER memory registration info
212  *
213  * @sge:          memory region sg element
214  * @rkey:         memory region remote key
215  * @mem_h:        pointer to registration context (FMR/Fastreg)
216  */
217 struct iser_mem_reg {
218 	struct ib_sge	 sge;
219 	u32		 rkey;
220 	void		*mem_h;
221 };
222 
223 enum iser_desc_type {
224 	ISCSI_TX_CONTROL ,
225 	ISCSI_TX_SCSI_COMMAND,
226 	ISCSI_TX_DATAOUT
227 };
228 
229 /**
230  * struct iser_data_buf - iSER data buffer
231  *
232  * @sg:           pointer to the sg list
233  * @size:         num entries of this sg
234  * @data_len:     total beffer byte len
235  * @dma_nents:    returned by dma_map_sg
236  * @copy_buf:     allocated copy buf for SGs unaligned
237  *                for rdma which are copied
238  * @orig_sg:      pointer to the original sg list (in case
239  *                we used a copy)
240  * @sg_single:    SG-ified clone of a non SG SC or
241  *                unaligned SG
242  */
243 struct iser_data_buf {
244 	struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
245 	void               *sg;
246 	int                size;
247 	unsigned long      data_len;
248 	unsigned int       dma_nents;
249 	char               *copy_buf;
250 	struct scatterlist *orig_sg;
251 	struct scatterlist sg_single;
252   };
253 
254 /* fwd declarations */
255 struct iser_conn;
256 struct ib_conn;
257 struct iser_device;
258 
259 /**
260  * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
261  *
262  * @iser_header:   iser header
263  * @iscsi_header:  iscsi header (bhs)
264  * @type:          command/control/dataout
265  * @dma_addr:      header buffer dma_address
266  * @tx_sg:         sg[0] points to iser/iscsi headers
267  *                 sg[1] optionally points to either of immediate data
268  *                 unsolicited data-out or control
269  * @num_sge:       number sges used on this TX task
270  * @mapped:        indicates if the descriptor is dma mapped
271  */
272 struct iser_tx_desc {
273 	struct iser_hdr              iser_header;
274 	struct iscsi_bhs             iscsi_header __attribute__((packed));
275 	enum   iser_desc_type        type;
276 	u64		             dma_addr;
277 	struct ib_sge		     tx_sg[2];
278 	int                          num_sge;
279 	bool                         mapped;
280 };
281 
282 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
283 					sizeof(u64) + sizeof(struct ib_sge)))
284 /**
285  * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
286  *
287  * @iser_header:   iser header
288  * @iscsi_header:  iscsi header
289  * @data:          received data segment
290  * @dma_addr:      receive buffer dma address
291  * @rx_sg:         ib_sge of receive buffer
292  * @pad:           for sense data TODO: Modify to maximum sense length supported
293  */
294 struct iser_rx_desc {
295 	struct iser_hdr              iser_header;
296 	struct iscsi_bhs             iscsi_header;
297 	char		             data[ISER_RECV_DATA_SEG_LEN];
298 	u64		             dma_addr;
299 	struct ib_sge		     rx_sg;
300 	char		             pad[ISER_RX_PAD_SIZE];
301 } __attribute__((packed));
302 
303 struct icl_iser_pdu {
304 	struct icl_pdu               icl_pdu;
305 	struct iser_tx_desc          desc;
306 	struct iser_conn             *iser_conn;
307 	enum iser_task_status        status;
308 	struct ccb_scsiio 			 *csio;
309 	int                          command_sent;
310 	int                          dir[ISER_DIRS_NUM];
311 	struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
312 	struct iser_data_buf         data[ISER_DIRS_NUM];
313 };
314 
315 /**
316  * struct iser_comp - iSER completion context
317  *
318  * @device:     pointer to device handle
319  * @cq:         completion queue
320  * @wcs:        work completion array
321  * @tq:    	taskqueue handle
322  * @task:    	task to run task_fn
323  * @active_qps: Number of active QPs attached
324  *              to completion context
325  */
326 struct iser_comp {
327 	struct iser_device      *device;
328 	struct ib_cq		*cq;
329 	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
330 	struct taskqueue        *tq;
331 	struct task             task;
332 	int                      active_qps;
333 };
334 
335 /**
336  * struct iser_device - iSER device handle
337  *
338  * @ib_device:     RDMA device
339  * @pd:            Protection Domain for this device
340  * @dev_attr:      Device attributes container
341  * @mr:            Global DMA memory region
342  * @event_handler: IB events handle routine
343  * @ig_list:	   entry in devices list
344  * @refcount:      Reference counter, dominated by open iser connections
345  * @comps_used:    Number of completion contexts used, Min between online
346  *                 cpus and device max completion vectors
347  * @comps:         Dinamically allocated array of completion handlers
348  */
349 struct iser_device {
350 	struct ib_device             *ib_device;
351 	struct ib_pd	             *pd;
352 	struct ib_device_attr	     dev_attr;
353 	struct ib_mr	             *mr;
354 	struct ib_event_handler      event_handler;
355 	struct list_head             ig_list;
356 	int                          refcount;
357 	int			     comps_used;
358 	struct iser_comp	     *comps;
359 };
360 
361 /**
362  * struct iser_reg_resources - Fast registration recources
363  *
364  * @mr:         memory region
365  * @mr_valid:   is mr valid indicator
366  */
367 struct iser_reg_resources {
368 	struct ib_mr                     *mr;
369 	u8                                mr_valid:1;
370 };
371 
372 /**
373  * struct fast_reg_descriptor - Fast registration descriptor
374  *
375  * @list:           entry in connection fastreg pool
376  * @rsc:            data buffer registration resources
377  */
378 struct fast_reg_descriptor {
379 	struct list_head		  list;
380 	struct iser_reg_resources	  rsc;
381 };
382 
383 
384 /**
385  * struct iser_beacon - beacon to signal all flush errors were drained
386  *
387  * @send:           send wr
388  * @recv:           recv wr
389  * @flush_lock:     protects flush_cv
390  * @flush_cv:       condition variable for beacon flush
391  */
392 struct iser_beacon {
393 	union {
394 		struct ib_send_wr	send;
395 		struct ib_recv_wr	recv;
396 	};
397 	struct mtx		     flush_lock;
398 	struct cv		     flush_cv;
399 };
400 
401 /**
402  * struct ib_conn - Infiniband related objects
403  *
404  * @cma_id:              rdma_cm connection maneger handle
405  * @qp:                  Connection Queue-pair
406  * @device:              reference to iser device
407  * @comp:                iser completion context
408   */
409 struct ib_conn {
410 	struct rdma_cm_id           *cma_id;
411 	struct ib_qp	            *qp;
412 	int                          post_recv_buf_count;
413 	u8                           sig_count;
414 	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
415 	struct iser_device          *device;
416 	struct iser_comp	    *comp;
417 	struct iser_beacon	     beacon;
418 	struct mtx               lock;
419 	union {
420 		struct {
421 			struct ib_fmr_pool      *pool;
422 			struct iser_page_vec	*page_vec;
423 		} fmr;
424 		struct {
425 			struct list_head	 pool;
426 			int			 pool_size;
427 		} fastreg;
428 	};
429 };
430 
431 struct iser_conn {
432 	struct icl_conn             icl_conn;
433 	struct ib_conn               ib_conn;
434 	struct cv                    up_cv;
435 	struct list_head             conn_list;
436 	struct sx		     		 state_mutex;
437 	enum iser_conn_state	     state;
438 	int		     				 qp_max_recv_dtos;
439 	int		     				 min_posted_rx;
440 	u16                          max_cmds;
441 	char  			     *login_buf;
442 	char			     *login_req_buf, *login_resp_buf;
443 	u64			     login_req_dma, login_resp_dma;
444 	unsigned int 		     rx_desc_head;
445 	struct iser_rx_desc	     *rx_descs;
446 	u32                          num_rx_descs;
447 	bool                         handoff_done;
448 };
449 
450 /**
451  * struct iser_global: iSER global context
452  *
453  * @device_list_mutex:    protects device_list
454  * @device_list:          iser devices global list
455  * @connlist_mutex:       protects connlist
456  * @connlist:             iser connections global list
457  * @desc_cache:           kmem cache for tx dataout
458  * @close_conns_mutex:    serializes conns closure
459  */
460 struct iser_global {
461 	struct sx        device_list_mutex;
462 	struct list_head  device_list;
463 	struct mtx        connlist_mutex;
464 	struct list_head  connlist;
465 	struct sx         close_conns_mutex;
466 };
467 
468 extern struct iser_global ig;
469 extern int iser_debug;
470 
471 void
472 iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *);
473 
474 int
475 iser_post_recvl(struct iser_conn *);
476 
477 int
478 iser_post_recvm(struct iser_conn *, int);
479 
480 int
481 iser_alloc_login_buf(struct iser_conn *iser_conn);
482 
483 void
484 iser_free_login_buf(struct iser_conn *iser_conn);
485 
486 int
487 iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool);
488 
489 void
490 iser_snd_completion(struct iser_tx_desc *, struct ib_conn *);
491 
492 void
493 iser_rcv_completion(struct iser_rx_desc *, unsigned long,
494 		    struct ib_conn *);
495 
496 void
497 iser_pdu_free(struct icl_conn *, struct icl_pdu *);
498 
499 struct icl_pdu *
500 iser_new_pdu(struct icl_conn *ic, int flags);
501 
502 int
503 iser_alloc_rx_descriptors(struct iser_conn *, int);
504 
505 void
506 iser_free_rx_descriptors(struct iser_conn *);
507 
508 int
509 iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *);
510 
511 int
512 iser_send_control(struct iser_conn *, struct icl_iser_pdu *);
513 
514 int
515 iser_send_command(struct iser_conn *, struct icl_iser_pdu *);
516 
517 int
518 iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
519 
520 void
521 iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
522 
523 int
524 iser_create_fastreg_pool(struct ib_conn *, unsigned);
525 
526 void
527 iser_free_fastreg_pool(struct ib_conn *);
528 
529 int
530 iser_dma_map_task_data(struct icl_iser_pdu *,
531 		       struct iser_data_buf *, enum iser_data_dir,
532 		       enum dma_data_direction);
533 
534 int
535 iser_conn_terminate(struct iser_conn *);
536 
537 void
538 iser_free_ib_conn_res(struct iser_conn *, bool);
539 
540 void
541 iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *,
542 			 enum dma_data_direction);
543 
544 int
545 iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
546 
547 #endif /* !ICL_ISER_H */
548