1 /*
2 * This file contains definitions imported from the OFED rds header ib.h.
3 * Oracle elects to have and use the contents of ib.h under and
4 * governed by the OpenIB.org BSD license.
5 */
6 /*
7 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
8 */
9
10 #ifndef _RDSV3_IB_H
11 #define _RDSV3_IB_H
12
13 #include <sys/rds.h>
14 #include <sys/ib/clients/rdsv3/rdsv3.h>
15 #include <sys/ib/clients/rdsv3/rdma_transport.h>
16 #include <sys/ib/clients/rdsv3/rdsv3_af_thr.h>
17
18 #define RDSV3_FMR_SIZE 256
19 #define RDSV3_FMR_POOL_SIZE (12 * 1024)
20
21 #define RDSV3_IB_MAX_SGE 8
22 #define RDSV3_IB_RECV_SGE 2
23
24 #define RDSV3_IB_DEFAULT_RECV_WR 1024
25 #define RDSV3_IB_DEFAULT_SEND_WR 256
26
27 #define RDSV3_IB_DEFAULT_RETRY_COUNT 2
28
29 /* minor versions supported */
30 #define RDSV3_IB_SUPPORTED_PROTOCOLS 0x00000003
31
32 #define RDSV3_IB_MAX_RECV_ALLOC ((512 * 1024 * 1024) / RDSV3_FRAG_SIZE)
33 #define RDSV3_IB_WC_POLL_SIZE 16
34
35 extern struct list rdsv3_ib_devices;
36
37 /*
38 * IB posts RDSV3_FRAG_SIZE fragments of pages to the receive queues to
39 * try and minimize the amount of memory tied up both the device and
40 * socket receive queues.
41 */
42 /* page offset of the final full frag that fits in the page */
43 #define RDSV3_PAGE_LAST_OFF \
44 (((PAGE_SIZE / RDSV3_FRAG_SIZE) - 1) * RDSV3_FRAG_SIZE)
45 struct rdsv3_page_frag {
46 struct list_node f_item;
47 caddr_t f_page;
48 unsigned long f_offset;
49 ibt_wr_ds_t f_sge;
50 ibt_mi_hdl_t f_mapped;
51 };
52
53 struct rdsv3_ib_incoming {
54 list_node_t ii_obj; /* list obj of rdsv3_inc_pool list */
55 struct list ii_frags;
56 struct rdsv3_incoming ii_inc;
57 struct rdsv3_inc_pool *ii_pool;
58 struct rdsv3_ib_device *ii_ibdev;
59 };
60
61 struct rdsv3_ib_connect_private {
62 /* Add new fields at the end, and don't permute existing fields. */
63 uint32_be_t dp_saddr;
64 uint32_be_t dp_daddr;
65 uint8_t dp_protocol_major;
66 uint8_t dp_protocol_minor;
67 uint16_be_t dp_protocol_minor_mask; /* bitmask */
68 uint32_be_t dp_reserved1;
69 uint32_be_t dp_ack_seq;
70 uint32_be_t dp_credit; /* non-zero enables flow ctl */
71 };
72
73 struct rdsv3_ib_send_work {
74 struct rdsv3_message *s_rm;
75 struct rdsv3_rdma_op *s_op;
76 ibt_wrc_opcode_t s_opcode;
77 unsigned long s_queued;
78 };
79
80 struct rdsv3_ib_recv_work {
81 struct rdsv3_ib_incoming *r_ibinc;
82 struct rdsv3_page_frag *r_frag;
83 ibt_wr_ds_t r_sge[2];
84 };
85
86 struct rdsv3_ib_work_ring {
87 uint32_t w_nr;
88 uint32_t w_alloc_ptr;
89 uint32_t w_alloc_ctr;
90 uint32_t w_free_ptr;
91 atomic_t w_free_ctr;
92 rdsv3_wait_queue_t w_empty_wait;
93 };
94
95 /*
96 * Rings are posted with all the allocations they'll need to queue the
97 * incoming message to the receiving socket so this can't fail.
98 * All fragments start with a header, so we can make sure we're not receiving
99 * garbage, and we can tell a small 8 byte fragment from an ACK frame.
100 */
101 struct rdsv3_ib_ack_state {
102 uint64_t ack_next;
103 uint64_t ack_recv;
104 unsigned int ack_required:1;
105 unsigned int ack_next_valid:1;
106 unsigned int ack_recv_valid:1;
107 };
108
109 struct rdsv3_ib_device;
110
111 struct rdsv3_ib_connection {
112
113 struct list_node ib_node;
114 boolean_t i_on_dev_list;
115 struct rdsv3_ib_device *rds_ibdev;
116 struct rdsv3_connection *conn;
117
118 /* alphabet soup, IBTA style */
119 struct rdma_cm_id *i_cm_id;
120 struct ib_pd *i_pd;
121 struct rdsv3_hdrs_mr *i_mr;
122 struct ib_cq *i_cq;
123 struct ib_cq *i_snd_cq;
124
125 /* tx */
126 struct rdsv3_ib_work_ring i_send_ring;
127 struct rdsv3_message *i_rm;
128 struct rdsv3_header *i_send_hdrs;
129 uint64_t i_send_hdrs_dma;
130 struct rdsv3_ib_send_work *i_sends;
131 ibt_send_wr_t *i_send_wrs;
132
133 /* soft CQ */
134 rdsv3_af_thr_t *i_soft_cq;
135 rdsv3_af_thr_t *i_snd_soft_cq;
136 rdsv3_af_thr_t *i_refill_rq;
137
138 /* rx */
139 struct mutex i_recv_mutex;
140 struct rdsv3_ib_work_ring i_recv_ring;
141 struct rdsv3_ib_incoming *i_ibinc;
142 uint32_t i_recv_data_rem;
143 struct rdsv3_header *i_recv_hdrs;
144 uint64_t i_recv_hdrs_dma;
145 struct rdsv3_ib_recv_work *i_recvs;
146 ibt_recv_wr_t *i_recv_wrs;
147 struct rdsv3_page_frag i_frag;
148 uint64_t i_ack_recv; /* last ACK received */
149
150 /* sending acks */
151 unsigned long i_ack_flags;
152 #ifdef KERNEL_HAS_ATOMIC64
153 atomic64_t i_ack_next; /* next ACK to send */
154 #else
155 kmutex_t i_ack_lock; /* protect i_ack_next */
156 uint64_t i_ack_next; /* next ACK to send */
157 #endif
158 struct rdsv3_header *i_ack;
159 ibt_send_wr_t i_ack_wr;
160 ibt_wr_ds_t i_ack_sge;
161 uint64_t i_ack_dma;
162 unsigned long i_ack_queued;
163
164 /*
165 * Flow control related information
166 *
167 * Our algorithm uses a pair variables that we need to access
168 * atomically - one for the send credits, and one posted
169 * recv credits we need to transfer to remote.
170 * Rather than protect them using a slow spinlock, we put both into
171 * a single atomic_t and update it using cmpxchg
172 */
173 atomic_t i_credits;
174
175 /* Protocol version specific information */
176 unsigned int i_flowctl:1; /* enable/disable flow ctl */
177
178 /* Batched completions */
179 unsigned int i_unsignaled_wrs;
180 long i_unsignaled_bytes;
181
182 unsigned long i_max_recv_alloc;
183 };
184
185 /* This assumes that atomic_t is at least 32 bits */
186 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
187 #define IB_GET_POST_CREDITS(v) ((v) >> 16)
188 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
189 #define IB_SET_POST_CREDITS(v) ((v) << 16)
190
191 struct rdsv3_ib_ipaddr {
192 struct list_node list;
193 uint32_be_t ipaddr;
194 };
195
196 struct rdsv3_ib_device {
197 struct list_node list;
198 struct list ipaddr_list;
199 struct list conn_list;
200 ib_device_t *dev;
201 struct ib_pd *pd;
202 struct kmem_cache *ib_frag_slab;
203 kmutex_t spinlock; /* protect the above */
204 krwlock_t rwlock; /* protect paddr_list */
205 unsigned int fmr_max_remaps;
206 unsigned int max_fmrs;
207 unsigned int fmr_message_size;
208 int max_sge;
209 unsigned int max_wrs;
210 unsigned int max_initiator_depth;
211 unsigned int max_responder_resources;
212 struct rdsv3_fmr_pool *fmr_pool;
213 struct rdsv3_inc_pool *inc_pool;
214 ibt_fmr_pool_hdl_t fmr_pool_hdl;
215 ibt_hca_attr_t hca_attr;
216 rdsv3_af_thr_t *fmr_soft_cq;
217 rdsv3_af_thr_t *inc_soft_cq;
218 ibt_hca_hdl_t ibt_hca_hdl;
219 rdsv3_af_grp_t *aft_hcagp;
220 };
221
222 /* bits for i_ack_flags */
223 #define IB_ACK_IN_FLIGHT 0
224 #define IB_ACK_REQUESTED 1
225
226 #define RDSV3_IB_SEND_OP (1ULL << 63)
227
228 /* Magic WR_ID for ACKs */
229 #define RDSV3_IB_ACK_WR_ID (~(uint64_t)0)
230
231 struct rdsv3_ib_statistics {
232 uint64_t s_ib_connect_raced;
233 uint64_t s_ib_listen_closed_stale;
234 uint64_t s_ib_evt_handler_call;
235 uint64_t s_ib_tasklet_call;
236 uint64_t s_ib_tx_cq_event;
237 uint64_t s_ib_tx_ring_full;
238 uint64_t s_ib_tx_throttle;
239 uint64_t s_ib_tx_sg_mapping_failure;
240 uint64_t s_ib_tx_stalled;
241 uint64_t s_ib_tx_credit_updates;
242 uint64_t s_ib_rx_cq_event;
243 uint64_t s_ib_rx_ring_empty;
244 uint64_t s_ib_rx_refill_from_cq;
245 uint64_t s_ib_rx_refill_from_thread;
246 uint64_t s_ib_rx_alloc_limit;
247 uint64_t s_ib_rx_credit_updates;
248 uint64_t s_ib_ack_sent;
249 uint64_t s_ib_ack_send_failure;
250 uint64_t s_ib_ack_send_delayed;
251 uint64_t s_ib_ack_send_piggybacked;
252 uint64_t s_ib_ack_received;
253 uint64_t s_ib_rdma_mr_alloc;
254 uint64_t s_ib_rdma_mr_free;
255 uint64_t s_ib_rdma_mr_used;
256 uint64_t s_ib_rdma_mr_pool_flush;
257 uint64_t s_ib_rdma_mr_pool_wait;
258 uint64_t s_ib_rdma_mr_pool_depleted;
259 };
260
261 extern struct rdsv3_workqueue_struct_s *rds_ib_wq;
262
263 /* ib.c */
264 extern struct rdsv3_transport rdsv3_ib_transport;
265 extern void rdsv3_ib_add_one(ib_device_t *device);
266 extern void rdsv3_ib_remove_one(ib_device_t *device);
267 extern struct ib_client rdsv3_ib_client;
268
269 extern unsigned int fmr_pool_size;
270 extern unsigned int fmr_message_size;
271 extern unsigned int rdsv3_ib_retry_count;
272
273 extern kmutex_t ib_nodev_conns_lock;
274 extern struct list ib_nodev_conns;
275
276 /* ib_cm.c */
277 int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp);
278 void rdsv3_ib_conn_free(void *arg);
279 int rdsv3_ib_conn_connect(struct rdsv3_connection *conn);
280 void rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn);
281 void rdsv3_conn_drop(struct rdsv3_connection *conn);
282 int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
283 struct rdma_cm_event *event);
284 int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
285 void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn,
286 struct rdma_cm_event *event);
287 void rdsv3_ib_tasklet_fn(void *data);
288 void rdsv3_ib_snd_tasklet_fn(void *data);
289 void rdsv3_ib_refill_fn(void *data);
290
291 /* ib_rdma.c */
292 int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev,
293 uint32_be_t ipaddr);
294 void rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev,
295 struct rdsv3_connection *conn);
296 void rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev,
297 struct rdsv3_connection *conn);
298 void __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock);
rdsv3_ib_destroy_nodev_conns(void)299 static inline void rdsv3_ib_destroy_nodev_conns(void)
300 {
301 __rdsv3_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
302 }
rdsv3_ib_destroy_conns(struct rdsv3_ib_device * rds_ibdev)303 static inline void rdsv3_ib_destroy_conns(struct rdsv3_ib_device *rds_ibdev)
304 {
305 __rdsv3_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
306 }
307
308 int rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *);
309 void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *);
310 void rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev,
311 struct rds_info_rdma_connection *iinfo);
312 void *rdsv3_ib_get_mr(struct rds_iovec *args, unsigned long nents,
313 struct rdsv3_sock *rs, uint32_t *key_ret);
314 void rdsv3_ib_sync_mr(void *trans_private, int dir);
315 void rdsv3_ib_free_mr(void *trans_private, int invalidate);
316 void rdsv3_ib_flush_mrs(void);
317 void rdsv3_ib_drain_mrlist_fn(void *data);
318
319 /* ib_recv.c */
320 int rdsv3_ib_recv_init(void);
321 void rdsv3_ib_recv_exit(void);
322 int rdsv3_ib_recv(struct rdsv3_connection *conn);
323 int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill);
324 void rdsv3_ib_inc_free(struct rdsv3_incoming *inc);
325 int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
326 size_t size);
327 void rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc,
328 struct rdsv3_ib_ack_state *state);
329 void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic);
330 void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic);
331 void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic);
332 void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic);
333 void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic);
334 uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic);
335 void rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq,
336 int ack_required);
337 int rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *);
338 void rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *);
339 void rdsv3_ib_drain_inclist(void *);
340
341 /* ib_ring.c */
342 void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr);
343 void rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr);
344 uint32_t rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val,
345 uint32_t *pos);
346 void rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val);
347 void rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val);
348 int rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring);
349 int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring);
350 uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring);
351 uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring,
352 uint32_t wr_id, uint32_t oldest);
353
354 /* ib_send.c */
355 void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn);
356 int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm,
357 unsigned int hdr_off, unsigned int sg, unsigned int off);
358 void rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc);
359 void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic);
360 void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic);
361 int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op);
362 void rdsv3_ib_send_add_credits(struct rdsv3_connection *conn,
363 unsigned int credits);
364 void rdsv3_ib_advertise_credits(struct rdsv3_connection *conn,
365 unsigned int posted);
366 int rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, uint32_t wanted,
367 uint32_t *adv_credits, int need_posted);
368
369 /* ib_stats.c */
370 extern struct rdsv3_ib_statistics *rdsv3_ib_stats;
371 #define rdsv3_ib_stats_inc(member) \
372 rdsv3_stats_add_which(rdsv3_ib_stats, member, 1)
373 unsigned int rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter,
374 unsigned int avail);
375
376 /* ib_sysctl.c */
377 int rdsv3_ib_sysctl_init(void);
378 void rdsv3_ib_sysctl_exit(void);
379 extern unsigned long rdsv3_ib_sysctl_max_send_wr;
380 extern unsigned long rdsv3_ib_sysctl_max_recv_wr;
381 extern unsigned long rdsv3_ib_sysctl_max_unsig_wrs;
382 extern unsigned long rdsv3_ib_sysctl_max_unsig_bytes;
383 extern unsigned long rdsv3_ib_sysctl_max_recv_allocation;
384 extern unsigned int rdsv3_ib_sysctl_flow_control;
385
386 #endif /* _RDSV3_IB_H */
387