1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2013 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2009 IBM Corporation. All rights reserved.
14 * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
15 * reserved.
16 * Copyright (c) 2006-2007 Voltaire All rights reserved.
17 * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
18 * $COPYRIGHT$
19 *
20 * Additional copyrights may follow
21 *
22 * $HEADER$
23 */
24
25 #ifndef MCA_BTL_IB_FRAG_H
26 #define MCA_BTL_IB_FRAG_H
27
28 #include "opal_config.h"
29 #include "opal/align.h"
30 #include "opal/mca/btl/btl.h"
31
32 #include <infiniband/verbs.h>
33
34 BEGIN_C_DECLS
35
36 struct mca_btl_openib_reg_t;
37
38 struct mca_btl_openib_header_t {
39 mca_btl_base_tag_t tag;
40 uint8_t cm_seen;
41 uint16_t credits;
42 #if OPAL_OPENIB_PAD_HDR
43 uint8_t padding[4];
44 #endif
45 };
46 typedef struct mca_btl_openib_header_t mca_btl_openib_header_t;
47 #define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15)
48 #define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG)
49 #define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG)
50
51 #define BTL_OPENIB_HEADER_HTON(h) \
52 do { \
53 (h).credits = htons((h).credits); \
54 } while (0)
55
56 #define BTL_OPENIB_HEADER_NTOH(h) \
57 do { \
58 (h).credits = ntohs((h).credits); \
59 } while (0)
60
61 typedef struct mca_btl_openib_header_coalesced_t {
62 mca_btl_base_tag_t tag;
63 uint32_t size;
64 uint32_t alloc_size;
65 #if OPAL_OPENIB_PAD_HDR
66 uint8_t padding[4];
67 #endif
68 } mca_btl_openib_header_coalesced_t;
69
70 #define BTL_OPENIB_HEADER_COALESCED_NTOH(h) \
71 do { \
72 (h).size = ntohl((h).size); \
73 (h).alloc_size = ntohl((h).alloc_size); \
74 } while(0)
75
76 #define BTL_OPENIB_HEADER_COALESCED_HTON(h) \
77 do { \
78 (h).size = htonl((h).size); \
79 (h).alloc_size = htonl((h).alloc_size); \
80 } while(0)
81
82 #if OPAL_OPENIB_PAD_HDR
83 /* BTL_OPENIB_FTR_PADDING
84 * This macro is used to keep the pointer to openib footers aligned for
85 * systems like SPARC64 that take a big performance hit when addresses
86 * are not aligned (and by default sigbus instead of coercing the type on
87 * an unaligned address).
88 *
89 * We assure alignment of a packet's structures when OPAL_OPENIB_PAD_HDR
90 * is set to 1. When this is the case then several structures are padded
91 * to assure alignment and the mca_btl_openib_footer_t structure itself
92 * will uses the BTL_OPENIB_FTR_PADDING macro to shift the location of the
93 * pointer to assure proper alignment after the PML Header and data.
94 * For example sending a 1 byte data packet the memory layout without
95 * footer alignment would look something like the following:
96 *
97 * 0x00 : mca_btl_openib_coalesced_header_t (12 bytes + 4 byte pad)
98 * 0x10 : mca_btl_openib_control_header_t (1 byte + 7 byte pad)
99 * 0x18 : mca_btl_openib_header_t (4 bytes + 4 byte pad)
100 * 0x20 : PML Header and data (16 bytes PML + 1 byte data)
101 * 0x29 : mca_btl_openib_footer_t (4 bytes + 4 byte pad)
102 * 0x31 : end of packet
103 *
104 * By applying the BTL_OPENIB_FTR_PADDING() in the progress_one_device
105 * and post_send routines we adjust the pointer to mca_btl_openib_footer_t
106 * from 0x29 to 0x2C thus correctly aligning the start of the
107 * footer pointer. This adjustment will cause the padding field of
108 * mca_btl_openib_footer_t to overlap with the neighboring memory but since
109 * we never use the padding we do not end up inadvertently overwriting
110 * memory that does not belong to the fragment.
111 */
112 #define BTL_OPENIB_FTR_PADDING(size) \
113 OPAL_ALIGN_PAD_AMOUNT(size, sizeof(uint64_t))
114
115 /* BTL_OPENIB_ALIGN_COALESCE_HDR
116 * This macro is used in btl_openib.c, while creating a coalesce fragment,
117 * to align the coalesce headers.
118 */
119 #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) \
120 OPAL_ALIGN_PTR(ptr, sizeof(uint32_t), unsigned char*)
121
122 /* BTL_OPENIB_COALESCE_HDR_PADDING
123 * This macro is used in btl_openib_component.c, while parsing an incoming
124 * coalesce fragment, to determine the padding amount used to align the
125 * mca_btl_openib_coalesce_hdr_t.
126 */
127 #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) \
128 OPAL_ALIGN_PAD_AMOUNT(ptr, sizeof(uint32_t))
129 #else
130 #define BTL_OPENIB_FTR_PADDING(size) 0
131 #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) ptr
132 #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) 0
133 #endif
134
135 struct mca_btl_openib_footer_t {
136 #if OPAL_ENABLE_DEBUG
137 uint32_t seq;
138 #endif
139 union {
140 uint32_t size;
141 uint8_t buf[4];
142 } u;
143 #if OPAL_OPENIB_PAD_HDR
144 #if OPAL_ENABLE_DEBUG
145 /* this footer needs to be of a 8-byte multiple so by adding the
146 * seq field you throw this off and you cannot just remove the
147 * padding because the padding is needed in order to adjust the alignment
148 * and not overwrite other packets.
149 */
150 uint8_t padding[12];
151 #else
152 uint8_t padding[8];
153 #endif
154 #endif
155 };
156 typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t;
157
158 #ifdef WORDS_BIGENDIAN
159 #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr)
160 #else
161 #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) \
162 do { \
163 uint8_t tmp = (ftr).u.buf[0]; \
164 (ftr).u.buf[0]=(ftr).u.buf[2]; \
165 (ftr).u.buf[2]=tmp; \
166 } while (0)
167 #endif
168
169 #if OPAL_ENABLE_DEBUG
170 #define BTL_OPENIB_FOOTER_SEQ_HTON(h) ((h).seq = htonl((h).seq))
171 #define BTL_OPENIB_FOOTER_SEQ_NTOH(h) ((h).seq = ntohl((h).seq))
172 #else
173 #define BTL_OPENIB_FOOTER_SEQ_HTON(h)
174 #define BTL_OPENIB_FOOTER_SEQ_NTOH(h)
175 #endif
176
177 #define BTL_OPENIB_FOOTER_HTON(h) \
178 do { \
179 BTL_OPENIB_FOOTER_SEQ_HTON(h); \
180 MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
181 } while (0)
182
183 #define BTL_OPENIB_FOOTER_NTOH(h) \
184 do { \
185 BTL_OPENIB_FOOTER_SEQ_NTOH(h); \
186 MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
187 } while (0)
188
189 #define MCA_BTL_OPENIB_CONTROL_CREDITS 0
190 #define MCA_BTL_OPENIB_CONTROL_RDMA 1
191 #define MCA_BTL_OPENIB_CONTROL_COALESCED 2
192 #define MCA_BTL_OPENIB_CONTROL_CTS 3
193
194 struct mca_btl_openib_control_header_t {
195 uint8_t type;
196 #if OPAL_OPENIB_PAD_HDR
197 uint8_t padding[7];
198 #endif
199 };
200 typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t;
201
202 struct mca_btl_openib_eager_rdma_header_t {
203 mca_btl_openib_control_header_t control;
204 uint32_t rkey;
205 opal_ptr_t rdma_start;
206 };
207 typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t;
208
209 #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON(h) \
210 do { \
211 (h).rkey = htonl((h).rkey); \
212 (h).rdma_start.lval = hton64((h).rdma_start.lval); \
213 } while (0)
214
215 #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \
216 do { \
217 (h).rkey = ntohl((h).rkey); \
218 (h).rdma_start.lval = ntoh64((h).rdma_start.lval); \
219 } while (0)
220
221
222 struct mca_btl_openib_rdma_credits_header_t {
223 mca_btl_openib_control_header_t control;
224 #if OPAL_OPENIB_PAD_HDR
225 uint8_t padding[1];
226 #endif
227 uint8_t qpn;
228 uint16_t rdma_credits;
229 };
230 typedef struct mca_btl_openib_rdma_credits_header_t mca_btl_openib_rdma_credits_header_t;
231
232 #define BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(h) \
233 do { \
234 (h).rdma_credits = htons((h).rdma_credits); \
235 } while (0)
236
237 #define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \
238 do { \
239 (h).rdma_credits = ntohs((h).rdma_credits); \
240 } while (0)
241
242 enum mca_btl_openib_frag_type_t {
243 MCA_BTL_OPENIB_FRAG_RECV,
244 MCA_BTL_OPENIB_FRAG_RECV_USER,
245 MCA_BTL_OPENIB_FRAG_SEND,
246 MCA_BTL_OPENIB_FRAG_SEND_USER,
247 MCA_BTL_OPENIB_FRAG_EAGER_RDMA,
248 MCA_BTL_OPENIB_FRAG_CONTROL,
249 MCA_BTL_OPENIB_FRAG_COALESCED
250 };
251 typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t;
252
253 #define openib_frag_type(f) (to_base_frag(f)->type)
254 /**
255 * IB fragment derived type.
256 */
257 /* base openib frag */
258 typedef struct mca_btl_openib_frag_t {
259 mca_btl_base_descriptor_t base;
260 mca_btl_base_segment_t segment;
261 mca_btl_openib_frag_type_t type;
262 opal_free_list_t* list;
263 } mca_btl_openib_frag_t;
264 OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t);
265
266 #define to_base_frag(f) ((mca_btl_openib_frag_t*)(f))
267
268 /* frag used for communication */
269 typedef struct mca_btl_openib_com_frag_t {
270 mca_btl_openib_frag_t super;
271 struct ibv_sge sg_entry;
272 struct mca_btl_openib_reg_t *registration;
273 struct mca_btl_base_endpoint_t *endpoint;
274 /* number of unsignaled frags sent before this frag. */
275 uint32_t n_wqes_inflight;
276 } mca_btl_openib_com_frag_t;
277 OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t);
278
279 #define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f))
280
281 typedef struct mca_btl_openib_out_frag_t {
282 mca_btl_openib_com_frag_t super;
283 struct ibv_send_wr sr_desc;
284 } mca_btl_openib_out_frag_t;
285 OBJ_CLASS_DECLARATION(mca_btl_openib_out_frag_t);
286
287 #define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f))
288
289 typedef struct mca_btl_openib_com_frag_t mca_btl_openib_in_frag_t;
290 OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t);
291
292 #define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f))
293
294 typedef struct mca_btl_openib_send_frag_t {
295 mca_btl_openib_out_frag_t super;
296 mca_btl_openib_header_t *hdr, *chdr;
297 mca_btl_openib_footer_t *ftr;
298 uint8_t qp_idx;
299 uint32_t coalesced_length;
300 opal_list_t coalesced_frags;
301 } mca_btl_openib_send_frag_t;
302 OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t);
303
304 #define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f))
305
306 typedef struct mca_btl_openib_recv_frag_t {
307 mca_btl_openib_in_frag_t super;
308 mca_btl_openib_header_t *hdr;
309 mca_btl_openib_footer_t *ftr;
310 struct ibv_recv_wr rd_desc;
311 uint8_t qp_idx;
312 } mca_btl_openib_recv_frag_t;
313 OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
314
315 #define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
316
317 typedef struct mca_btl_openib_put_frag_t {
318 mca_btl_openib_out_frag_t super;
319 struct {
320 mca_btl_base_rdma_completion_fn_t func;
321 mca_btl_base_registration_handle_t *local_handle;
322 void *context;
323 void *data;
324 } cb;
325 } mca_btl_openib_put_frag_t;
326 OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
327
328 #define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
329
330 typedef struct mca_btl_openib_get_frag_t {
331 mca_btl_openib_in_frag_t super;
332 struct ibv_send_wr sr_desc;
333 struct {
334 mca_btl_base_rdma_completion_fn_t func;
335 mca_btl_base_registration_handle_t *local_handle;
336 void *context;
337 void *data;
338 } cb;
339 } mca_btl_openib_get_frag_t;
340 OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
341
342 #define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f))
343
344 typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t;
345 OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t);
346
347 #define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f))
348
349 typedef struct mca_btl_openib_coalesced_frag_t {
350 mca_btl_openib_frag_t super;
351 mca_btl_openib_send_frag_t *send_frag;
352 mca_btl_openib_header_coalesced_t *hdr;
353 bool sent;
354 } mca_btl_openib_coalesced_frag_t;
355 OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);
356
357 #define to_coalesced_frag(f) ((mca_btl_openib_coalesced_frag_t*)(f))
358
359 /*
360 * Allocate an IB send descriptor
361 *
362 */
363
364 static inline mca_btl_openib_send_control_frag_t *
alloc_control_frag(mca_btl_openib_module_t * btl)365 alloc_control_frag(mca_btl_openib_module_t *btl)
366 {
367 return to_send_control_frag(opal_free_list_wait (&btl->device->send_free_control));
368 }
369
frag_size_to_order(mca_btl_openib_module_t * btl,size_t size)370 static inline uint8_t frag_size_to_order(mca_btl_openib_module_t* btl,
371 size_t size)
372 {
373 int qp;
374 for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++)
375 if(mca_btl_openib_component.qp_infos[qp].size >= size)
376 return qp;
377
378 return MCA_BTL_NO_ORDER;
379 }
380
alloc_send_user_frag(void)381 static inline mca_btl_openib_com_frag_t *alloc_send_user_frag(void)
382 {
383 return to_com_frag(opal_free_list_get (&mca_btl_openib_component.send_user_free));
384 }
385
alloc_recv_user_frag(void)386 static inline mca_btl_openib_com_frag_t *alloc_recv_user_frag(void)
387 {
388 return to_com_frag(opal_free_list_get (&mca_btl_openib_component.recv_user_free));
389 }
390
alloc_coalesced_frag(void)391 static inline mca_btl_openib_coalesced_frag_t *alloc_coalesced_frag(void)
392 {
393 return to_coalesced_frag(opal_free_list_get (&mca_btl_openib_component.send_free_coalesced));
394 }
395
396 #define MCA_BTL_IB_FRAG_RETURN(frag) \
397 do { \
398 opal_free_list_return (to_base_frag(frag)->list, \
399 (opal_free_list_item_t*)(frag)); \
400 } while(0)
401
402 #define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list) \
403 do { \
404 opal_list_item_t *_frag_item; \
405 while (NULL != (_frag_item = opal_list_remove_first(list))) { \
406 MCA_BTL_IB_FRAG_RETURN(_frag_item); \
407 } \
408 } while (0)
409
410 struct mca_btl_openib_module_t;
411
412 struct mca_btl_openib_frag_init_data_t {
413 uint8_t order;
414 opal_free_list_t* list;
415 };
416 typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t;
417
418 int mca_btl_openib_frag_init(opal_free_list_item_t* item, void* ctx);
419
420
421 END_C_DECLS
422 #endif
423