1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2013 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2009      IBM Corporation.  All rights reserved.
14  * Copyright (c) 2006-2015 Los Alamos National Security, LLC.  All rights
15  *                         reserved.
16  * Copyright (c) 2006-2007 Voltaire All rights reserved.
17  * Copyright (c) 2010-2012 Oracle and/or its affiliates.  All rights reserved.
18  * $COPYRIGHT$
19  *
20  * Additional copyrights may follow
21  *
22  * $HEADER$
23  */
24 
25 #ifndef MCA_BTL_IB_FRAG_H
26 #define MCA_BTL_IB_FRAG_H
27 
28 #include "opal_config.h"
29 #include "opal/align.h"
30 #include "opal/mca/btl/btl.h"
31 
32 #include <infiniband/verbs.h>
33 
34 BEGIN_C_DECLS
35 
36 struct mca_btl_openib_reg_t;
37 
38 struct mca_btl_openib_header_t {
39     mca_btl_base_tag_t tag;
40     uint8_t cm_seen;
41     uint16_t credits;
42 #if OPAL_OPENIB_PAD_HDR
43     uint8_t padding[4];
44 #endif
45 };
46 typedef struct mca_btl_openib_header_t mca_btl_openib_header_t;
47 #define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15)
48 #define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG)
49 #define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG)
50 
51 #define BTL_OPENIB_HEADER_HTON(h)     \
52 do {                                  \
53     (h).credits = htons((h).credits); \
54 } while (0)
55 
56 #define BTL_OPENIB_HEADER_NTOH(h)     \
57 do {                                  \
58     (h).credits = ntohs((h).credits); \
59 } while (0)
60 
61 typedef struct mca_btl_openib_header_coalesced_t {
62     mca_btl_base_tag_t tag;
63     uint32_t size;
64     uint32_t alloc_size;
65 #if OPAL_OPENIB_PAD_HDR
66     uint8_t padding[4];
67 #endif
68 } mca_btl_openib_header_coalesced_t;
69 
70 #define BTL_OPENIB_HEADER_COALESCED_NTOH(h)     \
71     do {                                        \
72         (h).size = ntohl((h).size);             \
73         (h).alloc_size = ntohl((h).alloc_size); \
74      } while(0)
75 
76 #define BTL_OPENIB_HEADER_COALESCED_HTON(h)     \
77     do {                                        \
78         (h).size = htonl((h).size);             \
79         (h).alloc_size = htonl((h).alloc_size); \
80      } while(0)
81 
82 #if OPAL_OPENIB_PAD_HDR
83 /* BTL_OPENIB_FTR_PADDING
84  * This macro is used to keep the pointer to openib footers aligned for
85  * systems like SPARC64 that take a big performance hit when addresses
86  * are not aligned (and by default sigbus instead of coercing the type on
87  * an unaligned address).
88  *
89  * We assure alignment of a packet's structures when OPAL_OPENIB_PAD_HDR
90  * is set to 1.  When this is the case then several structures are padded
91  * to assure alignment and the mca_btl_openib_footer_t structure itself
92  * will uses the BTL_OPENIB_FTR_PADDING macro to shift the location of the
93  * pointer to assure proper alignment after the PML Header and data.
94  * For example sending a 1 byte data packet the memory layout without
95  * footer alignment would look something like the following:
96  *
97  * 0x00   : mca_btl_openib_coalesced_header_t (12 bytes + 4 byte pad)
98  * 0x10   : mca_btl_openib_control_header_t (1 byte + 7 byte pad)
99  * 0x18   : mca_btl_openib_header_t (4 bytes + 4 byte pad)
100  * 0x20   : PML Header and data (16 bytes PML + 1 byte data)
101  * 0x29   : mca_btl_openib_footer_t (4 bytes + 4 byte pad)
102  * 0x31   : end of packet
103  *
104  * By applying the BTL_OPENIB_FTR_PADDING() in the progress_one_device
105  * and post_send routines we adjust the pointer to mca_btl_openib_footer_t
106  * from 0x29 to 0x2C thus correctly aligning the start of the
107  * footer pointer.  This adjustment will cause the padding field of
108  * mca_btl_openib_footer_t to overlap with the neighboring memory but since
109  * we never use the padding we do not end up inadvertently overwriting
110  * memory that does not belong to the fragment.
111  */
112 #define BTL_OPENIB_FTR_PADDING(size) \
113     OPAL_ALIGN_PAD_AMOUNT(size, sizeof(uint64_t))
114 
115 /* BTL_OPENIB_ALIGN_COALESCE_HDR
116  * This macro is used in btl_openib.c, while creating a coalesce fragment,
117  * to align the coalesce headers.
118  */
119 #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) \
120   OPAL_ALIGN_PTR(ptr, sizeof(uint32_t), unsigned char*)
121 
122 /* BTL_OPENIB_COALESCE_HDR_PADDING
123  * This macro is used in btl_openib_component.c, while parsing an incoming
124  * coalesce fragment, to determine the padding amount used to align the
125  * mca_btl_openib_coalesce_hdr_t.
126  */
127 #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) \
128   OPAL_ALIGN_PAD_AMOUNT(ptr, sizeof(uint32_t))
129 #else
130 #define BTL_OPENIB_FTR_PADDING(size) 0
131 #define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) ptr
132 #define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) 0
133 #endif
134 
135 struct mca_btl_openib_footer_t {
136 #if OPAL_ENABLE_DEBUG
137     uint32_t seq;
138 #endif
139     union {
140         uint32_t size;
141         uint8_t buf[4];
142     } u;
143 #if OPAL_OPENIB_PAD_HDR
144 #if OPAL_ENABLE_DEBUG
145     /* this footer needs to be of a 8-byte multiple so by adding the
146      * seq field you throw this off and you cannot just remove the
147      * padding because the padding is needed in order to adjust the alignment
148      * and not overwrite other packets.
149      */
150     uint8_t padding[12];
151 #else
152     uint8_t padding[8];
153 #endif
154 #endif
155 };
156 typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t;
157 
158 #ifdef WORDS_BIGENDIAN
159 #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr)
160 #else
161 #define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr)    \
162     do {                                        \
163         uint8_t tmp = (ftr).u.buf[0];           \
164         (ftr).u.buf[0]=(ftr).u.buf[2];          \
165         (ftr).u.buf[2]=tmp;                     \
166     } while (0)
167 #endif
168 
169 #if OPAL_ENABLE_DEBUG
170 #define BTL_OPENIB_FOOTER_SEQ_HTON(h)  ((h).seq = htonl((h).seq))
171 #define BTL_OPENIB_FOOTER_SEQ_NTOH(h)  ((h).seq = ntohl((h).seq))
172 #else
173 #define BTL_OPENIB_FOOTER_SEQ_HTON(h)
174 #define BTL_OPENIB_FOOTER_SEQ_NTOH(h)
175 #endif
176 
177 #define BTL_OPENIB_FOOTER_HTON(h)               \
178     do {                                        \
179         BTL_OPENIB_FOOTER_SEQ_HTON(h);          \
180         MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h);     \
181     } while (0)
182 
183 #define BTL_OPENIB_FOOTER_NTOH(h)               \
184     do {                                        \
185         BTL_OPENIB_FOOTER_SEQ_NTOH(h);          \
186         MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h);     \
187     } while (0)
188 
189 #define MCA_BTL_OPENIB_CONTROL_CREDITS      0
190 #define MCA_BTL_OPENIB_CONTROL_RDMA         1
191 #define MCA_BTL_OPENIB_CONTROL_COALESCED    2
192 #define MCA_BTL_OPENIB_CONTROL_CTS          3
193 
194 struct mca_btl_openib_control_header_t {
195     uint8_t  type;
196 #if OPAL_OPENIB_PAD_HDR
197     uint8_t  padding[7];
198 #endif
199 };
200 typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t;
201 
202 struct mca_btl_openib_eager_rdma_header_t {
203     mca_btl_openib_control_header_t control;
204     uint32_t rkey;
205     opal_ptr_t rdma_start;
206 };
207 typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t;
208 
209 #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON(h)       \
210     do {                                                   \
211         (h).rkey = htonl((h).rkey);                        \
212         (h).rdma_start.lval = hton64((h).rdma_start.lval); \
213     } while (0)
214 
215 #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h)         \
216     do {                                                     \
217         (h).rkey = ntohl((h).rkey);                          \
218         (h).rdma_start.lval = ntoh64((h).rdma_start.lval);   \
219     } while (0)
220 
221 
222 struct mca_btl_openib_rdma_credits_header_t {
223     mca_btl_openib_control_header_t control;
224 #if OPAL_OPENIB_PAD_HDR
225     uint8_t  padding[1];
226 #endif
227     uint8_t qpn;
228     uint16_t rdma_credits;
229 };
230 typedef struct mca_btl_openib_rdma_credits_header_t mca_btl_openib_rdma_credits_header_t;
231 
232 #define BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(h)     \
233 do {                                               \
234     (h).rdma_credits = htons((h).rdma_credits);    \
235 } while (0)
236 
237 #define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h)     \
238 do {                                               \
239     (h).rdma_credits = ntohs((h).rdma_credits);    \
240 } while (0)
241 
242 enum mca_btl_openib_frag_type_t {
243     MCA_BTL_OPENIB_FRAG_RECV,
244     MCA_BTL_OPENIB_FRAG_RECV_USER,
245     MCA_BTL_OPENIB_FRAG_SEND,
246     MCA_BTL_OPENIB_FRAG_SEND_USER,
247     MCA_BTL_OPENIB_FRAG_EAGER_RDMA,
248     MCA_BTL_OPENIB_FRAG_CONTROL,
249     MCA_BTL_OPENIB_FRAG_COALESCED
250 };
251 typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t;
252 
253 #define openib_frag_type(f) (to_base_frag(f)->type)
254 /**
255  * IB fragment derived type.
256  */
257 /* base openib frag */
258 typedef struct mca_btl_openib_frag_t {
259     mca_btl_base_descriptor_t base;
260     mca_btl_base_segment_t segment;
261     mca_btl_openib_frag_type_t type;
262     opal_free_list_t* list;
263 } mca_btl_openib_frag_t;
264 OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t);
265 
266 #define to_base_frag(f) ((mca_btl_openib_frag_t*)(f))
267 
268 /* frag used for communication */
269 typedef struct mca_btl_openib_com_frag_t {
270     mca_btl_openib_frag_t super;
271     struct ibv_sge sg_entry;
272     struct mca_btl_openib_reg_t *registration;
273     struct mca_btl_base_endpoint_t *endpoint;
274     /* number of unsignaled frags sent before this frag. */
275     uint32_t n_wqes_inflight;
276 } mca_btl_openib_com_frag_t;
277 OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t);
278 
279 #define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f))
280 
281 typedef struct mca_btl_openib_out_frag_t {
282     mca_btl_openib_com_frag_t super;
283     struct ibv_send_wr sr_desc;
284 } mca_btl_openib_out_frag_t;
285 OBJ_CLASS_DECLARATION(mca_btl_openib_out_frag_t);
286 
287 #define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f))
288 
289 typedef struct mca_btl_openib_com_frag_t mca_btl_openib_in_frag_t;
290 OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t);
291 
292 #define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f))
293 
294 typedef struct mca_btl_openib_send_frag_t {
295     mca_btl_openib_out_frag_t super;
296     mca_btl_openib_header_t *hdr, *chdr;
297     mca_btl_openib_footer_t *ftr;
298     uint8_t qp_idx;
299     uint32_t coalesced_length;
300     opal_list_t coalesced_frags;
301 } mca_btl_openib_send_frag_t;
302 OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t);
303 
304 #define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f))
305 
306 typedef struct mca_btl_openib_recv_frag_t {
307     mca_btl_openib_in_frag_t super;
308     mca_btl_openib_header_t *hdr;
309     mca_btl_openib_footer_t *ftr;
310     struct ibv_recv_wr rd_desc;
311     uint8_t qp_idx;
312 } mca_btl_openib_recv_frag_t;
313 OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
314 
315 #define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
316 
317 typedef struct mca_btl_openib_put_frag_t {
318     mca_btl_openib_out_frag_t super;
319     struct {
320 	mca_btl_base_rdma_completion_fn_t func;
321 	mca_btl_base_registration_handle_t *local_handle;
322 	void *context;
323 	void *data;
324     } cb;
325 } mca_btl_openib_put_frag_t;
326 OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
327 
328 #define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
329 
330 typedef struct mca_btl_openib_get_frag_t {
331     mca_btl_openib_in_frag_t super;
332     struct ibv_send_wr sr_desc;
333     struct {
334 	mca_btl_base_rdma_completion_fn_t func;
335 	mca_btl_base_registration_handle_t *local_handle;
336 	void *context;
337 	void *data;
338     } cb;
339 } mca_btl_openib_get_frag_t;
340 OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
341 
342 #define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f))
343 
344 typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t;
345 OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t);
346 
347 #define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f))
348 
349 typedef struct mca_btl_openib_coalesced_frag_t {
350     mca_btl_openib_frag_t super;
351     mca_btl_openib_send_frag_t *send_frag;
352     mca_btl_openib_header_coalesced_t *hdr;
353     bool sent;
354 } mca_btl_openib_coalesced_frag_t;
355 OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);
356 
357 #define to_coalesced_frag(f) ((mca_btl_openib_coalesced_frag_t*)(f))
358 
359 /*
360  * Allocate an IB send descriptor
361  *
362  */
363 
364 static inline mca_btl_openib_send_control_frag_t *
alloc_control_frag(mca_btl_openib_module_t * btl)365 alloc_control_frag(mca_btl_openib_module_t *btl)
366 {
367     return to_send_control_frag(opal_free_list_wait (&btl->device->send_free_control));
368 }
369 
frag_size_to_order(mca_btl_openib_module_t * btl,size_t size)370 static inline uint8_t frag_size_to_order(mca_btl_openib_module_t* btl,
371         size_t size)
372 {
373     int qp;
374     for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++)
375         if(mca_btl_openib_component.qp_infos[qp].size >= size)
376             return qp;
377 
378     return MCA_BTL_NO_ORDER;
379 }
380 
alloc_send_user_frag(void)381 static inline mca_btl_openib_com_frag_t *alloc_send_user_frag(void)
382 {
383     return to_com_frag(opal_free_list_get (&mca_btl_openib_component.send_user_free));
384 }
385 
alloc_recv_user_frag(void)386 static inline mca_btl_openib_com_frag_t *alloc_recv_user_frag(void)
387 {
388     return to_com_frag(opal_free_list_get (&mca_btl_openib_component.recv_user_free));
389 }
390 
alloc_coalesced_frag(void)391 static inline mca_btl_openib_coalesced_frag_t *alloc_coalesced_frag(void)
392 {
393     return to_coalesced_frag(opal_free_list_get (&mca_btl_openib_component.send_free_coalesced));
394 }
395 
396 #define MCA_BTL_IB_FRAG_RETURN(frag)                                    \
397     do {                                                                \
398         opal_free_list_return (to_base_frag(frag)->list,                \
399                 (opal_free_list_item_t*)(frag));                        \
400     } while(0)
401 
402 #define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list)                        \
403     do {                                                                \
404         opal_list_item_t *_frag_item;                                   \
405         while (NULL != (_frag_item = opal_list_remove_first(list))) {   \
406             MCA_BTL_IB_FRAG_RETURN(_frag_item);                         \
407         }                                                               \
408     } while (0)
409 
410 struct mca_btl_openib_module_t;
411 
412 struct mca_btl_openib_frag_init_data_t {
413     uint8_t order;
414     opal_free_list_t* list;
415 };
416 typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t;
417 
418 int mca_btl_openib_frag_init(opal_free_list_item_t* item, void* ctx);
419 
420 
421 END_C_DECLS
422 #endif
423