1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2016. ALL RIGHTS RESERVED.
3 * Copyright (C) The University of Tennessee and The University
4 * of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
5 *
6 * See file LICENSE for terms.
7 */
8
9 #ifndef UCT_IB_MD_H_
10 #define UCT_IB_MD_H_
11
12 #include "ib_device.h"
13
14 #include <uct/base/uct_md.h>
15 #include <ucs/stats/stats.h>
16 #include <ucs/memory/numa.h>
17 #include <ucs/memory/rcache.h>
18
19 #define UCT_IB_MD_MAX_MR_SIZE 0x80000000UL
20 #define UCT_IB_MD_PACKED_RKEY_SIZE sizeof(uint64_t)
21
22 #define UCT_IB_MD_DEFAULT_GID_INDEX 0 /**< The gid index used by default for an IB/RoCE port */
23
24 #define UCT_IB_MEM_ACCESS_FLAGS (IBV_ACCESS_LOCAL_WRITE | \
25 IBV_ACCESS_REMOTE_WRITE | \
26 IBV_ACCESS_REMOTE_READ | \
27 IBV_ACCESS_REMOTE_ATOMIC)
28
29 #define UCT_IB_MEM_DEREG 0
30 #define UCT_IB_CONFIG_PREFIX "IB_"
31
32
33 /**
34 * IB MD statistics counters
35 */
36 enum {
37 UCT_IB_MD_STAT_MEM_ALLOC,
38 UCT_IB_MD_STAT_MEM_REG,
39 UCT_IB_MD_STAT_LAST
40 };
41
42
43 enum {
44 UCT_IB_MEM_FLAG_ODP = UCS_BIT(0), /**< The memory region has on
45 demand paging enabled */
46 UCT_IB_MEM_FLAG_ATOMIC_MR = UCS_BIT(1), /**< The memory region has UMR
47 for the atomic access */
48 UCT_IB_MEM_ACCESS_REMOTE_ATOMIC = UCS_BIT(2), /**< An atomic access was
49 requested for the memory
50 region */
51 UCT_IB_MEM_MULTITHREADED = UCS_BIT(3), /**< The memory region registration
52 handled by chunks in parallel
53 threads */
54 UCT_IB_MEM_FLAG_RELAXED_ORDERING = UCS_BIT(4), /**< The memory region will issue
55 PCIe writes with relaxed order
56 attribute */
57 };
58
59 enum {
60 UCT_IB_DEVX_OBJ_RCQP,
61 UCT_IB_DEVX_OBJ_RCSRQ,
62 UCT_IB_DEVX_OBJ_DCT,
63 UCT_IB_DEVX_OBJ_DCSRQ
64 };
65
66 typedef struct uct_ib_md_ext_config {
67 int eth_pause; /**< Whether or not Pause Frame is
68 enabled on the Ethernet network */
69 int prefer_nearest_device; /**< Give priority for near
70 device */
71 int enable_indirect_atomic; /** Enable indirect atomic */
72 int enable_gpudirect_rdma; /** Enable GPUDirect RDMA */
73 #ifdef HAVE_EXP_UMR
74 unsigned max_inline_klm_list; /* Maximal length of inline KLM list */
75 #endif
76
77 struct {
78 ucs_numa_policy_t numa_policy; /**< NUMA policy flags for ODP */
79 int prefetch; /**< Auto-prefetch non-blocking memory
80 registrations / allocations */
81 size_t max_size; /**< Maximal memory region size for ODP */
82 } odp;
83
84 size_t gid_index; /**< IB GID index to use */
85
86 size_t min_mt_reg; /**< Multi-threaded registration threshold */
87 size_t mt_reg_chunk; /**< Multi-threaded registration chunk */
88 int mt_reg_bind; /**< Multi-threaded registration bind to core */
89 } uct_ib_md_ext_config_t;
90
91
92 typedef struct uct_ib_mem {
93 uint32_t lkey;
94 uint32_t rkey;
95 uint32_t atomic_rkey;
96 uint32_t flags;
97 } uct_ib_mem_t;
98
99
100 typedef union uct_ib_mr {
101 struct ibv_mr *ib;
102 } uct_ib_mr_t;
103
104
105 typedef enum {
106 /* Default memory region with either strict or relaxed order */
107 UCT_IB_MR_DEFAULT,
108 /* Additional memory region with strict order,
109 * if the default region is relaxed order */
110 UCT_IB_MR_STRICT_ORDER,
111 UCT_IB_MR_LAST
112 } uct_ib_mr_type_t;
113
114
115 /**
116 * IB memory domain.
117 */
118 typedef struct uct_ib_md {
119 uct_md_t super;
120 ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */
121 uct_mem_h global_odp;/**< Implicit ODP memory handle */
122 struct ibv_pd *pd; /**< IB memory domain */
123 uct_ib_device_t dev; /**< IB device */
124 ucs_linear_func_t reg_cost; /**< Memory registration cost */
125 struct uct_ib_md_ops *ops;
126 UCS_STATS_NODE_DECLARE(stats)
127 uct_ib_md_ext_config_t config; /* IB external configuration */
128 struct {
129 uct_ib_device_spec_t *specs; /* Custom device specifications */
130 unsigned count; /* Number of custom devices */
131 } custom_devices;
132 int check_subnet_filter;
133 uint64_t subnet_filter;
134 double pci_bw;
135 int relaxed_order;
136 int fork_init;
137 size_t memh_struct_size;
138 } uct_ib_md_t;
139
140
141 /**
142 * IB memory domain configuration.
143 */
144 typedef struct uct_ib_md_config {
145 uct_md_config_t super;
146
147 /** List of registration methods in order of preference */
148 UCS_CONFIG_STRING_ARRAY_FIELD(rmtd) reg_methods;
149
150 uct_md_rcache_config_t rcache; /**< Registration cache config */
151 ucs_linear_func_t uc_reg_cost; /**< Memory registration cost estimation
152 without using the cache */
153 unsigned fork_init; /**< Use ibv_fork_init() */
154 int async_events; /**< Whether async events should be delivered */
155
156 uct_ib_md_ext_config_t ext; /**< External configuration */
157
158 UCS_CONFIG_STRING_ARRAY_FIELD(spec) custom_devices; /**< Custom device specifications */
159
160 char *subnet_prefix; /**< Filter of subnet_prefix for IB ports */
161
162 UCS_CONFIG_ARRAY_FIELD(ucs_config_bw_spec_t, device) pci_bw; /**< List of PCI BW for devices */
163
164 unsigned devx; /**< DEVX support */
165 unsigned devx_objs; /**< Objects to be created by DevX */
166 ucs_on_off_auto_value_t mr_relaxed_order; /**< Allow reorder memory accesses */
167 } uct_ib_md_config_t;
168
169 /**
170 * Memory domain constructor.
171 *
172 * @param [in] ibv_device IB device.
173 *
174 * @param [in] md_config Memory domain configuration parameters.
175 *
176 * @param [out] md_p Handle to memory domain.
177 *
178 * @return UCS_OK on success or error code in case of failure.
179 */
180 typedef ucs_status_t (*uct_ib_md_open_func_t)(struct ibv_device *ibv_device,
181 const uct_ib_md_config_t *md_config,
182 struct uct_ib_md **md_p);
183
184 /**
185 * Memory domain destructor.
186 *
187 * @param [in] md Memory domain.
188 */
189 typedef void (*uct_ib_md_cleanup_func_t)(struct uct_ib_md *);
190
191 /**
192 * Memory domain method to register memory area.
193 *
194 * @param [in] md Memory domain.
195 *
196 * @param [in] address Memory area start address.
197 *
198 * @param [in] length Memory area length.
199 *
200 * @param [in] access IB verbs registration access flags
201 *
202 * @param [in] memh Memory region handle.
203 * Method should initialize lkey & rkey.
204 *
205 * @return UCS_OK on success or error code in case of failure.
206 */
207 typedef ucs_status_t (*uct_ib_md_reg_key_func_t)(struct uct_ib_md *md,
208 void *address, size_t length,
209 uint64_t access,
210 uct_ib_mem_t *memh,
211 uct_ib_mr_type_t mr_type);
212
213 /**
214 * Memory domain method to deregister memory area.
215 *
216 * @param [in] md Memory domain.
217 *
218 * @param [in] memh Memory region handle registered with
219 * uct_ib_md_reg_key_func_t.
220 *
221 * @return UCS_OK on success or error code in case of failure.
222 */
223 typedef ucs_status_t (*uct_ib_md_dereg_key_func_t)(struct uct_ib_md *md,
224 uct_ib_mem_t *memh,
225 uct_ib_mr_type_t mr_type);
226
227 /**
228 * Memory domain method to register memory area optimized for atomic ops.
229 *
230 * @param [in] md Memory domain.
231 *
232 * @param [in] memh Memory region handle registered for regular ops.
233 * Method should initialize atomic_rkey
234 *
235 * @return UCS_OK on success or error code in case of failure.
236 */
237 typedef ucs_status_t (*uct_ib_md_reg_atomic_key_func_t)(struct uct_ib_md *md,
238 uct_ib_mem_t *memh);
239
240 /**
241 * Memory domain method to release resources registered for atomic ops.
242 *
243 * @param [in] md Memory domain.
244 *
245 * @param [in] memh Memory region handle registered with
246 * uct_ib_md_reg_atomic_key_func_t.
247 *
248 * @return UCS_OK on success or error code in case of failure.
249 */
250 typedef ucs_status_t (*uct_ib_md_dereg_atomic_key_func_t)(struct uct_ib_md *md,
251 uct_ib_mem_t *memh);
252
253 /**
254 * Memory domain method to register memory area using multiple threads.
255 *
256 * @param [in] md Memory domain.
257 *
258 * @param [in] address Memory area start address.
259 *
260 * @param [in] length Memory area length.
261 *
262 * @param [in] access IB verbs registration access flags
263 *
264 * @param [in] memh Memory region handle.
265 * Method should initialize lkey & rkey.
266 *
267 * @return UCS_OK on success or error code in case of failure.
268 */
269 typedef ucs_status_t (*uct_ib_md_reg_multithreaded_func_t)(uct_ib_md_t *md,
270 void *address,
271 size_t length,
272 uint64_t access,
273 uct_ib_mem_t *memh,
274 uct_ib_mr_type_t mr_type);
275
276 /**
277 * Memory domain method to deregister memory area.
278 *
279 * @param [in] md Memory domain.
280 *
281 * @param [in] memh Memory region handle registered with
282 * uct_ib_md_reg_key_func_t.
283 *
284 * @return UCS_OK on success or error code in case of failure.
285 */
286 typedef ucs_status_t (*uct_ib_md_dereg_multithreaded_func_t)(uct_ib_md_t *md,
287 uct_ib_mem_t *memh,
288 uct_ib_mr_type_t mr_type);
289
290 /**
291 * Memory domain method to prefetch physical memory for virtual memory area.
292 *
293 * @param [in] md Memory domain.
294 *
295 * @param [in] memh Memory region handle.
296 *
297 * @param [in] address Memory area start address.
298 *
299 * @param [in] length Memory area length.
300 *
301 * @return UCS_OK on success or error code in case of failure.
302 */
303 typedef ucs_status_t (*uct_ib_md_mem_prefetch_func_t)(uct_ib_md_t *md,
304 uct_ib_mem_t *memh,
305 void *addr, size_t length);
306
307 /**
308 * Memory domain method to get unique atomic mr id.
309 *
310 * @param [in] md Memory domain.
311 *
312 * @param [out] mr_id id to access atomic MR.
313 *
314 * @return UCS_OK on success or error code in case of failure.
315 */
316 typedef ucs_status_t (*uct_ib_md_get_atomic_mr_id_func_t)(uct_ib_md_t *md,
317 uint8_t *mr_id);
318
319 typedef struct uct_ib_md_ops {
320 uct_ib_md_open_func_t open;
321 uct_ib_md_cleanup_func_t cleanup;
322 uct_ib_md_reg_key_func_t reg_key;
323 uct_ib_md_dereg_key_func_t dereg_key;
324 uct_ib_md_reg_atomic_key_func_t reg_atomic_key;
325 uct_ib_md_dereg_atomic_key_func_t dereg_atomic_key;
326 uct_ib_md_reg_multithreaded_func_t reg_multithreaded;
327 uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded;
328 uct_ib_md_mem_prefetch_func_t mem_prefetch;
329 uct_ib_md_get_atomic_mr_id_func_t get_atomic_mr_id;
330 } uct_ib_md_ops_t;
331
332
333 /**
334 * IB memory region in the registration cache.
335 */
336 typedef struct uct_ib_rcache_region {
337 ucs_rcache_region_t super;
338 uct_ib_mem_t memh; /**< mr exposed to the user as the memh */
339 } uct_ib_rcache_region_t;
340
341
342 /**
343 * IB memory domain constructor. Should have following logic:
344 * - probe provided IB device, may return UCS_ERR_UNSUPPORTED
345 * - allocate MD and IB context
346 * - setup atomic MR ops
347 * - determine device attributes and flags
348 */
349 typedef struct uct_ib_md_ops_entry {
350 ucs_list_link_t list;
351 const char *name;
352 uct_ib_md_ops_t *ops;
353 int priority;
354 } uct_ib_md_ops_entry_t;
355
356 #define UCT_IB_MD_OPS(_md_ops, _priority) \
357 extern ucs_list_link_t uct_ib_md_ops_list; \
358 UCS_STATIC_INIT { \
359 static uct_ib_md_ops_entry_t *p, entry = { \
360 .name = UCS_PP_MAKE_STRING(_md_ops), \
361 .ops = &_md_ops, \
362 .priority = _priority, \
363 }; \
364 ucs_list_for_each(p, &uct_ib_md_ops_list, list) { \
365 if (p->priority < _priority) { \
366 ucs_list_insert_before(&p->list, &entry.list); \
367 return; \
368 } \
369 } \
370 ucs_list_add_tail(&uct_ib_md_ops_list, &entry.list); \
371 }
372
373 extern uct_component_t uct_ib_component;
374
uct_ib_md_direct_rkey(uct_rkey_t uct_rkey)375 static inline uint32_t uct_ib_md_direct_rkey(uct_rkey_t uct_rkey)
376 {
377 return (uint32_t)uct_rkey;
378 }
379
380
uct_ib_md_indirect_rkey(uct_rkey_t uct_rkey)381 static uint32_t uct_ib_md_indirect_rkey(uct_rkey_t uct_rkey)
382 {
383 return uct_rkey >> 32;
384 }
385
386
387 static UCS_F_ALWAYS_INLINE void
uct_ib_md_pack_rkey(uint32_t rkey,uint32_t atomic_rkey,void * rkey_buffer)388 uct_ib_md_pack_rkey(uint32_t rkey, uint32_t atomic_rkey, void *rkey_buffer)
389 {
390 uint64_t *rkey_p = (uint64_t*)rkey_buffer;
391 *rkey_p = (((uint64_t)atomic_rkey) << 32) | rkey;
392 ucs_trace("packed rkey: direct 0x%x indirect 0x%x", rkey, atomic_rkey);
393 }
394
395
396 /**
397 * rkey is packed/unpacked is such a way that:
398 * low 32 bits contain a direct key
399 * high 32 bits contain either UCT_IB_INVALID_RKEY or a valid indirect key.
400 */
uct_ib_resolve_atomic_rkey(uct_rkey_t uct_rkey,uint16_t atomic_mr_offset,uint64_t * remote_addr_p)401 static inline uint32_t uct_ib_resolve_atomic_rkey(uct_rkey_t uct_rkey,
402 uint16_t atomic_mr_offset,
403 uint64_t *remote_addr_p)
404 {
405 uint32_t atomic_rkey = uct_ib_md_indirect_rkey(uct_rkey);
406 if (atomic_rkey == UCT_IB_INVALID_RKEY) {
407 return uct_ib_md_direct_rkey(uct_rkey);
408 } else {
409 *remote_addr_p += atomic_mr_offset;
410 return atomic_rkey;
411 }
412 }
413
414
uct_ib_md_atomic_offset(uint8_t atomic_mr_id)415 static inline uint16_t uct_ib_md_atomic_offset(uint8_t atomic_mr_id)
416 {
417 return 8 * atomic_mr_id;
418 }
419
420 static inline void
uct_ib_memh_init_keys(uct_ib_mem_t * memh,uint32_t lkey,uint32_t rkey)421 uct_ib_memh_init_keys(uct_ib_mem_t *memh, uint32_t lkey, uint32_t rkey)
422 {
423 memh->lkey = lkey;
424 memh->rkey = rkey;
425 }
426
427 static inline uct_ib_mr_type_t
uct_ib_memh_get_atomic_base_mr_type(uct_ib_mem_t * memh)428 uct_ib_memh_get_atomic_base_mr_type(uct_ib_mem_t *memh)
429 {
430 if (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING) {
431 return UCT_IB_MR_STRICT_ORDER;
432 } else {
433 return UCT_IB_MR_DEFAULT;
434 }
435 }
436
uct_ib_memh_get_lkey(uct_mem_h memh)437 static UCS_F_ALWAYS_INLINE uint32_t uct_ib_memh_get_lkey(uct_mem_h memh)
438 {
439 ucs_assert(memh != UCT_MEM_HANDLE_NULL);
440 return ((uct_ib_mem_t*)memh)->lkey;
441 }
442
443
444 ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name,
445 const uct_md_config_t *uct_md_config, uct_md_h *md_p);
446
447 ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md,
448 struct ibv_device *ib_device,
449 const uct_ib_md_config_t *md_config);
450
451 void uct_ib_md_close(uct_md_h uct_md);
452
453 ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
454 uint64_t access, struct ibv_mr **mr_p);
455 ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr);
456 ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num);
457
458 ucs_status_t
459 uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address,
460 size_t length, uint64_t access,
461 size_t chunk, struct ibv_mr **mrs);
462
463 void uct_ib_md_parse_relaxed_order(uct_ib_md_t *md,
464 const uct_ib_md_config_t *md_config);
465
466 ucs_status_t uct_ib_reg_key_impl(uct_ib_md_t *md, void *address,
467 size_t length, uint64_t access_flags,
468 uct_ib_mem_t *memh, uct_ib_mr_t *mrs,
469 uct_ib_mr_type_t mr_type);
470 #endif
471