1 /**
2  * Copyright (C) Mellanox Technologies Ltd. 2001-2016.  ALL RIGHTS RESERVED.
3  * Copyright (C) The University of Tennessee and The University
4  *               of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
5  *
6  * See file LICENSE for terms.
7  */
8 
9 #ifndef UCT_IB_MD_H_
10 #define UCT_IB_MD_H_
11 
12 #include "ib_device.h"
13 
14 #include <uct/base/uct_md.h>
15 #include <ucs/stats/stats.h>
16 #include <ucs/memory/numa.h>
17 #include <ucs/memory/rcache.h>
18 
19 #define UCT_IB_MD_MAX_MR_SIZE       0x80000000UL
20 #define UCT_IB_MD_PACKED_RKEY_SIZE  sizeof(uint64_t)
21 
22 #define UCT_IB_MD_DEFAULT_GID_INDEX 0   /**< The gid index used by default for an IB/RoCE port */
23 
24 #define UCT_IB_MEM_ACCESS_FLAGS  (IBV_ACCESS_LOCAL_WRITE | \
25                                   IBV_ACCESS_REMOTE_WRITE | \
26                                   IBV_ACCESS_REMOTE_READ | \
27                                   IBV_ACCESS_REMOTE_ATOMIC)
28 
29 #define UCT_IB_MEM_DEREG          0
30 #define UCT_IB_CONFIG_PREFIX      "IB_"
31 
32 
33 /**
34  * IB MD statistics counters
35  */
36 enum {
37     UCT_IB_MD_STAT_MEM_ALLOC,
38     UCT_IB_MD_STAT_MEM_REG,
39     UCT_IB_MD_STAT_LAST
40 };
41 
42 
43 enum {
44     UCT_IB_MEM_FLAG_ODP              = UCS_BIT(0), /**< The memory region has on
45                                                         demand paging enabled */
46     UCT_IB_MEM_FLAG_ATOMIC_MR        = UCS_BIT(1), /**< The memory region has UMR
47                                                         for the atomic access */
48     UCT_IB_MEM_ACCESS_REMOTE_ATOMIC  = UCS_BIT(2), /**< An atomic access was
49                                                         requested for the memory
50                                                         region */
51     UCT_IB_MEM_MULTITHREADED         = UCS_BIT(3), /**< The memory region registration
52                                                         handled by chunks in parallel
53                                                         threads */
54     UCT_IB_MEM_FLAG_RELAXED_ORDERING = UCS_BIT(4), /**< The memory region will issue
55                                                         PCIe writes with relaxed order
56                                                         attribute */
57 };
58 
59 enum {
60     UCT_IB_DEVX_OBJ_RCQP,
61     UCT_IB_DEVX_OBJ_RCSRQ,
62     UCT_IB_DEVX_OBJ_DCT,
63     UCT_IB_DEVX_OBJ_DCSRQ
64 };
65 
66 typedef struct uct_ib_md_ext_config {
67     int                      eth_pause;    /**< Whether or not Pause Frame is
68                                                 enabled on the Ethernet network */
69     int                      prefer_nearest_device; /**< Give priority for near
70                                                          device */
71     int                      enable_indirect_atomic; /** Enable indirect atomic */
72     int                      enable_gpudirect_rdma; /** Enable GPUDirect RDMA */
73 #ifdef HAVE_EXP_UMR
74     unsigned                 max_inline_klm_list; /* Maximal length of inline KLM list */
75 #endif
76 
77     struct {
78         ucs_numa_policy_t    numa_policy;  /**< NUMA policy flags for ODP */
79         int                  prefetch;     /**< Auto-prefetch non-blocking memory
80                                                 registrations / allocations */
81         size_t               max_size;     /**< Maximal memory region size for ODP */
82     } odp;
83 
84     size_t                   gid_index;    /**< IB GID index to use  */
85 
86     size_t                   min_mt_reg;   /**< Multi-threaded registration threshold */
87     size_t                   mt_reg_chunk; /**< Multi-threaded registration chunk */
88     int                      mt_reg_bind;  /**< Multi-threaded registration bind to core */
89 } uct_ib_md_ext_config_t;
90 
91 
92 typedef struct uct_ib_mem {
93     uint32_t                lkey;
94     uint32_t                rkey;
95     uint32_t                atomic_rkey;
96     uint32_t                flags;
97 } uct_ib_mem_t;
98 
99 
100 typedef union uct_ib_mr {
101     struct ibv_mr           *ib;
102 } uct_ib_mr_t;
103 
104 
105 typedef enum {
106     /* Default memory region with either strict or relaxed order */
107     UCT_IB_MR_DEFAULT,
108     /* Additional memory region with strict order,
109      * if the default region is relaxed order */
110     UCT_IB_MR_STRICT_ORDER,
111     UCT_IB_MR_LAST
112 } uct_ib_mr_type_t;
113 
114 
115 /**
116  * IB memory domain.
117  */
118 typedef struct uct_ib_md {
119     uct_md_t                 super;
120     ucs_rcache_t             *rcache;   /**< Registration cache (can be NULL) */
121     uct_mem_h                global_odp;/**< Implicit ODP memory handle */
122     struct ibv_pd            *pd;       /**< IB memory domain */
123     uct_ib_device_t          dev;       /**< IB device */
124     ucs_linear_func_t        reg_cost;  /**< Memory registration cost */
125     struct uct_ib_md_ops     *ops;
126     UCS_STATS_NODE_DECLARE(stats)
127     uct_ib_md_ext_config_t   config;    /* IB external configuration */
128     struct {
129         uct_ib_device_spec_t *specs;    /* Custom device specifications */
130         unsigned             count;     /* Number of custom devices */
131     } custom_devices;
132     int                      check_subnet_filter;
133     uint64_t                 subnet_filter;
134     double                   pci_bw;
135     int                      relaxed_order;
136     int                      fork_init;
137     size_t                   memh_struct_size;
138 } uct_ib_md_t;
139 
140 
141 /**
142  * IB memory domain configuration.
143  */
144 typedef struct uct_ib_md_config {
145     uct_md_config_t          super;
146 
147     /** List of registration methods in order of preference */
148     UCS_CONFIG_STRING_ARRAY_FIELD(rmtd) reg_methods;
149 
150     uct_md_rcache_config_t   rcache;       /**< Registration cache config */
151     ucs_linear_func_t        uc_reg_cost;  /**< Memory registration cost estimation
152                                                 without using the cache */
153     unsigned                 fork_init;    /**< Use ibv_fork_init() */
154     int                      async_events; /**< Whether async events should be delivered */
155 
156     uct_ib_md_ext_config_t   ext;          /**< External configuration */
157 
158     UCS_CONFIG_STRING_ARRAY_FIELD(spec) custom_devices; /**< Custom device specifications */
159 
160     char                     *subnet_prefix; /**< Filter of subnet_prefix for IB ports */
161 
162     UCS_CONFIG_ARRAY_FIELD(ucs_config_bw_spec_t, device) pci_bw; /**< List of PCI BW for devices */
163 
164     unsigned                 devx;         /**< DEVX support */
165     unsigned                 devx_objs;    /**< Objects to be created by DevX */
166     ucs_on_off_auto_value_t  mr_relaxed_order; /**< Allow reorder memory accesses */
167 } uct_ib_md_config_t;
168 
169 /**
170  * Memory domain constructor.
171  *
172  * @param [in]  ibv_device    IB device.
173  *
174  * @param [in]  md_config     Memory domain configuration parameters.
175  *
176  * @param [out] md_p          Handle to memory domain.
177  *
178  * @return UCS_OK on success or error code in case of failure.
179  */
180 typedef ucs_status_t (*uct_ib_md_open_func_t)(struct ibv_device *ibv_device,
181                                               const uct_ib_md_config_t *md_config,
182                                               struct uct_ib_md **md_p);
183 
184 /**
185  * Memory domain destructor.
186  *
187  * @param [in]  md      Memory domain.
188  */
189 typedef void (*uct_ib_md_cleanup_func_t)(struct uct_ib_md *);
190 
191 /**
192  * Memory domain method to register memory area.
193  *
194  * @param [in]  md      Memory domain.
195  *
196  * @param [in]  address Memory area start address.
197  *
198  * @param [in]  length  Memory area length.
199  *
200  * @param [in]  access  IB verbs registration access flags
201  *
202  * @param [in]  memh    Memory region handle.
203  *                      Method should initialize lkey & rkey.
204  *
205  * @return UCS_OK on success or error code in case of failure.
206  */
207 typedef ucs_status_t (*uct_ib_md_reg_key_func_t)(struct uct_ib_md *md,
208                                                  void *address, size_t length,
209                                                  uint64_t access,
210                                                  uct_ib_mem_t *memh,
211                                                  uct_ib_mr_type_t mr_type);
212 
213 /**
214  * Memory domain method to deregister memory area.
215  *
216  * @param [in]  md      Memory domain.
217  *
218  * @param [in]  memh    Memory region handle registered with
219  *                      uct_ib_md_reg_key_func_t.
220  *
221  * @return UCS_OK on success or error code in case of failure.
222  */
223 typedef ucs_status_t (*uct_ib_md_dereg_key_func_t)(struct uct_ib_md *md,
224                                                    uct_ib_mem_t *memh,
225                                                    uct_ib_mr_type_t mr_type);
226 
227 /**
228  * Memory domain method to register memory area optimized for atomic ops.
229  *
230  * @param [in]  md      Memory domain.
231  *
232  * @param [in]  memh    Memory region handle registered for regular ops.
233  *                      Method should initialize atomic_rkey
234  *
235  * @return UCS_OK on success or error code in case of failure.
236  */
237 typedef ucs_status_t (*uct_ib_md_reg_atomic_key_func_t)(struct uct_ib_md *md,
238                                                         uct_ib_mem_t *memh);
239 
240 /**
241  * Memory domain method to release resources registered for atomic ops.
242  *
243  * @param [in]  md      Memory domain.
244  *
245  * @param [in]  memh    Memory region handle registered with
246  *                      uct_ib_md_reg_atomic_key_func_t.
247  *
248  * @return UCS_OK on success or error code in case of failure.
249  */
250 typedef ucs_status_t (*uct_ib_md_dereg_atomic_key_func_t)(struct uct_ib_md *md,
251                                                           uct_ib_mem_t *memh);
252 
253 /**
254  * Memory domain method to register memory area using multiple threads.
255  *
256  * @param [in]  md      Memory domain.
257  *
258  * @param [in]  address Memory area start address.
259  *
260  * @param [in]  length  Memory area length.
261  *
262  * @param [in]  access  IB verbs registration access flags
263  *
264  * @param [in]  memh    Memory region handle.
265  *                      Method should initialize lkey & rkey.
266  *
267  * @return UCS_OK on success or error code in case of failure.
268  */
269 typedef ucs_status_t (*uct_ib_md_reg_multithreaded_func_t)(uct_ib_md_t *md,
270                                                            void *address,
271                                                            size_t length,
272                                                            uint64_t access,
273                                                            uct_ib_mem_t *memh,
274                                                            uct_ib_mr_type_t mr_type);
275 
276 /**
277  * Memory domain method to deregister memory area.
278  *
279  * @param [in]  md      Memory domain.
280  *
281  * @param [in]  memh    Memory region handle registered with
282  *                      uct_ib_md_reg_key_func_t.
283  *
284  * @return UCS_OK on success or error code in case of failure.
285  */
286 typedef ucs_status_t (*uct_ib_md_dereg_multithreaded_func_t)(uct_ib_md_t *md,
287                                                              uct_ib_mem_t *memh,
288                                                              uct_ib_mr_type_t mr_type);
289 
290 /**
291  * Memory domain method to prefetch physical memory for virtual memory area.
292  *
293  * @param [in]  md      Memory domain.
294  *
295  * @param [in]  memh    Memory region handle.
296  *
297  * @param [in]  address Memory area start address.
298  *
299  * @param [in]  length  Memory area length.
300  *
301  * @return UCS_OK on success or error code in case of failure.
302  */
303 typedef ucs_status_t (*uct_ib_md_mem_prefetch_func_t)(uct_ib_md_t *md,
304                                                       uct_ib_mem_t *memh,
305                                                       void *addr, size_t length);
306 
307 /**
308  * Memory domain method to get unique atomic mr id.
309  *
310  * @param [in]  md      Memory domain.
311  *
312  * @param [out] mr_id   id to access atomic MR.
313  *
314  * @return UCS_OK on success or error code in case of failure.
315  */
316 typedef ucs_status_t (*uct_ib_md_get_atomic_mr_id_func_t)(uct_ib_md_t *md,
317                                                           uint8_t *mr_id);
318 
319 typedef struct uct_ib_md_ops {
320     uct_ib_md_open_func_t                open;
321     uct_ib_md_cleanup_func_t             cleanup;
322     uct_ib_md_reg_key_func_t             reg_key;
323     uct_ib_md_dereg_key_func_t           dereg_key;
324     uct_ib_md_reg_atomic_key_func_t      reg_atomic_key;
325     uct_ib_md_dereg_atomic_key_func_t    dereg_atomic_key;
326     uct_ib_md_reg_multithreaded_func_t   reg_multithreaded;
327     uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded;
328     uct_ib_md_mem_prefetch_func_t        mem_prefetch;
329     uct_ib_md_get_atomic_mr_id_func_t    get_atomic_mr_id;
330 } uct_ib_md_ops_t;
331 
332 
333 /**
334  * IB memory region in the registration cache.
335  */
336 typedef struct uct_ib_rcache_region {
337     ucs_rcache_region_t  super;
338     uct_ib_mem_t         memh;      /**<  mr exposed to the user as the memh */
339 } uct_ib_rcache_region_t;
340 
341 
342 /**
343  * IB memory domain constructor. Should have following logic:
344  * - probe provided IB device, may return UCS_ERR_UNSUPPORTED
345  * - allocate MD and IB context
346  * - setup atomic MR ops
347  * - determine device attributes and flags
348  */
349 typedef struct uct_ib_md_ops_entry {
350     ucs_list_link_t             list;
351     const char                  *name;
352     uct_ib_md_ops_t             *ops;
353     int                         priority;
354 } uct_ib_md_ops_entry_t;
355 
356 #define UCT_IB_MD_OPS(_md_ops, _priority) \
357     extern ucs_list_link_t uct_ib_md_ops_list; \
358     UCS_STATIC_INIT { \
359         static uct_ib_md_ops_entry_t *p, entry = { \
360             .name     = UCS_PP_MAKE_STRING(_md_ops), \
361             .ops      = &_md_ops, \
362             .priority = _priority, \
363         }; \
364         ucs_list_for_each(p, &uct_ib_md_ops_list, list) { \
365             if (p->priority < _priority) { \
366                 ucs_list_insert_before(&p->list, &entry.list); \
367                 return; \
368             } \
369         } \
370         ucs_list_add_tail(&uct_ib_md_ops_list, &entry.list); \
371     }
372 
373 extern uct_component_t uct_ib_component;
374 
uct_ib_md_direct_rkey(uct_rkey_t uct_rkey)375 static inline uint32_t uct_ib_md_direct_rkey(uct_rkey_t uct_rkey)
376 {
377     return (uint32_t)uct_rkey;
378 }
379 
380 
uct_ib_md_indirect_rkey(uct_rkey_t uct_rkey)381 static uint32_t uct_ib_md_indirect_rkey(uct_rkey_t uct_rkey)
382 {
383     return uct_rkey >> 32;
384 }
385 
386 
387 static UCS_F_ALWAYS_INLINE void
uct_ib_md_pack_rkey(uint32_t rkey,uint32_t atomic_rkey,void * rkey_buffer)388 uct_ib_md_pack_rkey(uint32_t rkey, uint32_t atomic_rkey, void *rkey_buffer)
389 {
390     uint64_t *rkey_p = (uint64_t*)rkey_buffer;
391     *rkey_p = (((uint64_t)atomic_rkey) << 32) | rkey;
392      ucs_trace("packed rkey: direct 0x%x indirect 0x%x", rkey, atomic_rkey);
393 }
394 
395 
396 /**
397  * rkey is packed/unpacked is such a way that:
398  * low  32 bits contain a direct key
399  * high 32 bits contain either UCT_IB_INVALID_RKEY or a valid indirect key.
400  */
uct_ib_resolve_atomic_rkey(uct_rkey_t uct_rkey,uint16_t atomic_mr_offset,uint64_t * remote_addr_p)401 static inline uint32_t uct_ib_resolve_atomic_rkey(uct_rkey_t uct_rkey,
402                                                   uint16_t atomic_mr_offset,
403                                                   uint64_t *remote_addr_p)
404 {
405     uint32_t atomic_rkey = uct_ib_md_indirect_rkey(uct_rkey);
406     if (atomic_rkey == UCT_IB_INVALID_RKEY) {
407         return uct_ib_md_direct_rkey(uct_rkey);
408     } else {
409         *remote_addr_p += atomic_mr_offset;
410         return atomic_rkey;
411     }
412 }
413 
414 
uct_ib_md_atomic_offset(uint8_t atomic_mr_id)415 static inline uint16_t uct_ib_md_atomic_offset(uint8_t atomic_mr_id)
416 {
417     return 8 * atomic_mr_id;
418 }
419 
420 static inline void
uct_ib_memh_init_keys(uct_ib_mem_t * memh,uint32_t lkey,uint32_t rkey)421 uct_ib_memh_init_keys(uct_ib_mem_t *memh, uint32_t lkey, uint32_t rkey)
422 {
423     memh->lkey = lkey;
424     memh->rkey = rkey;
425 }
426 
427 static inline uct_ib_mr_type_t
uct_ib_memh_get_atomic_base_mr_type(uct_ib_mem_t * memh)428 uct_ib_memh_get_atomic_base_mr_type(uct_ib_mem_t *memh)
429 {
430     if (memh->flags & UCT_IB_MEM_FLAG_RELAXED_ORDERING) {
431         return UCT_IB_MR_STRICT_ORDER;
432     } else {
433         return UCT_IB_MR_DEFAULT;
434     }
435 }
436 
uct_ib_memh_get_lkey(uct_mem_h memh)437 static UCS_F_ALWAYS_INLINE uint32_t uct_ib_memh_get_lkey(uct_mem_h memh)
438 {
439     ucs_assert(memh != UCT_MEM_HANDLE_NULL);
440     return ((uct_ib_mem_t*)memh)->lkey;
441 }
442 
443 
444 ucs_status_t uct_ib_md_open(uct_component_t *component, const char *md_name,
445                             const uct_md_config_t *uct_md_config, uct_md_h *md_p);
446 
447 ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md,
448                                    struct ibv_device *ib_device,
449                                    const uct_ib_md_config_t *md_config);
450 
451 void uct_ib_md_close(uct_md_h uct_md);
452 
453 ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
454                            uint64_t access, struct ibv_mr **mr_p);
455 ucs_status_t uct_ib_dereg_mr(struct ibv_mr *mr);
456 ucs_status_t uct_ib_dereg_mrs(struct ibv_mr **mrs, size_t mr_num);
457 
458 ucs_status_t
459 uct_ib_md_handle_mr_list_multithreaded(uct_ib_md_t *md, void *address,
460                                        size_t length, uint64_t access,
461                                        size_t chunk, struct ibv_mr **mrs);
462 
463 void uct_ib_md_parse_relaxed_order(uct_ib_md_t *md,
464                                    const uct_ib_md_config_t *md_config);
465 
466 ucs_status_t uct_ib_reg_key_impl(uct_ib_md_t *md, void *address,
467                                  size_t length, uint64_t access_flags,
468                                  uct_ib_mem_t *memh, uct_ib_mr_t *mrs,
469                                  uct_ib_mr_type_t mr_type);
470 #endif
471