1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2009 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2011 Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
15  * Copyright (c) 2006-2018 Los Alamos National Security, LLC.  All rights
16  *                         reserved.
17  * Copyright (c) 2006-2007 Voltaire All rights reserved.
18  * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
19  * Copyright (c) 2013-2014 NVIDIA Corporation.  All rights reserved.
20  * Copyright (c) 2014      Bull SAS.  All rights reserved.
21  * Copyright (c) 2015-2018 Research Organization for Information Science
22  *                         and Technology (RIST).  All rights reserved.
23  * Copyrigth (c) 2019      Triad National Security, LLC. All rights reserved.
24  *
25  * $COPYRIGHT$
26  *
27  * Additional copyrights may follow
28  *
29  * $HEADER$
30  *
31  * @file
32  */
33 
34 #ifndef MCA_BTL_IB_H
35 #define MCA_BTL_IB_H
36 
37 #include "opal_config.h"
38 #include <sys/types.h>
39 #include <string.h>
40 #include <infiniband/verbs.h>
41 
42 /* Open MPI includes */
43 #include "opal/class/opal_pointer_array.h"
44 #include "opal/class/opal_hash_table.h"
45 #include "opal/util/arch.h"
46 #include "opal/util/output.h"
47 #include "opal/mca/event/event.h"
48 #include "opal/threads/threads.h"
49 #include "opal/mca/btl/btl.h"
50 #include "opal/mca/rcache/rcache.h"
51 #include "opal/mca/mpool/mpool.h"
52 #include "opal/mca/btl/base/btl_base_error.h"
53 #include "opal/mca/btl/base/base.h"
54 #include "opal/runtime/opal_progress_threads.h"
55 
56 #include "connect/connect.h"
57 
58 BEGIN_C_DECLS
59 
60 #define HAVE_XRC (OPAL_HAVE_CONNECTX_XRC || OPAL_HAVE_CONNECTX_XRC_DOMAINS)
61 #define ENABLE_DYNAMIC_SL OPAL_ENABLE_DYNAMIC_SL
62 
63 #define MCA_BTL_IB_LEAVE_PINNED 1
64 #define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
65 #define MCA_BTL_IB_PKEY_MASK 0x7fff
66 #define MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT (256)
67 
68 
69 /*--------------------------------------------------------------------*/
70 
71 #if OPAL_ENABLE_DEBUG
72 #define ATTACH() do { \
73   int i = 0; \
74   opal_output(0, "WAITING TO DEBUG ATTACH"); \
75   while (i == 0) sleep(5); \
76   } while(0);
77 #else
78 #define ATTACH()
79 #endif
80 
81 /*--------------------------------------------------------------------*/
82 
83 /**
84  * Infiniband (IB) BTL component.
85  */
86 
87 enum {
88     BTL_OPENIB_HP_CQ,
89     BTL_OPENIB_LP_CQ,
90     BTL_OPENIB_MAX_CQ,
91 };
92 
93 typedef enum {
94     MCA_BTL_OPENIB_TRANSPORT_IB,
95     MCA_BTL_OPENIB_TRANSPORT_IWARP,
96     MCA_BTL_OPENIB_TRANSPORT_RDMAOE,
97     MCA_BTL_OPENIB_TRANSPORT_UNKNOWN,
98     MCA_BTL_OPENIB_TRANSPORT_SIZE
99 } mca_btl_openib_transport_type_t;
100 
101 typedef enum {
102     MCA_BTL_OPENIB_PP_QP,
103     MCA_BTL_OPENIB_SRQ_QP,
104     MCA_BTL_OPENIB_XRC_QP
105 } mca_btl_openib_qp_type_t;
106 
107 struct mca_btl_openib_pp_qp_info_t {
108     int32_t rd_win;
109     int32_t rd_rsv;
110 }; typedef struct mca_btl_openib_pp_qp_info_t mca_btl_openib_pp_qp_info_t;
111 
112 struct mca_btl_openib_srq_qp_info_t {
113     int32_t sd_max;
114     /* The init value for rd_curr_num variables of all SRQs */
115     int32_t rd_init;
116     /* The watermark, threshold - if the number of WQEs in SRQ is less then this value =>
117        the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on corresponding SRQ.
118        As result the maximal number of pre-posted WQEs on the SRQ will be increased */
119     int32_t srq_limit;
120 }; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;
121 
122 struct mca_btl_openib_qp_info_t {
123     mca_btl_openib_qp_type_t type;
124     size_t size;
125     int32_t rd_num;
126     int32_t rd_low;
127     union {
128         mca_btl_openib_pp_qp_info_t pp_qp;
129         mca_btl_openib_srq_qp_info_t srq_qp;
130     } u;
131 }; typedef struct mca_btl_openib_qp_info_t mca_btl_openib_qp_info_t;
132 
133 #define BTL_OPENIB_QP_TYPE(Q) (mca_btl_openib_component.qp_infos[(Q)].type)
134 #define BTL_OPENIB_QP_TYPE_PP(Q) \
135     (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_PP_QP)
136 #define BTL_OPENIB_QP_TYPE_SRQ(Q) \
137     (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_SRQ_QP)
138 #define BTL_OPENIB_QP_TYPE_XRC(Q) \
139     (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
140 
141 typedef enum {
142     BTL_OPENIB_RQ_SOURCE_DEVICE_INI = MCA_BASE_VAR_SOURCE_MAX,
143 } btl_openib_receive_queues_source_t;
144 
145 typedef enum {
146     BTL_OPENIB_DT_IB,
147     BTL_OPENIB_DT_IWARP,
148     BTL_OPENIB_DT_ALL
149 } btl_openib_device_type_t;
150 
151 /* The structer for manage all BTL SRQs */
152 typedef struct mca_btl_openib_srq_manager_t {
153     opal_mutex_t lock;
154     /* The keys of this hash table are addresses of
155        SRQs structures, and the elements are BTL modules
156        pointers that associated with these SRQs */
157     opal_hash_table_t srq_addr_table;
158 } mca_btl_openib_srq_manager_t;
159 
160 struct mca_btl_openib_component_t {
161     mca_btl_base_component_3_0_0_t          super;  /**< base BTL component */
162 
163     int                                ib_max_btls;
164     /**< maximum number of devices available to openib component */
165 
166     int                                ib_num_btls;
167     /**< number of devices available to the openib component */
168 
169     int                                ib_allowed_btls;
170     /**< number of devices allowed to the openib component */
171 
172     struct mca_btl_openib_module_t             **openib_btls;
173     /**< array of available BTLs */
174 
175     opal_pointer_array_t devices; /**< array of available devices */
176     int devices_count;
177 
178     int ib_free_list_num;
179     /**< initial size of free lists */
180 
181     int ib_free_list_max;
182     /**< maximum size of free lists */
183 
184     int ib_free_list_inc;
185     /**< number of elements to alloc when growing free lists */
186 
187     opal_list_t                             ib_procs;
188     /**< list of ib proc structures */
189 
190     opal_event_t                            ib_send_event;
191     /**< event structure for sends */
192 
193     opal_event_t                            ib_recv_event;
194     /**< event structure for recvs */
195 
196     opal_mutex_t                            ib_lock;
197     /**< lock for accessing module state */
198 
199     char* ib_mpool_hints;
200     /**< hints for selecting an mpool component */
201 
202     char *ib_rcache_name;
203     /**< name of ib registration cache */
204 
205     uint8_t num_pp_qps;          /**< number of pp qp's */
206     uint8_t num_srq_qps;         /**< number of srq qp's */
207     uint8_t num_xrc_qps;         /**< number of xrc qp's */
208     uint8_t num_qps;             /**< total number of qp's */
209 
210     opal_hash_table_t ib_addr_table; /**< used only for xrc.hash-table that
211                                        keeps table of all lids/subnets */
212     mca_btl_openib_qp_info_t* qp_infos;
213 
214     size_t eager_limit;      /**< Eager send limit of first fragment, in Bytes */
215     size_t max_send_size;    /**< Maximum send size, in Bytes */
216     uint32_t max_hw_msg_size;/**< Maximum message size for RDMA protocols in Bytes */
217     uint32_t reg_mru_len;    /**< Length of the registration cache most recently used list */
218     uint32_t use_srq;        /**< Use the Shared Receive Queue (SRQ mode) */
219 
220     uint32_t ib_cq_size[BTL_OPENIB_MAX_CQ];  /**< Max outstanding CQE on the CQ */
221 
222     int      ib_max_inline_data; /**< Max size of inline data */
223     unsigned int ib_pkey_val;
224     unsigned int ib_psn;
225     unsigned int ib_qp_ous_rd_atom;
226     uint32_t ib_mtu;
227     unsigned int ib_min_rnr_timer;
228     unsigned int ib_timeout;
229     unsigned int ib_retry_count;
230     unsigned int ib_rnr_retry;
231     unsigned int ib_max_rdma_dst_ops;
232     unsigned int ib_service_level;
233 #if (ENABLE_DYNAMIC_SL)
234     unsigned int ib_path_record_service_level;
235 #endif
236     int     use_eager_rdma;
237     int     eager_rdma_threshold; /**< After this number of msg, use RDMA for short messages, always */
238     int     eager_rdma_num;
239     int32_t max_eager_rdma;
240     unsigned int btls_per_lid;
241     unsigned int max_lmc;
242     int     apm_lmc;
243     int     apm_ports;
244     unsigned int buffer_alignment;    /**< Preferred communication buffer alignment in Bytes (must be power of two) */
245     int32_t error_counter;           /**< Counts number on error events that we got on all devices */
246     opal_event_base_t *async_evbase; /**< Async event base */
247     bool use_async_event_thread;     /**< Use the async event handler */
248     mca_btl_openib_srq_manager_t srq_manager;     /**< Hash table for all BTL SRQs */
249     /* declare as an int instead of btl_openib_device_type_t since there is no
250        guarantee about the size of an enum. this value will be registered as an
251        integer with the MCA variable system */
252     int device_type;
253     bool allow_ib;
254     char *if_include;
255     char **if_include_list;
256     char *if_exclude;
257     char **if_exclude_list;
258     char *ipaddr_include;
259     char *ipaddr_exclude;
260 
261     /* MCA param btl_openib_receive_queues */
262     char *receive_queues;
263     /* Whether we got a non-default value of btl_openib_receive_queues */
264     mca_base_var_source_t receive_queues_source;
265 
266     /** Colon-delimited list of filenames for device parameters */
267     char *device_params_file_names;
268 
269     /** Whether we're in verbose mode or not */
270     bool verbose;
271 
272     /** Whether we want a warning if no device-specific parameters are
273         found in INI files */
274     bool warn_no_device_params_found;
275     /** Whether we want a warning if non default GID prefix is not configured
276         on multiport setup */
277     bool warn_default_gid_prefix;
278     /** Whether we want a warning if the user specifies a non-existent
279         device and/or port via btl_openib_if_[in|ex]clude MCA params */
280     bool warn_nonexistent_if;
281     /** Whether we want to abort if there's not enough registered
282         memory available */
283     bool abort_not_enough_reg_mem;
284 
285     /** Dummy argv-style list; a copy of names from the
286         if_[in|ex]clude list that we use for error checking (to ensure
287         that they all exist) */
288     char **if_list;
289     bool use_message_coalescing;
290     unsigned int cq_poll_ratio;
291     unsigned int cq_poll_progress;
292     unsigned int cq_poll_batch;
293     unsigned int eager_rdma_poll_ratio;
294     int rdma_qp;
295     int credits_qp; /* qp used for software flow control */
296     bool cpc_explicitly_defined;
297     /**< free list of frags only; used for pining user memory */
298     opal_free_list_t send_user_free;
299     /**< free list of frags only; used for pining user memory */
300     opal_free_list_t recv_user_free;
301     /**< frags for coalesced massages */
302     opal_free_list_t send_free_coalesced;
303     /** Default receive queues */
304     char* default_recv_qps;
305     /** GID index to use */
306     int gid_index;
307     /*  Whether we want to allow connecting processes from different subnets.
308      *  set to 'no' by default */
309     bool allow_different_subnets;
310     /** Whether we want a dynamically resizing srq, enabled by default */
311     bool enable_srq_resize;
312     bool allow_max_memory_registration;
313     int memory_registration_verbose_level;
314     int memory_registration_verbose;
315     int ignore_locality;
316 #if OPAL_CUDA_SUPPORT
317     bool cuda_async_send;
318     bool cuda_async_recv;
319     bool cuda_have_gdr;
320     bool driver_have_gdr;
321     bool cuda_want_gdr;
322 #endif /* OPAL_CUDA_SUPPORT */
323 #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
324     bool rroce_enable;
325 #endif
326     unsigned int num_default_gid_btls; /* numbers of btl in the default subnet */
327 }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
328 
329 OPAL_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
330 
331 typedef mca_btl_base_recv_reg_t mca_btl_openib_recv_reg_t;
332 
333 /**
334  * Common information for all ports that is sent in the modex message
335  */
336 typedef struct mca_btl_openib_modex_message_t {
337     /** The subnet ID of this port */
338     uint64_t subnet_id;
339     /** LID of this port */
340     uint16_t lid;
341     /** APM LID for this port */
342     uint16_t apm_lid;
343     /** The MTU used by this port */
344     uint8_t mtu;
345     /** vendor id define device type and tuning */
346     uint32_t vendor_id;
347     /** vendor part id define device type and tuning */
348     uint32_t vendor_part_id;
349     /** Transport type of remote port */
350     uint8_t transport_type;
351     /** Dummy field used to calculate the real length */
352     uint8_t end;
353 } mca_btl_openib_modex_message_t;
354 
355 #define MCA_BTL_OPENIB_MODEX_MSG_NTOH(hdr)     \
356     do {                              \
357         (hdr).subnet_id = ntoh64((hdr).subnet_id); \
358         (hdr).lid = ntohs((hdr).lid); \
359     } while (0)
360 #define MCA_BTL_OPENIB_MODEX_MSG_HTON(hdr)     \
361     do {                              \
362         (hdr).subnet_id = hton64((hdr).subnet_id); \
363         (hdr).lid = htons((hdr).lid); \
364     } while (0)
365 
366 typedef struct mca_btl_openib_device_qp_t {
367     opal_free_list_t send_free;     /**< free lists of send buffer descriptors */
368     opal_free_list_t recv_free;     /**< free lists of receive buffer descriptors */
369 } mca_btl_openib_device_qp_t;
370 
371 struct mca_btl_base_endpoint_t;
372 
373 typedef struct mca_btl_openib_device_t {
374     opal_object_t super;
375     struct ibv_device *ib_dev;  /* the ib device */
376 #if OPAL_ENABLE_PROGRESS_THREADS == 1
377     struct ibv_comp_channel *ib_channel; /* Channel event for the device */
378     opal_thread_t thread;                /* Progress thread */
379     volatile bool progress;              /* Progress status */
380 #endif
381     opal_mutex_t device_lock;          /* device level lock */
382     struct ibv_context *ib_dev_context;
383 #if HAVE_DECL_IBV_EXP_QUERY_DEVICE
384     struct ibv_exp_device_attr ib_exp_dev_attr;
385 #endif
386     struct ibv_device_attr ib_dev_attr;
387     struct ibv_pd *ib_pd;
388     struct ibv_cq *ib_cq[BTL_OPENIB_MAX_CQ];
389     uint32_t cq_size[BTL_OPENIB_MAX_CQ];
390     mca_mpool_base_module_t *mpool;
391     mca_rcache_base_module_t *rcache;
392     /* MTU for this device */
393     uint32_t mtu;
394     /* Whether this device supports eager RDMA */
395     uint8_t use_eager_rdma;
396     uint8_t btls;              /** < number of btls using this device */
397     uint8_t allowed_btls;      /** < number of allowed btls using this device */
398     opal_pointer_array_t *endpoints;
399     opal_pointer_array_t *device_btls;
400     uint16_t hp_cq_polls;
401     uint16_t eager_rdma_polls;
402     bool pollme;
403     volatile bool got_fatal_event;
404     volatile bool got_port_event;
405 #if HAVE_XRC
406 #if OPAL_HAVE_CONNECTX_XRC_DOMAINS
407     struct ibv_xrcd *xrcd;
408 #else
409     struct ibv_xrc_domain *xrc_domain;
410 #endif
411     int xrc_fd;
412 #endif
413     int32_t non_eager_rdma_endpoints;
414     int32_t eager_rdma_buffers_count;
415     struct mca_btl_base_endpoint_t **eager_rdma_buffers;
416     /**< frags for control massages */
417     opal_free_list_t send_free_control;
418     /* QP types and attributes that will be used on this device */
419     mca_btl_openib_device_qp_t *qps;
420     /* Maximum value supported by this device for max_inline_data */
421     uint32_t max_inline_data;
422     /* Registration limit and current count */
423     uint64_t mem_reg_max, mem_reg_max_total, mem_reg_active;
424     /* Device is ready for use */
425     bool ready_for_use;
426     /* Async event */
427     opal_event_t async_event;
428 } mca_btl_openib_device_t;
429 OBJ_CLASS_DECLARATION(mca_btl_openib_device_t);
430 
431 struct mca_btl_openib_module_pp_qp_t {
432     int32_t dummy;
433 }; typedef struct mca_btl_openib_module_pp_qp_t mca_btl_openib_module_pp_qp_t;
434 
435 struct mca_btl_openib_module_srq_qp_t {
436     struct ibv_srq *srq;
437     int32_t rd_posted;
438     int32_t sd_credits;  /* the max number of outstanding sends on a QP when using SRQ */
439                          /*  i.e. the number of frags that  can be outstanding (down counter) */
440     opal_list_t pending_frags[2];    /**< list of high/low prio frags */
441     /** The number of receive buffers that can be post in the current time.
442         The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED
443         event handler. The value starts from (rd_num / 4) and increased up to rd_num */
444     int32_t rd_curr_num;
445     /** We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value.
446          The value increased together with rd_curr_num. The value is unique for every SRQ. */
447     int32_t rd_low_local;
448     /** The flag points if we want to get the
449          IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
450     bool srq_limit_event_flag;
451     /**< In difference of the "--mca enable_srq_resize" parameter that says, if we want(or no)
452          to start with small num of pre-posted receive buffers (rd_curr_num) and to increase this number by needs
453          (the max of this value is rd_num * the whole size of SRQ), the "srq_limit_event_flag" says if we want to get limit event
454          from device if the defined srq limit was reached (signal to the main thread) and we put off this flag if the rd_curr_num
455          was increased up to rd_num.
456          In order to prevent lock/unlock operation in the critical path we prefer only put-on
457          the srq_limit_event_flag in asynchronous thread, because in this way we post receive buffers
458          in the main thread only and only after posting we set (if srq_limit_event_flag is true)
459          the limit for IBV_EVENT_SRQ_LIMIT_REACHED event. */
460 }; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t;
461 
462 struct mca_btl_openib_module_qp_t {
463     union {
464         mca_btl_openib_module_pp_qp_t pp_qp;
465         mca_btl_openib_module_srq_qp_t srq_qp;
466     } u;
467 }; typedef struct mca_btl_openib_module_qp_t mca_btl_openib_module_qp_t;
468 
469 /**
470  * IB BTL Interface
471  */
472 struct mca_btl_openib_module_t {
473     /* Base BTL module */
474     mca_btl_base_module_t  super;
475 
476     bool btl_inited;
477     bool srqs_created;
478 
479     /** Common information about all ports */
480     mca_btl_openib_modex_message_t port_info;
481 
482     /** Array of CPCs on this port */
483     opal_btl_openib_connect_base_module_t **cpcs;
484 
485     /** Number of elements in the cpcs array */
486     uint8_t num_cpcs;
487 
488     mca_btl_openib_device_t *device;
489     char * device_name;
490     uint8_t port_num;                  /**< ID of the PORT */
491     uint16_t pkey_index;
492     struct ibv_port_attr ib_port_attr;
493     uint16_t lid;                      /**< lid that is actually used (for LMC) */
494     int apm_port;                      /**< Alternative port that may be used for APM */
495     uint8_t src_path_bits;             /**< offset from base lid (for LMC) */
496 
497     int32_t num_peers;
498 
499     opal_mutex_t ib_lock;              /**< module level lock */
500 
501     size_t eager_rdma_frag_size;                /**< length of eager frag */
502     volatile int32_t eager_rdma_channels;  /**< number of open RDMA channels */
503 
504     mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
505 
506     mca_btl_openib_module_qp_t * qps;
507 
508     int local_procs;                   /** number of local procs */
509 
510     bool atomic_ops_be;                /** atomic result is big endian */
511 };
512 typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
513 
514 extern mca_btl_openib_module_t mca_btl_openib_module;
515 
516 struct mca_btl_base_registration_handle_t {
517     uint32_t rkey;
518     uint32_t lkey;
519 };
520 
521 struct mca_btl_openib_reg_t {
522     mca_rcache_base_registration_t base;
523     struct ibv_mr *mr;
524     mca_btl_base_registration_handle_t btl_handle;
525 };
526 typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t;
527 
528 #if OPAL_ENABLE_PROGRESS_THREADS == 1
529 extern void* mca_btl_openib_progress_thread(opal_object_t*);
530 #endif
531 
532 
533 /**
534  * Register a callback function that is called on error..
535  *
536  * @param btl (IN)     BTL module
537  * @return             Status indicating if cleanup was successful
538  */
539 
540 int mca_btl_openib_register_error_cb(
541     struct mca_btl_base_module_t* btl,
542     mca_btl_base_module_error_cb_fn_t cbfunc
543 );
544 
545 
546 /**
547  * Cleanup any resources held by the BTL.
548  *
549  * @param btl  BTL instance.
550  * @return     OPAL_SUCCESS or error status on failure.
551  */
552 
553 extern int mca_btl_openib_finalize(
554     struct mca_btl_base_module_t* btl
555 );
556 
557 
558 /**
559  * PML->BTL notification of change in the process list.
560  *
561  * @param btl (IN)            BTL module
562  * @param nprocs (IN)         Number of processes
563  * @param procs (IN)          Set of processes
564  * @param peers (OUT)         Set of (optional) peer addressing info.
565  * @param reachable (IN/OUT)  Set of processes that are reachable via this BTL.
566  * @return     OPAL_SUCCESS or error status on failure.
567  *
568  */
569 
570 extern int mca_btl_openib_add_procs(
571     struct mca_btl_base_module_t* btl,
572     size_t nprocs,
573     struct opal_proc_t **procs,
574     struct mca_btl_base_endpoint_t** peers,
575     opal_bitmap_t* reachable
576 );
577 
578 /**
579  * PML->BTL notification of change in the process list.
580  *
581  * @param btl (IN)     BTL instance
582  * @param nproc (IN)   Number of processes.
583  * @param procs (IN)   Set of processes.
584  * @param peers (IN)   Set of peer data structures.
585  * @return             Status indicating if cleanup was successful
586  *
587  */
588 extern int mca_btl_openib_del_procs(
589     struct mca_btl_base_module_t* btl,
590     size_t nprocs,
591     struct opal_proc_t **procs,
592     struct mca_btl_base_endpoint_t** peers
593 );
594 
595 
596 /**
597  * PML->BTL Initiate a send of the specified size.
598  *
599  * @param btl (IN)               BTL instance
600  * @param btl_peer (IN)          BTL peer addressing
601  * @param descriptor (IN)        Descriptor of data to be transmitted.
602  * @param tag (IN)               Tag.
603  */
604 extern int mca_btl_openib_send(
605     struct mca_btl_base_module_t* btl,
606     struct mca_btl_base_endpoint_t* btl_peer,
607     struct mca_btl_base_descriptor_t* descriptor,
608     mca_btl_base_tag_t tag
609 );
610 
611 /**
612  * PML->BTL Initiate a immediate send of the specified size.
613  *
614  * @param btl (IN)               BTL instance
615  * @param ep (IN)                Endpoint
616  * @param convertor (IN)         Datatypes converter
617  * @param header (IN)            PML header
618  * @param header_size (IN)       PML header size
619  * @param payload_size (IN)      Payload size
620  * @param order (IN)             Order
621  * @param flags (IN)             Flags
622  * @param tag (IN)               Tag
623  * @param descriptor (OUT)       Messages descriptor
624  */
625 extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
626     struct mca_btl_base_endpoint_t* ep,
627     struct opal_convertor_t* convertor,
628     void* header,
629     size_t header_size,
630     size_t payload_size,
631     uint8_t order,
632     uint32_t flags,
633     mca_btl_base_tag_t tag,
634     mca_btl_base_descriptor_t** descriptor
635 );
636 
637 /* forward decaration for internal put/get */
638 struct mca_btl_openib_put_frag_t;
639 struct mca_btl_openib_get_frag_t;
640 
641 /**
642  * @brief Schedule a put fragment with the HCA (internal)
643  *
644  * @param btl (IN)               BTL instance
645  * @param ep (IN)                BTL endpoint
646  * @param frag (IN)              Fragment prepared by mca_btl_openib_put
647  *
648  * If the fragment can not be scheduled due to resource limitations then
649  * the fragment will be put on the pending put fragment list and retried
650  * when another get/put fragment has completed.
651  */
652 int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
653                                  struct mca_btl_openib_put_frag_t *frag);
654 
655 /**
656  * @brief Schedule an RDMA write with the HCA
657  *
658  * @param btl (IN)               BTL instance
659  * @param ep (IN)                BTL endpoint
660  * @param local_address (IN)     Source address
661  * @param remote_address (IN)    Destination address
662  * @param local_handle (IN)      Registration handle for region containing the region {local_address, size}
663  * @param remote_handle (IN)     Registration handle for region containing the region {remote_address, size}
664  * @param size (IN)              Number of bytes to write
665  * @param flags (IN)             Transfer flags
666  * @param order (IN)             Ordering
667  * @param cbfunc (IN)            Function to call on completion
668  * @param cbcontext (IN)         Context for completion callback
669  * @param cbdata (IN)            Data for completion callback
670  *
671  * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
672  * @return OPAL_SUCCCESS if the operation was successfully scheduled
673  *
674  * This function will attempt to schedule a put operation with the HCA.
675  */
676 int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
677                         uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
678                         mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
679                         int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
680 
681 /**
682  * @brief Schedule a get fragment with the HCA (internal)
683  *
684  * @param btl (IN)               BTL instance
685  * @param ep (IN)                BTL endpoint
686  * @param qp (IN)                ID of queue pair to schedule the get on
687  * @param frag (IN)              Fragment prepared by mca_btl_openib_get
688  *
689  * If the fragment can not be scheduled due to resource limitations then
690  * the fragment will be put on the pending get fragment list and retried
691  * when another get/put fragment has completed.
692  */
693 int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
694                                  struct mca_btl_openib_get_frag_t *frag);
695 
696 /**
697  * @brief Schedule an RDMA read with the HCA
698  *
699  * @param btl (IN)               BTL instance
700  * @param ep (IN)                BTL endpoint
701  * @param local_address (IN)     Destination address
702  * @param remote_address (IN)    Source address
703  * @param local_handle (IN)      Registration handle for region containing the region {local_address, size}
704  * @param remote_handle (IN)     Registration handle for region containing the region {remote_address, size}
705  * @param size (IN)              Number of bytes to read
706  * @param flags (IN)             Transfer flags
707  * @param order (IN)             Ordering
708  * @param cbfunc (IN)            Function to call on completion
709  * @param cbcontext (IN)         Context for completion callback
710  * @param cbdata (IN)            Data for completion callback
711  *
712  * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
713  * @return OPAL_SUCCCESS if the operation was successfully scheduled
714  *
715  * This function will attempt to schedule a get operation with the HCA.
716  */
717 int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
718                         uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
719                         mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
720                         int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
721 
722 /**
723  * Initiate an asynchronous fetching atomic operation.
724  * Completion Semantics: if this function returns a 1 then the operation
725  *                       is complete. a return of OPAL_SUCCESS indicates
726  *                       the atomic operation has been queued with the
727  *                       network.
728  *
729  * @param btl (IN)            BTL module
730  * @param endpoint (IN)       BTL addressing information
731  * @param local_address (OUT) Local address to store the result in
732  * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
733  * @param local_handle (IN)   Local registration handle for region containing
734  *                            (local_address, local_address + 8)
735  * @param remote_handle (IN)  Remote registration handle for region containing
736  *                            (remote_address, remote_address + 8)
737  * @param op (IN)             Operation to perform
738  * @param operand (IN)        Operand for the operation
739  * @param flags (IN)          Flags for this put operation
740  * @param order (IN)          Ordering
741  * @param cbfunc (IN)         Function to call on completion (if queued)
742  * @param cbcontext (IN)      Context for the callback
743  * @param cbdata (IN)         Data for callback
744  *
745  * @retval OPAL_SUCCESS    The operation was successfully queued
746  * @retval 1               The operation is complete
747  * @retval OPAL_ERROR      The operation was NOT successfully queued
748  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
749  *                         operation. Try again later
750  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
751  *                         alignment restrictions or the operation {op} is not supported
752  *                         by the hardware.
753  *
754  * After the operation is complete the remote address specified by {remote_address} and
755  * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
756  * {local_address} will be updated with the previous value stored in {remote_address}.
757  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
758  * however, that not all btls will provide consistency between btl atomic operations and
759  * cpu atomics.
760  */
761 int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
762                                void *local_address, uint64_t remote_address,
763                                struct mca_btl_base_registration_handle_t *local_handle,
764                                struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
765                                uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
766                                void *cbcontext, void *cbdata);
767 
768 /**
769  * Initiate an asynchronous compare and swap operation.
770  * Completion Semantics: if this function returns a 1 then the operation
771  *                       is complete. a return of OPAL_SUCCESS indicates
772  *                       the atomic operation has been queued with the
773  *                       network.
774  *
775  * @param btl (IN)            BTL module
776  * @param endpoint (IN)       BTL addressing information
777  * @param local_address (OUT) Local address to store the result in
778  * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
779  * @param local_handle (IN)   Local registration handle for region containing
780  *                            (local_address, local_address + 8)
781  * @param remote_handle (IN)  Remote registration handle for region containing
782  *                            (remote_address, remote_address + 8)
783  * @param compare (IN)        Operand for the operation
784  * @param value (IN)          Value to store on success
785  * @param flags (IN)          Flags for this put operation
786  * @param order (IN)          Ordering
787  * @param cbfunc (IN)         Function to call on completion (if queued)
788  * @param cbcontext (IN)      Context for the callback
789  * @param cbdata (IN)         Data for callback
790  *
791  * @retval OPAL_SUCCESS    The operation was successfully queued
792  * @retval 1               The operation is complete
793  * @retval OPAL_ERROR      The operation was NOT successfully queued
794  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
795  *                         operation. Try again later
796  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
797  *                         alignment restrictions or the operation {op} is not supported
798  *                         by the hardware.
799  *
800  * After the operation is complete the remote address specified by {remote_address} and
801  * {remote_handle} will be updated with {value} if *remote_address == compare.
802  * {local_address} will be updated with the previous value stored in {remote_address}.
803  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
804  * however, that not all btls will provide consistency between btl atomic operations and
805  * cpu atomics.
806  */
807 int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
808                                  void *local_address, uint64_t remote_address,
809                                  struct mca_btl_base_registration_handle_t *local_handle,
810                                  struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
811                                  uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
812                                  void *cbcontext, void *cbdata);
813 
814 /**
815  * Allocate a descriptor.
816  *
817  * @param btl (IN)      BTL module
818  * @param size (IN)     Requested descriptor size.
819  */
820 extern mca_btl_base_descriptor_t* mca_btl_openib_alloc(
821         struct mca_btl_base_module_t* btl,
822         struct mca_btl_base_endpoint_t* endpoint,
823         uint8_t order,
824         size_t size,
825         uint32_t flags);
826 
827 
828 /**
829  * Return a segment allocated by this BTL.
830  *
831  * @param btl (IN)         BTL module
832  * @param descriptor (IN)  Allocated descriptor.
833  */
834 extern int mca_btl_openib_free(
835                                struct mca_btl_base_module_t* btl,
836                                mca_btl_base_descriptor_t* des);
837 
838 
839 /**
840  * Pack data and return a descriptor that can be
841  * used for send/put.
842  *
843  * @param btl (IN)      BTL module
844  * @param peer (IN)     BTL peer addressing
845  */
846 mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
847                                                       struct mca_btl_base_module_t* btl,
848                                                       struct mca_btl_base_endpoint_t* peer,
849                                                       struct opal_convertor_t* convertor,
850                                                       uint8_t order,
851                                                       size_t reserve,
852                                                       size_t* size,
853                                                       uint32_t flags
854                                                       );
855 
856 extern void mca_btl_openib_frag_progress_pending_put_get(
857         struct mca_btl_base_endpoint_t*, const int);
858 
859 /**
860  * Fault Tolerance Event Notification Function
861  *
862  * @param state (IN)  Checkpoint State
863  * @return OPAL_SUCCESS or failure status
864  */
865 extern int mca_btl_openib_ft_event(int state);
866 
867 
868 /**
869  * Show an error during init, particularly when running out of
870  * registered memory.
871  */
872 void mca_btl_openib_show_init_error(const char *file, int line,
873                                     const char *func, const char *dev);
874 /**
875  * Post to Shared Receive Queue with certain priority
876  *
877  * @param openib_btl (IN) BTL module
878  * @param additional (IN) Additional Bytes to reserve
879  * @param prio (IN)       Priority (either BTL_OPENIB_HP_QP or BTL_OPENIB_LP_QP)
880  * @return OPAL_SUCCESS or failure status
881  */
882 
883 int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
884 
885 /**
886  * Get a transport name of btl by its transport type.
887  */
888 
889 const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
890 
891 /**
892  * Get an endpoint for a process
893  *
894  * @param btl (IN)    BTL module
895  * @param proc (IN)   opal process object
896  *
897  * This function will return an existing endpoint if one exists otherwise it will allocate
898  * a new endpoint and return it.
899  */
900 struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl,
901                                                        struct opal_proc_t *proc);
902 
903 /**
904  * Get a transport type of btl.
905  */
906 
907 mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl);
908 
qp_cq_prio(const int qp)909 static inline int qp_cq_prio(const int qp)
910 {
911     if(0 == qp)
912         return BTL_OPENIB_HP_CQ; /* smallest qp is always HP */
913 
914     /* If the size for this qp is <= the eager limit, make it a
915        high priority QP.  Otherwise, make it a low priority QP. */
916     return (mca_btl_openib_component.qp_infos[qp].size <=
917             mca_btl_openib_component.eager_limit) ?
918         BTL_OPENIB_HP_CQ : BTL_OPENIB_LP_CQ;
919 }
920 
921 #define BTL_OPENIB_RDMA_QP(QP) \
922     ((QP) == mca_btl_openib_component.rdma_qp)
923 
924 /**
925  * Run function as part of opal_progress()
926  *
927  * @param[in] fn    function to run
928  * @param[in] arg   function data
929  */
930 int mca_btl_openib_run_in_main (void *(*fn)(void *), void *arg);
931 
932 
933 END_C_DECLS
934 
935 #endif /* MCA_BTL_IB_H */
936