1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2016 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2016 Los Alamos National Security, LLC.  All rights
14  *                         reserved.
15  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
16  * Copyright (c) 2012-2013 NVIDIA Corporation.  All rights reserved.
17  * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
18  * Copyright (c) 2015      Research Organization for Information Science
19  *                         and Technology (RIST). All rights reserved.
20  * $COPYRIGHT$
21  *
22  * Additional copyrights may follow
23  *
24  * $HEADER$
25  */
26 /**
27  * @file
28  *
29  * Byte Transfer Layer (BTL)
30  *
31  *
32  * BTL Initialization:
33  *
34  * During library initialization, all available BTL components are
35  * loaded and opened via their mca_base_open_component_fn_t
36  * function. The BTL open function should register any mca parameters
37  * used to tune/adjust the behaviour of the BTL (mca_base_var_register()
38  * mca_base_component_var_register()). Note that the open function may fail
39  * if the resources (e.g. shared libraries, etc) required by the network
40  * transport are not available.
41  *
42  * The mca_btl_base_component_init_fn_t() is then called for each of the
43  * components that are succesfully opened. The component init function may
44  * return either:
45  *
46  * (1) a NULL list of BTL modules if the transport is not available,
47  * (2) a list containing a one or more single BTL modules, where the BTL provides
48  *     a layer of abstraction over one or more physical devices (e.g. NICs),
49  *
50  * During module initialization, the module should post any addressing
51  * information required by its peers. An example would be the TCP
52  * listen port opened by the TCP module for incoming connection
53  * requests. This information is published to peers via the
54  * modex_send() interface. Note that peer information is not
55  * guaranteed to be available via modex_recv() during the
56  * module's init function. However, it will be available during
57  * BTL selection (mca_btl_base_add_proc_fn_t()).
58  *
59  * BTL Selection:
60  *
61  * The upper layer builds an ordered list of the available BTL modules sorted
62  * by their exclusivity ranking. This is a relative ranking that is used
63  * to determine the set of BTLs that may be used to reach a given destination.
64  * During startup the BTL modules are queried via their
65  * mca_btl_base_add_proc_fn_t() to determine if they are able to reach
66  * a given destination.  The BTL module with the highest ranking that
67  * returns success is selected. Subsequent BTL modules are selected only
68  * if they have the same exclusivity ranking.
69  *
70  * An example of how this might be used:
71  *
72  * BTL         Exclusivity   Comments
73  * --------    -----------   ------------------
74  * LO              100       Selected exclusively for local process
75  * SM               50       Selected exclusively for other processes on host
76  * IB                0       Selected based on network reachability
77  * IB                0       Selected based on network reachability
78  * TCP               0       Selected based on network reachability
79  * TCP               0       Selected based on network reachability
80  *
81  * When mca_btl_base_add_proc_fn_t() is called on a  BTL module, the BTL
82  * will populate an OUT variable with mca_btl_base_endpoint_t pointers.
83  * Each pointer is treated as an opaque handle by the upper layer and is
84  * returned to the BTL on subsequent data transfer calls to the
85  * corresponding destination process.  The actual contents of the
86  * data structure are defined on a per BTL basis, and may be used to
87  * cache addressing or connection information, such as a TCP socket
88  * or IB queue pair.
89  *
90  * Progress:
91  *
92  * By default, the library provides for polling based progress of outstanding
93  * requests. The BTL component exports an interface function (btl_progress)
94  * that is called in a polling mode by the PML during calls into the MPI
95  * library. Note that the btl_progress() function is called on the BTL component
96  * rather than each BTL module. This implies that the BTL author is responsible
97  * for iterating over the pending operations in each of the BTL modules associated
98  * with the component.
99  *
100  * On platforms where threading support is provided, the library provides the
101  * option of building with asynchronous threaded progress. In this case, the BTL
102  * author is responsible for providing a thread to progress pending operations.
103  * A thread is associated with the BTL component/module such that transport specific
104  * functionality/APIs may be used to block the thread until a pending operation
105  * completes. This thread MUST NOT poll for completion as this would oversubscribe
106  * the CPU.
107  *
108  * Note that in the threaded case the PML may choose to use a hybrid approach,
109  * such that polling is implemented from the user thread for a fixed number of
110  * cycles before relying on the background thread(s) to complete requests. If
111  * possible the BTL should support the use of both modes concurrently.
112  *
113  */
114 
115 #ifndef OPAL_MCA_BTL_H
116 #define OPAL_MCA_BTL_H
117 
118 #include "opal_config.h"
119 #include "opal/types.h"
120 #include "opal/prefetch.h" /* For OPAL_LIKELY */
121 #include "opal/class/opal_bitmap.h"
122 #include "opal/datatype/opal_convertor.h"
123 #include "opal/mca/mca.h"
124 #include "opal/mca/mpool/mpool.h"
125 #include "opal/mca/rcache/rcache.h"
126 #include "opal/mca/crs/crs.h"
127 #include "opal/mca/crs/base/base.h"
128 
129 BEGIN_C_DECLS
130 
131 /*
132  * BTL types
133  */
134 
135 struct mca_btl_base_module_t;
136 struct mca_btl_base_endpoint_t;
137 struct mca_btl_base_descriptor_t;
138 struct mca_mpool_base_resources_t;
139 struct opal_proc_t;
140 
141 /**
142  * Opaque registration handle for executing RDMA and atomic
143  * operations on a memory region.
144  *
145  * This data inside this handle is appropriate for passing
146  * to remote peers to execute RDMA and atomic operations. The
147  * size needed to send the registration handle can be
148  * obtained from the btl via the btl_registration_handle_size
149  * member. If this size is 0 then no registration data is
150  * needed to execute RDMA or atomic operations.
151  */
152 struct mca_btl_base_registration_handle_t;
153 typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
154 
155 
156 /* Wildcard endpoint for use in the register_mem function */
157 #define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
158 
159 /* send/recv operations require tag matching */
160 typedef uint8_t mca_btl_base_tag_t;
161 
162 #define MCA_BTL_NO_ORDER       255
163 
164 /*
165  * Communication specific defines. There are a number of active message ID
166  * that can be shred between all frameworks that need to communicate (i.e.
167  * use the PML or the BTL directly). These ID are exchanged between the
168  * processes, therefore they need to be identical everywhere. The simplest
169  * approach is to have them defined as constants, and give each framework a
170  * small number. Here is the rule that defines these ID (they are 8 bits):
171  * - the first 3 bits are used to code the framework (i.e. PML, OSC, COLL)
172  * - the remaining 5 bytes are used internally by the framework, and divided
173  *   based on the components requirements. Therefore, the way the PML and
174  * the OSC frameworks use these defines will be different. For more
175  * information about how these framework ID are defined, take a look in the
176  * header file associated with the framework.
177  */
178 #define MCA_BTL_AM_FRAMEWORK_MASK   0xD0
179 #define MCA_BTL_TAG_BTL             0x20
180 #define MCA_BTL_TAG_PML             0x40
181 #define MCA_BTL_TAG_OSC_RDMA        0x60
182 #define MCA_BTL_TAG_USR             0x80
183 #define MCA_BTL_TAG_MAX             255 /* 1 + highest allowed tag num */
184 
185 /*
186  * Reserved tags for specific BTLs. As multiple BTLs can be active
187  * simultaneously, their tags should not collide.
188  */
189 #define MCA_BTL_TAG_IB                (MCA_BTL_TAG_BTL + 0)
190 #define MCA_BTL_TAG_UDAPL             (MCA_BTL_TAG_BTL + 1)
191 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
192 
193 /* prefered protocol */
194 #define MCA_BTL_FLAGS_SEND            0x0001
195 #define MCA_BTL_FLAGS_PUT             0x0002
196 #define MCA_BTL_FLAGS_GET             0x0004
197 /* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML
198  * rdma_btls list. This allows the updated one-sided component to
199  * use btls that are not otherwise used for send/recv. */
200 #define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT)
201 
202 /* btl can send directly from user buffer w/out registration */
203 #define MCA_BTL_FLAGS_SEND_INPLACE    0x0008
204 
205 /* btl transport reliability flags - currently used only by the DR PML */
206 #define MCA_BTL_FLAGS_NEED_ACK        0x0010
207 #define MCA_BTL_FLAGS_NEED_CSUM       0x0020
208 
209 /** deprecated (BTL 3.0) */
210 #define MCA_BTL_FLAGS_RDMA_MATCHED    0x0040
211 
212 /* btl needs local rdma completion */
213 #define MCA_BTL_FLAGS_RDMA_COMPLETION 0x0080
214 
215  /* btl can do heterogeneous rdma operations on byte buffers */
216 #define MCA_BTL_FLAGS_HETEROGENEOUS_RDMA 0x0100
217 
218 /* btl can support failover if enabled */
219 #define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200
220 
221 #define MCA_BTL_FLAGS_CUDA_PUT        0x0400
222 #define MCA_BTL_FLAGS_CUDA_GET        0x0800
223 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
224 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000
225 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000
226 
227 /* btl can support signaled operations. BTLs that support this flag are
228  * expected to provide a mechanism for asynchronous progress on descriptors
229  * where the feature is requested. BTLs should also be aware that users can
230  * (and probably will) turn this flag on and off using the MCA variable
231  * system.
232  */
233 #define MCA_BTL_FLAGS_SIGNALED        0x4000
234 
235 /** The BTL supports network atomic operations */
236 #define MCA_BTL_FLAGS_ATOMIC_OPS      0x08000
237 /** The BTL supports fetching network atomic operations */
238 #define MCA_BTL_FLAGS_ATOMIC_FOPS     0x10000
239 
240 /** The BTL requires add_procs to be with all procs including non-local. Shared-memory
241  * BTLs should not set this flag. */
242 #define MCA_BTL_FLAGS_SINGLE_ADD_PROCS 0x20000
243 
244 /* The BTL is using progress thread and need the protection on matching */
245 #define MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED 0x40000
246 
247 /* Default exclusivity levels */
248 #define MCA_BTL_EXCLUSIVITY_HIGH     (64*1024) /* internal loopback */
249 #define MCA_BTL_EXCLUSIVITY_DEFAULT  1024      /* GM/IB/etc. */
250 #define MCA_BTL_EXCLUSIVITY_LOW      0         /* TCP used as a last resort */
251 
252 /* error callback flags */
253 #define MCA_BTL_ERROR_FLAGS_FATAL 0x1
254 #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
255 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
256 
257 /** registration flags. the access flags are a 1-1 mapping with the mpool
258  * access flags. */
259 enum {
260     /** Allow local write on the registered region. If a region is registered
261      * with this flag the registration can be used as the local handle for a
262      * btl_get operation. */
263     MCA_BTL_REG_FLAG_LOCAL_WRITE   = MCA_RCACHE_ACCESS_LOCAL_WRITE,
264     /** Allow remote read on the registered region. If a region is registered
265      * with this flag the registration can be used as the remote handle for a
266      * btl_get operation. */
267     MCA_BTL_REG_FLAG_REMOTE_READ   = MCA_RCACHE_ACCESS_REMOTE_READ,
268     /** Allow remote write on the registered region. If a region is registered
269      * with this flag the registration can be used as the remote handle for a
270      * btl_put operation. */
271     MCA_BTL_REG_FLAG_REMOTE_WRITE  = MCA_RCACHE_ACCESS_REMOTE_WRITE,
272     /** Allow remote atomic operations on the registered region. If a region is
273      * registered with this flag the registration can be used as the remote
274      * handle for a btl_atomic_op or btl_atomic_fop operation. */
275     MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_RCACHE_ACCESS_REMOTE_ATOMIC,
276     /** Allow any btl operation on the registered region. If a region is registered
277      * with this flag the registration can be used as the local or remote handle for
278      * any btl operation. */
279     MCA_BTL_REG_FLAG_ACCESS_ANY    = MCA_RCACHE_ACCESS_ANY,
280 #if OPAL_CUDA_GDR_SUPPORT
281     /** Region is in GPU memory */
282     MCA_BTL_REG_FLAG_CUDA_GPU_MEM  = 0x00010000,
283 #endif
284 };
285 
286 /** supported atomic operations */
287 enum {
288     /** The btl supports atomic add */
289     MCA_BTL_ATOMIC_SUPPORTS_ADD    = 0x00000001,
290     /** The btl supports atomic bitwise and */
291     MCA_BTL_ATOMIC_SUPPORTS_AND    = 0x00000200,
292     /** The btl supports atomic bitwise or */
293     MCA_BTL_ATOMIC_SUPPORTS_OR     = 0x00000400,
294     /** The btl supports atomic bitwise exclusive or */
295     MCA_BTL_ATOMIC_SUPPORTS_XOR    = 0x00000800,
296 
297     /** The btl supports logical and */
298     MCA_BTL_ATOMIC_SUPPORTS_LAND   = 0x00001000,
299     /** The btl supports logical or */
300     MCA_BTL_ATOMIC_SUPPORTS_LOR    = 0x00002000,
301     /** The btl supports logical exclusive or */
302     MCA_BTL_ATOMIC_SUPPORTS_LXOR   = 0x00004000,
303 
304     /** The btl supports atomic swap */
305     MCA_BTL_ATOMIC_SUPPORTS_SWAP   = 0x00010000,
306 
307     /** The btl supports atomic min */
308     MCA_BTL_ATOMIC_SUPPORTS_MIN    = 0x00100000,
309     /** The btl supports atomic min */
310     MCA_BTL_ATOMIC_SUPPORTS_MAX    = 0x00200000,
311 
312     /** The btl supports 32-bit integer operations. Keep in mind the btl may
313      * support only a subset of the available atomics. */
314     MCA_BTL_ATOMIC_SUPPORTS_32BIT  = 0x01000000,
315 
316     /** The btl supports floating-point operations. Keep in mind the btl may
317      * support only a subset of the available atomics and may not support
318      * both 64 or 32-bit floating point. */
319     MCA_BTL_ATOMIC_SUPPORTS_FLOAT  = 0x02000000,
320 
321     /** The btl supports atomic compare-and-swap */
322     MCA_BTL_ATOMIC_SUPPORTS_CSWAP  = 0x10000000,
323 
324     /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */
325     MCA_BTL_ATOMIC_SUPPORTS_GLOB   = 0x20000000,
326 };
327 
328 enum {
329     /** Use 32-bit atomics */
330     MCA_BTL_ATOMIC_FLAG_32BIT = 0x00000001,
331     /** Use floating-point atomics */
332     MCA_BTL_ATOMIC_FLAG_FLOAT = 0x00000002,
333 };
334 
335 enum mca_btl_base_atomic_op_t {
336     /** Atomic add: (*remote_address) = (*remote_address) + operand */
337     MCA_BTL_ATOMIC_ADD = 0x0001,
338     /** Atomic and: (*remote_address) = (*remote_address) & operand */
339     MCA_BTL_ATOMIC_AND = 0x0011,
340     /** Atomic or: (*remote_address) = (*remote_address) | operand */
341     MCA_BTL_ATOMIC_OR  = 0x0012,
342     /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */
343     MCA_BTL_ATOMIC_XOR = 0x0014,
344     /** Atomic logical and: (*remote_address) = (*remote_address) && operand */
345     MCA_BTL_ATOMIC_LAND = 0x0015,
346     /** Atomic logical or: (*remote_address) = (*remote_address) || operand */
347     MCA_BTL_ATOMIC_LOR = 0x0016,
348     /** Atomic logical xor: (*remote_address) = (*remote_address) != operand */
349     MCA_BTL_ATOMIC_LXOR = 0x0017,
350     /** Atomic swap: (*remote_address) = operand */
351     MCA_BTL_ATOMIC_SWAP = 0x001a,
352     /** Atomic min */
353     MCA_BTL_ATOMIC_MIN = 0x0020,
354     /** Atomic max */
355     MCA_BTL_ATOMIC_MAX = 0x0021,
356 
357     MCA_BTL_ATOMIC_LAST,
358 };
359 typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t;
360 
361 /**
362  * Asynchronous callback function on completion of an operation.
363  * Completion Semantics: The descriptor can be reused or returned to the
364  *  BTL via mca_btl_base_module_free_fn_t. The operation has been queued to
365  *  the network device or will otherwise make asynchronous progress without
366  *  subsequent calls to btl_progress.
367  *
368  * @param[IN] module      the BTL module
369  * @param[IN] endpoint    the BTL endpoint
370  * @param[IN] descriptor  the BTL descriptor
371  *
372  */
373 typedef void (*mca_btl_base_completion_fn_t)(
374     struct mca_btl_base_module_t* module,
375     struct mca_btl_base_endpoint_t* endpoint,
376     struct mca_btl_base_descriptor_t* descriptor,
377     int status);
378 
379 
380 /**
381  * Asynchronous callback function on completion of an rdma or atomic operation.
382  * Completion Semantics: The rdma or atomic memory operation has completed
383  * remotely (i.e.) is remotely visible and the caller is free to deregister
384  * the local_handle or modify the memory in local_address.
385  *
386  * @param[IN] module        the BTL module
387  * @param[IN] endpoint      the BTL endpoint
388  * @param[IN] local_address local address for the operation (if any)
389  * @param[IN] local_handle  local handle associated with the local_address
390  * @param[IN] context       callback context supplied to the rdma/atomic operation
391  * @param[IN] cbdata        callback data supplied to the rdma/atomic operation
392  * @param[IN] status        status of the operation
393  *
394  */
395 typedef void (*mca_btl_base_rdma_completion_fn_t)(
396     struct mca_btl_base_module_t* module,
397     struct mca_btl_base_endpoint_t* endpoint,
398     void *local_address,
399     struct mca_btl_base_registration_handle_t *local_handle,
400     void *context,
401     void *cbdata,
402     int status);
403 
404 
405 /**
406  * Describes a region/segment of memory that is addressable
407  * by an BTL.
408  *
409  * Note: In many cases the alloc and prepare methods of BTLs
410  * do not return a mca_btl_base_segment_t but instead return a
411  * subclass. Extreme care should be used when modifying
412  * BTL segments to prevent overwriting internal BTL data.
413  *
414  * All BTLs MUST use base segments when calling registered
415  * Callbacks.
416  *
417  * BTL MUST use mca_btl_base_segment_t or a subclass and
418  * MUST store their segment length in btl_seg_size. BTLs
419  * MUST specify a segment no larger than MCA_BTL_SEG_MAX_SIZE.
420  */
421 
422 struct mca_btl_base_segment_t {
423     /** Address of the memory */
424     opal_ptr_t seg_addr;
425      /** Length in bytes */
426     uint64_t   seg_len;
427 };
428 typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
429 
430 
431 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && !defined(WORDS_BIGENDIAN)
432 #define MCA_BTL_BASE_SEGMENT_HTON(s)                   \
433         (s).seg_addr.lval = hton64((s).seg_addr.lval); \
434         (s).seg_len = hton64((s).seg_len);
435 #define MCA_BTL_BASE_SEGMENT_NTOH(s)                   \
436         (s).seg_addr.lval = ntoh64((s).seg_addr.lval); \
437         (s).seg_len = ntoh64((s).seg_len);
438 #else
439 #define MCA_BTL_BASE_SEGMENT_HTON(s)
440 #define MCA_BTL_BASE_SEGMENT_NTOH(s)
441 #endif
442 /**
443  * A descriptor that holds the parameters to a send/put/get
444  * operation along w/ a callback routine that is called on
445  * completion of the request.
446  * Note: receive callbacks will store the incomming data segments in
447  *       des_segments
448  */
449 
450 struct mca_btl_base_descriptor_t {
451     opal_free_list_item_t super;
452     mca_btl_base_segment_t *des_segments;     /**< local segments */
453     size_t des_segment_count;                 /**< number of local segments */
454     mca_btl_base_completion_fn_t des_cbfunc;  /**< local callback function */
455     void* des_cbdata;                         /**< opaque callback data */
456     void* des_context;                        /**< more opaque callback data */
457     uint32_t des_flags;                       /**< hints to BTL */
458     /** order value, this is only
459         valid in the local completion callback
460         and may be used in subsequent calls to
461         btl_alloc, btl_prepare_src to request
462         a descriptor that will be ordered w.r.t.
463         this descriptor
464     */
465     uint8_t order;
466 };
467 typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t;
468 
469 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
470 
471 #define MCA_BTL_DES_FLAGS_PRIORITY          0x0001
472 /* Allow the BTL to dispose the descriptor once the callback
473  * associated was triggered.
474  */
475 #define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP     0x0002
476 /* Allow the BTL to avoid calling the descriptor callback
477  * if the send succeded in the btl_send (i.e in the fast path).
478  */
479 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK    0x0004
480 
481 /* Tell the PML that the copy is being done asynchronously
482  */
483 #define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC   0x0008
484 
485 /* Type of transfer that will be done with this frag.
486  */
487 #define MCA_BTL_DES_FLAGS_PUT               0x0010
488 #define MCA_BTL_DES_FLAGS_GET               0x0020
489 
490 /* Ask the BTL to wake the remote process (send/sendi) or local process
491  * (put/get) to handle this message. The BTL may ignore this flag if
492  * signaled operations are not supported.
493  */
494 #define MCA_BTL_DES_FLAGS_SIGNAL            0x0040
495 
496 /**
497  * Maximum number of allowed segments in src/dst fields of a descriptor.
498  */
499 #define MCA_BTL_DES_MAX_SEGMENTS 16
500 
501 /**
502  * Maximum size of a BTL segment (NTH: does it really save us anything
503  * to hardcode this?)
504  */
505 #define MCA_BTL_SEG_MAX_SIZE 256
506 
507 /**
508  * Maximum size of a BTL registration handle in bytes
509  */
510 #define MCA_BTL_REG_HANDLE_MAX_SIZE 256
511 
512 /*
513  *  BTL base header, stores the tag at a minimum
514  */
515 struct mca_btl_base_header_t{
516     mca_btl_base_tag_t tag;
517 };
518 typedef struct mca_btl_base_header_t mca_btl_base_header_t;
519 
520 #define MCA_BTL_BASE_HEADER_HTON(hdr)
521 #define MCA_BTL_BASE_HEADER_NTOH(hdr)
522 
523 /*
524  *  BTL component interface functions and datatype.
525  */
526 
527 /**
528  * MCA->BTL Initializes the BTL component and creates specific BTL
529  * module(s).
530  *
531  * @param num_btls (OUT) Returns the number of btl modules created, or 0
532  *                       if the transport is not available.
533  *
534  * @param enable_progress_threads (IN) Whether this component is
535  * allowed to run a hidden/progress thread or not.
536  *
537  * @param enable_mpi_threads (IN) Whether support for multiple MPI
538  * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
539  * indicates whether multiple threads may invoke this component
540  * simultaneously or not.
541  *
542  * @return Array of pointers to BTL modules, or NULL if the transport
543  *         is not available.
544  *
545  * During component initialization, the BTL component should discover
546  * the physical devices that are available for the given transport,
547  * and create a BTL module to represent each device. Any addressing
548  * information required by peers to reach the device should be published
549  * during this function via the modex_send() interface.
550  *
551  */
552 
553 typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)(
554     int *num_btls,
555     bool enable_progress_threads,
556     bool enable_mpi_threads
557 );
558 
559 /**
560  * MCA->BTL Called to progress outstanding requests for
561  * non-threaded polling environments.
562  *
563  * @return           Count of "completions", a metric of
564  *                   how many items where completed in the call
565  *                   to progress.
566  */
567 
568 typedef int (*mca_btl_base_component_progress_fn_t)(void);
569 
570 
571 /**
572  * Callback function that is called asynchronously on receipt
573  * of data by the transport layer.
574  * Note that the the mca_btl_base_descriptor_t is only valid within the
575  * completion function, this implies that all data payload in the
576  * mca_btl_base_descriptor_t must be copied out within this callback or
577  * forfeited back to the BTL.
578  * Note also that descriptor segments (des_segments) must be base
579  * segments for all callbacks.
580  *
581  * @param[IN] btl        BTL module
582  * @param[IN] tag        The active message receive callback tag value
583  * @param[IN] descriptor The BTL descriptor (contains the receive payload)
584  * @param[IN] cbdata     Opaque callback data
585  */
586 
587 typedef void (*mca_btl_base_module_recv_cb_fn_t)(
588     struct mca_btl_base_module_t* btl,
589     mca_btl_base_tag_t tag,
590     mca_btl_base_descriptor_t* descriptor,
591     void* cbdata
592 );
593 
594 typedef struct mca_btl_active_message_callback_t {
595     mca_btl_base_module_recv_cb_fn_t cbfunc;
596     void* cbdata;
597 } mca_btl_active_message_callback_t;
598 
599 OPAL_DECLSPEC extern
600 mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TAG_MAX];
601 
602 /**
603  *  BTL component descriptor. Contains component version information
604  *  and component open/close/init functions.
605  */
606 
607 struct mca_btl_base_component_3_0_0_t {
608   mca_base_component_t btl_version;
609   mca_base_component_data_t btl_data;
610   mca_btl_base_component_init_fn_t btl_init;
611   mca_btl_base_component_progress_fn_t btl_progress;
612 };
613 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_3_0_0_t;
614 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_t;
615 
616 /*  add the 2_0_0_t typedef for source compatibility
617  *  we can do this safely because 2_0_0 components are the same as
618  *  3_0_0 components, the difference is in the btl module.
619  *  Unfortunately 2_0_0 modules are not compatible with BTL 3_0_0 and
620  *  can not be used with the new interface.
621  */
622 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_2_0_0_t;
623 
624 
625 /*
626  * BTL module interface functions and datatype.
627  */
628 
629 /**
630  * MCA->BTL Clean up any resources held by BTL module
631  * before the module is unloaded.
632  *
633  * @param btl (IN)   BTL module.
634  * @return           OPAL_SUCCESS or error status on failure.
635  *
636  * Prior to unloading a BTL module, the MCA framework will call
637  * the BTL finalize method of the module. Any resources held by
638  * the BTL should be released and if required the memory corresponding
639  * to the BTL module freed.
640  *
641  */
642 typedef int (*mca_btl_base_module_finalize_fn_t)(
643     struct mca_btl_base_module_t* btl
644 );
645 
646 /**
647  * BML->BTL notification of change in the process list.
648  *
649  * @param btl (IN)            BTL module
650  * @param nprocs (IN)         Number of processes
651  * @param procs (IN)          Array of processes
652  * @param endpoint (OUT)      Array of mca_btl_base_endpoint_t structures by BTL.
653  * @param reachable (OUT)     Bitmask indicating set of peer processes that are reachable by this BTL.
654  * @return                    OPAL_SUCCESS or error status on failure.
655  *
656  * The mca_btl_base_module_add_procs_fn_t() is called by the BML to
657  * determine the set of BTLs that should be used to reach each process.
658  * Any addressing information exported by the peer via the modex_send()
659  * function should be available during this call via the corresponding
660  * modex_recv() function. The BTL may utilize this information to
661  * determine reachability of each peer process.
662  *
663  * The caller may pass a "reachable" bitmap pointer.  If it is not
664  * NULL, for each process that is reachable by the BTL, the bit
665  * corresponding to the index into the proc array (nprocs) should be
666  * set in the reachable bitmask. The BTL will return an array of
667  * pointers to a data structure defined by the BTL that is then
668  * returned to the BTL on subsequent calls to the BTL data transfer
669  * functions (e.g btl_send). This may be used by the BTL to cache any
670  * addressing or connection information (e.g. TCP socket, IB queue
671  * pair).
672  */
673 typedef int (*mca_btl_base_module_add_procs_fn_t)(
674     struct mca_btl_base_module_t* btl,
675     size_t nprocs,
676     struct opal_proc_t** procs,
677     struct mca_btl_base_endpoint_t** endpoints,
678     struct opal_bitmap_t* reachable
679 );
680 
681 /**
682  * Notification of change to the process list.
683  *
684  * @param btl (IN)     BTL module
685  * @param nprocs (IN)  Number of processes
686  * @param proc (IN)    Set of processes
687  * @param peer (IN)    Set of peer addressing information.
688  * @return             Status indicating if cleanup was successful
689  *
690  * When the process list changes, the BML notifies the BTL of the
691  * change, to provide the opportunity to cleanup or release any
692  * resources associated with the peer.
693  */
694 typedef int (*mca_btl_base_module_del_procs_fn_t)(
695     struct mca_btl_base_module_t* btl,
696     size_t nprocs,
697     struct opal_proc_t** procs,
698     struct mca_btl_base_endpoint_t** peer
699 );
700 
701 /**
702  * Register a callback function that is called on receipt
703  * of a fragment.
704  *
705  * @param[IN] btl      BTL module
706  * @param[IN] tag      tag value of this callback
707  *                     (specified on subsequent send operations)
708  * @param[IN] cbfunc   The callback function
709  * @param[IN] cbdata   Opaque callback data
710  *
711  * @return OPAL_SUCCESS The callback was registered successfully
712  * @return OPAL_ERROR   The callback was NOT registered successfully
713  *
714  */
715 typedef int (*mca_btl_base_module_register_fn_t)(
716     struct mca_btl_base_module_t* btl,
717     mca_btl_base_tag_t tag,
718     mca_btl_base_module_recv_cb_fn_t cbfunc,
719     void* cbdata
720 );
721 
722 
723 /**
724  * Callback function that is called asynchronously on receipt
725  * of an error from the transport layer
726  *
727  * @param[IN] btl     BTL module
728  * @param[IN] flags   type of error
729  * @param[IN] errproc process that had an error
730  * @param[IN] btlinfo descriptive string from the BTL
731  */
732 
733 typedef void (*mca_btl_base_module_error_cb_fn_t)(
734         struct mca_btl_base_module_t* btl,
735         int32_t flags,
736         struct opal_proc_t* errproc,
737         char* btlinfo
738 );
739 
740 
741 /**
742  * Register a callback function that is called on receipt
743  * of an error.
744  *
745  * @param[IN] btl       BTL module
746  * @param[IN] cbfunc    The callback function
747  *
748  * @return OPAL_SUCCESS The callback was registered successfully
749  * @return OPAL_ERROR   The callback was NOT registered successfully
750  *
751  */
752 typedef int (*mca_btl_base_module_register_error_fn_t)(
753     struct mca_btl_base_module_t* btl,
754     mca_btl_base_module_error_cb_fn_t cbfunc
755 );
756 
757 
758 /**
759  * Allocate a descriptor with a segment of the requested size.
760  * Note that the BTL layer may choose to return a smaller size
761  * if it cannot support the request. The order tag value ensures that
762  * operations on the descriptor that is allocated will be
763  * ordered w.r.t. a previous operation on a particular descriptor.
764  * Ordering is only guaranteed if the previous descriptor had its
765  * local completion callback function called and the order tag of
766  * that descriptor is only valid upon the local completion callback function.
767  *
768  *
769  * @param btl (IN)      BTL module
770  * @param size (IN)     Request segment size.
771  * @param order (IN)    The ordering tag (may be MCA_BTL_NO_ORDER)
772  */
773 
774 typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)(
775     struct mca_btl_base_module_t* btl,
776     struct mca_btl_base_endpoint_t* endpoint,
777     uint8_t order,
778     size_t size,
779     uint32_t flags
780 );
781 
782 /**
783  * Return a descriptor allocated from this BTL via alloc/prepare.
784  * A descriptor can only be deallocated after its local completion
785  * callback function has called for all send/put/get operations.
786  *
787  * @param btl (IN)      BTL module
788  * @param segment (IN)  Descriptor allocated from the BTL
789  */
790 typedef int (*mca_btl_base_module_free_fn_t)(
791     struct mca_btl_base_module_t* btl,
792     mca_btl_base_descriptor_t* descriptor
793 );
794 
795 
796 /**
797  * Prepare a descriptor for send using the supplied convertor. If the convertor
798  * references data that is contiguous, the descriptor may simply point to the
799  * user buffer. Otherwise, this routine is responsible for allocating buffer
800  * space and packing if required.
801  *
802  * The order tag value ensures that operations on the
803  * descriptor that is prepared will be ordered w.r.t. a previous
804  * operation on a particular descriptor. Ordering is only guaranteed if
805  * the previous descriptor had its local completion callback function
806  * called and the order tag of that descriptor is only valid upon the local
807  * completion callback function.
808  *
809  * @param btl (IN)          BTL module
810  * @param endpoint (IN)     BTL peer addressing
811  * @param registration (IN) Memory registration
812  * @param convertor (IN)    Data type convertor
813  * @param order (IN)        The ordering tag (may be MCA_BTL_NO_ORDER)
814  * @param reserve (IN)      Additional bytes requested by upper layer to precede user data
815  * @param size (IN/OUT)     Number of bytes to prepare (IN),
816  *                          number of bytes actually prepared (OUT)
817  *
818  */
819 typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
820     struct mca_btl_base_module_t* btl,
821     struct mca_btl_base_endpoint_t* endpoint,
822     struct opal_convertor_t* convertor,
823     uint8_t order,
824     size_t reserve,
825     size_t* size,
826     uint32_t flags
827 );
828 
829 /**
830  * @brief Register a memory region for put/get/atomic operations.
831  *
832  * @param btl (IN)         BTL module
833  * @param endpoint(IN)     BTL addressing information (or NULL for all endpoints)
834  * @param base (IN)        Pointer to start of region
835  * @param size (IN)        Size of region
836  * @param flags (IN)       Flags including access permissions
837  *
838  * @returns a memory registration handle valid for both local and remote operations
839  * @returns NULL if the region could not be registered
840  *
841  * This function registers the specified region with the hardware for use with
842  * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
843  * functions. Care should be taken to not hold an excessive number of registrations
844  * as they may use limited system/NIC resources.
845  *
846  * Ownership of the memory pointed to by the returned (struct
847  * mca_btl_base_registration_handle_t*) is passed to the caller.  The
848  * BTL module cannot free or reuse the handle until it is returned via
849  * the mca_btl_base_module_deregister_mem_fn_t function.
850  */
851 typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
852     struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
853     size_t size, uint32_t flags);
854 
855 /**
856  * @brief Deregister a memory region
857  *
858  * @param btl (IN)         BTL module region was registered with
859  * @param handle (IN)      BTL registration handle to deregister
860  *
861  * This function deregisters the memory region associated with the specified handle. Care
862  * should be taken to not perform any RDMA or atomic operation on this memory region
863  * after it is deregistered. It is erroneous to specify a memory handle associated with
864  * a remote node.
865  *
866  * The handle passed in will be a value previously returned by the
867  * mca_btl_base_module_register_mem_fn_t function.  Ownership of the
868  * memory pointed to by handle passes to the BTL module; this function
869  * is now is allowed to free the memory, return it to a freelist, etc.
870  */
871 typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
872     struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
873 
874 /**
875  * Initiate an asynchronous send.
876  * Completion Semantics: the descriptor has been queued for a send operation
877  *                       the BTL now controls the descriptor until local
878  *                       completion callback is made on the descriptor
879  *
880  * All BTLs allow multiple concurrent asynchronous send operations on a descriptor
881  *
882  * @param btl (IN)         BTL module
883  * @param endpoint (IN)    BTL addressing information
884  * @param descriptor (IN)  Description of the data to be transfered
885  * @param tag (IN)         The tag value used to notify the peer.
886  *
887  * @retval OPAL_SUCCESS    The descriptor was successfully queued for a send
888  * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a send
889  * @retval OPAL_ERR_UNREACH The endpoint is not reachable
890  */
891 typedef int (*mca_btl_base_module_send_fn_t)(
892     struct mca_btl_base_module_t* btl,
893     struct mca_btl_base_endpoint_t* endpoint,
894     struct mca_btl_base_descriptor_t* descriptor,
895     mca_btl_base_tag_t tag
896 );
897 
898 /**
899  * Initiate an immediate blocking send.
900  * Completion Semantics: the BTL will make a best effort
901  *  to send the header and "size" bytes from the datatype using the convertor.
902  *  The header is guaranteed to be delivered entirely in the first segment.
903  *  Should the BTL be unable to deliver the data due to resource constraints
904  *  the BTL will return a descriptor (via the OUT param)
905  *  of size "payload_size + header_size".
906  *
907  * @param btl (IN)             BTL module
908  * @param endpoint (IN)        BTL addressing information
909  * @param convertor (IN)       Data type convertor
910  * @param header (IN)          Pointer to header.
911  * @param header_size (IN)     Size of header.
912  * @param payload_size (IN)    Size of payload (from convertor).
913  * @param order (IN)           The ordering tag (may be MCA_BTL_NO_ORDER)
914  * @param flags (IN)           Flags.
915  * @param tag (IN)             The tag value used to notify the peer.
916  * @param descriptor (OUT)     The descriptor to be returned unable to be sent immediately
917  *                             (may be NULL).
918  *
919  * @retval OPAL_SUCCESS           The send was successfully queued
920  * @retval OPAL_ERROR             The send failed
921  * @retval OPAL_ERR_UNREACH       The endpoint is not reachable
922  * @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned
923  *                                (via the OUT param) if descriptors are available
924  */
925 
926 typedef int (*mca_btl_base_module_sendi_fn_t)(
927     struct mca_btl_base_module_t* btl,
928     struct mca_btl_base_endpoint_t* endpoint,
929     struct opal_convertor_t* convertor,
930     void* header,
931     size_t header_size,
932     size_t payload_size,
933     uint8_t order,
934     uint32_t flags,
935     mca_btl_base_tag_t tag,
936     mca_btl_base_descriptor_t** descriptor
937  );
938 
939 /**
940  * Initiate an asynchronous put.
941  * Completion Semantics: if this function returns a 1 then the operation
942  *                       is complete. a return of OPAL_SUCCESS indicates
943  *                       the put operation has been queued with the
944  *                       network. the local_handle can not be deregistered
945  *                       until all outstanding operations on that handle
946  *                       have been completed.
947  *
948  * @param btl (IN)            BTL module
949  * @param endpoint (IN)       BTL addressing information
950  * @param local_address (IN)  Local address to put from (registered)
951  * @param remote_address (IN) Remote address to put to (registered remotely)
952  * @param local_handle (IN)   Registration handle for region containing
953  *                            (local_address, local_address + size)
954  * @param remote_handle (IN)  Remote registration handle for region containing
955  *                            (remote_address, remote_address + size)
956  * @param size (IN)           Number of bytes to put
957  * @param flags (IN)          Flags for this put operation
958  * @param order (IN)          Ordering
959  * @param cbfunc (IN)         Function to call on completion (if queued)
960  * @param cbcontext (IN)      Context for the callback
961  * @param cbdata (IN)         Data for callback
962  *
963  * @retval OPAL_SUCCESS    The descriptor was successfully queued for a put
964  * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a put
965  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the put
966  *                         operation. Try again later
967  * @retval OPAL_ERR_NOT_AVAILABLE  Put can not be performed due to size or
968  *                         alignment restrictions.
969  */
970 typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
971     struct mca_btl_base_endpoint_t *endpoint, void *local_address,
972     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
973     struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
974     int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
975 
976 /**
977  * Initiate an asynchronous get.
978  * Completion Semantics: if this function returns a 1 then the operation
979  *                       is complete. a return of OPAL_SUCCESS indicates
980  *                       the get operation has been queued with the
981  *                       network. the local_handle can not be deregistered
982  *                       until all outstanding operations on that handle
983  *                       have been completed.
984  *
985  * @param btl (IN)            BTL module
986  * @param endpoint (IN)       BTL addressing information
987  * @param local_address (IN)  Local address to put from (registered)
988  * @param remote_address (IN) Remote address to put to (registered remotely)
989  * @param local_handle (IN)   Registration handle for region containing
990  *                            (local_address, local_address + size)
991  * @param remote_handle (IN)  Remote registration handle for region containing
992  *                            (remote_address, remote_address + size)
993  * @param size (IN)           Number of bytes to put
994  * @param flags (IN)          Flags for this put operation
995  * @param order (IN)          Ordering
996  * @param cbfunc (IN)         Function to call on completion (if queued)
997  * @param cbcontext (IN)      Context for the callback
998  * @param cbdata (IN)         Data for callback
999  *
1000  * @retval OPAL_SUCCESS    The descriptor was successfully queued for a put
1001  * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a put
1002  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the put
1003  *                         operation. Try again later
1004  * @retval OPAL_ERR_NOT_AVAILABLE  Put can not be performed due to size or
1005  *                         alignment restrictions.
1006  */
1007 typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
1008     struct mca_btl_base_endpoint_t *endpoint, void *local_address,
1009     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
1010     struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
1011     int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
1012 
1013 /**
1014  * Initiate an asynchronous atomic operation.
1015  * Completion Semantics: if this function returns a 1 then the operation
1016  *                       is complete. a return of OPAL_SUCCESS indicates
1017  *                       the atomic operation has been queued with the
1018  *                       network.
1019  *
1020  * @param btl (IN)            BTL module
1021  * @param endpoint (IN)       BTL addressing information
1022  * @param remote_address (IN) Remote address to put to (registered remotely)
1023  * @param remote_handle (IN)  Remote registration handle for region containing
1024  *                            (remote_address, remote_address + 8)
1025  * @param op (IN)             Operation to perform
1026  * @param operand (IN)        Operand for the operation
1027  * @param flags (IN)          Flags for this atomic operation
1028  * @param order (IN)          Ordering
1029  * @param cbfunc (IN)         Function to call on completion (if queued)
1030  * @param cbcontext (IN)      Context for the callback
1031  * @param cbdata (IN)         Data for callback
1032  *
1033  * @retval OPAL_SUCCESS    The operation was successfully queued
1034  * @retval 1               The operation is complete
1035  * @retval OPAL_ERROR      The operation was NOT successfully queued
1036  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
1037  *                         operation. Try again later
1038  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
1039  *                         alignment restrictions or the operation {op} is not supported
1040  *                         by the hardware.
1041  *
1042  * After the operation is complete the remote address specified by {remote_address} and
1043  * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
1044  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1045  * however, that not all btls will provide consistency between btl atomic operations and
1046  * cpu or other btl atomics.
1047  */
1048 typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl,
1049     struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address,
1050     struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
1051     uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1052     void *cbcontext, void *cbdata);
1053 
1054 /**
1055  * Initiate an asynchronous fetching atomic operation.
1056  * Completion Semantics: if this function returns a 1 then the operation
1057  *                       is complete. a return of OPAL_SUCCESS indicates
1058  *                       the atomic operation has been queued with the
1059  *                       network.
1060  *
1061  * @param btl (IN)            BTL module
1062  * @param endpoint (IN)       BTL addressing information
1063  * @param local_address (OUT) Local address to store the result in
1064  * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
1065  * @param local_handle (IN)   Local registration handle for region containing
1066  *                            (local_address, local_address + 8)
1067  * @param remote_handle (IN)  Remote registration handle for region containing
1068  *                            (remote_address, remote_address + 8)
1069  * @param op (IN)             Operation to perform
1070  * @param operand (IN)        Operand for the operation
1071  * @param flags (IN)          Flags for this atomic operation
1072  * @param order (IN)          Ordering
1073  * @param cbfunc (IN)         Function to call on completion (if queued)
1074  * @param cbcontext (IN)      Context for the callback
1075  * @param cbdata (IN)         Data for callback
1076  *
1077  * @retval OPAL_SUCCESS    The operation was successfully queued
1078  * @retval 1               The operation is complete
1079  * @retval OPAL_ERROR      The operation was NOT successfully queued
1080  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
1081  *                         operation. Try again later
1082  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
1083  *                         alignment restrictions or the operation {op} is not supported
1084  *                         by the hardware.
1085  *
1086  * After the operation is complete the remote address specified by {remote_address} and
1087  * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
1088  * {local_address} will be updated with the previous value stored in {remote_address}.
1089  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1090  * however, that not all btls will provide consistency between btl atomic operations and
1091  * cpu or other btl atomics.
1092  */
1093 typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl,
1094     struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
1095     struct mca_btl_base_registration_handle_t *local_handle,
1096     struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
1097     uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1098     void *cbcontext, void *cbdata);
1099 
1100 /**
1101  * Initiate an asynchronous compare and swap operation.
1102  * Completion Semantics: if this function returns a 1 then the operation
1103  *                       is complete. a return of OPAL_SUCCESS indicates
1104  *                       the atomic operation has been queued with the
1105  *                       network.
1106  *
1107  * @param btl (IN)            BTL module
1108  * @param endpoint (IN)       BTL addressing information
1109  * @param local_address (OUT) Local address to store the result in
1110  * @param remote_address (IN) Remote address perfom operation on to (registered remotely)
1111  * @param local_handle (IN)   Local registration handle for region containing
1112  *                            (local_address, local_address + 8)
1113  * @param remote_handle (IN)  Remote registration handle for region containing
1114  *                            (remote_address, remote_address + 8)
1115  * @param compare (IN)        Operand for the operation
1116  * @param value (IN)          Value to store on success
1117  * @param flags (IN)          Flags for this atomic operation
1118  * @param order (IN)          Ordering
1119  * @param cbfunc (IN)         Function to call on completion (if queued)
1120  * @param cbcontext (IN)      Context for the callback
1121  * @param cbdata (IN)         Data for callback
1122  *
1123  * @retval OPAL_SUCCESS    The operation was successfully queued
1124  * @retval 1               The operation is complete
1125  * @retval OPAL_ERROR      The operation was NOT successfully queued
1126  * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the atomic
1127  *                         operation. Try again later
1128  * @retval OPAL_ERR_NOT_AVAILABLE  Atomic operation can not be performed due to
1129  *                         alignment restrictions or the operation {op} is not supported
1130  *                         by the hardware.
1131  *
1132  * After the operation is complete the remote address specified by {remote_address} and
1133  * {remote_handle} will be updated with {value} if *remote_address == compare.
1134  * {local_address} will be updated with the previous value stored in {remote_address}.
1135  * The btl will guarantee consistency of atomic operations performed via the btl. Note,
1136  * however, that not all btls will provide consistency between btl atomic operations and
1137  * cpu atomics.
1138  */
1139 typedef int (*mca_btl_base_module_atomic_cswap64_fn_t) (struct mca_btl_base_module_t *btl,
1140     struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
1141     struct mca_btl_base_registration_handle_t *local_handle,
1142     struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
1143     uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
1144     void *cbcontext, void *cbdata);
1145 
1146 /**
1147  * Diagnostic dump of btl state.
1148  *
1149  * @param btl (IN)         BTL module
1150  * @param endpoint (IN)    BTL endpoint
1151  * @param verbose (IN)     Verbosity level
1152  */
1153 
1154 typedef void (*mca_btl_base_module_dump_fn_t)(
1155     struct mca_btl_base_module_t* btl,
1156     struct mca_btl_base_endpoint_t* endpoint,
1157     int verbose
1158 );
1159 
1160 /**
1161  * Fault Tolerance Event Notification Function
1162  * @param state Checkpoint Status
1163  * @return OPAL_SUCCESS or failure status
1164  */
1165 typedef int (*mca_btl_base_module_ft_event_fn_t)(int state);
1166 
1167 /**
1168  * BTL module interface functions and attributes.
1169  */
1170 struct mca_btl_base_module_t {
1171 
1172     /* BTL common attributes */
1173     mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */
1174     size_t      btl_eager_limit;      /**< maximum size of first fragment -- eager send */
1175     size_t      btl_rndv_eager_limit;    /**< the size of a data sent in a first fragment of rendezvous protocol */
1176     size_t      btl_max_send_size;    /**< maximum send fragment size supported by the BTL */
1177     size_t      btl_rdma_pipeline_send_length; /**< amount of bytes that should be send by pipeline protocol */
1178     size_t      btl_rdma_pipeline_frag_size; /**< maximum rdma fragment size supported by the BTL */
1179     size_t      btl_min_rdma_pipeline_size; /**< minimum packet size for pipeline protocol  */
1180     uint32_t    btl_exclusivity;      /**< indicates this BTL should be used exclusively */
1181     uint32_t    btl_latency;          /**< relative ranking of latency used to prioritize btls */
1182     uint32_t    btl_bandwidth;        /**< bandwidth (Mbytes/sec) supported by each endpoint */
1183     uint32_t    btl_flags;            /**< flags (put/get...) */
1184     uint32_t    btl_atomic_flags;     /**< atomic operations supported (add, and, xor, etc) */
1185     size_t      btl_registration_handle_size; /**< size of the BTLs registration handles */
1186 
1187     /* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
1188     size_t      btl_get_limit;        /**< maximum size supported by the btl_get function */
1189     size_t      btl_get_alignment;    /**< minimum alignment/size needed by btl_get (power of 2) */
1190     size_t      btl_put_limit;        /**< maximum size supported by the btl_put function */
1191     size_t      btl_put_alignment;    /**< minimum alignment/size needed by btl_put (power of 2) */
1192 
1193     /* minimum transaction sizes for which registration is required for local memory */
1194     size_t      btl_get_local_registration_threshold;
1195     size_t      btl_put_local_registration_threshold;
1196 
1197     /* BTL function table */
1198     mca_btl_base_module_add_procs_fn_t      btl_add_procs;
1199     mca_btl_base_module_del_procs_fn_t      btl_del_procs;
1200     mca_btl_base_module_register_fn_t       btl_register;
1201     mca_btl_base_module_finalize_fn_t       btl_finalize;
1202 
1203     mca_btl_base_module_alloc_fn_t          btl_alloc;
1204     mca_btl_base_module_free_fn_t           btl_free;
1205     mca_btl_base_module_prepare_fn_t        btl_prepare_src;
1206     mca_btl_base_module_send_fn_t           btl_send;
1207     mca_btl_base_module_sendi_fn_t          btl_sendi;
1208     mca_btl_base_module_put_fn_t            btl_put;
1209     mca_btl_base_module_get_fn_t            btl_get;
1210     mca_btl_base_module_dump_fn_t           btl_dump;
1211 
1212     /* atomic operations */
1213     mca_btl_base_module_atomic_op64_fn_t    btl_atomic_op;
1214     mca_btl_base_module_atomic_fop64_fn_t   btl_atomic_fop;
1215     mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap;
1216 
1217     /* new memory registration functions */
1218     mca_btl_base_module_register_mem_fn_t   btl_register_mem;   /**< memory registration function (NULL if not needed) */
1219     mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
1220 
1221     /** the mpool associated with this btl (optional) */
1222     mca_mpool_base_module_t*             btl_mpool;
1223     /** register a default error handler */
1224     mca_btl_base_module_register_error_fn_t btl_register_error;
1225     /** fault tolerant even notification */
1226     mca_btl_base_module_ft_event_fn_t btl_ft_event;
1227 #if OPAL_CUDA_GDR_SUPPORT
1228     size_t      btl_cuda_eager_limit;  /**< switch from eager to RDMA */
1229     size_t      btl_cuda_rdma_limit;   /**< switch from RDMA to rndv pipeline */
1230 #endif /* OPAL_CUDA_GDR_SUPPORT */
1231 #if OPAL_CUDA_SUPPORT
1232     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
1233 #endif /* OPAL_CUDA_SUPPORT */
1234 };
1235 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
1236 
1237 /*
1238  * Macro for use in modules that are of type btl v3.0.0
1239  * NOTE: This is not the final version of 3.0.0. Consider it
1240  * alpha until this comment is removed.
1241  */
1242 #define MCA_BTL_BASE_VERSION_3_0_0              \
1243     OPAL_MCA_BASE_VERSION_2_1_0("btl", 3, 0, 0)
1244 
1245 #define MCA_BTL_DEFAULT_VERSION(name)                       \
1246     MCA_BTL_BASE_VERSION_3_0_0,                             \
1247     .mca_component_name = name,                             \
1248     MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, \
1249                           OPAL_RELEASE_VERSION)
1250 
1251 END_C_DECLS
1252 
1253 #endif /* OPAL_MCA_BTL_H */
1254