1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 2 /* 3 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana 4 * University Research and Technology 5 * Corporation. All rights reserved. 6 * Copyright (c) 2004-2016 The University of Tennessee and The University 7 * of Tennessee Research Foundation. All rights 8 * reserved. 9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 10 * University of Stuttgart. All rights reserved. 11 * Copyright (c) 2004-2005 The Regents of the University of California. 12 * All rights reserved. 13 * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights 14 * reserved. 15 * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. 16 * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. 17 * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. 18 * Copyright (c) 2015 Research Organization for Information Science 19 * and Technology (RIST). All rights reserved. 20 * $COPYRIGHT$ 21 * 22 * Additional copyrights may follow 23 * 24 * $HEADER$ 25 */ 26 /** 27 * @file 28 * 29 * Byte Transfer Layer (BTL) 30 * 31 * 32 * BTL Initialization: 33 * 34 * During library initialization, all available BTL components are 35 * loaded and opened via their mca_base_open_component_fn_t 36 * function. The BTL open function should register any mca parameters 37 * used to tune/adjust the behaviour of the BTL (mca_base_var_register() 38 * mca_base_component_var_register()). Note that the open function may fail 39 * if the resources (e.g. shared libraries, etc) required by the network 40 * transport are not available. 41 * 42 * The mca_btl_base_component_init_fn_t() is then called for each of the 43 * components that are succesfully opened. The component init function may 44 * return either: 45 * 46 * (1) a NULL list of BTL modules if the transport is not available, 47 * (2) a list containing a one or more single BTL modules, where the BTL provides 48 * a layer of abstraction over one or more physical devices (e.g. NICs), 49 * 50 * During module initialization, the module should post any addressing 51 * information required by its peers. An example would be the TCP 52 * listen port opened by the TCP module for incoming connection 53 * requests. This information is published to peers via the 54 * modex_send() interface. Note that peer information is not 55 * guaranteed to be available via modex_recv() during the 56 * module's init function. However, it will be available during 57 * BTL selection (mca_btl_base_add_proc_fn_t()). 58 * 59 * BTL Selection: 60 * 61 * The upper layer builds an ordered list of the available BTL modules sorted 62 * by their exclusivity ranking. This is a relative ranking that is used 63 * to determine the set of BTLs that may be used to reach a given destination. 64 * During startup the BTL modules are queried via their 65 * mca_btl_base_add_proc_fn_t() to determine if they are able to reach 66 * a given destination. The BTL module with the highest ranking that 67 * returns success is selected. Subsequent BTL modules are selected only 68 * if they have the same exclusivity ranking. 69 * 70 * An example of how this might be used: 71 * 72 * BTL Exclusivity Comments 73 * -------- ----------- ------------------ 74 * LO 100 Selected exclusively for local process 75 * SM 50 Selected exclusively for other processes on host 76 * IB 0 Selected based on network reachability 77 * IB 0 Selected based on network reachability 78 * TCP 0 Selected based on network reachability 79 * TCP 0 Selected based on network reachability 80 * 81 * When mca_btl_base_add_proc_fn_t() is called on a BTL module, the BTL 82 * will populate an OUT variable with mca_btl_base_endpoint_t pointers. 83 * Each pointer is treated as an opaque handle by the upper layer and is 84 * returned to the BTL on subsequent data transfer calls to the 85 * corresponding destination process. The actual contents of the 86 * data structure are defined on a per BTL basis, and may be used to 87 * cache addressing or connection information, such as a TCP socket 88 * or IB queue pair. 89 * 90 * Progress: 91 * 92 * By default, the library provides for polling based progress of outstanding 93 * requests. The BTL component exports an interface function (btl_progress) 94 * that is called in a polling mode by the PML during calls into the MPI 95 * library. Note that the btl_progress() function is called on the BTL component 96 * rather than each BTL module. This implies that the BTL author is responsible 97 * for iterating over the pending operations in each of the BTL modules associated 98 * with the component. 99 * 100 * On platforms where threading support is provided, the library provides the 101 * option of building with asynchronous threaded progress. In this case, the BTL 102 * author is responsible for providing a thread to progress pending operations. 103 * A thread is associated with the BTL component/module such that transport specific 104 * functionality/APIs may be used to block the thread until a pending operation 105 * completes. This thread MUST NOT poll for completion as this would oversubscribe 106 * the CPU. 107 * 108 * Note that in the threaded case the PML may choose to use a hybrid approach, 109 * such that polling is implemented from the user thread for a fixed number of 110 * cycles before relying on the background thread(s) to complete requests. If 111 * possible the BTL should support the use of both modes concurrently. 112 * 113 */ 114 115 #ifndef OPAL_MCA_BTL_H 116 #define OPAL_MCA_BTL_H 117 118 #include "opal_config.h" 119 #include "opal/types.h" 120 #include "opal/prefetch.h" /* For OPAL_LIKELY */ 121 #include "opal/class/opal_bitmap.h" 122 #include "opal/datatype/opal_convertor.h" 123 #include "opal/mca/mca.h" 124 #include "opal/mca/mpool/mpool.h" 125 #include "opal/mca/rcache/rcache.h" 126 #include "opal/mca/crs/crs.h" 127 #include "opal/mca/crs/base/base.h" 128 129 BEGIN_C_DECLS 130 131 /* 132 * BTL types 133 */ 134 135 struct mca_btl_base_module_t; 136 struct mca_btl_base_endpoint_t; 137 struct mca_btl_base_descriptor_t; 138 struct mca_mpool_base_resources_t; 139 struct opal_proc_t; 140 141 /** 142 * Opaque registration handle for executing RDMA and atomic 143 * operations on a memory region. 144 * 145 * This data inside this handle is appropriate for passing 146 * to remote peers to execute RDMA and atomic operations. The 147 * size needed to send the registration handle can be 148 * obtained from the btl via the btl_registration_handle_size 149 * member. If this size is 0 then no registration data is 150 * needed to execute RDMA or atomic operations. 151 */ 152 struct mca_btl_base_registration_handle_t; 153 typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t; 154 155 156 /* Wildcard endpoint for use in the register_mem function */ 157 #define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1 158 159 /* send/recv operations require tag matching */ 160 typedef uint8_t mca_btl_base_tag_t; 161 162 #define MCA_BTL_NO_ORDER 255 163 164 /* 165 * Communication specific defines. There are a number of active message ID 166 * that can be shred between all frameworks that need to communicate (i.e. 167 * use the PML or the BTL directly). These ID are exchanged between the 168 * processes, therefore they need to be identical everywhere. The simplest 169 * approach is to have them defined as constants, and give each framework a 170 * small number. Here is the rule that defines these ID (they are 8 bits): 171 * - the first 3 bits are used to code the framework (i.e. PML, OSC, COLL) 172 * - the remaining 5 bytes are used internally by the framework, and divided 173 * based on the components requirements. Therefore, the way the PML and 174 * the OSC frameworks use these defines will be different. For more 175 * information about how these framework ID are defined, take a look in the 176 * header file associated with the framework. 177 */ 178 #define MCA_BTL_AM_FRAMEWORK_MASK 0xD0 179 #define MCA_BTL_TAG_BTL 0x20 180 #define MCA_BTL_TAG_PML 0x40 181 #define MCA_BTL_TAG_OSC_RDMA 0x60 182 #define MCA_BTL_TAG_USR 0x80 183 #define MCA_BTL_TAG_MAX 255 /* 1 + highest allowed tag num */ 184 185 /* 186 * Reserved tags for specific BTLs. As multiple BTLs can be active 187 * simultaneously, their tags should not collide. 188 */ 189 #define MCA_BTL_TAG_IB (MCA_BTL_TAG_BTL + 0) 190 #define MCA_BTL_TAG_UDAPL (MCA_BTL_TAG_BTL + 1) 191 #define MCA_BTL_TAG_SMCUDA (MCA_BTL_TAG_BTL + 2) 192 #define MCA_BTL_TAG_VADER (MCA_BTL_TAG_BTL + 3) 193 194 /* prefered protocol */ 195 #define MCA_BTL_FLAGS_SEND 0x0001 196 #define MCA_BTL_FLAGS_PUT 0x0002 197 #define MCA_BTL_FLAGS_GET 0x0004 198 /* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML 199 * rdma_btls list. This allows the updated one-sided component to 200 * use btls that are not otherwise used for send/recv. */ 201 #define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT) 202 203 /* btl can send directly from user buffer w/out registration */ 204 #define MCA_BTL_FLAGS_SEND_INPLACE 0x0008 205 206 /* btl transport reliability flags - currently used only by the DR PML */ 207 #define MCA_BTL_FLAGS_NEED_ACK 0x0010 208 #define MCA_BTL_FLAGS_NEED_CSUM 0x0020 209 210 /** deprecated (BTL 3.0) */ 211 #define MCA_BTL_FLAGS_RDMA_MATCHED 0x0040 212 213 /* btl needs local rdma completion */ 214 #define MCA_BTL_FLAGS_RDMA_COMPLETION 0x0080 215 216 /* btl can do heterogeneous rdma operations on byte buffers */ 217 #define MCA_BTL_FLAGS_HETEROGENEOUS_RDMA 0x0100 218 219 /* btl can support failover if enabled */ 220 #define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200 221 222 #define MCA_BTL_FLAGS_CUDA_PUT 0x0400 223 #define MCA_BTL_FLAGS_CUDA_GET 0x0800 224 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT) 225 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000 226 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000 227 228 /* btl can support signaled operations. BTLs that support this flag are 229 * expected to provide a mechanism for asynchronous progress on descriptors 230 * where the feature is requested. BTLs should also be aware that users can 231 * (and probably will) turn this flag on and off using the MCA variable 232 * system. 233 */ 234 #define MCA_BTL_FLAGS_SIGNALED 0x4000 235 236 /** The BTL supports network atomic operations */ 237 #define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000 238 /** The BTL supports fetching network atomic operations */ 239 #define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000 240 241 /** The BTL requires add_procs to be with all procs including non-local. Shared-memory 242 * BTLs should not set this flag. */ 243 #define MCA_BTL_FLAGS_SINGLE_ADD_PROCS 0x20000 244 245 /* The BTL is using progress thread and need the protection on matching */ 246 #define MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED 0x40000 247 248 /* The BTL supports RMDA flush */ 249 #define MCA_BTL_FLAGS_RDMA_FLUSH 0x80000 250 251 /* Default exclusivity levels */ 252 #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */ 253 #define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */ 254 #define MCA_BTL_EXCLUSIVITY_LOW 0 /* TCP used as a last resort */ 255 256 /* error callback flags */ 257 #define MCA_BTL_ERROR_FLAGS_FATAL 0x1 258 #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2 259 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4 260 261 /** registration flags. the access flags are a 1-1 mapping with the mpool 262 * access flags. */ 263 enum { 264 /** Allow local write on the registered region. If a region is registered 265 * with this flag the registration can be used as the local handle for a 266 * btl_get operation. */ 267 MCA_BTL_REG_FLAG_LOCAL_WRITE = MCA_RCACHE_ACCESS_LOCAL_WRITE, 268 /** Allow remote read on the registered region. If a region is registered 269 * with this flag the registration can be used as the remote handle for a 270 * btl_get operation. */ 271 MCA_BTL_REG_FLAG_REMOTE_READ = MCA_RCACHE_ACCESS_REMOTE_READ, 272 /** Allow remote write on the registered region. If a region is registered 273 * with this flag the registration can be used as the remote handle for a 274 * btl_put operation. */ 275 MCA_BTL_REG_FLAG_REMOTE_WRITE = MCA_RCACHE_ACCESS_REMOTE_WRITE, 276 /** Allow remote atomic operations on the registered region. If a region is 277 * registered with this flag the registration can be used as the remote 278 * handle for a btl_atomic_op or btl_atomic_fop operation. */ 279 MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_RCACHE_ACCESS_REMOTE_ATOMIC, 280 /** Allow any btl operation on the registered region. If a region is registered 281 * with this flag the registration can be used as the local or remote handle for 282 * any btl operation. */ 283 MCA_BTL_REG_FLAG_ACCESS_ANY = MCA_RCACHE_ACCESS_ANY, 284 #if OPAL_CUDA_GDR_SUPPORT 285 /** Region is in GPU memory */ 286 MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000, 287 #endif 288 }; 289 290 /** supported atomic operations */ 291 enum { 292 /** The btl supports atomic add */ 293 MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001, 294 /** The btl supports atomic bitwise and */ 295 MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200, 296 /** The btl supports atomic bitwise or */ 297 MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400, 298 /** The btl supports atomic bitwise exclusive or */ 299 MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800, 300 301 /** The btl supports logical and */ 302 MCA_BTL_ATOMIC_SUPPORTS_LAND = 0x00001000, 303 /** The btl supports logical or */ 304 MCA_BTL_ATOMIC_SUPPORTS_LOR = 0x00002000, 305 /** The btl supports logical exclusive or */ 306 MCA_BTL_ATOMIC_SUPPORTS_LXOR = 0x00004000, 307 308 /** The btl supports atomic swap */ 309 MCA_BTL_ATOMIC_SUPPORTS_SWAP = 0x00010000, 310 311 /** The btl supports atomic min */ 312 MCA_BTL_ATOMIC_SUPPORTS_MIN = 0x00100000, 313 /** The btl supports atomic min */ 314 MCA_BTL_ATOMIC_SUPPORTS_MAX = 0x00200000, 315 316 /** The btl supports 32-bit integer operations. Keep in mind the btl may 317 * support only a subset of the available atomics. */ 318 MCA_BTL_ATOMIC_SUPPORTS_32BIT = 0x01000000, 319 320 /** The btl supports floating-point operations. Keep in mind the btl may 321 * support only a subset of the available atomics and may not support 322 * both 64 or 32-bit floating point. */ 323 MCA_BTL_ATOMIC_SUPPORTS_FLOAT = 0x02000000, 324 325 /** The btl supports atomic compare-and-swap */ 326 MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000, 327 328 /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */ 329 MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000, 330 }; 331 332 enum { 333 /** Use 32-bit atomics */ 334 MCA_BTL_ATOMIC_FLAG_32BIT = 0x00000001, 335 /** Use floating-point atomics */ 336 MCA_BTL_ATOMIC_FLAG_FLOAT = 0x00000002, 337 }; 338 339 enum mca_btl_base_atomic_op_t { 340 /** Atomic add: (*remote_address) = (*remote_address) + operand */ 341 MCA_BTL_ATOMIC_ADD = 0x0001, 342 /** Atomic and: (*remote_address) = (*remote_address) & operand */ 343 MCA_BTL_ATOMIC_AND = 0x0011, 344 /** Atomic or: (*remote_address) = (*remote_address) | operand */ 345 MCA_BTL_ATOMIC_OR = 0x0012, 346 /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */ 347 MCA_BTL_ATOMIC_XOR = 0x0014, 348 /** Atomic logical and: (*remote_address) = (*remote_address) && operand */ 349 MCA_BTL_ATOMIC_LAND = 0x0015, 350 /** Atomic logical or: (*remote_address) = (*remote_address) || operand */ 351 MCA_BTL_ATOMIC_LOR = 0x0016, 352 /** Atomic logical xor: (*remote_address) = (*remote_address) != operand */ 353 MCA_BTL_ATOMIC_LXOR = 0x0017, 354 /** Atomic swap: (*remote_address) = operand */ 355 MCA_BTL_ATOMIC_SWAP = 0x001a, 356 /** Atomic min */ 357 MCA_BTL_ATOMIC_MIN = 0x0020, 358 /** Atomic max */ 359 MCA_BTL_ATOMIC_MAX = 0x0021, 360 361 MCA_BTL_ATOMIC_LAST, 362 }; 363 typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t; 364 365 /** 366 * Asynchronous callback function on completion of an operation. 367 * Completion Semantics: The descriptor can be reused or returned to the 368 * BTL via mca_btl_base_module_free_fn_t. The operation has been queued to 369 * the network device or will otherwise make asynchronous progress without 370 * subsequent calls to btl_progress. 371 * 372 * @param[IN] module the BTL module 373 * @param[IN] endpoint the BTL endpoint 374 * @param[IN] descriptor the BTL descriptor 375 * 376 */ 377 typedef void (*mca_btl_base_completion_fn_t)( 378 struct mca_btl_base_module_t* module, 379 struct mca_btl_base_endpoint_t* endpoint, 380 struct mca_btl_base_descriptor_t* descriptor, 381 int status); 382 383 384 /** 385 * Asynchronous callback function on completion of an rdma or atomic operation. 386 * Completion Semantics: The rdma or atomic memory operation has completed 387 * remotely (i.e.) is remotely visible and the caller is free to deregister 388 * the local_handle or modify the memory in local_address. 389 * 390 * @param[IN] module the BTL module 391 * @param[IN] endpoint the BTL endpoint 392 * @param[IN] local_address local address for the operation (if any) 393 * @param[IN] local_handle local handle associated with the local_address 394 * @param[IN] context callback context supplied to the rdma/atomic operation 395 * @param[IN] cbdata callback data supplied to the rdma/atomic operation 396 * @param[IN] status status of the operation 397 * 398 */ 399 typedef void (*mca_btl_base_rdma_completion_fn_t)( 400 struct mca_btl_base_module_t* module, 401 struct mca_btl_base_endpoint_t* endpoint, 402 void *local_address, 403 struct mca_btl_base_registration_handle_t *local_handle, 404 void *context, 405 void *cbdata, 406 int status); 407 408 409 /** 410 * Describes a region/segment of memory that is addressable 411 * by an BTL. 412 * 413 * Note: In many cases the alloc and prepare methods of BTLs 414 * do not return a mca_btl_base_segment_t but instead return a 415 * subclass. Extreme care should be used when modifying 416 * BTL segments to prevent overwriting internal BTL data. 417 * 418 * All BTLs MUST use base segments when calling registered 419 * Callbacks. 420 * 421 * BTL MUST use mca_btl_base_segment_t or a subclass and 422 * MUST store their segment length in btl_seg_size. BTLs 423 * MUST specify a segment no larger than MCA_BTL_SEG_MAX_SIZE. 424 */ 425 426 struct mca_btl_base_segment_t { 427 /** Address of the memory */ 428 opal_ptr_t seg_addr; 429 /** Length in bytes */ 430 uint64_t seg_len; 431 }; 432 typedef struct mca_btl_base_segment_t mca_btl_base_segment_t; 433 434 435 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && !defined(WORDS_BIGENDIAN) 436 #define MCA_BTL_BASE_SEGMENT_HTON(s) \ 437 (s).seg_addr.lval = hton64((s).seg_addr.lval); \ 438 (s).seg_len = hton64((s).seg_len); 439 #define MCA_BTL_BASE_SEGMENT_NTOH(s) \ 440 (s).seg_addr.lval = ntoh64((s).seg_addr.lval); \ 441 (s).seg_len = ntoh64((s).seg_len); 442 #else 443 #define MCA_BTL_BASE_SEGMENT_HTON(s) 444 #define MCA_BTL_BASE_SEGMENT_NTOH(s) 445 #endif 446 /** 447 * A descriptor that holds the parameters to a send/put/get 448 * operation along w/ a callback routine that is called on 449 * completion of the request. 450 * Note: receive callbacks will store the incomming data segments in 451 * des_segments 452 */ 453 454 struct mca_btl_base_descriptor_t { 455 opal_free_list_item_t super; 456 mca_btl_base_segment_t *des_segments; /**< local segments */ 457 size_t des_segment_count; /**< number of local segments */ 458 mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */ 459 void* des_cbdata; /**< opaque callback data */ 460 void* des_context; /**< more opaque callback data */ 461 uint32_t des_flags; /**< hints to BTL */ 462 /** order value, this is only 463 valid in the local completion callback 464 and may be used in subsequent calls to 465 btl_alloc, btl_prepare_src to request 466 a descriptor that will be ordered w.r.t. 467 this descriptor 468 */ 469 uint8_t order; 470 }; 471 typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t; 472 473 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t); 474 475 #define MCA_BTL_DES_FLAGS_PRIORITY 0x0001 476 /* Allow the BTL to dispose the descriptor once the callback 477 * associated was triggered. 478 */ 479 #define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP 0x0002 480 /* Allow the BTL to avoid calling the descriptor callback 481 * if the send succeded in the btl_send (i.e in the fast path). 482 */ 483 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004 484 485 /* Tell the PML that the copy is being done asynchronously 486 */ 487 #define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC 0x0008 488 489 /* Type of transfer that will be done with this frag. 490 */ 491 #define MCA_BTL_DES_FLAGS_PUT 0x0010 492 #define MCA_BTL_DES_FLAGS_GET 0x0020 493 494 /* Ask the BTL to wake the remote process (send/sendi) or local process 495 * (put/get) to handle this message. The BTL may ignore this flag if 496 * signaled operations are not supported. 497 */ 498 #define MCA_BTL_DES_FLAGS_SIGNAL 0x0040 499 500 /** 501 * Maximum number of allowed segments in src/dst fields of a descriptor. 502 */ 503 #define MCA_BTL_DES_MAX_SEGMENTS 16 504 505 /** 506 * Maximum size of a BTL segment (NTH: does it really save us anything 507 * to hardcode this?) 508 */ 509 #define MCA_BTL_SEG_MAX_SIZE 256 510 511 /** 512 * Maximum size of a BTL registration handle in bytes 513 */ 514 #define MCA_BTL_REG_HANDLE_MAX_SIZE 256 515 516 /* 517 * BTL base header, stores the tag at a minimum 518 */ 519 struct mca_btl_base_header_t{ 520 mca_btl_base_tag_t tag; 521 }; 522 typedef struct mca_btl_base_header_t mca_btl_base_header_t; 523 524 #define MCA_BTL_BASE_HEADER_HTON(hdr) 525 #define MCA_BTL_BASE_HEADER_NTOH(hdr) 526 527 /* 528 * BTL component interface functions and datatype. 529 */ 530 531 /** 532 * MCA->BTL Initializes the BTL component and creates specific BTL 533 * module(s). 534 * 535 * @param num_btls (OUT) Returns the number of btl modules created, or 0 536 * if the transport is not available. 537 * 538 * @param enable_progress_threads (IN) Whether this component is 539 * allowed to run a hidden/progress thread or not. 540 * 541 * @param enable_mpi_threads (IN) Whether support for multiple MPI 542 * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which 543 * indicates whether multiple threads may invoke this component 544 * simultaneously or not. 545 * 546 * @return Array of pointers to BTL modules, or NULL if the transport 547 * is not available. 548 * 549 * During component initialization, the BTL component should discover 550 * the physical devices that are available for the given transport, 551 * and create a BTL module to represent each device. Any addressing 552 * information required by peers to reach the device should be published 553 * during this function via the modex_send() interface. 554 * 555 */ 556 557 typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)( 558 int *num_btls, 559 bool enable_progress_threads, 560 bool enable_mpi_threads 561 ); 562 563 /** 564 * MCA->BTL Called to progress outstanding requests for 565 * non-threaded polling environments. 566 * 567 * @return Count of "completions", a metric of 568 * how many items where completed in the call 569 * to progress. 570 */ 571 572 typedef int (*mca_btl_base_component_progress_fn_t)(void); 573 574 575 /** 576 * Callback function that is called asynchronously on receipt 577 * of data by the transport layer. 578 * Note that the the mca_btl_base_descriptor_t is only valid within the 579 * completion function, this implies that all data payload in the 580 * mca_btl_base_descriptor_t must be copied out within this callback or 581 * forfeited back to the BTL. 582 * Note also that descriptor segments (des_segments) must be base 583 * segments for all callbacks. 584 * 585 * @param[IN] btl BTL module 586 * @param[IN] tag The active message receive callback tag value 587 * @param[IN] descriptor The BTL descriptor (contains the receive payload) 588 * @param[IN] cbdata Opaque callback data 589 */ 590 591 typedef void (*mca_btl_base_module_recv_cb_fn_t)( 592 struct mca_btl_base_module_t* btl, 593 mca_btl_base_tag_t tag, 594 mca_btl_base_descriptor_t* descriptor, 595 void* cbdata 596 ); 597 598 typedef struct mca_btl_active_message_callback_t { 599 mca_btl_base_module_recv_cb_fn_t cbfunc; 600 void* cbdata; 601 } mca_btl_active_message_callback_t; 602 603 OPAL_DECLSPEC extern 604 mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TAG_MAX]; 605 606 /** 607 * BTL component descriptor. Contains component version information 608 * and component open/close/init functions. 609 */ 610 611 struct mca_btl_base_component_3_0_0_t { 612 mca_base_component_t btl_version; 613 mca_base_component_data_t btl_data; 614 mca_btl_base_component_init_fn_t btl_init; 615 mca_btl_base_component_progress_fn_t btl_progress; 616 }; 617 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_3_0_0_t; 618 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_t; 619 620 /* add the 2_0_0_t typedef for source compatibility 621 * we can do this safely because 2_0_0 components are the same as 622 * 3_0_0 components, the difference is in the btl module. 623 * Unfortunately 2_0_0 modules are not compatible with BTL 3_0_0 and 624 * can not be used with the new interface. 625 */ 626 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_2_0_0_t; 627 628 629 /* 630 * BTL module interface functions and datatype. 631 */ 632 633 /** 634 * MCA->BTL Clean up any resources held by BTL module 635 * before the module is unloaded. 636 * 637 * @param btl (IN) BTL module. 638 * @return OPAL_SUCCESS or error status on failure. 639 * 640 * Prior to unloading a BTL module, the MCA framework will call 641 * the BTL finalize method of the module. Any resources held by 642 * the BTL should be released and if required the memory corresponding 643 * to the BTL module freed. 644 * 645 */ 646 typedef int (*mca_btl_base_module_finalize_fn_t)( 647 struct mca_btl_base_module_t* btl 648 ); 649 650 /** 651 * BML->BTL notification of change in the process list. 652 * 653 * @param btl (IN) BTL module 654 * @param nprocs (IN) Number of processes 655 * @param procs (IN) Array of processes 656 * @param endpoint (OUT) Array of mca_btl_base_endpoint_t structures by BTL. 657 * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL. 658 * @return OPAL_SUCCESS or error status on failure. 659 * 660 * The mca_btl_base_module_add_procs_fn_t() is called by the BML to 661 * determine the set of BTLs that should be used to reach each process. 662 * Any addressing information exported by the peer via the modex_send() 663 * function should be available during this call via the corresponding 664 * modex_recv() function. The BTL may utilize this information to 665 * determine reachability of each peer process. 666 * 667 * The caller may pass a "reachable" bitmap pointer. If it is not 668 * NULL, for each process that is reachable by the BTL, the bit 669 * corresponding to the index into the proc array (nprocs) should be 670 * set in the reachable bitmask. The BTL will return an array of 671 * pointers to a data structure defined by the BTL that is then 672 * returned to the BTL on subsequent calls to the BTL data transfer 673 * functions (e.g btl_send). This may be used by the BTL to cache any 674 * addressing or connection information (e.g. TCP socket, IB queue 675 * pair). 676 */ 677 typedef int (*mca_btl_base_module_add_procs_fn_t)( 678 struct mca_btl_base_module_t* btl, 679 size_t nprocs, 680 struct opal_proc_t** procs, 681 struct mca_btl_base_endpoint_t** endpoints, 682 struct opal_bitmap_t* reachable 683 ); 684 685 /** 686 * Notification of change to the process list. 687 * 688 * @param btl (IN) BTL module 689 * @param nprocs (IN) Number of processes 690 * @param proc (IN) Set of processes 691 * @param peer (IN) Set of peer addressing information. 692 * @return Status indicating if cleanup was successful 693 * 694 * When the process list changes, the BML notifies the BTL of the 695 * change, to provide the opportunity to cleanup or release any 696 * resources associated with the peer. 697 */ 698 typedef int (*mca_btl_base_module_del_procs_fn_t)( 699 struct mca_btl_base_module_t* btl, 700 size_t nprocs, 701 struct opal_proc_t** procs, 702 struct mca_btl_base_endpoint_t** peer 703 ); 704 705 /** 706 * Register a callback function that is called on receipt 707 * of a fragment. 708 * 709 * @param[IN] btl BTL module 710 * @param[IN] tag tag value of this callback 711 * (specified on subsequent send operations) 712 * @param[IN] cbfunc The callback function 713 * @param[IN] cbdata Opaque callback data 714 * 715 * @return OPAL_SUCCESS The callback was registered successfully 716 * @return OPAL_ERROR The callback was NOT registered successfully 717 * 718 */ 719 typedef int (*mca_btl_base_module_register_fn_t)( 720 struct mca_btl_base_module_t* btl, 721 mca_btl_base_tag_t tag, 722 mca_btl_base_module_recv_cb_fn_t cbfunc, 723 void* cbdata 724 ); 725 726 727 /** 728 * Callback function that is called asynchronously on receipt 729 * of an error from the transport layer 730 * 731 * @param[IN] btl BTL module 732 * @param[IN] flags type of error 733 * @param[IN] errproc process that had an error 734 * @param[IN] btlinfo descriptive string from the BTL 735 */ 736 737 typedef void (*mca_btl_base_module_error_cb_fn_t)( 738 struct mca_btl_base_module_t* btl, 739 int32_t flags, 740 struct opal_proc_t* errproc, 741 char* btlinfo 742 ); 743 744 745 /** 746 * Register a callback function that is called on receipt 747 * of an error. 748 * 749 * @param[IN] btl BTL module 750 * @param[IN] cbfunc The callback function 751 * 752 * @return OPAL_SUCCESS The callback was registered successfully 753 * @return OPAL_ERROR The callback was NOT registered successfully 754 * 755 */ 756 typedef int (*mca_btl_base_module_register_error_fn_t)( 757 struct mca_btl_base_module_t* btl, 758 mca_btl_base_module_error_cb_fn_t cbfunc 759 ); 760 761 762 /** 763 * Allocate a descriptor with a segment of the requested size. 764 * Note that the BTL layer may choose to return a smaller size 765 * if it cannot support the request. The order tag value ensures that 766 * operations on the descriptor that is allocated will be 767 * ordered w.r.t. a previous operation on a particular descriptor. 768 * Ordering is only guaranteed if the previous descriptor had its 769 * local completion callback function called and the order tag of 770 * that descriptor is only valid upon the local completion callback function. 771 * 772 * 773 * @param btl (IN) BTL module 774 * @param size (IN) Request segment size. 775 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) 776 */ 777 778 typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)( 779 struct mca_btl_base_module_t* btl, 780 struct mca_btl_base_endpoint_t* endpoint, 781 uint8_t order, 782 size_t size, 783 uint32_t flags 784 ); 785 786 /** 787 * Return a descriptor allocated from this BTL via alloc/prepare. 788 * A descriptor can only be deallocated after its local completion 789 * callback function has called for all send/put/get operations. 790 * 791 * @param btl (IN) BTL module 792 * @param segment (IN) Descriptor allocated from the BTL 793 */ 794 typedef int (*mca_btl_base_module_free_fn_t)( 795 struct mca_btl_base_module_t* btl, 796 mca_btl_base_descriptor_t* descriptor 797 ); 798 799 800 /** 801 * Prepare a descriptor for send using the supplied convertor. If the convertor 802 * references data that is contiguous, the descriptor may simply point to the 803 * user buffer. Otherwise, this routine is responsible for allocating buffer 804 * space and packing if required. 805 * 806 * The order tag value ensures that operations on the 807 * descriptor that is prepared will be ordered w.r.t. a previous 808 * operation on a particular descriptor. Ordering is only guaranteed if 809 * the previous descriptor had its local completion callback function 810 * called and the order tag of that descriptor is only valid upon the local 811 * completion callback function. 812 * 813 * @param btl (IN) BTL module 814 * @param endpoint (IN) BTL peer addressing 815 * @param registration (IN) Memory registration 816 * @param convertor (IN) Data type convertor 817 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) 818 * @param reserve (IN) Additional bytes requested by upper layer to precede user data 819 * @param size (IN/OUT) Number of bytes to prepare (IN), 820 * number of bytes actually prepared (OUT) 821 * 822 */ 823 typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)( 824 struct mca_btl_base_module_t* btl, 825 struct mca_btl_base_endpoint_t* endpoint, 826 struct opal_convertor_t* convertor, 827 uint8_t order, 828 size_t reserve, 829 size_t* size, 830 uint32_t flags 831 ); 832 833 /** 834 * @brief Register a memory region for put/get/atomic operations. 835 * 836 * @param btl (IN) BTL module 837 * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) 838 * @param base (IN) Pointer to start of region 839 * @param size (IN) Size of region 840 * @param flags (IN) Flags including access permissions 841 * 842 * @returns a memory registration handle valid for both local and remote operations 843 * @returns NULL if the region could not be registered 844 * 845 * This function registers the specified region with the hardware for use with 846 * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop 847 * functions. Care should be taken to not hold an excessive number of registrations 848 * as they may use limited system/NIC resources. 849 * 850 * Ownership of the memory pointed to by the returned (struct 851 * mca_btl_base_registration_handle_t*) is passed to the caller. The 852 * BTL module cannot free or reuse the handle until it is returned via 853 * the mca_btl_base_module_deregister_mem_fn_t function. 854 */ 855 typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)( 856 struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, 857 size_t size, uint32_t flags); 858 859 /** 860 * @brief Deregister a memory region 861 * 862 * @param btl (IN) BTL module region was registered with 863 * @param handle (IN) BTL registration handle to deregister 864 * 865 * This function deregisters the memory region associated with the specified handle. Care 866 * should be taken to not perform any RDMA or atomic operation on this memory region 867 * after it is deregistered. It is erroneous to specify a memory handle associated with 868 * a remote node. 869 * 870 * The handle passed in will be a value previously returned by the 871 * mca_btl_base_module_register_mem_fn_t function. Ownership of the 872 * memory pointed to by handle passes to the BTL module; this function 873 * is now is allowed to free the memory, return it to a freelist, etc. 874 */ 875 typedef int (*mca_btl_base_module_deregister_mem_fn_t)( 876 struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle); 877 878 /** 879 * Initiate an asynchronous send. 880 * Completion Semantics: the descriptor has been queued for a send operation 881 * the BTL now controls the descriptor until local 882 * completion callback is made on the descriptor 883 * 884 * All BTLs allow multiple concurrent asynchronous send operations on a descriptor 885 * 886 * @param btl (IN) BTL module 887 * @param endpoint (IN) BTL addressing information 888 * @param descriptor (IN) Description of the data to be transfered 889 * @param tag (IN) The tag value used to notify the peer. 890 * 891 * @retval OPAL_SUCCESS The descriptor was successfully queued for a send 892 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a send 893 * @retval OPAL_ERR_UNREACH The endpoint is not reachable 894 */ 895 typedef int (*mca_btl_base_module_send_fn_t)( 896 struct mca_btl_base_module_t* btl, 897 struct mca_btl_base_endpoint_t* endpoint, 898 struct mca_btl_base_descriptor_t* descriptor, 899 mca_btl_base_tag_t tag 900 ); 901 902 /** 903 * Initiate an immediate blocking send. 904 * Completion Semantics: the BTL will make a best effort 905 * to send the header and "size" bytes from the datatype using the convertor. 906 * The header is guaranteed to be delivered entirely in the first segment. 907 * Should the BTL be unable to deliver the data due to resource constraints 908 * the BTL will return a descriptor (via the OUT param) 909 * of size "payload_size + header_size". 910 * 911 * @param btl (IN) BTL module 912 * @param endpoint (IN) BTL addressing information 913 * @param convertor (IN) Data type convertor 914 * @param header (IN) Pointer to header. 915 * @param header_size (IN) Size of header. 916 * @param payload_size (IN) Size of payload (from convertor). 917 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) 918 * @param flags (IN) Flags. 919 * @param tag (IN) The tag value used to notify the peer. 920 * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately 921 * (may be NULL). 922 * 923 * @retval OPAL_SUCCESS The send was successfully queued 924 * @retval OPAL_ERROR The send failed 925 * @retval OPAL_ERR_UNREACH The endpoint is not reachable 926 * @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned 927 * (via the OUT param) if descriptors are available 928 */ 929 930 typedef int (*mca_btl_base_module_sendi_fn_t)( 931 struct mca_btl_base_module_t* btl, 932 struct mca_btl_base_endpoint_t* endpoint, 933 struct opal_convertor_t* convertor, 934 void* header, 935 size_t header_size, 936 size_t payload_size, 937 uint8_t order, 938 uint32_t flags, 939 mca_btl_base_tag_t tag, 940 mca_btl_base_descriptor_t** descriptor 941 ); 942 943 /** 944 * Initiate an asynchronous put. 945 * Completion Semantics: if this function returns a 1 then the operation 946 * is complete. a return of OPAL_SUCCESS indicates 947 * the put operation has been queued with the 948 * network. the local_handle can not be deregistered 949 * until all outstanding operations on that handle 950 * have been completed. 951 * 952 * @param btl (IN) BTL module 953 * @param endpoint (IN) BTL addressing information 954 * @param local_address (IN) Local address to put from (registered) 955 * @param remote_address (IN) Remote address to put to (registered remotely) 956 * @param local_handle (IN) Registration handle for region containing 957 * (local_address, local_address + size) 958 * @param remote_handle (IN) Remote registration handle for region containing 959 * (remote_address, remote_address + size) 960 * @param size (IN) Number of bytes to put 961 * @param flags (IN) Flags for this put operation 962 * @param order (IN) Ordering 963 * @param cbfunc (IN) Function to call on completion (if queued) 964 * @param cbcontext (IN) Context for the callback 965 * @param cbdata (IN) Data for callback 966 * 967 * @retval OPAL_SUCCESS The descriptor was successfully queued for a put 968 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put 969 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put 970 * operation. Try again later 971 * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or 972 * alignment restrictions. 973 */ 974 typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl, 975 struct mca_btl_base_endpoint_t *endpoint, void *local_address, 976 uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, 977 struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, 978 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); 979 980 /** 981 * Initiate an asynchronous get. 982 * Completion Semantics: if this function returns a 1 then the operation 983 * is complete. a return of OPAL_SUCCESS indicates 984 * the get operation has been queued with the 985 * network. the local_handle can not be deregistered 986 * until all outstanding operations on that handle 987 * have been completed. 988 * 989 * @param btl (IN) BTL module 990 * @param endpoint (IN) BTL addressing information 991 * @param local_address (IN) Local address to put from (registered) 992 * @param remote_address (IN) Remote address to put to (registered remotely) 993 * @param local_handle (IN) Registration handle for region containing 994 * (local_address, local_address + size) 995 * @param remote_handle (IN) Remote registration handle for region containing 996 * (remote_address, remote_address + size) 997 * @param size (IN) Number of bytes to put 998 * @param flags (IN) Flags for this put operation 999 * @param order (IN) Ordering 1000 * @param cbfunc (IN) Function to call on completion (if queued) 1001 * @param cbcontext (IN) Context for the callback 1002 * @param cbdata (IN) Data for callback 1003 * 1004 * @retval OPAL_SUCCESS The descriptor was successfully queued for a put 1005 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put 1006 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put 1007 * operation. Try again later 1008 * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or 1009 * alignment restrictions. 1010 */ 1011 typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl, 1012 struct mca_btl_base_endpoint_t *endpoint, void *local_address, 1013 uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, 1014 struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, 1015 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); 1016 1017 /** 1018 * Initiate an asynchronous atomic operation. 1019 * Completion Semantics: if this function returns a 1 then the operation 1020 * is complete. a return of OPAL_SUCCESS indicates 1021 * the atomic operation has been queued with the 1022 * network. 1023 * 1024 * @param btl (IN) BTL module 1025 * @param endpoint (IN) BTL addressing information 1026 * @param remote_address (IN) Remote address to put to (registered remotely) 1027 * @param remote_handle (IN) Remote registration handle for region containing 1028 * (remote_address, remote_address + 8) 1029 * @param op (IN) Operation to perform 1030 * @param operand (IN) Operand for the operation 1031 * @param flags (IN) Flags for this atomic operation 1032 * @param order (IN) Ordering 1033 * @param cbfunc (IN) Function to call on completion (if queued) 1034 * @param cbcontext (IN) Context for the callback 1035 * @param cbdata (IN) Data for callback 1036 * 1037 * @retval OPAL_SUCCESS The operation was successfully queued 1038 * @retval 1 The operation is complete 1039 * @retval OPAL_ERROR The operation was NOT successfully queued 1040 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic 1041 * operation. Try again later 1042 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to 1043 * alignment restrictions or the operation {op} is not supported 1044 * by the hardware. 1045 * 1046 * After the operation is complete the remote address specified by {remote_address} and 1047 * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. 1048 * The btl will guarantee consistency of atomic operations performed via the btl. Note, 1049 * however, that not all btls will provide consistency between btl atomic operations and 1050 * cpu or other btl atomics. 1051 */ 1052 typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl, 1053 struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address, 1054 struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, 1055 uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, 1056 void *cbcontext, void *cbdata); 1057 1058 /** 1059 * Initiate an asynchronous fetching atomic operation. 1060 * Completion Semantics: if this function returns a 1 then the operation 1061 * is complete. a return of OPAL_SUCCESS indicates 1062 * the atomic operation has been queued with the 1063 * network. 1064 * 1065 * @param btl (IN) BTL module 1066 * @param endpoint (IN) BTL addressing information 1067 * @param local_address (OUT) Local address to store the result in 1068 * @param remote_address (IN) Remote address perfom operation on to (registered remotely) 1069 * @param local_handle (IN) Local registration handle for region containing 1070 * (local_address, local_address + 8) 1071 * @param remote_handle (IN) Remote registration handle for region containing 1072 * (remote_address, remote_address + 8) 1073 * @param op (IN) Operation to perform 1074 * @param operand (IN) Operand for the operation 1075 * @param flags (IN) Flags for this atomic operation 1076 * @param order (IN) Ordering 1077 * @param cbfunc (IN) Function to call on completion (if queued) 1078 * @param cbcontext (IN) Context for the callback 1079 * @param cbdata (IN) Data for callback 1080 * 1081 * @retval OPAL_SUCCESS The operation was successfully queued 1082 * @retval 1 The operation is complete 1083 * @retval OPAL_ERROR The operation was NOT successfully queued 1084 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic 1085 * operation. Try again later 1086 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to 1087 * alignment restrictions or the operation {op} is not supported 1088 * by the hardware. 1089 * 1090 * After the operation is complete the remote address specified by {remote_address} and 1091 * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. 1092 * {local_address} will be updated with the previous value stored in {remote_address}. 1093 * The btl will guarantee consistency of atomic operations performed via the btl. Note, 1094 * however, that not all btls will provide consistency between btl atomic operations and 1095 * cpu or other btl atomics. 1096 */ 1097 typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl, 1098 struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, 1099 struct mca_btl_base_registration_handle_t *local_handle, 1100 struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, 1101 uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, 1102 void *cbcontext, void *cbdata); 1103 1104 /** 1105 * Initiate an asynchronous compare and swap operation. 1106 * Completion Semantics: if this function returns a 1 then the operation 1107 * is complete. a return of OPAL_SUCCESS indicates 1108 * the atomic operation has been queued with the 1109 * network. 1110 * 1111 * @param btl (IN) BTL module 1112 * @param endpoint (IN) BTL addressing information 1113 * @param local_address (OUT) Local address to store the result in 1114 * @param remote_address (IN) Remote address perfom operation on to (registered remotely) 1115 * @param local_handle (IN) Local registration handle for region containing 1116 * (local_address, local_address + 8) 1117 * @param remote_handle (IN) Remote registration handle for region containing 1118 * (remote_address, remote_address + 8) 1119 * @param compare (IN) Operand for the operation 1120 * @param value (IN) Value to store on success 1121 * @param flags (IN) Flags for this atomic operation 1122 * @param order (IN) Ordering 1123 * @param cbfunc (IN) Function to call on completion (if queued) 1124 * @param cbcontext (IN) Context for the callback 1125 * @param cbdata (IN) Data for callback 1126 * 1127 * @retval OPAL_SUCCESS The operation was successfully queued 1128 * @retval 1 The operation is complete 1129 * @retval OPAL_ERROR The operation was NOT successfully queued 1130 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic 1131 * operation. Try again later 1132 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to 1133 * alignment restrictions or the operation {op} is not supported 1134 * by the hardware. 1135 * 1136 * After the operation is complete the remote address specified by {remote_address} and 1137 * {remote_handle} will be updated with {value} if *remote_address == compare. 1138 * {local_address} will be updated with the previous value stored in {remote_address}. 1139 * The btl will guarantee consistency of atomic operations performed via the btl. Note, 1140 * however, that not all btls will provide consistency between btl atomic operations and 1141 * cpu atomics. 1142 */ 1143 typedef int (*mca_btl_base_module_atomic_cswap64_fn_t) (struct mca_btl_base_module_t *btl, 1144 struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, 1145 struct mca_btl_base_registration_handle_t *local_handle, 1146 struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, 1147 uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, 1148 void *cbcontext, void *cbdata); 1149 1150 /** 1151 * Diagnostic dump of btl state. 1152 * 1153 * @param btl (IN) BTL module 1154 * @param endpoint (IN) BTL endpoint 1155 * @param verbose (IN) Verbosity level 1156 */ 1157 1158 typedef void (*mca_btl_base_module_dump_fn_t)( 1159 struct mca_btl_base_module_t* btl, 1160 struct mca_btl_base_endpoint_t* endpoint, 1161 int verbose 1162 ); 1163 1164 /** 1165 * Fault Tolerance Event Notification Function 1166 * @param state Checkpoint Status 1167 * @return OPAL_SUCCESS or failure status 1168 */ 1169 typedef int (*mca_btl_base_module_ft_event_fn_t)(int state); 1170 1171 /** 1172 * Flush all outstanding RDMA operations on an endpoint or all endpoints. 1173 * 1174 * @param btl (IN) BTL module 1175 * @param endpoint (IN) Endpoint to flush (NULL == all) 1176 * 1177 * This function returns when all outstanding RDMA (put, get, atomic) operations 1178 * that were started prior to the flush call have completed. This call does 1179 * NOT guarantee that all BTL callbacks have been completed. 1180 * 1181 * The BTL is allowed to ignore the endpoint parameter and flush *all* endpoints. 1182 */ 1183 typedef int (*mca_btl_base_module_flush_fn_t) (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint); 1184 1185 /** 1186 * BTL module interface functions and attributes. 1187 */ 1188 struct mca_btl_base_module_t { 1189 1190 /* BTL common attributes */ 1191 mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */ 1192 size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */ 1193 size_t btl_rndv_eager_limit; /**< the size of a data sent in a first fragment of rendezvous protocol */ 1194 size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */ 1195 size_t btl_rdma_pipeline_send_length; /**< amount of bytes that should be send by pipeline protocol */ 1196 size_t btl_rdma_pipeline_frag_size; /**< maximum rdma fragment size supported by the BTL */ 1197 size_t btl_min_rdma_pipeline_size; /**< minimum packet size for pipeline protocol */ 1198 uint32_t btl_exclusivity; /**< indicates this BTL should be used exclusively */ 1199 uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */ 1200 uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */ 1201 uint32_t btl_flags; /**< flags (put/get...) */ 1202 uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */ 1203 size_t btl_registration_handle_size; /**< size of the BTLs registration handles */ 1204 1205 /* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */ 1206 size_t btl_get_limit; /**< maximum size supported by the btl_get function */ 1207 size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */ 1208 size_t btl_put_limit; /**< maximum size supported by the btl_put function */ 1209 size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */ 1210 1211 /* minimum transaction sizes for which registration is required for local memory */ 1212 size_t btl_get_local_registration_threshold; 1213 size_t btl_put_local_registration_threshold; 1214 1215 /* BTL function table */ 1216 mca_btl_base_module_add_procs_fn_t btl_add_procs; 1217 mca_btl_base_module_del_procs_fn_t btl_del_procs; 1218 mca_btl_base_module_register_fn_t btl_register; 1219 mca_btl_base_module_finalize_fn_t btl_finalize; 1220 1221 mca_btl_base_module_alloc_fn_t btl_alloc; 1222 mca_btl_base_module_free_fn_t btl_free; 1223 mca_btl_base_module_prepare_fn_t btl_prepare_src; 1224 mca_btl_base_module_send_fn_t btl_send; 1225 mca_btl_base_module_sendi_fn_t btl_sendi; 1226 mca_btl_base_module_put_fn_t btl_put; 1227 mca_btl_base_module_get_fn_t btl_get; 1228 mca_btl_base_module_dump_fn_t btl_dump; 1229 1230 /* atomic operations */ 1231 mca_btl_base_module_atomic_op64_fn_t btl_atomic_op; 1232 mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop; 1233 mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap; 1234 1235 /* new memory registration functions */ 1236 mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */ 1237 mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */ 1238 1239 /** the mpool associated with this btl (optional) */ 1240 mca_mpool_base_module_t* btl_mpool; 1241 /** register a default error handler */ 1242 mca_btl_base_module_register_error_fn_t btl_register_error; 1243 /** fault tolerant even notification */ 1244 mca_btl_base_module_ft_event_fn_t btl_ft_event; 1245 #if OPAL_CUDA_GDR_SUPPORT 1246 size_t btl_cuda_eager_limit; /**< switch from eager to RDMA */ 1247 size_t btl_cuda_rdma_limit; /**< switch from RDMA to rndv pipeline */ 1248 #endif /* OPAL_CUDA_GDR_SUPPORT */ 1249 #if OPAL_CUDA_SUPPORT 1250 size_t btl_cuda_max_send_size; /**< set if CUDA max send_size is different from host max send size */ 1251 #endif /* OPAL_CUDA_SUPPORT */ 1252 1253 mca_btl_base_module_flush_fn_t btl_flush; /**< flush all previous operations on an endpoint */ 1254 1255 unsigned char padding[256]; /**< padding to future-proof the btl module */ 1256 }; 1257 typedef struct mca_btl_base_module_t mca_btl_base_module_t; 1258 1259 /* 1260 * Macro for use in modules that are of type btl v3.1.0 1261 */ 1262 #define MCA_BTL_BASE_VERSION_3_1_0 \ 1263 OPAL_MCA_BASE_VERSION_2_1_0("btl", 3, 1, 0) 1264 1265 #define MCA_BTL_DEFAULT_VERSION(name) \ 1266 MCA_BTL_BASE_VERSION_3_1_0, \ 1267 .mca_component_name = name, \ 1268 MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, \ 1269 OPAL_RELEASE_VERSION) 1270 1271 /** 1272 * Convinience macro for detecting the BTL interface version. 1273 */ 1274 #define BTL_VERSION 310 1275 1276 END_C_DECLS 1277 1278 #endif /* OPAL_MCA_BTL_H */ 1279