1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 2 /* 3 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana 4 * University Research and Technology 5 * Corporation. All rights reserved. 6 * Copyright (c) 2004-2016 The University of Tennessee and The University 7 * of Tennessee Research Foundation. All rights 8 * reserved. 9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 10 * University of Stuttgart. All rights reserved. 11 * Copyright (c) 2004-2005 The Regents of the University of California. 12 * All rights reserved. 13 * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights 14 * reserved. 15 * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. 16 * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. 17 * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. 18 * Copyright (c) 2015 Research Organization for Information Science 19 * and Technology (RIST). All rights reserved. 20 * $COPYRIGHT$ 21 * 22 * Additional copyrights may follow 23 * 24 * $HEADER$ 25 */ 26 /** 27 * @file 28 * 29 * Byte Transfer Layer (BTL) 30 * 31 * 32 * BTL Initialization: 33 * 34 * During library initialization, all available BTL components are 35 * loaded and opened via their mca_base_open_component_fn_t 36 * function. The BTL open function should register any mca parameters 37 * used to tune/adjust the behaviour of the BTL (mca_base_var_register() 38 * mca_base_component_var_register()). Note that the open function may fail 39 * if the resources (e.g. shared libraries, etc) required by the network 40 * transport are not available. 41 * 42 * The mca_btl_base_component_init_fn_t() is then called for each of the 43 * components that are succesfully opened. The component init function may 44 * return either: 45 * 46 * (1) a NULL list of BTL modules if the transport is not available, 47 * (2) a list containing a one or more single BTL modules, where the BTL provides 48 * a layer of abstraction over one or more physical devices (e.g. NICs), 49 * 50 * During module initialization, the module should post any addressing 51 * information required by its peers. An example would be the TCP 52 * listen port opened by the TCP module for incoming connection 53 * requests. This information is published to peers via the 54 * modex_send() interface. Note that peer information is not 55 * guaranteed to be available via modex_recv() during the 56 * module's init function. However, it will be available during 57 * BTL selection (mca_btl_base_add_proc_fn_t()). 58 * 59 * BTL Selection: 60 * 61 * The upper layer builds an ordered list of the available BTL modules sorted 62 * by their exclusivity ranking. This is a relative ranking that is used 63 * to determine the set of BTLs that may be used to reach a given destination. 64 * During startup the BTL modules are queried via their 65 * mca_btl_base_add_proc_fn_t() to determine if they are able to reach 66 * a given destination. The BTL module with the highest ranking that 67 * returns success is selected. Subsequent BTL modules are selected only 68 * if they have the same exclusivity ranking. 69 * 70 * An example of how this might be used: 71 * 72 * BTL Exclusivity Comments 73 * -------- ----------- ------------------ 74 * LO 100 Selected exclusively for local process 75 * SM 50 Selected exclusively for other processes on host 76 * IB 0 Selected based on network reachability 77 * IB 0 Selected based on network reachability 78 * TCP 0 Selected based on network reachability 79 * TCP 0 Selected based on network reachability 80 * 81 * When mca_btl_base_add_proc_fn_t() is called on a BTL module, the BTL 82 * will populate an OUT variable with mca_btl_base_endpoint_t pointers. 83 * Each pointer is treated as an opaque handle by the upper layer and is 84 * returned to the BTL on subsequent data transfer calls to the 85 * corresponding destination process. The actual contents of the 86 * data structure are defined on a per BTL basis, and may be used to 87 * cache addressing or connection information, such as a TCP socket 88 * or IB queue pair. 89 * 90 * Progress: 91 * 92 * By default, the library provides for polling based progress of outstanding 93 * requests. The BTL component exports an interface function (btl_progress) 94 * that is called in a polling mode by the PML during calls into the MPI 95 * library. Note that the btl_progress() function is called on the BTL component 96 * rather than each BTL module. This implies that the BTL author is responsible 97 * for iterating over the pending operations in each of the BTL modules associated 98 * with the component. 99 * 100 * On platforms where threading support is provided, the library provides the 101 * option of building with asynchronous threaded progress. In this case, the BTL 102 * author is responsible for providing a thread to progress pending operations. 103 * A thread is associated with the BTL component/module such that transport specific 104 * functionality/APIs may be used to block the thread until a pending operation 105 * completes. This thread MUST NOT poll for completion as this would oversubscribe 106 * the CPU. 107 * 108 * Note that in the threaded case the PML may choose to use a hybrid approach, 109 * such that polling is implemented from the user thread for a fixed number of 110 * cycles before relying on the background thread(s) to complete requests. If 111 * possible the BTL should support the use of both modes concurrently. 112 * 113 */ 114 115 #ifndef OPAL_MCA_BTL_H 116 #define OPAL_MCA_BTL_H 117 118 #include "opal_config.h" 119 #include "opal/types.h" 120 #include "opal/prefetch.h" /* For OPAL_LIKELY */ 121 #include "opal/class/opal_bitmap.h" 122 #include "opal/datatype/opal_convertor.h" 123 #include "opal/mca/mca.h" 124 #include "opal/mca/mpool/mpool.h" 125 #include "opal/mca/rcache/rcache.h" 126 #include "opal/mca/crs/crs.h" 127 #include "opal/mca/crs/base/base.h" 128 129 BEGIN_C_DECLS 130 131 /* 132 * BTL types 133 */ 134 135 struct mca_btl_base_module_t; 136 struct mca_btl_base_endpoint_t; 137 struct mca_btl_base_descriptor_t; 138 struct mca_mpool_base_resources_t; 139 struct opal_proc_t; 140 141 /** 142 * Opaque registration handle for executing RDMA and atomic 143 * operations on a memory region. 144 * 145 * This data inside this handle is appropriate for passing 146 * to remote peers to execute RDMA and atomic operations. The 147 * size needed to send the registration handle can be 148 * obtained from the btl via the btl_registration_handle_size 149 * member. If this size is 0 then no registration data is 150 * needed to execute RDMA or atomic operations. 151 */ 152 struct mca_btl_base_registration_handle_t; 153 typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t; 154 155 156 /* Wildcard endpoint for use in the register_mem function */ 157 #define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1 158 159 /* send/recv operations require tag matching */ 160 typedef uint8_t mca_btl_base_tag_t; 161 162 #define MCA_BTL_NO_ORDER 255 163 164 /* 165 * Communication specific defines. There are a number of active message ID 166 * that can be shred between all frameworks that need to communicate (i.e. 167 * use the PML or the BTL directly). These ID are exchanged between the 168 * processes, therefore they need to be identical everywhere. The simplest 169 * approach is to have them defined as constants, and give each framework a 170 * small number. Here is the rule that defines these ID (they are 8 bits): 171 * - the first 3 bits are used to code the framework (i.e. PML, OSC, COLL) 172 * - the remaining 5 bytes are used internally by the framework, and divided 173 * based on the components requirements. Therefore, the way the PML and 174 * the OSC frameworks use these defines will be different. For more 175 * information about how these framework ID are defined, take a look in the 176 * header file associated with the framework. 177 */ 178 #define MCA_BTL_AM_FRAMEWORK_MASK 0xD0 179 #define MCA_BTL_TAG_BTL 0x20 180 #define MCA_BTL_TAG_PML 0x40 181 #define MCA_BTL_TAG_OSC_RDMA 0x60 182 #define MCA_BTL_TAG_USR 0x80 183 #define MCA_BTL_TAG_MAX 255 /* 1 + highest allowed tag num */ 184 185 /* 186 * Reserved tags for specific BTLs. As multiple BTLs can be active 187 * simultaneously, their tags should not collide. 188 */ 189 #define MCA_BTL_TAG_IB (MCA_BTL_TAG_BTL + 0) 190 #define MCA_BTL_TAG_UDAPL (MCA_BTL_TAG_BTL + 1) 191 #define MCA_BTL_TAG_SMCUDA (MCA_BTL_TAG_BTL + 2) 192 193 /* prefered protocol */ 194 #define MCA_BTL_FLAGS_SEND 0x0001 195 #define MCA_BTL_FLAGS_PUT 0x0002 196 #define MCA_BTL_FLAGS_GET 0x0004 197 /* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML 198 * rdma_btls list. This allows the updated one-sided component to 199 * use btls that are not otherwise used for send/recv. */ 200 #define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT) 201 202 /* btl can send directly from user buffer w/out registration */ 203 #define MCA_BTL_FLAGS_SEND_INPLACE 0x0008 204 205 /* btl transport reliability flags - currently used only by the DR PML */ 206 #define MCA_BTL_FLAGS_NEED_ACK 0x0010 207 #define MCA_BTL_FLAGS_NEED_CSUM 0x0020 208 209 /** deprecated (BTL 3.0) */ 210 #define MCA_BTL_FLAGS_RDMA_MATCHED 0x0040 211 212 /* btl needs local rdma completion */ 213 #define MCA_BTL_FLAGS_RDMA_COMPLETION 0x0080 214 215 /* btl can do heterogeneous rdma operations on byte buffers */ 216 #define MCA_BTL_FLAGS_HETEROGENEOUS_RDMA 0x0100 217 218 /* btl can support failover if enabled */ 219 #define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200 220 221 #define MCA_BTL_FLAGS_CUDA_PUT 0x0400 222 #define MCA_BTL_FLAGS_CUDA_GET 0x0800 223 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT) 224 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000 225 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000 226 227 /* btl can support signaled operations. BTLs that support this flag are 228 * expected to provide a mechanism for asynchronous progress on descriptors 229 * where the feature is requested. BTLs should also be aware that users can 230 * (and probably will) turn this flag on and off using the MCA variable 231 * system. 232 */ 233 #define MCA_BTL_FLAGS_SIGNALED 0x4000 234 235 /** The BTL supports network atomic operations */ 236 #define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000 237 /** The BTL supports fetching network atomic operations */ 238 #define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000 239 240 /** The BTL requires add_procs to be with all procs including non-local. Shared-memory 241 * BTLs should not set this flag. */ 242 #define MCA_BTL_FLAGS_SINGLE_ADD_PROCS 0x20000 243 244 /* The BTL is using progress thread and need the protection on matching */ 245 #define MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED 0x40000 246 247 /* Default exclusivity levels */ 248 #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */ 249 #define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */ 250 #define MCA_BTL_EXCLUSIVITY_LOW 0 /* TCP used as a last resort */ 251 252 /* error callback flags */ 253 #define MCA_BTL_ERROR_FLAGS_FATAL 0x1 254 #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2 255 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4 256 257 /** registration flags. the access flags are a 1-1 mapping with the mpool 258 * access flags. */ 259 enum { 260 /** Allow local write on the registered region. If a region is registered 261 * with this flag the registration can be used as the local handle for a 262 * btl_get operation. */ 263 MCA_BTL_REG_FLAG_LOCAL_WRITE = MCA_RCACHE_ACCESS_LOCAL_WRITE, 264 /** Allow remote read on the registered region. If a region is registered 265 * with this flag the registration can be used as the remote handle for a 266 * btl_get operation. */ 267 MCA_BTL_REG_FLAG_REMOTE_READ = MCA_RCACHE_ACCESS_REMOTE_READ, 268 /** Allow remote write on the registered region. If a region is registered 269 * with this flag the registration can be used as the remote handle for a 270 * btl_put operation. */ 271 MCA_BTL_REG_FLAG_REMOTE_WRITE = MCA_RCACHE_ACCESS_REMOTE_WRITE, 272 /** Allow remote atomic operations on the registered region. If a region is 273 * registered with this flag the registration can be used as the remote 274 * handle for a btl_atomic_op or btl_atomic_fop operation. */ 275 MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_RCACHE_ACCESS_REMOTE_ATOMIC, 276 /** Allow any btl operation on the registered region. If a region is registered 277 * with this flag the registration can be used as the local or remote handle for 278 * any btl operation. */ 279 MCA_BTL_REG_FLAG_ACCESS_ANY = MCA_RCACHE_ACCESS_ANY, 280 #if OPAL_CUDA_GDR_SUPPORT 281 /** Region is in GPU memory */ 282 MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000, 283 #endif 284 }; 285 286 /** supported atomic operations */ 287 enum { 288 /** The btl supports atomic add */ 289 MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001, 290 /** The btl supports atomic bitwise and */ 291 MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200, 292 /** The btl supports atomic bitwise or */ 293 MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400, 294 /** The btl supports atomic bitwise exclusive or */ 295 MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800, 296 297 /** The btl supports logical and */ 298 MCA_BTL_ATOMIC_SUPPORTS_LAND = 0x00001000, 299 /** The btl supports logical or */ 300 MCA_BTL_ATOMIC_SUPPORTS_LOR = 0x00002000, 301 /** The btl supports logical exclusive or */ 302 MCA_BTL_ATOMIC_SUPPORTS_LXOR = 0x00004000, 303 304 /** The btl supports atomic swap */ 305 MCA_BTL_ATOMIC_SUPPORTS_SWAP = 0x00010000, 306 307 /** The btl supports atomic min */ 308 MCA_BTL_ATOMIC_SUPPORTS_MIN = 0x00100000, 309 /** The btl supports atomic min */ 310 MCA_BTL_ATOMIC_SUPPORTS_MAX = 0x00200000, 311 312 /** The btl supports 32-bit integer operations. Keep in mind the btl may 313 * support only a subset of the available atomics. */ 314 MCA_BTL_ATOMIC_SUPPORTS_32BIT = 0x01000000, 315 316 /** The btl supports floating-point operations. Keep in mind the btl may 317 * support only a subset of the available atomics and may not support 318 * both 64 or 32-bit floating point. */ 319 MCA_BTL_ATOMIC_SUPPORTS_FLOAT = 0x02000000, 320 321 /** The btl supports atomic compare-and-swap */ 322 MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000, 323 324 /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */ 325 MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000, 326 }; 327 328 enum { 329 /** Use 32-bit atomics */ 330 MCA_BTL_ATOMIC_FLAG_32BIT = 0x00000001, 331 /** Use floating-point atomics */ 332 MCA_BTL_ATOMIC_FLAG_FLOAT = 0x00000002, 333 }; 334 335 enum mca_btl_base_atomic_op_t { 336 /** Atomic add: (*remote_address) = (*remote_address) + operand */ 337 MCA_BTL_ATOMIC_ADD = 0x0001, 338 /** Atomic and: (*remote_address) = (*remote_address) & operand */ 339 MCA_BTL_ATOMIC_AND = 0x0011, 340 /** Atomic or: (*remote_address) = (*remote_address) | operand */ 341 MCA_BTL_ATOMIC_OR = 0x0012, 342 /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */ 343 MCA_BTL_ATOMIC_XOR = 0x0014, 344 /** Atomic logical and: (*remote_address) = (*remote_address) && operand */ 345 MCA_BTL_ATOMIC_LAND = 0x0015, 346 /** Atomic logical or: (*remote_address) = (*remote_address) || operand */ 347 MCA_BTL_ATOMIC_LOR = 0x0016, 348 /** Atomic logical xor: (*remote_address) = (*remote_address) != operand */ 349 MCA_BTL_ATOMIC_LXOR = 0x0017, 350 /** Atomic swap: (*remote_address) = operand */ 351 MCA_BTL_ATOMIC_SWAP = 0x001a, 352 /** Atomic min */ 353 MCA_BTL_ATOMIC_MIN = 0x0020, 354 /** Atomic max */ 355 MCA_BTL_ATOMIC_MAX = 0x0021, 356 357 MCA_BTL_ATOMIC_LAST, 358 }; 359 typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t; 360 361 /** 362 * Asynchronous callback function on completion of an operation. 363 * Completion Semantics: The descriptor can be reused or returned to the 364 * BTL via mca_btl_base_module_free_fn_t. The operation has been queued to 365 * the network device or will otherwise make asynchronous progress without 366 * subsequent calls to btl_progress. 367 * 368 * @param[IN] module the BTL module 369 * @param[IN] endpoint the BTL endpoint 370 * @param[IN] descriptor the BTL descriptor 371 * 372 */ 373 typedef void (*mca_btl_base_completion_fn_t)( 374 struct mca_btl_base_module_t* module, 375 struct mca_btl_base_endpoint_t* endpoint, 376 struct mca_btl_base_descriptor_t* descriptor, 377 int status); 378 379 380 /** 381 * Asynchronous callback function on completion of an rdma or atomic operation. 382 * Completion Semantics: The rdma or atomic memory operation has completed 383 * remotely (i.e.) is remotely visible and the caller is free to deregister 384 * the local_handle or modify the memory in local_address. 385 * 386 * @param[IN] module the BTL module 387 * @param[IN] endpoint the BTL endpoint 388 * @param[IN] local_address local address for the operation (if any) 389 * @param[IN] local_handle local handle associated with the local_address 390 * @param[IN] context callback context supplied to the rdma/atomic operation 391 * @param[IN] cbdata callback data supplied to the rdma/atomic operation 392 * @param[IN] status status of the operation 393 * 394 */ 395 typedef void (*mca_btl_base_rdma_completion_fn_t)( 396 struct mca_btl_base_module_t* module, 397 struct mca_btl_base_endpoint_t* endpoint, 398 void *local_address, 399 struct mca_btl_base_registration_handle_t *local_handle, 400 void *context, 401 void *cbdata, 402 int status); 403 404 405 /** 406 * Describes a region/segment of memory that is addressable 407 * by an BTL. 408 * 409 * Note: In many cases the alloc and prepare methods of BTLs 410 * do not return a mca_btl_base_segment_t but instead return a 411 * subclass. Extreme care should be used when modifying 412 * BTL segments to prevent overwriting internal BTL data. 413 * 414 * All BTLs MUST use base segments when calling registered 415 * Callbacks. 416 * 417 * BTL MUST use mca_btl_base_segment_t or a subclass and 418 * MUST store their segment length in btl_seg_size. BTLs 419 * MUST specify a segment no larger than MCA_BTL_SEG_MAX_SIZE. 420 */ 421 422 struct mca_btl_base_segment_t { 423 /** Address of the memory */ 424 opal_ptr_t seg_addr; 425 /** Length in bytes */ 426 uint64_t seg_len; 427 }; 428 typedef struct mca_btl_base_segment_t mca_btl_base_segment_t; 429 430 431 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && !defined(WORDS_BIGENDIAN) 432 #define MCA_BTL_BASE_SEGMENT_HTON(s) \ 433 (s).seg_addr.lval = hton64((s).seg_addr.lval); \ 434 (s).seg_len = hton64((s).seg_len); 435 #define MCA_BTL_BASE_SEGMENT_NTOH(s) \ 436 (s).seg_addr.lval = ntoh64((s).seg_addr.lval); \ 437 (s).seg_len = ntoh64((s).seg_len); 438 #else 439 #define MCA_BTL_BASE_SEGMENT_HTON(s) 440 #define MCA_BTL_BASE_SEGMENT_NTOH(s) 441 #endif 442 /** 443 * A descriptor that holds the parameters to a send/put/get 444 * operation along w/ a callback routine that is called on 445 * completion of the request. 446 * Note: receive callbacks will store the incomming data segments in 447 * des_segments 448 */ 449 450 struct mca_btl_base_descriptor_t { 451 opal_free_list_item_t super; 452 mca_btl_base_segment_t *des_segments; /**< local segments */ 453 size_t des_segment_count; /**< number of local segments */ 454 mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */ 455 void* des_cbdata; /**< opaque callback data */ 456 void* des_context; /**< more opaque callback data */ 457 uint32_t des_flags; /**< hints to BTL */ 458 /** order value, this is only 459 valid in the local completion callback 460 and may be used in subsequent calls to 461 btl_alloc, btl_prepare_src to request 462 a descriptor that will be ordered w.r.t. 463 this descriptor 464 */ 465 uint8_t order; 466 }; 467 typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t; 468 469 OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t); 470 471 #define MCA_BTL_DES_FLAGS_PRIORITY 0x0001 472 /* Allow the BTL to dispose the descriptor once the callback 473 * associated was triggered. 474 */ 475 #define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP 0x0002 476 /* Allow the BTL to avoid calling the descriptor callback 477 * if the send succeded in the btl_send (i.e in the fast path). 478 */ 479 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004 480 481 /* Tell the PML that the copy is being done asynchronously 482 */ 483 #define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC 0x0008 484 485 /* Type of transfer that will be done with this frag. 486 */ 487 #define MCA_BTL_DES_FLAGS_PUT 0x0010 488 #define MCA_BTL_DES_FLAGS_GET 0x0020 489 490 /* Ask the BTL to wake the remote process (send/sendi) or local process 491 * (put/get) to handle this message. The BTL may ignore this flag if 492 * signaled operations are not supported. 493 */ 494 #define MCA_BTL_DES_FLAGS_SIGNAL 0x0040 495 496 /** 497 * Maximum number of allowed segments in src/dst fields of a descriptor. 498 */ 499 #define MCA_BTL_DES_MAX_SEGMENTS 16 500 501 /** 502 * Maximum size of a BTL segment (NTH: does it really save us anything 503 * to hardcode this?) 504 */ 505 #define MCA_BTL_SEG_MAX_SIZE 256 506 507 /** 508 * Maximum size of a BTL registration handle in bytes 509 */ 510 #define MCA_BTL_REG_HANDLE_MAX_SIZE 256 511 512 /* 513 * BTL base header, stores the tag at a minimum 514 */ 515 struct mca_btl_base_header_t{ 516 mca_btl_base_tag_t tag; 517 }; 518 typedef struct mca_btl_base_header_t mca_btl_base_header_t; 519 520 #define MCA_BTL_BASE_HEADER_HTON(hdr) 521 #define MCA_BTL_BASE_HEADER_NTOH(hdr) 522 523 /* 524 * BTL component interface functions and datatype. 525 */ 526 527 /** 528 * MCA->BTL Initializes the BTL component and creates specific BTL 529 * module(s). 530 * 531 * @param num_btls (OUT) Returns the number of btl modules created, or 0 532 * if the transport is not available. 533 * 534 * @param enable_progress_threads (IN) Whether this component is 535 * allowed to run a hidden/progress thread or not. 536 * 537 * @param enable_mpi_threads (IN) Whether support for multiple MPI 538 * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which 539 * indicates whether multiple threads may invoke this component 540 * simultaneously or not. 541 * 542 * @return Array of pointers to BTL modules, or NULL if the transport 543 * is not available. 544 * 545 * During component initialization, the BTL component should discover 546 * the physical devices that are available for the given transport, 547 * and create a BTL module to represent each device. Any addressing 548 * information required by peers to reach the device should be published 549 * during this function via the modex_send() interface. 550 * 551 */ 552 553 typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)( 554 int *num_btls, 555 bool enable_progress_threads, 556 bool enable_mpi_threads 557 ); 558 559 /** 560 * MCA->BTL Called to progress outstanding requests for 561 * non-threaded polling environments. 562 * 563 * @return Count of "completions", a metric of 564 * how many items where completed in the call 565 * to progress. 566 */ 567 568 typedef int (*mca_btl_base_component_progress_fn_t)(void); 569 570 571 /** 572 * Callback function that is called asynchronously on receipt 573 * of data by the transport layer. 574 * Note that the the mca_btl_base_descriptor_t is only valid within the 575 * completion function, this implies that all data payload in the 576 * mca_btl_base_descriptor_t must be copied out within this callback or 577 * forfeited back to the BTL. 578 * Note also that descriptor segments (des_segments) must be base 579 * segments for all callbacks. 580 * 581 * @param[IN] btl BTL module 582 * @param[IN] tag The active message receive callback tag value 583 * @param[IN] descriptor The BTL descriptor (contains the receive payload) 584 * @param[IN] cbdata Opaque callback data 585 */ 586 587 typedef void (*mca_btl_base_module_recv_cb_fn_t)( 588 struct mca_btl_base_module_t* btl, 589 mca_btl_base_tag_t tag, 590 mca_btl_base_descriptor_t* descriptor, 591 void* cbdata 592 ); 593 594 typedef struct mca_btl_active_message_callback_t { 595 mca_btl_base_module_recv_cb_fn_t cbfunc; 596 void* cbdata; 597 } mca_btl_active_message_callback_t; 598 599 OPAL_DECLSPEC extern 600 mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TAG_MAX]; 601 602 /** 603 * BTL component descriptor. Contains component version information 604 * and component open/close/init functions. 605 */ 606 607 struct mca_btl_base_component_3_0_0_t { 608 mca_base_component_t btl_version; 609 mca_base_component_data_t btl_data; 610 mca_btl_base_component_init_fn_t btl_init; 611 mca_btl_base_component_progress_fn_t btl_progress; 612 }; 613 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_3_0_0_t; 614 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_t; 615 616 /* add the 2_0_0_t typedef for source compatibility 617 * we can do this safely because 2_0_0 components are the same as 618 * 3_0_0 components, the difference is in the btl module. 619 * Unfortunately 2_0_0 modules are not compatible with BTL 3_0_0 and 620 * can not be used with the new interface. 621 */ 622 typedef struct mca_btl_base_component_3_0_0_t mca_btl_base_component_2_0_0_t; 623 624 625 /* 626 * BTL module interface functions and datatype. 627 */ 628 629 /** 630 * MCA->BTL Clean up any resources held by BTL module 631 * before the module is unloaded. 632 * 633 * @param btl (IN) BTL module. 634 * @return OPAL_SUCCESS or error status on failure. 635 * 636 * Prior to unloading a BTL module, the MCA framework will call 637 * the BTL finalize method of the module. Any resources held by 638 * the BTL should be released and if required the memory corresponding 639 * to the BTL module freed. 640 * 641 */ 642 typedef int (*mca_btl_base_module_finalize_fn_t)( 643 struct mca_btl_base_module_t* btl 644 ); 645 646 /** 647 * BML->BTL notification of change in the process list. 648 * 649 * @param btl (IN) BTL module 650 * @param nprocs (IN) Number of processes 651 * @param procs (IN) Array of processes 652 * @param endpoint (OUT) Array of mca_btl_base_endpoint_t structures by BTL. 653 * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL. 654 * @return OPAL_SUCCESS or error status on failure. 655 * 656 * The mca_btl_base_module_add_procs_fn_t() is called by the BML to 657 * determine the set of BTLs that should be used to reach each process. 658 * Any addressing information exported by the peer via the modex_send() 659 * function should be available during this call via the corresponding 660 * modex_recv() function. The BTL may utilize this information to 661 * determine reachability of each peer process. 662 * 663 * The caller may pass a "reachable" bitmap pointer. If it is not 664 * NULL, for each process that is reachable by the BTL, the bit 665 * corresponding to the index into the proc array (nprocs) should be 666 * set in the reachable bitmask. The BTL will return an array of 667 * pointers to a data structure defined by the BTL that is then 668 * returned to the BTL on subsequent calls to the BTL data transfer 669 * functions (e.g btl_send). This may be used by the BTL to cache any 670 * addressing or connection information (e.g. TCP socket, IB queue 671 * pair). 672 */ 673 typedef int (*mca_btl_base_module_add_procs_fn_t)( 674 struct mca_btl_base_module_t* btl, 675 size_t nprocs, 676 struct opal_proc_t** procs, 677 struct mca_btl_base_endpoint_t** endpoints, 678 struct opal_bitmap_t* reachable 679 ); 680 681 /** 682 * Notification of change to the process list. 683 * 684 * @param btl (IN) BTL module 685 * @param nprocs (IN) Number of processes 686 * @param proc (IN) Set of processes 687 * @param peer (IN) Set of peer addressing information. 688 * @return Status indicating if cleanup was successful 689 * 690 * When the process list changes, the BML notifies the BTL of the 691 * change, to provide the opportunity to cleanup or release any 692 * resources associated with the peer. 693 */ 694 typedef int (*mca_btl_base_module_del_procs_fn_t)( 695 struct mca_btl_base_module_t* btl, 696 size_t nprocs, 697 struct opal_proc_t** procs, 698 struct mca_btl_base_endpoint_t** peer 699 ); 700 701 /** 702 * Register a callback function that is called on receipt 703 * of a fragment. 704 * 705 * @param[IN] btl BTL module 706 * @param[IN] tag tag value of this callback 707 * (specified on subsequent send operations) 708 * @param[IN] cbfunc The callback function 709 * @param[IN] cbdata Opaque callback data 710 * 711 * @return OPAL_SUCCESS The callback was registered successfully 712 * @return OPAL_ERROR The callback was NOT registered successfully 713 * 714 */ 715 typedef int (*mca_btl_base_module_register_fn_t)( 716 struct mca_btl_base_module_t* btl, 717 mca_btl_base_tag_t tag, 718 mca_btl_base_module_recv_cb_fn_t cbfunc, 719 void* cbdata 720 ); 721 722 723 /** 724 * Callback function that is called asynchronously on receipt 725 * of an error from the transport layer 726 * 727 * @param[IN] btl BTL module 728 * @param[IN] flags type of error 729 * @param[IN] errproc process that had an error 730 * @param[IN] btlinfo descriptive string from the BTL 731 */ 732 733 typedef void (*mca_btl_base_module_error_cb_fn_t)( 734 struct mca_btl_base_module_t* btl, 735 int32_t flags, 736 struct opal_proc_t* errproc, 737 char* btlinfo 738 ); 739 740 741 /** 742 * Register a callback function that is called on receipt 743 * of an error. 744 * 745 * @param[IN] btl BTL module 746 * @param[IN] cbfunc The callback function 747 * 748 * @return OPAL_SUCCESS The callback was registered successfully 749 * @return OPAL_ERROR The callback was NOT registered successfully 750 * 751 */ 752 typedef int (*mca_btl_base_module_register_error_fn_t)( 753 struct mca_btl_base_module_t* btl, 754 mca_btl_base_module_error_cb_fn_t cbfunc 755 ); 756 757 758 /** 759 * Allocate a descriptor with a segment of the requested size. 760 * Note that the BTL layer may choose to return a smaller size 761 * if it cannot support the request. The order tag value ensures that 762 * operations on the descriptor that is allocated will be 763 * ordered w.r.t. a previous operation on a particular descriptor. 764 * Ordering is only guaranteed if the previous descriptor had its 765 * local completion callback function called and the order tag of 766 * that descriptor is only valid upon the local completion callback function. 767 * 768 * 769 * @param btl (IN) BTL module 770 * @param size (IN) Request segment size. 771 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) 772 */ 773 774 typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)( 775 struct mca_btl_base_module_t* btl, 776 struct mca_btl_base_endpoint_t* endpoint, 777 uint8_t order, 778 size_t size, 779 uint32_t flags 780 ); 781 782 /** 783 * Return a descriptor allocated from this BTL via alloc/prepare. 784 * A descriptor can only be deallocated after its local completion 785 * callback function has called for all send/put/get operations. 786 * 787 * @param btl (IN) BTL module 788 * @param segment (IN) Descriptor allocated from the BTL 789 */ 790 typedef int (*mca_btl_base_module_free_fn_t)( 791 struct mca_btl_base_module_t* btl, 792 mca_btl_base_descriptor_t* descriptor 793 ); 794 795 796 /** 797 * Prepare a descriptor for send using the supplied convertor. If the convertor 798 * references data that is contiguous, the descriptor may simply point to the 799 * user buffer. Otherwise, this routine is responsible for allocating buffer 800 * space and packing if required. 801 * 802 * The order tag value ensures that operations on the 803 * descriptor that is prepared will be ordered w.r.t. a previous 804 * operation on a particular descriptor. Ordering is only guaranteed if 805 * the previous descriptor had its local completion callback function 806 * called and the order tag of that descriptor is only valid upon the local 807 * completion callback function. 808 * 809 * @param btl (IN) BTL module 810 * @param endpoint (IN) BTL peer addressing 811 * @param registration (IN) Memory registration 812 * @param convertor (IN) Data type convertor 813 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) 814 * @param reserve (IN) Additional bytes requested by upper layer to precede user data 815 * @param size (IN/OUT) Number of bytes to prepare (IN), 816 * number of bytes actually prepared (OUT) 817 * 818 */ 819 typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)( 820 struct mca_btl_base_module_t* btl, 821 struct mca_btl_base_endpoint_t* endpoint, 822 struct opal_convertor_t* convertor, 823 uint8_t order, 824 size_t reserve, 825 size_t* size, 826 uint32_t flags 827 ); 828 829 /** 830 * @brief Register a memory region for put/get/atomic operations. 831 * 832 * @param btl (IN) BTL module 833 * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) 834 * @param base (IN) Pointer to start of region 835 * @param size (IN) Size of region 836 * @param flags (IN) Flags including access permissions 837 * 838 * @returns a memory registration handle valid for both local and remote operations 839 * @returns NULL if the region could not be registered 840 * 841 * This function registers the specified region with the hardware for use with 842 * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop 843 * functions. Care should be taken to not hold an excessive number of registrations 844 * as they may use limited system/NIC resources. 845 * 846 * Ownership of the memory pointed to by the returned (struct 847 * mca_btl_base_registration_handle_t*) is passed to the caller. The 848 * BTL module cannot free or reuse the handle until it is returned via 849 * the mca_btl_base_module_deregister_mem_fn_t function. 850 */ 851 typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)( 852 struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, 853 size_t size, uint32_t flags); 854 855 /** 856 * @brief Deregister a memory region 857 * 858 * @param btl (IN) BTL module region was registered with 859 * @param handle (IN) BTL registration handle to deregister 860 * 861 * This function deregisters the memory region associated with the specified handle. Care 862 * should be taken to not perform any RDMA or atomic operation on this memory region 863 * after it is deregistered. It is erroneous to specify a memory handle associated with 864 * a remote node. 865 * 866 * The handle passed in will be a value previously returned by the 867 * mca_btl_base_module_register_mem_fn_t function. Ownership of the 868 * memory pointed to by handle passes to the BTL module; this function 869 * is now is allowed to free the memory, return it to a freelist, etc. 870 */ 871 typedef int (*mca_btl_base_module_deregister_mem_fn_t)( 872 struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle); 873 874 /** 875 * Initiate an asynchronous send. 876 * Completion Semantics: the descriptor has been queued for a send operation 877 * the BTL now controls the descriptor until local 878 * completion callback is made on the descriptor 879 * 880 * All BTLs allow multiple concurrent asynchronous send operations on a descriptor 881 * 882 * @param btl (IN) BTL module 883 * @param endpoint (IN) BTL addressing information 884 * @param descriptor (IN) Description of the data to be transfered 885 * @param tag (IN) The tag value used to notify the peer. 886 * 887 * @retval OPAL_SUCCESS The descriptor was successfully queued for a send 888 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a send 889 * @retval OPAL_ERR_UNREACH The endpoint is not reachable 890 */ 891 typedef int (*mca_btl_base_module_send_fn_t)( 892 struct mca_btl_base_module_t* btl, 893 struct mca_btl_base_endpoint_t* endpoint, 894 struct mca_btl_base_descriptor_t* descriptor, 895 mca_btl_base_tag_t tag 896 ); 897 898 /** 899 * Initiate an immediate blocking send. 900 * Completion Semantics: the BTL will make a best effort 901 * to send the header and "size" bytes from the datatype using the convertor. 902 * The header is guaranteed to be delivered entirely in the first segment. 903 * Should the BTL be unable to deliver the data due to resource constraints 904 * the BTL will return a descriptor (via the OUT param) 905 * of size "payload_size + header_size". 906 * 907 * @param btl (IN) BTL module 908 * @param endpoint (IN) BTL addressing information 909 * @param convertor (IN) Data type convertor 910 * @param header (IN) Pointer to header. 911 * @param header_size (IN) Size of header. 912 * @param payload_size (IN) Size of payload (from convertor). 913 * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) 914 * @param flags (IN) Flags. 915 * @param tag (IN) The tag value used to notify the peer. 916 * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately 917 * (may be NULL). 918 * 919 * @retval OPAL_SUCCESS The send was successfully queued 920 * @retval OPAL_ERROR The send failed 921 * @retval OPAL_ERR_UNREACH The endpoint is not reachable 922 * @retval OPAL_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned 923 * (via the OUT param) if descriptors are available 924 */ 925 926 typedef int (*mca_btl_base_module_sendi_fn_t)( 927 struct mca_btl_base_module_t* btl, 928 struct mca_btl_base_endpoint_t* endpoint, 929 struct opal_convertor_t* convertor, 930 void* header, 931 size_t header_size, 932 size_t payload_size, 933 uint8_t order, 934 uint32_t flags, 935 mca_btl_base_tag_t tag, 936 mca_btl_base_descriptor_t** descriptor 937 ); 938 939 /** 940 * Initiate an asynchronous put. 941 * Completion Semantics: if this function returns a 1 then the operation 942 * is complete. a return of OPAL_SUCCESS indicates 943 * the put operation has been queued with the 944 * network. the local_handle can not be deregistered 945 * until all outstanding operations on that handle 946 * have been completed. 947 * 948 * @param btl (IN) BTL module 949 * @param endpoint (IN) BTL addressing information 950 * @param local_address (IN) Local address to put from (registered) 951 * @param remote_address (IN) Remote address to put to (registered remotely) 952 * @param local_handle (IN) Registration handle for region containing 953 * (local_address, local_address + size) 954 * @param remote_handle (IN) Remote registration handle for region containing 955 * (remote_address, remote_address + size) 956 * @param size (IN) Number of bytes to put 957 * @param flags (IN) Flags for this put operation 958 * @param order (IN) Ordering 959 * @param cbfunc (IN) Function to call on completion (if queued) 960 * @param cbcontext (IN) Context for the callback 961 * @param cbdata (IN) Data for callback 962 * 963 * @retval OPAL_SUCCESS The descriptor was successfully queued for a put 964 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put 965 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put 966 * operation. Try again later 967 * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or 968 * alignment restrictions. 969 */ 970 typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl, 971 struct mca_btl_base_endpoint_t *endpoint, void *local_address, 972 uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, 973 struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, 974 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); 975 976 /** 977 * Initiate an asynchronous get. 978 * Completion Semantics: if this function returns a 1 then the operation 979 * is complete. a return of OPAL_SUCCESS indicates 980 * the get operation has been queued with the 981 * network. the local_handle can not be deregistered 982 * until all outstanding operations on that handle 983 * have been completed. 984 * 985 * @param btl (IN) BTL module 986 * @param endpoint (IN) BTL addressing information 987 * @param local_address (IN) Local address to put from (registered) 988 * @param remote_address (IN) Remote address to put to (registered remotely) 989 * @param local_handle (IN) Registration handle for region containing 990 * (local_address, local_address + size) 991 * @param remote_handle (IN) Remote registration handle for region containing 992 * (remote_address, remote_address + size) 993 * @param size (IN) Number of bytes to put 994 * @param flags (IN) Flags for this put operation 995 * @param order (IN) Ordering 996 * @param cbfunc (IN) Function to call on completion (if queued) 997 * @param cbcontext (IN) Context for the callback 998 * @param cbdata (IN) Data for callback 999 * 1000 * @retval OPAL_SUCCESS The descriptor was successfully queued for a put 1001 * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put 1002 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put 1003 * operation. Try again later 1004 * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or 1005 * alignment restrictions. 1006 */ 1007 typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl, 1008 struct mca_btl_base_endpoint_t *endpoint, void *local_address, 1009 uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, 1010 struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, 1011 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); 1012 1013 /** 1014 * Initiate an asynchronous atomic operation. 1015 * Completion Semantics: if this function returns a 1 then the operation 1016 * is complete. a return of OPAL_SUCCESS indicates 1017 * the atomic operation has been queued with the 1018 * network. 1019 * 1020 * @param btl (IN) BTL module 1021 * @param endpoint (IN) BTL addressing information 1022 * @param remote_address (IN) Remote address to put to (registered remotely) 1023 * @param remote_handle (IN) Remote registration handle for region containing 1024 * (remote_address, remote_address + 8) 1025 * @param op (IN) Operation to perform 1026 * @param operand (IN) Operand for the operation 1027 * @param flags (IN) Flags for this atomic operation 1028 * @param order (IN) Ordering 1029 * @param cbfunc (IN) Function to call on completion (if queued) 1030 * @param cbcontext (IN) Context for the callback 1031 * @param cbdata (IN) Data for callback 1032 * 1033 * @retval OPAL_SUCCESS The operation was successfully queued 1034 * @retval 1 The operation is complete 1035 * @retval OPAL_ERROR The operation was NOT successfully queued 1036 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic 1037 * operation. Try again later 1038 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to 1039 * alignment restrictions or the operation {op} is not supported 1040 * by the hardware. 1041 * 1042 * After the operation is complete the remote address specified by {remote_address} and 1043 * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. 1044 * The btl will guarantee consistency of atomic operations performed via the btl. Note, 1045 * however, that not all btls will provide consistency between btl atomic operations and 1046 * cpu or other btl atomics. 1047 */ 1048 typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl, 1049 struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address, 1050 struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, 1051 uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, 1052 void *cbcontext, void *cbdata); 1053 1054 /** 1055 * Initiate an asynchronous fetching atomic operation. 1056 * Completion Semantics: if this function returns a 1 then the operation 1057 * is complete. a return of OPAL_SUCCESS indicates 1058 * the atomic operation has been queued with the 1059 * network. 1060 * 1061 * @param btl (IN) BTL module 1062 * @param endpoint (IN) BTL addressing information 1063 * @param local_address (OUT) Local address to store the result in 1064 * @param remote_address (IN) Remote address perfom operation on to (registered remotely) 1065 * @param local_handle (IN) Local registration handle for region containing 1066 * (local_address, local_address + 8) 1067 * @param remote_handle (IN) Remote registration handle for region containing 1068 * (remote_address, remote_address + 8) 1069 * @param op (IN) Operation to perform 1070 * @param operand (IN) Operand for the operation 1071 * @param flags (IN) Flags for this atomic operation 1072 * @param order (IN) Ordering 1073 * @param cbfunc (IN) Function to call on completion (if queued) 1074 * @param cbcontext (IN) Context for the callback 1075 * @param cbdata (IN) Data for callback 1076 * 1077 * @retval OPAL_SUCCESS The operation was successfully queued 1078 * @retval 1 The operation is complete 1079 * @retval OPAL_ERROR The operation was NOT successfully queued 1080 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic 1081 * operation. Try again later 1082 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to 1083 * alignment restrictions or the operation {op} is not supported 1084 * by the hardware. 1085 * 1086 * After the operation is complete the remote address specified by {remote_address} and 1087 * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. 1088 * {local_address} will be updated with the previous value stored in {remote_address}. 1089 * The btl will guarantee consistency of atomic operations performed via the btl. Note, 1090 * however, that not all btls will provide consistency between btl atomic operations and 1091 * cpu or other btl atomics. 1092 */ 1093 typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl, 1094 struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, 1095 struct mca_btl_base_registration_handle_t *local_handle, 1096 struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, 1097 uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, 1098 void *cbcontext, void *cbdata); 1099 1100 /** 1101 * Initiate an asynchronous compare and swap operation. 1102 * Completion Semantics: if this function returns a 1 then the operation 1103 * is complete. a return of OPAL_SUCCESS indicates 1104 * the atomic operation has been queued with the 1105 * network. 1106 * 1107 * @param btl (IN) BTL module 1108 * @param endpoint (IN) BTL addressing information 1109 * @param local_address (OUT) Local address to store the result in 1110 * @param remote_address (IN) Remote address perfom operation on to (registered remotely) 1111 * @param local_handle (IN) Local registration handle for region containing 1112 * (local_address, local_address + 8) 1113 * @param remote_handle (IN) Remote registration handle for region containing 1114 * (remote_address, remote_address + 8) 1115 * @param compare (IN) Operand for the operation 1116 * @param value (IN) Value to store on success 1117 * @param flags (IN) Flags for this atomic operation 1118 * @param order (IN) Ordering 1119 * @param cbfunc (IN) Function to call on completion (if queued) 1120 * @param cbcontext (IN) Context for the callback 1121 * @param cbdata (IN) Data for callback 1122 * 1123 * @retval OPAL_SUCCESS The operation was successfully queued 1124 * @retval 1 The operation is complete 1125 * @retval OPAL_ERROR The operation was NOT successfully queued 1126 * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic 1127 * operation. Try again later 1128 * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to 1129 * alignment restrictions or the operation {op} is not supported 1130 * by the hardware. 1131 * 1132 * After the operation is complete the remote address specified by {remote_address} and 1133 * {remote_handle} will be updated with {value} if *remote_address == compare. 1134 * {local_address} will be updated with the previous value stored in {remote_address}. 1135 * The btl will guarantee consistency of atomic operations performed via the btl. Note, 1136 * however, that not all btls will provide consistency between btl atomic operations and 1137 * cpu atomics. 1138 */ 1139 typedef int (*mca_btl_base_module_atomic_cswap64_fn_t) (struct mca_btl_base_module_t *btl, 1140 struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, 1141 struct mca_btl_base_registration_handle_t *local_handle, 1142 struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, 1143 uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, 1144 void *cbcontext, void *cbdata); 1145 1146 /** 1147 * Diagnostic dump of btl state. 1148 * 1149 * @param btl (IN) BTL module 1150 * @param endpoint (IN) BTL endpoint 1151 * @param verbose (IN) Verbosity level 1152 */ 1153 1154 typedef void (*mca_btl_base_module_dump_fn_t)( 1155 struct mca_btl_base_module_t* btl, 1156 struct mca_btl_base_endpoint_t* endpoint, 1157 int verbose 1158 ); 1159 1160 /** 1161 * Fault Tolerance Event Notification Function 1162 * @param state Checkpoint Status 1163 * @return OPAL_SUCCESS or failure status 1164 */ 1165 typedef int (*mca_btl_base_module_ft_event_fn_t)(int state); 1166 1167 /** 1168 * BTL module interface functions and attributes. 1169 */ 1170 struct mca_btl_base_module_t { 1171 1172 /* BTL common attributes */ 1173 mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */ 1174 size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */ 1175 size_t btl_rndv_eager_limit; /**< the size of a data sent in a first fragment of rendezvous protocol */ 1176 size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */ 1177 size_t btl_rdma_pipeline_send_length; /**< amount of bytes that should be send by pipeline protocol */ 1178 size_t btl_rdma_pipeline_frag_size; /**< maximum rdma fragment size supported by the BTL */ 1179 size_t btl_min_rdma_pipeline_size; /**< minimum packet size for pipeline protocol */ 1180 uint32_t btl_exclusivity; /**< indicates this BTL should be used exclusively */ 1181 uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */ 1182 uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */ 1183 uint32_t btl_flags; /**< flags (put/get...) */ 1184 uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */ 1185 size_t btl_registration_handle_size; /**< size of the BTLs registration handles */ 1186 1187 /* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */ 1188 size_t btl_get_limit; /**< maximum size supported by the btl_get function */ 1189 size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */ 1190 size_t btl_put_limit; /**< maximum size supported by the btl_put function */ 1191 size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */ 1192 1193 /* minimum transaction sizes for which registration is required for local memory */ 1194 size_t btl_get_local_registration_threshold; 1195 size_t btl_put_local_registration_threshold; 1196 1197 /* BTL function table */ 1198 mca_btl_base_module_add_procs_fn_t btl_add_procs; 1199 mca_btl_base_module_del_procs_fn_t btl_del_procs; 1200 mca_btl_base_module_register_fn_t btl_register; 1201 mca_btl_base_module_finalize_fn_t btl_finalize; 1202 1203 mca_btl_base_module_alloc_fn_t btl_alloc; 1204 mca_btl_base_module_free_fn_t btl_free; 1205 mca_btl_base_module_prepare_fn_t btl_prepare_src; 1206 mca_btl_base_module_send_fn_t btl_send; 1207 mca_btl_base_module_sendi_fn_t btl_sendi; 1208 mca_btl_base_module_put_fn_t btl_put; 1209 mca_btl_base_module_get_fn_t btl_get; 1210 mca_btl_base_module_dump_fn_t btl_dump; 1211 1212 /* atomic operations */ 1213 mca_btl_base_module_atomic_op64_fn_t btl_atomic_op; 1214 mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop; 1215 mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap; 1216 1217 /* new memory registration functions */ 1218 mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */ 1219 mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */ 1220 1221 /** the mpool associated with this btl (optional) */ 1222 mca_mpool_base_module_t* btl_mpool; 1223 /** register a default error handler */ 1224 mca_btl_base_module_register_error_fn_t btl_register_error; 1225 /** fault tolerant even notification */ 1226 mca_btl_base_module_ft_event_fn_t btl_ft_event; 1227 #if OPAL_CUDA_GDR_SUPPORT 1228 size_t btl_cuda_eager_limit; /**< switch from eager to RDMA */ 1229 size_t btl_cuda_rdma_limit; /**< switch from RDMA to rndv pipeline */ 1230 #endif /* OPAL_CUDA_GDR_SUPPORT */ 1231 #if OPAL_CUDA_SUPPORT 1232 size_t btl_cuda_max_send_size; /**< set if CUDA max send_size is different from host max send size */ 1233 #endif /* OPAL_CUDA_SUPPORT */ 1234 }; 1235 typedef struct mca_btl_base_module_t mca_btl_base_module_t; 1236 1237 /* 1238 * Macro for use in modules that are of type btl v3.0.0 1239 * NOTE: This is not the final version of 3.0.0. Consider it 1240 * alpha until this comment is removed. 1241 */ 1242 #define MCA_BTL_BASE_VERSION_3_0_0 \ 1243 OPAL_MCA_BASE_VERSION_2_1_0("btl", 3, 0, 0) 1244 1245 #define MCA_BTL_DEFAULT_VERSION(name) \ 1246 MCA_BTL_BASE_VERSION_3_0_0, \ 1247 .mca_component_name = name, \ 1248 MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, \ 1249 OPAL_RELEASE_VERSION) 1250 1251 END_C_DECLS 1252 1253 #endif /* OPAL_MCA_BTL_H */ 1254