1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 2 /* 3 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana 4 * University Research and Technology 5 * Corporation. All rights reserved. 6 * Copyright (c) 2004-2016 The University of Tennessee and The University 7 * of Tennessee Research Foundation. All rights 8 * reserved. 9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 10 * University of Stuttgart. All rights reserved. 11 * Copyright (c) 2004-2005 The Regents of the University of California. 12 * All rights reserved. 13 * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. 14 * Copyright (c) 2014-2016 Research Organization for Information Science 15 * and Technology (RIST). All rights reserved. 16 * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights 17 * reserved. 18 * $COPYRIGHT$ 19 * 20 * Additional copyrights may follow 21 * 22 * $HEADER$ 23 */ 24 /** 25 * @file 26 */ 27 #ifndef MCA_BTL_TCP_H 28 #define MCA_BTL_TCP_H 29 30 #include "opal_config.h" 31 #ifdef HAVE_SYS_TYPES_H 32 #include <sys/types.h> 33 #endif 34 #ifdef HAVE_SYS_SOCKET_H 35 #include <sys/socket.h> 36 #endif 37 #ifdef HAVE_NETINET_IN_H 38 #include <netinet/in.h> 39 #endif 40 #ifdef HAVE_UNISTD_H 41 #include <unistd.h> 42 #endif 43 44 /* Open MPI includes */ 45 #include "opal/mca/event/event.h" 46 #include "opal/class/opal_free_list.h" 47 #include "opal/mca/btl/btl.h" 48 #include "opal/mca/btl/base/base.h" 49 #include "opal/mca/mpool/mpool.h" 50 #include "opal/class/opal_hash_table.h" 51 #include "opal/util/fd.h" 52 53 #define MCA_BTL_TCP_STATISTICS 0 54 BEGIN_C_DECLS 55 56 extern opal_event_base_t* mca_btl_tcp_event_base; 57 58 #define MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) \ 59 do { \ 60 int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); \ 61 if( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) { \ 62 frag->base.des_cbfunc(&frag->endpoint->endpoint_btl->super, frag->endpoint, \ 63 &frag->base, frag->rc); \ 64 } \ 65 if( btl_ownership ) { \ 66 MCA_BTL_TCP_FRAG_RETURN(frag); \ 67 } \ 68 } while (0) 69 #define MCA_BTL_TCP_RECV_TRIGGER_CB(frag) \ 70 do { \ 71 if( MCA_BTL_TCP_HDR_TYPE_SEND == frag->hdr.type ) { \ 72 mca_btl_active_message_callback_t* reg; \ 73 reg = mca_btl_base_active_message_trigger + frag->hdr.base.tag; \ 74 reg->cbfunc(&frag->endpoint->endpoint_btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata); \ 75 } \ 76 } while (0) 77 78 extern opal_list_t mca_btl_tcp_ready_frag_pending_queue; 79 extern opal_mutex_t mca_btl_tcp_ready_frag_mutex; 80 extern int mca_btl_tcp_pipe_to_progress[2]; 81 extern int mca_btl_tcp_progress_thread_trigger; 82 83 #define MCA_BTL_TCP_CRITICAL_SECTION_ENTER(name) \ 84 opal_mutex_atomic_lock((name)) 85 #define MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(name) \ 86 opal_mutex_atomic_unlock((name)) 87 88 #define MCA_BTL_TCP_ACTIVATE_EVENT(event, value) \ 89 do { \ 90 if(0 < mca_btl_tcp_progress_thread_trigger) { \ 91 opal_event_t* _event = (opal_event_t*)(event); \ 92 (void) opal_fd_write( mca_btl_tcp_pipe_to_progress[1], sizeof(opal_event_t*), \ 93 &_event); \ 94 } \ 95 else { \ 96 opal_event_add(event, (value)); \ 97 } \ 98 } while (0) 99 100 /** 101 * TCP BTL component. 102 */ 103 104 struct mca_btl_tcp_component_t { 105 mca_btl_base_component_3_0_0_t super; /**< base BTL component */ 106 uint32_t tcp_addr_count; /**< total number of addresses */ 107 uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */ 108 unsigned int tcp_num_links; /**< number of logical links per physical device */ 109 struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */ 110 int tcp_free_list_num; /**< initial size of free lists */ 111 int tcp_free_list_max; /**< maximum size of free lists */ 112 int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */ 113 int tcp_endpoint_cache; /**< amount of cache on each endpoint */ 114 opal_proc_table_t tcp_procs; /**< hash table of tcp proc structures */ 115 opal_mutex_t tcp_lock; /**< lock for accessing module state */ 116 opal_list_t tcp_events; 117 118 opal_event_t tcp_recv_event; /**< recv event for IPv4 listen socket */ 119 int tcp_listen_sd; /**< IPv4 listen socket for incoming connection requests */ 120 unsigned short tcp_listen_port; /**< IPv4 listen port */ 121 int tcp_port_min; /**< IPv4 minimum port */ 122 int tcp_port_range; /**< IPv4 port range */ 123 #if OPAL_ENABLE_IPV6 124 opal_event_t tcp6_recv_event; /**< recv event for IPv6 listen socket */ 125 int tcp6_listen_sd; /**< IPv6 listen socket for incoming connection requests */ 126 unsigned short tcp6_listen_port; /**< IPv6 listen port */ 127 int tcp6_port_min; /**< IPv4 minimum port */ 128 int tcp6_port_range; /**< IPv4 port range */ 129 #endif 130 /* Port range restriction */ 131 132 char* tcp_if_include; /**< comma seperated list of interface to include */ 133 char* tcp_if_exclude; /**< comma seperated list of interface to exclude */ 134 int tcp_sndbuf; /**< socket sndbuf size */ 135 int tcp_rcvbuf; /**< socket rcvbuf size */ 136 int tcp_disable_family; /**< disabled AF_family */ 137 138 /* free list of fragment descriptors */ 139 opal_free_list_t tcp_frag_eager; 140 opal_free_list_t tcp_frag_max; 141 opal_free_list_t tcp_frag_user; 142 143 int tcp_enable_progress_thread; /** Support for tcp progress thread flag */ 144 145 opal_event_t tcp_recv_thread_async_event; 146 opal_mutex_t tcp_frag_eager_mutex; 147 opal_mutex_t tcp_frag_max_mutex; 148 opal_mutex_t tcp_frag_user_mutex; 149 /* Do we want to use TCP_NODELAY? */ 150 int tcp_not_use_nodelay; 151 152 /* do we want to warn on all excluded interfaces 153 * that are not found? 154 */ 155 bool report_all_unfound_interfaces; 156 }; 157 typedef struct mca_btl_tcp_component_t mca_btl_tcp_component_t; 158 159 OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component; 160 161 /** 162 * BTL Module Interface 163 */ 164 struct mca_btl_tcp_module_t { 165 mca_btl_base_module_t super; /**< base BTL interface */ 166 uint16_t tcp_ifkindex; /** <BTL kernel interface index */ 167 #if 0 168 int tcp_ifindex; /**< BTL interface index */ 169 #endif 170 struct sockaddr_storage tcp_ifaddr; /**< BTL interface address */ 171 uint32_t tcp_ifmask; /**< BTL interface netmask */ 172 173 opal_mutex_t tcp_endpoints_mutex; 174 opal_list_t tcp_endpoints; 175 176 mca_btl_base_module_error_cb_fn_t tcp_error_cb; /**< Upper layer error callback */ 177 #if MCA_BTL_TCP_STATISTICS 178 size_t tcp_bytes_sent; 179 size_t tcp_bytes_recv; 180 size_t tcp_send_handler; 181 #endif 182 }; 183 typedef struct mca_btl_tcp_module_t mca_btl_tcp_module_t; 184 extern mca_btl_tcp_module_t mca_btl_tcp_module; 185 186 #define CLOSE_THE_SOCKET(socket) {(void)shutdown(socket, SHUT_RDWR); (void)close(socket);} 187 188 /** 189 * TCP component initialization. 190 * 191 * @param num_btl_modules (OUT) Number of BTLs returned in BTL array. 192 * @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE) 193 * @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE) 194 */ 195 extern mca_btl_base_module_t** mca_btl_tcp_component_init( 196 int *num_btl_modules, 197 bool allow_multi_user_threads, 198 bool have_hidden_threads 199 ); 200 201 202 /** 203 * Cleanup any resources held by the BTL. 204 * 205 * @param btl BTL instance. 206 * @return OPAL_SUCCESS or error status on failure. 207 */ 208 209 extern int mca_btl_tcp_finalize( 210 struct mca_btl_base_module_t* btl 211 ); 212 213 214 /** 215 * PML->BTL notification of change in the process list. 216 * 217 * @param btl (IN) 218 * @param nprocs (IN) Number of processes 219 * @param procs (IN) Set of processes 220 * @param peers (OUT) Set of (optional) peer addressing info. 221 * @param peers (IN/OUT) Set of processes that are reachable via this BTL. 222 * @return OPAL_SUCCESS or error status on failure. 223 * 224 */ 225 226 extern int mca_btl_tcp_add_procs( 227 struct mca_btl_base_module_t* btl, 228 size_t nprocs, 229 struct opal_proc_t **procs, 230 struct mca_btl_base_endpoint_t** peers, 231 opal_bitmap_t* reachable 232 ); 233 234 /** 235 * PML->BTL notification of change in the process list. 236 * 237 * @param btl (IN) BTL instance 238 * @param nproc (IN) Number of processes. 239 * @param procs (IN) Set of processes. 240 * @param peers (IN) Set of peer data structures. 241 * @return Status indicating if cleanup was successful 242 * 243 */ 244 245 extern int mca_btl_tcp_del_procs( 246 struct mca_btl_base_module_t* btl, 247 size_t nprocs, 248 struct opal_proc_t **procs, 249 struct mca_btl_base_endpoint_t** peers 250 ); 251 252 253 /** 254 * Initiate an asynchronous send. 255 * 256 * @param btl (IN) BTL module 257 * @param endpoint (IN) BTL addressing information 258 * @param descriptor (IN) Description of the data to be transfered 259 * @param tag (IN) The tag value used to notify the peer. 260 */ 261 262 extern int mca_btl_tcp_send( 263 struct mca_btl_base_module_t* btl, 264 struct mca_btl_base_endpoint_t* btl_peer, 265 struct mca_btl_base_descriptor_t* descriptor, 266 mca_btl_base_tag_t tag 267 ); 268 269 270 /** 271 * Initiate an asynchronous put. 272 */ 273 274 int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, 275 uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, 276 mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, 277 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); 278 279 280 /** 281 * Initiate an asynchronous get. 282 */ 283 284 int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, 285 uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, 286 mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, 287 int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); 288 289 /** 290 * Allocate a descriptor with a segment of the requested size. 291 * Note that the BTL layer may choose to return a smaller size 292 * if it cannot support the request. 293 * 294 * @param btl (IN) BTL module 295 * @param size (IN) Request segment size. 296 */ 297 298 extern mca_btl_base_descriptor_t* mca_btl_tcp_alloc( 299 struct mca_btl_base_module_t* btl, 300 struct mca_btl_base_endpoint_t* endpoint, 301 uint8_t order, 302 size_t size, 303 uint32_t flags); 304 305 306 /** 307 * Return a segment allocated by this BTL. 308 * 309 * @param btl (IN) BTL module 310 * @param descriptor (IN) Allocated descriptor. 311 */ 312 313 extern int mca_btl_tcp_free( 314 struct mca_btl_base_module_t* btl, 315 mca_btl_base_descriptor_t* des); 316 317 318 /** 319 * Prepare a descriptor for send/rdma using the supplied 320 * convertor. If the convertor references data that is contigous, 321 * the descriptor may simply point to the user buffer. Otherwise, 322 * this routine is responsible for allocating buffer space and 323 * packing if required. 324 * 325 * @param btl (IN) BTL module 326 * @param endpoint (IN) BTL peer addressing 327 * @param convertor (IN) Data type convertor 328 * @param reserve (IN) Additional bytes requested by upper layer to precede user data 329 * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) 330 */ 331 332 mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( 333 struct mca_btl_base_module_t* btl, 334 struct mca_btl_base_endpoint_t* peer, 335 struct opal_convertor_t* convertor, 336 uint8_t order, 337 size_t reserve, 338 size_t* size, 339 uint32_t flags 340 ); 341 342 extern void 343 mca_btl_tcp_dump(struct mca_btl_base_module_t* btl, 344 struct mca_btl_base_endpoint_t* endpoint, 345 int verbose); 346 347 /** 348 * Fault Tolerance Event Notification Function 349 * @param state Checkpoint Stae 350 * @return OPAL_SUCCESS or failure status 351 */ 352 int mca_btl_tcp_ft_event(int state); 353 354 /* 355 * A blocking send on a non-blocking socket. Used to send the small 356 * amount of connection information that identifies the endpoints 357 * endpoint. 358 */ 359 int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size); 360 361 /* 362 * A blocking recv for both blocking and non-blocking socket. 363 * Used to receive the small amount of connection information 364 * that identifies the endpoints 365 * 366 * when the socket is blocking (the caller introduces timeout) 367 * which happens during initial handshake otherwise socket is 368 * non-blocking most of the time. 369 */ 370 int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size); 371 372 END_C_DECLS 373 #endif 374