1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2016 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2010-2011 Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2014-2016 Research Organization for Information Science
15  *                         and Technology (RIST). All rights reserved.
16  * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
17  *                         reserved.
18  * $COPYRIGHT$
19  *
20  * Additional copyrights may follow
21  *
22  * $HEADER$
23  */
24 /**
25  * @file
26  */
27 #ifndef MCA_BTL_TCP_H
28 #define MCA_BTL_TCP_H
29 
30 #include "opal_config.h"
31 #ifdef HAVE_SYS_TYPES_H
32 #include <sys/types.h>
33 #endif
34 #ifdef HAVE_SYS_SOCKET_H
35 #include <sys/socket.h>
36 #endif
37 #ifdef HAVE_NETINET_IN_H
38 #include <netinet/in.h>
39 #endif
40 #ifdef HAVE_UNISTD_H
41 #include <unistd.h>
42 #endif
43 
44 /* Open MPI includes */
45 #include "opal/mca/event/event.h"
46 #include "opal/class/opal_free_list.h"
47 #include "opal/mca/btl/btl.h"
48 #include "opal/mca/btl/base/base.h"
49 #include "opal/mca/mpool/mpool.h"
50 #include "opal/class/opal_hash_table.h"
51 #include "opal/util/fd.h"
52 
53 #define MCA_BTL_TCP_STATISTICS 0
54 BEGIN_C_DECLS
55 
56 extern opal_event_base_t* mca_btl_tcp_event_base;
57 
58 #define MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag)                            \
59     do {                                                                \
60         int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); \
61         if( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) { \
62             frag->base.des_cbfunc(&frag->endpoint->endpoint_btl->super, frag->endpoint, \
63                                   &frag->base, frag->rc);               \
64         }                                                               \
65         if( btl_ownership ) {                                           \
66             MCA_BTL_TCP_FRAG_RETURN(frag);                              \
67         }                                                               \
68     } while (0)
69 #define MCA_BTL_TCP_RECV_TRIGGER_CB(frag)                               \
70     do {                                                                \
71         if( MCA_BTL_TCP_HDR_TYPE_SEND == frag->hdr.type ) {             \
72             mca_btl_active_message_callback_t* reg;                     \
73             reg = mca_btl_base_active_message_trigger + frag->hdr.base.tag; \
74             reg->cbfunc(&frag->endpoint->endpoint_btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata); \
75         }                                                               \
76     } while (0)
77 
78 extern opal_list_t mca_btl_tcp_ready_frag_pending_queue;
79 extern opal_mutex_t mca_btl_tcp_ready_frag_mutex;
80 extern int mca_btl_tcp_pipe_to_progress[2];
81 extern int mca_btl_tcp_progress_thread_trigger;
82 
83 #define MCA_BTL_TCP_CRITICAL_SECTION_ENTER(name) \
84     opal_mutex_atomic_lock((name))
85 #define MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(name) \
86     opal_mutex_atomic_unlock((name))
87 
88 #define MCA_BTL_TCP_ACTIVATE_EVENT(event, value)                        \
89     do {                                                                \
90         if(0 < mca_btl_tcp_progress_thread_trigger) {                   \
91             opal_event_t* _event = (opal_event_t*)(event);                  \
92             (void) opal_fd_write( mca_btl_tcp_pipe_to_progress[1], sizeof(opal_event_t*), \
93                            &_event);                                        \
94         }                                                                   \
95         else {                                                          \
96             opal_event_add(event, (value));                             \
97         }                                                               \
98     } while (0)
99 
100 /**
101  * TCP BTL component.
102  */
103 
104 struct mca_btl_tcp_component_t {
105     mca_btl_base_component_3_0_0_t super;   /**< base BTL component */
106     uint32_t tcp_addr_count;                /**< total number of addresses */
107     uint32_t tcp_num_btls;                  /**< number of interfaces available to the TCP component */
108     unsigned int tcp_num_links;             /**< number of logical links per physical device */
109     struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */
110     int tcp_free_list_num;                  /**< initial size of free lists */
111     int tcp_free_list_max;                  /**< maximum size of free lists */
112     int tcp_free_list_inc;                  /**< number of elements to alloc when growing free lists */
113     int tcp_endpoint_cache;                 /**< amount of cache on each endpoint */
114     opal_proc_table_t tcp_procs;            /**< hash table of tcp proc structures */
115     opal_mutex_t tcp_lock;                  /**< lock for accessing module state */
116     opal_list_t tcp_events;
117 
118     opal_event_t tcp_recv_event;            /**< recv event for IPv4 listen socket */
119     int tcp_listen_sd;                      /**< IPv4 listen socket for incoming connection requests */
120     unsigned short tcp_listen_port;         /**< IPv4 listen port */
121     int tcp_port_min;                       /**< IPv4 minimum port */
122     int tcp_port_range;                     /**< IPv4 port range */
123 #if OPAL_ENABLE_IPV6
124     opal_event_t tcp6_recv_event;           /**< recv event for IPv6 listen socket */
125     int tcp6_listen_sd;                     /**< IPv6 listen socket for incoming connection requests */
126     unsigned short tcp6_listen_port;        /**< IPv6 listen port */
127     int tcp6_port_min;                      /**< IPv4 minimum port */
128     int tcp6_port_range;                    /**< IPv4 port range */
129 #endif
130     /* Port range restriction */
131 
132     char*  tcp_if_include;                  /**< comma seperated list of interface to include */
133     char*  tcp_if_exclude;                  /**< comma seperated list of interface to exclude */
134     int    tcp_sndbuf;                      /**< socket sndbuf size */
135     int    tcp_rcvbuf;                      /**< socket rcvbuf size */
136     int    tcp_disable_family;              /**< disabled AF_family */
137 
138     /* free list of fragment descriptors */
139     opal_free_list_t tcp_frag_eager;
140     opal_free_list_t tcp_frag_max;
141     opal_free_list_t tcp_frag_user;
142 
143     int tcp_enable_progress_thread;         /** Support for tcp progress thread flag */
144 
145     opal_event_t tcp_recv_thread_async_event;
146     opal_mutex_t tcp_frag_eager_mutex;
147     opal_mutex_t tcp_frag_max_mutex;
148     opal_mutex_t tcp_frag_user_mutex;
149     /* Do we want to use TCP_NODELAY? */
150     int    tcp_not_use_nodelay;
151 
152     /* do we want to warn on all excluded interfaces
153      * that are not found?
154      */
155     bool report_all_unfound_interfaces;
156 };
157 typedef struct mca_btl_tcp_component_t mca_btl_tcp_component_t;
158 
159 OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
160 
161 /**
162  * BTL Module Interface
163  */
164 struct mca_btl_tcp_module_t {
165     mca_btl_base_module_t  super;  /**< base BTL interface */
166     uint16_t           tcp_ifkindex; /** <BTL kernel interface index */
167 #if 0
168     int                tcp_ifindex; /**< BTL interface index */
169 #endif
170     struct sockaddr_storage tcp_ifaddr; /**< BTL interface address */
171     uint32_t           tcp_ifmask;  /**< BTL interface netmask */
172 
173     opal_mutex_t       tcp_endpoints_mutex;
174     opal_list_t        tcp_endpoints;
175 
176     mca_btl_base_module_error_cb_fn_t tcp_error_cb;  /**< Upper layer error callback */
177 #if MCA_BTL_TCP_STATISTICS
178     size_t tcp_bytes_sent;
179     size_t tcp_bytes_recv;
180     size_t tcp_send_handler;
181 #endif
182 };
183 typedef struct mca_btl_tcp_module_t mca_btl_tcp_module_t;
184 extern mca_btl_tcp_module_t mca_btl_tcp_module;
185 
186 #define CLOSE_THE_SOCKET(socket)   {(void)shutdown(socket, SHUT_RDWR); (void)close(socket);}
187 
188 /**
189  * TCP component initialization.
190  *
191  * @param num_btl_modules (OUT)           Number of BTLs returned in BTL array.
192  * @param allow_multi_user_threads (OUT)  Flag indicating wether BTL supports user threads (TRUE)
193  * @param have_hidden_threads (OUT)       Flag indicating wether BTL uses threads (TRUE)
194  */
195 extern mca_btl_base_module_t** mca_btl_tcp_component_init(
196     int *num_btl_modules,
197     bool allow_multi_user_threads,
198     bool have_hidden_threads
199 );
200 
201 
202 /**
203  * Cleanup any resources held by the BTL.
204  *
205  * @param btl  BTL instance.
206  * @return     OPAL_SUCCESS or error status on failure.
207  */
208 
209 extern int mca_btl_tcp_finalize(
210     struct mca_btl_base_module_t* btl
211 );
212 
213 
214 /**
215  * PML->BTL notification of change in the process list.
216  *
217  * @param btl (IN)
218  * @param nprocs (IN)     Number of processes
219  * @param procs (IN)      Set of processes
220  * @param peers (OUT)     Set of (optional) peer addressing info.
221  * @param peers (IN/OUT)  Set of processes that are reachable via this BTL.
222  * @return     OPAL_SUCCESS or error status on failure.
223  *
224  */
225 
226 extern int mca_btl_tcp_add_procs(
227     struct mca_btl_base_module_t* btl,
228     size_t nprocs,
229     struct opal_proc_t **procs,
230     struct mca_btl_base_endpoint_t** peers,
231     opal_bitmap_t* reachable
232 );
233 
234 /**
235  * PML->BTL notification of change in the process list.
236  *
237  * @param btl (IN)     BTL instance
238  * @param nproc (IN)   Number of processes.
239  * @param procs (IN)   Set of processes.
240  * @param peers (IN)   Set of peer data structures.
241  * @return             Status indicating if cleanup was successful
242  *
243  */
244 
245 extern int mca_btl_tcp_del_procs(
246     struct mca_btl_base_module_t* btl,
247     size_t nprocs,
248     struct opal_proc_t **procs,
249     struct mca_btl_base_endpoint_t** peers
250 );
251 
252 
253 /**
254  * Initiate an asynchronous send.
255  *
256  * @param btl (IN)         BTL module
257  * @param endpoint (IN)    BTL addressing information
258  * @param descriptor (IN)  Description of the data to be transfered
259  * @param tag (IN)         The tag value used to notify the peer.
260  */
261 
262 extern int mca_btl_tcp_send(
263     struct mca_btl_base_module_t* btl,
264     struct mca_btl_base_endpoint_t* btl_peer,
265     struct mca_btl_base_descriptor_t* descriptor,
266     mca_btl_base_tag_t tag
267 );
268 
269 
270 /**
271  * Initiate an asynchronous put.
272  */
273 
274 int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
275                      uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
276                      mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
277                      int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
278 
279 
280 /**
281  * Initiate an asynchronous get.
282  */
283 
284 int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
285                      uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
286                      mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
287                      int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
288 
289 /**
290  * Allocate a descriptor with a segment of the requested size.
291  * Note that the BTL layer may choose to return a smaller size
292  * if it cannot support the request.
293  *
294  * @param btl (IN)      BTL module
295  * @param size (IN)     Request segment size.
296  */
297 
298 extern mca_btl_base_descriptor_t* mca_btl_tcp_alloc(
299     struct mca_btl_base_module_t* btl,
300     struct mca_btl_base_endpoint_t* endpoint,
301     uint8_t order,
302     size_t size,
303     uint32_t flags);
304 
305 
306 /**
307  * Return a segment allocated by this BTL.
308  *
309  * @param btl (IN)      BTL module
310  * @param descriptor (IN)  Allocated descriptor.
311  */
312 
313 extern int mca_btl_tcp_free(
314     struct mca_btl_base_module_t* btl,
315     mca_btl_base_descriptor_t* des);
316 
317 
318 /**
319  * Prepare a descriptor for send/rdma using the supplied
320  * convertor. If the convertor references data that is contigous,
321  * the descriptor may simply point to the user buffer. Otherwise,
322  * this routine is responsible for allocating buffer space and
323  * packing if required.
324  *
325  * @param btl (IN)          BTL module
326  * @param endpoint (IN)     BTL peer addressing
327  * @param convertor (IN)    Data type convertor
328  * @param reserve (IN)      Additional bytes requested by upper layer to precede user data
329  * @param size (IN/OUT)     Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
330 */
331 
332 mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
333     struct mca_btl_base_module_t* btl,
334     struct mca_btl_base_endpoint_t* peer,
335     struct opal_convertor_t* convertor,
336     uint8_t order,
337     size_t reserve,
338     size_t* size,
339     uint32_t flags
340 );
341 
342 extern void
343 mca_btl_tcp_dump(struct mca_btl_base_module_t* btl,
344                  struct mca_btl_base_endpoint_t* endpoint,
345                  int verbose);
346 
347 /**
348   * Fault Tolerance Event Notification Function
349   * @param state Checkpoint Stae
350   * @return OPAL_SUCCESS or failure status
351   */
352 int mca_btl_tcp_ft_event(int state);
353 
354 /*
355  * A blocking send on a non-blocking socket. Used to send the small
356  * amount of connection information that identifies the endpoints
357  * endpoint.
358  */
359 int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size);
360 
361 /*
362  * A blocking recv for both blocking and non-blocking socket.
363  * Used to receive the small amount of connection information
364  * that identifies the endpoints
365  *
366  * when the socket is blocking (the caller introduces timeout)
367  * which happens during initial handshake otherwise socket is
368  * non-blocking most of the time.
369  */
370 int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size);
371 
372 END_C_DECLS
373 #endif
374