1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpidi_ch3_impl.h"
7 #ifdef USE_PMI2_API
8 #include "pmi2.h"
9 #else
10 #include "pmi.h"
11 #endif
12 
13 #include "mpidu_sock.h"
14 
15 #include "ch3usock.h"
16 
17 /* Private packet types used only within this file */
18 /* Note that these must be smaller than the PktGeneric type and
19    their MPIDI_CH3_Pkt_type_t values are arbitrary (but must be
20    consistent) */
21 /* FIXME - We need a little security here to avoid having a random port scan
22    crash the process.  Perhaps a "secret" value for each process could be
23    published in the key-val space and subsequently sent in the open pkt. */
24 typedef struct
25 {
26     MPIDI_CH3_Pkt_type_t type;
27     int pg_id_len;
28     int pg_rank;
29 }
30 MPIDI_CH3I_Pkt_sc_open_req_t;
31 
32 typedef struct
33 {
34     MPIDI_CH3_Pkt_type_t type;
35     int ack;
36 }
37 MPIDI_CH3I_Pkt_sc_open_resp_t;
38 
39 typedef struct
40 {
41     MPIDI_CH3_Pkt_type_t type;
42     int port_name_tag;
43 }
44 MPIDI_CH3I_Pkt_sc_conn_accept_t;
45 
46 #ifdef HAVE_NETDB_H
47 #include <netdb.h>
48 #endif
49 #ifdef HAVE_SYS_SOCKET_H
50 /* Include this for AF_INET */
51 #include <sys/socket.h>
52 #endif
53 #ifdef HAVE_ARPA_INET_H
54 /* Include this for inet_pton prototype */
55 #include <arpa/inet.h>
56 #endif
57 
58 /* FIXME: Describe what these routines do */
59 
60 /* FIXME: Clean up use of private packets (open/accept) */
61 
62 /* Partial description:
63    This file contains the routines that are used to create socket connections,
64    including the routines used to encode/decode the description of a connection
65    into/out of the "business card".
66 
67    ToDo: change the "host description" to an "interface address" so that
68    socket connections are targeted at particularly interfaces, not
69    compute nodes, and that the address is in ready-to-use IP address format,
70    and does not require a gethostbyname lookup.  - Partially done
71  */
72 
73 /*
74  * Manage the connection information that is exported to other processes
75  *
76  */
77 #define MPIDI_CH3I_HOST_DESCRIPTION_KEY  "description"
78 #define MPIDI_CH3I_PORT_KEY              "port"
79 #define MPIDI_CH3I_IFNAME_KEY            "ifname"
80 
81 /*
82  * Routines for establishing a listener socket on the socket set that
83  * is used for all communication.  These should be called from the
84  * channel init and finalize routines.
85  */
86 static int MPIDI_CH3I_listener_port = 0;
87 static MPIDI_CH3I_Connection_t * MPIDI_CH3I_listener_conn = NULL;
88 
89 /* Required for (socket version) upcall to Connect_to_root (see FIXME) */
90 extern MPIDI_CH3I_Sock_set_t MPIDI_CH3I_sock_set;
91 
MPIDU_CH3I_SetupListener(MPIDI_CH3I_Sock_set_t sock_set)92 int MPIDU_CH3I_SetupListener( MPIDI_CH3I_Sock_set_t sock_set )
93 {
94     int mpi_errno = MPI_SUCCESS;
95     MPIDI_CH3I_Sock_t sock;
96 
97     mpi_errno = MPIDI_CH3I_Connection_alloc(&MPIDI_CH3I_listener_conn);
98     if (mpi_errno != MPI_SUCCESS) {
99 	return mpi_errno;
100     }
101 
102     MPL_DBG_MSG(MPIDI_CH3_DBG_CONNECT,TYPICAL,
103 		 "Setting listener connect state to CONN_STATE_LISTENING");
104     MPIDI_CH3I_listener_conn->sock	  = NULL;
105     MPIDI_CH3I_listener_conn->vc	  = NULL;
106     MPIDI_CH3I_listener_conn->state	  = CONN_STATE_LISTENING;
107     MPIDI_CH3I_listener_conn->send_active = NULL;
108     MPIDI_CH3I_listener_conn->recv_active = NULL;
109 
110     mpi_errno = MPIDI_CH3I_Sock_listen(sock_set, MPIDI_CH3I_listener_conn,
111 				  &MPIDI_CH3I_listener_port, &sock);
112     if (mpi_errno) return mpi_errno;
113 
114     MPL_DBG_MSG_D(MPIDI_CH3_DBG_CONNECT,VERBOSE,"Listener port %d",
115 		   MPIDI_CH3I_listener_port );
116 
117     MPIDI_CH3I_listener_conn->sock = sock;
118 
119     return mpi_errno;
120 }
121 
MPIDU_CH3I_ShutdownListener(void)122 int MPIDU_CH3I_ShutdownListener( void )
123 {
124     int mpi_errno;
125     MPID_Progress_state progress_state;
126 
127     MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL,"Closing listener sock (Post_close)");
128     mpi_errno = MPIDI_CH3I_Sock_post_close(MPIDI_CH3I_listener_conn->sock);
129     if (mpi_errno != MPI_SUCCESS) {
130 	return mpi_errno;
131     }
132 
133     MPID_Progress_start(&progress_state);
134     while(MPIDI_CH3I_listener_conn != NULL)
135     {
136 	mpi_errno = MPID_Progress_wait(&progress_state);
137 
138     }
139     MPID_Progress_end(&progress_state);
140 
141     return mpi_errno;
142 }
143 
144 /* Allocates a connection and the pg_id field for a connection only.
145    Does not initialize any connection fields other than pg_id.
146    Called by routines that create connections, used in this
147    file and in ch3_progress*.c in various channels.
148 */
MPIDI_CH3I_Connection_alloc(MPIDI_CH3I_Connection_t ** connp)149 int MPIDI_CH3I_Connection_alloc(MPIDI_CH3I_Connection_t ** connp)
150 {
151     int mpi_errno = MPI_SUCCESS;
152     MPIDI_CH3I_Connection_t * conn = NULL;
153     int id_sz;
154     int pmi_errno;
155     MPIR_CHKPMEM_DECL(2);
156     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CONNECTION_ALLOC);
157 
158     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CONNECTION_ALLOC);
159 
160     MPIR_CHKPMEM_MALLOC(conn,MPIDI_CH3I_Connection_t*,
161 			sizeof(MPIDI_CH3I_Connection_t),mpi_errno,"conn", MPL_MEM_DYNAMIC);
162 
163     /* FIXME: This size is unchanging, so get it only once (at most);
164        we might prefer for connections to simply point at the single process
165        group to which the remote process belong */
166 #ifdef USE_PMI2_API
167     id_sz = MPID_MAX_JOBID_LEN;
168 #else
169     pmi_errno = PMI_KVS_Get_name_length_max(&id_sz);
170     MPIR_ERR_CHKANDJUMP1(pmi_errno, mpi_errno,MPI_ERR_OTHER,
171 			     "**pmi_get_id_length_max",
172 			     "**pmi_get_id_length_max %d", pmi_errno);
173 #endif
174     MPIR_CHKPMEM_MALLOC(conn->pg_id,char*,id_sz + 1,mpi_errno,"conn->pg_id", MPL_MEM_DYNAMIC);
175     conn->pg_id[0] = 0;           /* Be careful about pg_id in case a later
176 				     error */
177     *connp = conn;
178 
179   fn_exit:
180     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CONNECTION_ALLOC);
181     return mpi_errno;
182   fn_fail:
183     MPIR_CHKPMEM_REAP();
184     goto fn_exit;
185 }
186 
187 
188 /* FIXME: Why does the name include "to_root"?  */
189 
190 /* FIXME: Describe the algorithm for the connection logic */
MPIDI_CH3I_Connect_to_root_sock(const char * port_name,MPIDI_VC_t ** new_vc)191 int MPIDI_CH3I_Connect_to_root_sock(const char * port_name,
192 				    MPIDI_VC_t ** new_vc)
193 {
194     int mpi_errno = MPI_SUCCESS;
195     MPIDI_VC_t * vc;
196     MPIDI_CH3I_VC *vcch;
197     MPIR_CHKPMEM_DECL(1);
198     char host_description[MAX_HOST_DESCRIPTION_LEN];
199     int port, port_name_tag;
200     MPL_sockaddr_t ifaddr;
201     int hasIfaddr = 0;
202     MPIDI_CH3I_Connection_t * conn;
203     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_CONNECT_TO_ROOT_SOCK);
204 
205     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3I_CONNECT_TO_ROOT_SOCK);
206 
207     /* First, create a new vc (we may use this to pass to a generic
208        connection routine) */
209     MPIR_CHKPMEM_MALLOC(vc,MPIDI_VC_t *,sizeof(MPIDI_VC_t),mpi_errno,"vc", MPL_MEM_DYNAMIC);
210     /* FIXME - where does this vc get freed? */
211 
212     *new_vc = vc;
213 
214     /* FIXME: There may need to be an additional routine here, to ensure that the
215        channel is initialized for this pair of process groups (this process
216        and the remote process to which the vc will connect). */
217     MPIDI_VC_Init(vc, NULL, 0);
218 
219     MPL_DBG_MSG_S(MPIDI_CH3_DBG_CONNECT,VERBOSE,"Connect to root with portstring %s",
220 		   port_name );
221 
222     mpi_errno = MPIDI_CH3I_Sock_get_conninfo_from_bc( port_name, host_description,
223 						 sizeof(host_description),
224 						 &port, &ifaddr, &hasIfaddr );
225     MPIR_ERR_CHECK(mpi_errno);
226     mpi_errno = MPIDI_GetTagFromPort(port_name, &port_name_tag);
227     if (mpi_errno != MPL_SUCCESS) {
228 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**argstr_port_name_tag");
229     }
230 
231     MPL_DBG_MSG_D(MPIDI_CH3_DBG_CONNECT,VERBOSE,"port tag %d",port_name_tag);
232 
233     mpi_errno = MPIDI_CH3I_Connection_alloc(&conn);
234     MPIR_ERR_CHECK(mpi_errno);
235 
236     /* conn->pg_id is not used for this conection */
237 
238     /* FIXME: To avoid this global (MPIDI_CH3I_sock_set) which is
239        used only in ch3_progress.c and ch3_progress_connect.c in the channels,
240        this should be a call into the channel, asking it to setup the
241        socket for a connection and return the connection.  That will
242        keep the socket set out of the general ch3 code, even if this
243        is the socket utility functions. */
244     MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,VERBOSE,(MPL_DBG_FDEST,
245 	  "posting connect to host %s, port %d", host_description, port ));
246     mpi_errno = MPIDI_CH3I_Sock_post_connect(MPIDI_CH3I_sock_set, conn,
247 					host_description, port, &conn->sock);
248     if (mpi_errno == MPI_SUCCESS)
249     {
250 	MPIDI_CH3I_Pkt_sc_conn_accept_t *acceptpkt =
251 	    (MPIDI_CH3I_Pkt_sc_conn_accept_t *)&conn->pkt.type;
252     	vcch = &vc->ch;
253 	vcch->sock = conn->sock;
254         vcch->conn = conn;
255         vcch->state = MPIDI_CH3I_VC_STATE_CONNECTING;
256         conn->vc = vc;
257 	MPL_DBG_CONNSTATECHANGE(vc,conn,CONN_STATE_CONNECT_ACCEPT);
258         conn->state = CONN_STATE_CONNECT_ACCEPT;
259         conn->send_active = NULL;
260         conn->recv_active = NULL;
261 
262         /* place the port name tag in the pkt that will eventually be sent to
263 	   the other side */
264         acceptpkt->port_name_tag = port_name_tag;
265     }
266     /* --BEGIN ERROR HANDLING-- */
267     else
268     {
269 	if (MPIR_ERR_GET_CLASS(mpi_errno) == MPIDI_CH3I_SOCK_ERR_BAD_HOST)
270         {
271             mpi_errno = MPIR_Err_create_code(
272 		MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|sock|badhost",
273 		"**ch3|sock|badhost %s %d %s", conn->pg_id, conn->vc->pg_rank, port_name);
274         }
275         else if (MPIR_ERR_GET_CLASS(mpi_errno) == MPIDI_CH3I_SOCK_ERR_CONN_FAILED)
276         {
277             mpi_errno = MPIR_Err_create_code(
278 		MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|sock|connrefused",
279 		"**ch3|sock|connrefused %s %d %s", conn->pg_id, conn->vc->pg_rank, port_name);
280         }
281         else
282         {
283 	    MPIR_ERR_POP(mpi_errno);
284 	}
285     	vcch = &vc->ch;
286         vcch->state = MPIDI_CH3I_VC_STATE_FAILED;
287         MPL_free(conn);
288         goto fn_fail;
289     }
290     /* --END ERROR HANDLING-- */
291 
292  fn_exit:
293     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3I_CONNECT_TO_ROOT_SOCK);
294     return mpi_errno;
295  fn_fail:
296     MPIR_CHKPMEM_REAP();
297     goto fn_exit;
298 }
299 
300 /* ------------------------------------------------------------------------- */
301 /* Business card management.  These routines insert or extract connection
302    information when using sockets from the business card */
303 /* ------------------------------------------------------------------------- */
304 
305 /* FIXME: These are small routines; we may want to bring them together
306    into a more specific post-connection-for-sock */
307 
308 /* The host_description should be of length MAX_HOST_DESCRIPTION_LEN */
309 
MPIDI_CH3I_Sock_get_conninfo_from_bc(const char * bc,char * host_description,int maxlen,int * port,MPL_sockaddr_t * ifaddr,int * hasIfaddr)310 int MPIDI_CH3I_Sock_get_conninfo_from_bc( const char *bc,
311 				     char *host_description, int maxlen,
312 				     int *port, MPL_sockaddr_t * ifaddr,
313 				     int *hasIfaddr )
314 {
315     int mpi_errno = MPI_SUCCESS;
316     int str_errno;
317 #if !defined(HAVE_WINDOWS_H) && defined(HAVE_INET_PTON)
318     char ifname[256];
319 #endif
320     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SOCK_GET_CONNINFO_FROM_BC);
321 
322     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3I_SOCK_GET_CONNINFO_FROM_BC);
323 
324     str_errno = MPL_str_get_string_arg(bc, MPIDI_CH3I_HOST_DESCRIPTION_KEY,
325 				 host_description, maxlen);
326     if (str_errno != MPL_SUCCESS) {
327 	/* --BEGIN ERROR HANDLING */
328 	if (str_errno == MPL_ERR_STR_FAIL) {
329 	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**argstr_missinghost");
330 	}
331 	else {
332 	    /* MPL_ERR_STR_TRUNCATED or MPL_ERR_STR_NOMEM */
333 	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**argstr_hostd");
334 	}
335 	/* --END ERROR HANDLING-- */
336     }
337     str_errno = MPL_str_get_int_arg(bc, MPIDI_CH3I_PORT_KEY, port);
338     if (str_errno != MPL_SUCCESS) {
339 	/* --BEGIN ERROR HANDLING */
340 	if (str_errno == MPL_ERR_STR_FAIL) {
341 	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**argstr_missingport");
342 	}
343 	else {
344 	    /* MPL_ERR_STR_TRUNCATED or MPL_ERR_STR_NOMEM */
345 	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**argstr_port");
346 	}
347 	/* --END ERROR HANDLING-- */
348     }
349     /* ifname is optional */
350     /* FIXME: This is a hack to allow Windows to continue to use
351        the host description string instead of the interface address
352        bytes when posting a socket connection.  This should be fixed
353        by changing the Sock_post_connect to only accept interface
354        address.  Note also that Windows does not have the inet_pton
355        routine; the Windows version of this routine will need to
356        be identified or written.  See also channels/sock/ch3_progress.c */
357     *hasIfaddr = 0;
358 #if !defined(HAVE_WINDOWS_H) && defined(HAVE_INET_PTON)
359     str_errno = MPL_str_get_string_arg(bc, MPIDI_CH3I_IFNAME_KEY,
360 					ifname, sizeof(ifname) );
361     if (str_errno == MPL_SUCCESS) {
362         int ret = MPL_get_sockaddr((const char *)ifname, ifaddr);
363         if (ret) {
364 	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ifnameinvalid");
365 	}
366     }
367 #endif
368 
369  fn_exit:
370     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3I_SOCK_GET_CONNINFO_FROM_BC);
371     return mpi_errno;
372  fn_fail:
373     goto fn_exit;
374 }
375 
376 
377 /*  MPIDI_CH3U_Get_business_card_sock - does socket specific portion of
378  *  setting up a business card
379  *
380  *  Parameters:
381  *     bc_val_p     - business card value buffer pointer, updated to the next
382  *                    available location or freed if published.
383  *     val_max_sz_p - ptr to maximum value buffer size reduced by the number
384  *                    of characters written
385  *
386  */
387 
MPIDI_CH3U_Get_business_card_sock(int myRank,char ** bc_val_p,int * val_max_sz_p)388 int MPIDI_CH3U_Get_business_card_sock(int myRank,
389 				      char **bc_val_p, int *val_max_sz_p)
390 {
391     int mpi_errno = MPI_SUCCESS;
392     int str_errno = MPL_SUCCESS;
393     MPL_sockaddr_t ifaddr;
394     char ifnamestr[MAX_HOST_DESCRIPTION_LEN];
395 #ifdef MPL_USE_DBG_LOGGING
396     char *bc_orig = *bc_val_p;
397 #endif
398     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_GET_BUSINESS_CARD_SOCK);
399 
400     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_GET_BUSINESS_CARD_SOCK);
401 
402     MPIDU_CH3U_GetSockInterfaceAddr( myRank, ifnamestr, sizeof(ifnamestr), &ifaddr );
403 
404     str_errno = MPL_str_add_int_arg(bc_val_p, val_max_sz_p,
405                                      MPIDI_CH3I_PORT_KEY, MPIDI_CH3I_listener_port);
406     if (str_errno) {
407         MPIR_ERR_CHKANDJUMP(str_errno == MPL_ERR_STR_NOMEM, mpi_errno, MPI_ERR_OTHER, "**buscard_len");
408         MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**buscard");
409     }
410 
411     str_errno = MPL_str_add_string_arg(bc_val_p, val_max_sz_p,
412                                         MPIDI_CH3I_HOST_DESCRIPTION_KEY, ifnamestr );
413     if (str_errno) {
414         MPIR_ERR_CHKANDJUMP(str_errno == MPL_ERR_STR_NOMEM, mpi_errno, MPI_ERR_OTHER, "**buscard_len");
415         MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**buscard");
416     }
417 
418     /* Look up the interface address cooresponding to this host description */
419     /* FIXME: We should start switching to getaddrinfo instead of
420        gethostbyname */
421     /* FIXME: We don't make use of the ifname in Windows in order to
422        provide backward compatibility with the (undocumented) host
423        description string used by the socket connection routine
424        MPIDI_CH3I_Sock_post_connect.  We need to change to an interface-address
425        (already resolved) based description for better scalability and
426        to eliminate reliance on fragile DNS services. Note that this is
427        also more scalable, since the DNS server may serialize address
428        requests.  On most systems, asking for the host info of yourself
429        is resolved locally (i.e., perfectly parallel).  Regrettably, not
430        all systems do this (e.g., some versions of FreeBSD).
431     */
432 #if 0
433 #ifndef HAVE_WINDOWS_H
434     {
435 	struct hostent *info;
436 	char ifname[256];
437 	unsigned char *p;
438 	info = gethostbyname( ifname );
439 	if (info && info->h_addr_list) {
440 	    p = (unsigned char *)(info->h_addr_list[0]);
441 	    MPL_snprintf( ifname, sizeof(ifname), "%u.%u.%u.%u",
442 			   p[0], p[1], p[2], p[3] );
443 	    MPL_DBG_MSG_S(MPIDI_CH3_DBG_CONNECT,VERBOSE,"ifname = %s",ifname );
444 	    str_errno = MPL_str_add_string_arg( bc_val_p,
445 						 val_max_sz_p,
446 						 MPIDI_CH3I_IFNAME_KEY,
447 						 ifname );
448             if (str_errno) {
449                 MPIR_ERR_CHKANDJUMP(str_errno == MPL_ERR_STR_NOMEM, mpi_errno, MPI_ERR_OTHER, "**buscard_len");
450                 MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**buscard");
451             }
452 	}
453     }
454 #endif
455 #endif
456 
457     {
458 	char ifname[256]="";
459         MPL_sockaddr_to_str(&ifaddr, ifname, 256);
460 	if (ifname[0]) {
461 	    MPL_DBG_MSG_S(MPIDI_CH3_DBG_CONNECT,VERBOSE,"ifname = %s",ifname );
462 	    str_errno = MPL_str_add_string_arg( bc_val_p,
463 						 val_max_sz_p,
464 						 MPIDI_CH3I_IFNAME_KEY,
465 						 ifname );
466             if (str_errno) {
467                 MPIR_ERR_CHKANDJUMP(str_errno == MPL_ERR_STR_NOMEM, mpi_errno, MPI_ERR_OTHER, "**buscard_len");
468                 MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**buscard");
469             }
470 	}
471     }
472 
473     MPL_DBG_MSG_S(MPIDI_CH3_DBG_CONNECT,TYPICAL,"business card is %s", bc_orig );
474 
475  fn_exit:
476     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_GET_BUSINESS_CARD_SOCK);
477     return mpi_errno;
478  fn_fail:
479     goto fn_exit;
480 }
481 
482 /* ------------------------------------------------------------------------- */
483 /* Below will be/is the code that is used to create a connection and
484  * to handle changes to the state of a connection.
485  */
486 /* ------------------------------------------------------------------------- */
487 static int connection_post_recv_pkt(MPIDI_CH3I_Connection_t * conn);
488 static int connection_post_send_pkt(MPIDI_CH3I_Connection_t * conn);
489 static int connection_post_send_pkt_and_pgid(MPIDI_CH3I_Connection_t * conn);
490 static int connection_post_sendq_req(MPIDI_CH3I_Connection_t * conn);
491 static void connection_destroy(MPIDI_CH3I_Connection_t * conn);
492 
493 /* This routine is called in response to an MPIDI_CH3I_SOCK_OP_ACCEPT event
494    in ch3_progress */
MPIDI_CH3_Sockconn_handle_accept_event(void)495 int MPIDI_CH3_Sockconn_handle_accept_event( void )
496 {
497     int mpi_errno = MPI_SUCCESS;
498     MPIDI_CH3I_Connection_t * conn;
499     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_ACCEPT_EVENT);
500 
501     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_ACCEPT_EVENT);
502 
503     mpi_errno = MPIDI_CH3I_Connection_alloc(&conn);
504     MPIR_ERR_CHECK(mpi_errno);
505     mpi_errno = MPIDI_CH3I_Sock_accept(MPIDI_CH3I_listener_conn->sock,
506 				  MPIDI_CH3I_sock_set, conn, &conn->sock);
507     if (mpi_errno != MPI_SUCCESS) {
508 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|sock|accept");
509     }
510 
511     conn->vc = NULL;
512     MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_LRECV_PKT);
513     conn->state = CONN_STATE_OPEN_LRECV_PKT;
514     conn->send_active = NULL;
515     conn->recv_active = NULL;
516 
517     mpi_errno = connection_post_recv_pkt(conn);
518     MPIR_ERR_CHECK(mpi_errno);
519 
520  fn_exit:
521     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_ACCEPT_EVENT);
522 
523     return mpi_errno;
524  fn_fail:
525     goto fn_exit;
526 }
527 
MPIDI_CH3_Sockconn_handle_connect_event(MPIDI_CH3I_Connection_t * conn,int event_error)528 int MPIDI_CH3_Sockconn_handle_connect_event( MPIDI_CH3I_Connection_t *conn,
529 					     int event_error )
530 {
531     int mpi_errno = MPI_SUCCESS;
532     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNECT_EVENT);
533 
534     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNECT_EVENT);
535 
536     /* --BEGIN ERROR HANDLING-- */
537     if (event_error != MPI_SUCCESS) {
538 	/* If the connection fails, conn->vc etc is probably invalid,
539 	   so we can only report that the connection failed */
540 	mpi_errno = event_error;
541 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|sock|connfailed" );
542     }
543     /* --END ERROR HANDLING-- */
544 
545     if (conn->state == CONN_STATE_CONNECTING || conn->state == CONN_STATE_DISCARD) {
546 	MPIDI_CH3I_Pkt_sc_open_req_t *openpkt =
547 	    (MPIDI_CH3I_Pkt_sc_open_req_t *)&conn->pkt.type;
548         if(conn->state == CONN_STATE_CONNECTING){
549 	    MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_CSEND);
550 	    conn->state = CONN_STATE_OPEN_CSEND;
551         }
552 	MPIDI_Pkt_init(openpkt, MPIDI_CH3I_PKT_SC_OPEN_REQ);
553 	openpkt->pg_id_len = (int) strlen(MPIDI_Process.my_pg->id) + 1;
554 	openpkt->pg_rank = MPIR_Process.comm_world->rank;
555 
556 	mpi_errno = connection_post_send_pkt_and_pgid(conn);
557 	if (mpi_errno) { MPIR_ERR_POP(mpi_errno); }
558     }
559     else {
560 	/* CONN_STATE_CONNECT_ACCEPT */
561 	int port_name_tag;
562 	MPIDI_CH3I_Pkt_sc_conn_accept_t *acceptpkt =
563 	    (MPIDI_CH3I_Pkt_sc_conn_accept_t *)&conn->pkt.type;
564 
565 	MPIR_Assert(conn->state == CONN_STATE_CONNECT_ACCEPT);
566 	MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_CSEND);
567 	conn->state = CONN_STATE_OPEN_CSEND;
568 
569 	/* pkt contains port name tag. In memory debugging mode,
570 	   MPIDI_Pkt_init resets the packet contents. Therefore,
571 	   save the port name tag and then add it back. */
572 	port_name_tag = acceptpkt->port_name_tag;
573 	MPIDI_Pkt_init(acceptpkt, MPIDI_CH3I_PKT_SC_CONN_ACCEPT);
574 	acceptpkt->port_name_tag = port_name_tag;
575 
576 	mpi_errno = connection_post_send_pkt(conn);
577 	if (mpi_errno != MPI_SUCCESS) {
578 	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_INTERN,
579 				"**ch3|sock|scconnaccept");
580 	}
581     }
582 
583  fn_exit:
584     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNECT_EVENT);
585     return mpi_errno;
586  fn_fail:
587     goto fn_exit;
588 }
589 
MPIDI_CH3_Sockconn_handle_close_event(MPIDI_CH3I_Connection_t * conn)590 int MPIDI_CH3_Sockconn_handle_close_event( MPIDI_CH3I_Connection_t * conn )
591 {
592     int mpi_errno = MPI_SUCCESS;
593     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CLOSE_EVENT);
594 
595     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CLOSE_EVENT);
596 
597     /* If the conn pointer is NULL then the close was intentional */
598     /* FIXME: What does the above comment mean? */
599     if (conn != NULL) {
600 	if (conn->state == CONN_STATE_CLOSING) {
601 	    MPIR_Assert(conn->send_active == NULL);
602 	    MPIR_Assert(conn->recv_active == NULL);
603 	    if (conn->vc != NULL) {
604 		MPIDI_CH3I_VC *vcch = &conn->vc->ch;
605 
606                 conn->sock = MPIDI_CH3I_SOCK_INVALID_SOCK;
607                 MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_CLOSED);
608                 conn->state = CONN_STATE_CLOSED;
609 
610                 /* Only manipulate vcch if conn was not the loser in a
611                    head-to-head resolution.  */
612                 if (vcch && vcch->conn == conn) {
613                     MPL_DBG_VCCHSTATECHANGE(conn->vc,VC_STATE_UNCONNECTED);
614                     vcch->state = MPIDI_CH3I_VC_STATE_UNCONNECTED;
615                     vcch->sock  = MPIDI_CH3I_SOCK_INVALID_SOCK;
616 
617                     /* This step is important; without this, test
618                        disconnect_reconnect fails because the vc->ch.conn
619                        connection will continue to be used, even though
620                        the memory has been freed */
621                     vcch->conn = NULL;
622 
623                     /* Handle_connection takes care of updating the state on the VC */
624                     mpi_errno = MPIDI_CH3U_Handle_connection(conn->vc, MPIDI_VC_EVENT_TERMINATED);
625                     MPIR_ERR_CHECK(mpi_errno);
626                 }
627             }
628 
629             /* The VC was likely freed in the _Handle_connection call and should
630                not be referenced anymore in any case. */
631             conn->vc = NULL;
632 	}
633         else if(conn->state == CONN_STATE_DISCARD) {
634         /* post close, so the socket is closed and memmory leaks are avoided */
635             MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL,"CLosing sock (Post_close)");
636             conn->state = CONN_STATE_CLOSING;
637             mpi_errno = MPIDI_CH3I_Sock_post_close(conn->sock);
638 	    MPIR_ERR_CHECK(mpi_errno);
639             goto fn_exit;
640         }
641 	else {
642 	    MPIR_Assert(conn->state == CONN_STATE_LISTENING);
643 	    MPIDI_CH3I_listener_conn = NULL;
644 	    MPIDI_CH3I_listener_port = 0;
645 
646 	    MPIDI_CH3_Progress_signal_completion();
647 	}
648 
649 	connection_destroy(conn);
650     }
651  fn_exit:
652     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CLOSE_EVENT);
653     return mpi_errno;
654  fn_fail:
655     goto fn_exit;
656 }
657 
658 /* Cycle through the connection setup states */
659 /* FIXME: separate out the accept and connect sides to make it easier
660    to follow the logic */
MPIDI_CH3_Sockconn_handle_conn_event(MPIDI_CH3I_Connection_t * conn)661 int MPIDI_CH3_Sockconn_handle_conn_event( MPIDI_CH3I_Connection_t * conn )
662 {
663     int mpi_errno = MPI_SUCCESS;
664     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONN_EVENT);
665 
666     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONN_EVENT);
667 
668     /* FIXME: Is there an assumption about conn->state? */
669 
670     if (conn->pkt.type == MPIDI_CH3I_PKT_SC_OPEN_REQ) {
671 	MPIDI_CH3I_Pkt_sc_open_req_t *openpkt =
672 	    (MPIDI_CH3I_Pkt_sc_open_req_t *)&conn->pkt.type;
673 	/* Answer to fixme: it appears from the control flow that this is
674 	   the required state) */
675 	MPIR_Assert( conn->state == CONN_STATE_OPEN_LRECV_PKT);
676 	MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_LRECV_DATA);
677 	conn->state = CONN_STATE_OPEN_LRECV_DATA;
678 	mpi_errno = MPIDI_CH3I_Sock_post_read(conn->sock, conn->pg_id,
679 					 openpkt->pg_id_len,
680 					 openpkt->pg_id_len, NULL);
681 	MPIR_ERR_CHECK(mpi_errno);
682     }
683     else if (conn->pkt.type == MPIDI_CH3I_PKT_SC_CONN_ACCEPT) {
684 	MPIDI_VC_t *vc;
685 	MPIDI_CH3I_VC *vcch;
686 	int port_name_tag;
687 	MPIDI_CH3I_Pkt_sc_conn_accept_t *acceptpkt =
688 	    (MPIDI_CH3I_Pkt_sc_conn_accept_t *)&conn->pkt.type;
689 	MPIDI_CH3I_Pkt_sc_open_resp_t *openresp =
690 	    (MPIDI_CH3I_Pkt_sc_open_resp_t *)&conn->pkt.type;
691 
692 	vc = (MPIDI_VC_t *) MPL_malloc(sizeof(MPIDI_VC_t), MPL_MEM_ADDRESS);
693 	/* --BEGIN ERROR HANDLING-- */
694 	if (vc == NULL) {
695 	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_OTHER,
696 					     "**nomem", NULL);
697 	    goto fn_fail;
698 	}
699 	/* --END ERROR HANDLING-- */
700 	/* FIXME - where does this vc get freed? */
701 
702 	MPIDI_VC_Init(vc, NULL, 0);
703 
704 	vcch = &vc->ch;
705 	MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_CONNECTING);
706 	vcch->state = MPIDI_CH3I_VC_STATE_CONNECTING;
707 	vcch->sock = conn->sock;
708 	vcch->conn = conn;
709 	conn->vc   = vc;
710 	port_name_tag = acceptpkt->port_name_tag;
711 
712 	MPIDI_Pkt_init(openresp, MPIDI_CH3I_PKT_SC_OPEN_RESP);
713 	openresp->ack = TRUE;
714 
715 	/* FIXME: Possible ambiguous state (two ways to get to OPEN_LSEND) */
716 	MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_LSEND);
717 	conn->state = CONN_STATE_OPEN_LSEND;
718 	mpi_errno = connection_post_send_pkt(conn);
719 	if (mpi_errno != MPI_SUCCESS) {
720 	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_INTERN,
721 				"**ch3|sock|scconnaccept");
722 	}
723 
724 	/* ENQUEUE vc */
725 	MPIDI_CH3I_Acceptq_enqueue(vc, port_name_tag);
726 
727     }
728     else if (conn->pkt.type == MPIDI_CH3I_PKT_SC_OPEN_RESP) {
729 	MPIDI_CH3I_Pkt_sc_open_resp_t *openpkt =
730 	    (MPIDI_CH3I_Pkt_sc_open_resp_t *)&conn->pkt.type;
731 	/* FIXME: is this the correct assert? */
732 
733 	if (openpkt->ack && conn->state != CONN_STATE_DISCARD) {
734 	    MPIR_Assert( conn->state == CONN_STATE_OPEN_CRECV );
735 	    MPIDI_CH3I_VC *vcch = &conn->vc->ch;
736 	    MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_CONNECTED);
737 	    conn->state = CONN_STATE_CONNECTED;
738 	    vcch->state = MPIDI_CH3I_VC_STATE_CONNECTED;
739 	    MPIR_Assert(vcch->conn == conn);
740 	    MPIR_Assert(vcch->sock == conn->sock);
741 
742 	    mpi_errno = connection_post_recv_pkt(conn);
743 	    MPIR_ERR_CHECK(mpi_errno);
744 	    mpi_errno = connection_post_sendq_req(conn);
745 	    if (mpi_errno != MPI_SUCCESS) {
746 		MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_INTERN,
747 				    "**ch3|sock|scopenresp");
748 	    }
749 	}
750 	else {
751 	    MPIDI_CH3I_VC *vcch = &conn->vc->ch;
752 	    /* FIXME: Should conn->vc be freed? Who allocated? Why not? */
753 	    /* FIXME: Should probably reduce ref count on conn->vc */
754 	    /* FIXME: What happens to the state of the associated VC?
755 	       Why isn't it changed?  Is there an assert here,
756 	       such as conn->vc->conn != conn (there is another connection
757 	       chosen for the vc)? */
758             /*Answer to FIXME */
759             /* Neither freed nor updated. This connection is the looser of
760                a head-to-head connection. The VC is still in use, but by
761                another sochekt connection. The refcount is not incremented
762                By chaning the assosiated connection. */
763 	    /* MPIR_Assert( conn->vc->ch.conn != conn ); */
764 	    /* Set the candidate vc for this connection to NULL (we
765 	       are discarding this connection because (I think) we
766 	       are performing a head-to-head connection, and this
767 	       connection is being rejected in favor of the connection
768 	       from the other side. */
769 	    if (vcch->conn == conn) vcch->conn = NULL;
770 	    MPL_DBG_CONNSTATECHANGE_MSG(conn->vc,conn,CONN_STATE_CLOSING,
771 					"because ack on OPEN_CRECV was false");
772 	    conn->vc = NULL;
773 	    conn->state = CONN_STATE_CLOSING;
774 	    /* FIXME: What does post close do here? */
775             /* Answer to FIXME: */
776             /* Since the connection is discarded, the socket is
777                no longer needed and should be closed. This is initiated with the post
778                close command. This also caused that the socket is removed from the
779                socket set, so no more polling on this socket*/
780 	    MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL,"CLosing sock (Post_close)");
781 	    mpi_errno = MPIDI_CH3I_Sock_post_close(conn->sock);
782 	    MPIR_ERR_CHECK(mpi_errno);
783 	}
784     }
785     /* --BEGIN ERROR HANDLING-- */
786     else {
787 	MPL_DBG_STMT(MPIDI_CH3_DBG_CONNECT,VERBOSE,MPIDI_DBG_Print_packet(&conn->pkt));
788 	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_INTERN,
789 					 "**ch3|sock|badpacket", "**ch3|sock|badpacket %d", conn->pkt.type);
790 	goto fn_fail;
791     }
792     /* --END ERROR HANDLING-- */
793 
794 
795  fn_exit:
796     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONN_EVENT);
797     return mpi_errno;
798  fn_fail:
799     goto fn_exit;
800 }
801 
802 /* FIXME: This should really be combined with handle_conn_event */
MPIDI_CH3_Sockconn_handle_connopen_event(MPIDI_CH3I_Connection_t * conn)803 int MPIDI_CH3_Sockconn_handle_connopen_event( MPIDI_CH3I_Connection_t * conn )
804 {
805     int mpi_errno = MPI_SUCCESS;
806     MPIDI_PG_t * pg;
807     int pg_rank;
808     MPIDI_VC_t * vc;
809     MPIDI_CH3I_VC *vcch;
810     MPIDI_CH3I_Pkt_sc_open_req_t *openpkt =
811 	(MPIDI_CH3I_Pkt_sc_open_req_t *)&conn->pkt.type;
812     MPIDI_CH3I_Pkt_sc_open_resp_t *openresp =
813 	(MPIDI_CH3I_Pkt_sc_open_resp_t *)&conn->pkt.type;
814     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNOPEN_EVENT);
815 
816     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNOPEN_EVENT);
817 
818     /* Look up pg based on conn->pg_id */
819     mpi_errno = MPIDI_PG_Find(conn->pg_id, &pg);
820     if (pg == NULL) {
821 	MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,
822 			     "**pglookup",
823 			     "**pglookup %s", conn->pg_id);
824     }
825 
826     /* We require that the packet be the open_req type */
827     pg_rank = openpkt->pg_rank;
828     MPIDI_PG_Get_vc_set_active(pg, pg_rank, &vc);
829     MPIR_Assert(vc->pg_rank == pg_rank);
830 
831     if(pg->finalize == 1) {
832         MPIDI_Pkt_init(openresp, MPIDI_CH3I_PKT_SC_OPEN_RESP);
833         openresp->ack = FALSE;
834         MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_LSEND);
835         conn->state = CONN_STATE_OPEN_LSEND;
836         mpi_errno = connection_post_send_pkt(conn);
837         if (mpi_errno != MPI_SUCCESS) {
838             MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_INTERN,
839 			    "**ch3|sock|open_lrecv_data");
840         }
841         goto fn_exit;
842     }
843     vcch = &vc->ch;
844     if (vcch->conn == NULL) {
845 	/* no head-to-head connects, accept the connection */
846 	MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_CONNECTING);
847 	vcch->state = MPIDI_CH3I_VC_STATE_CONNECTING;
848 	vcch->sock = conn->sock;
849 	vcch->conn = conn;
850 	conn->vc = vc;
851 
852 	MPIDI_Pkt_init(openresp, MPIDI_CH3I_PKT_SC_OPEN_RESP);
853 	openresp->ack = TRUE;
854     }
855     else {
856 	/* head to head situation */
857 	if (pg == MPIDI_Process.my_pg) {
858 	    /* the other process is in the same comm_world; just compare the
859 	       ranks */
860 	    if (MPIR_Process.comm_world->rank < pg_rank) {
861 		MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST,
862                 "vc=%p,conn=%p:Accept head-to-head connection (my process group), discarding vcch->conn=%p",vc,conn, vcch->conn));
863 
864                 /* mark old connection */
865                 MPIDI_CH3I_Connection_t *old_conn = vcch->conn;
866                 MPL_DBG_CONNSTATECHANGE(old_conn,old_conn,CONN_STATE_DISCARD);
867                 old_conn->state = CONN_STATE_DISCARD;
868 
869 		/* accept connection */
870 		MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_CONNECTING);
871 		vcch->state = MPIDI_CH3I_VC_STATE_CONNECTING;
872 		vcch->sock = conn->sock;
873 		vcch->conn = conn;
874 		conn->vc = vc;
875 
876 		MPIDI_Pkt_init(openresp, MPIDI_CH3I_PKT_SC_OPEN_RESP);
877 		openresp->ack = TRUE;
878 	    }
879 	    else {
880 		/* refuse connection */
881 		MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST,
882                 "vc=%p,conn=%p:Refuse head-to-head connection (my process group)",vc,conn));
883 		MPIDI_Pkt_init(openresp, MPIDI_CH3I_PKT_SC_OPEN_RESP);
884 		openresp->ack = FALSE;
885 	    }
886 	}
887 	else {
888 	    /* the two processes are in different comm_worlds; compare their
889 	       unique pg_ids. */
890 	    if (strcmp(MPIDI_Process.my_pg->id, pg->id) < 0) {
891 		MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST,
892                 "vc=%p,conn=%p:Accept head-to-head connection (two process groups), discarding vcch->conn=%p",vc,conn, vcch->conn));
893                 /* mark old connection */
894                 MPIDI_CH3I_Connection_t *old_conn = vcch->conn;
895                 MPL_DBG_CONNSTATECHANGE(old_conn,old_conn,CONN_STATE_DISCARD);
896                 old_conn->state = CONN_STATE_DISCARD;
897 		/* accept connection */
898 		MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_CONNECTING);
899 		vcch->state = MPIDI_CH3I_VC_STATE_CONNECTING;
900 		vcch->sock = conn->sock;
901 		vcch->conn = conn;
902 		conn->vc = vc;
903 
904 		MPIDI_Pkt_init(openresp, MPIDI_CH3I_PKT_SC_OPEN_RESP);
905 		openresp->ack = TRUE;
906 	    }
907 	    else {
908 		/* refuse connection */
909 		MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST,
910 			"vc=%p,conn=%p:Refuse head-to-head connection (two process groups)",vc,conn));
911 		MPIDI_Pkt_init(openresp, MPIDI_CH3I_PKT_SC_OPEN_RESP);
912 		openresp->ack = FALSE;
913 	    }
914 	}
915     }
916 
917     MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_LSEND);
918     conn->state = CONN_STATE_OPEN_LSEND;
919     mpi_errno = connection_post_send_pkt(conn);
920     if (mpi_errno != MPI_SUCCESS) {
921 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_INTERN,
922 			    "**ch3|sock|open_lrecv_data");
923     }
924 
925  fn_exit:
926     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNOPEN_EVENT);
927     return mpi_errno;
928  fn_fail:
929     goto fn_exit;
930 }
931 
932 /* FIXME: This routine is called when?  What is valid in conn? */
MPIDI_CH3_Sockconn_handle_connwrite(MPIDI_CH3I_Connection_t * conn)933 int MPIDI_CH3_Sockconn_handle_connwrite( MPIDI_CH3I_Connection_t * conn )
934 {
935     int mpi_errno = MPI_SUCCESS;
936     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNWRITE);
937 
938     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNWRITE);
939 
940     if (conn->state == CONN_STATE_OPEN_CSEND || conn->state == CONN_STATE_DISCARD) {
941 	/* finished sending open request packet */
942 	/* post receive for open response packet */
943         if(conn->state == CONN_STATE_OPEN_CSEND){
944             MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_OPEN_CRECV);
945             conn->state = CONN_STATE_OPEN_CRECV;
946         }
947 	mpi_errno = connection_post_recv_pkt(conn);
948 	MPIR_ERR_CHECK(mpi_errno);
949     }
950     else if (conn->state == CONN_STATE_OPEN_LSEND) {
951 	MPIDI_CH3I_Pkt_sc_open_resp_t *openresp =
952 	    (MPIDI_CH3I_Pkt_sc_open_resp_t *)&conn->pkt.type;
953 	/* finished sending open response packet */
954 	if (openresp->ack == TRUE) {
955 	    MPIDI_CH3I_VC *vcch = &conn->vc->ch;
956 	    /* post receive for packet header */
957 	    MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_CONNECTED);
958 	    conn->state = CONN_STATE_CONNECTED;
959 	    MPL_DBG_VCCHSTATECHANGE(conn->vc,VC_STATE_CONNECTED);
960 	    vcch->state = MPIDI_CH3I_VC_STATE_CONNECTED;
961 	    mpi_errno = connection_post_recv_pkt(conn);
962 	    MPIR_ERR_CHECK(mpi_errno);
963 
964 	    mpi_errno = connection_post_sendq_req(conn);
965 	    if (mpi_errno != MPI_SUCCESS) {
966 		MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_INTERN,
967 				    "**ch3|sock|openlsend");
968 	    }
969 	}
970 	else {
971 	    /* head-to-head connections - close this connection */
972 	    MPL_DBG_CONNSTATECHANGE(conn->vc,conn,CONN_STATE_CLOSING);
973 	    /* FIXME: the connect side of this sets conn->vc to NULL. Why is
974 	       this different? The code that checks CONN_STATE_CLOSING uses
975 	       conn == NULL to identify intentional close, which this
976 	       appears to be. */
977 	    conn->state = CONN_STATE_CLOSING;
978 
979             /* zero out the vc to prevent trouble in _handle_close_event */
980             conn->vc = NULL;
981 
982 	    MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL,"Closing sock2 (Post_close)");
983 	    mpi_errno = MPIDI_CH3I_Sock_post_close(conn->sock);
984 	    if (mpi_errno != MPI_SUCCESS) {
985 		MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
986 				    "**sock_post_close");
987 	    }
988 	}
989     }
990 
991  fn_exit:
992     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_SOCKCONN_HANDLE_CONNWRITE);
993     return mpi_errno;
994  fn_fail:
995     goto fn_exit;
996 }
997 
998 /* ----------------------------------------------------------------------- */
999 /* FIXME: What does this do? */
MPIDI_CH3I_VC_post_sockconnect(MPIDI_VC_t * vc)1000 int MPIDI_CH3I_VC_post_sockconnect(MPIDI_VC_t * vc)
1001 {
1002     int mpi_errno = MPI_SUCCESS;
1003     char val[MPIDI_MAX_KVS_VALUE_LEN];
1004     MPIDI_CH3I_VC *vcch = &vc->ch;
1005     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_VC_POST_SOCKCONNECT);
1006 
1007     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3I_VC_POST_SOCKCONNECT);
1008 
1009     /* MPIDI_PG_GetConnString() can block & release the lock for
1010      * the current thread. Prevent other threads from trying to
1011      * obtain the ConnString by setting the VC to *CONNECTING.
1012      */
1013     if(vcch->state == MPIDI_CH3I_VC_STATE_UNCONNECTED){
1014         MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_CONNECTING);
1015     	vcch->state = MPIDI_CH3I_VC_STATE_CONNECTING;
1016 	MPL_DBG_MSG_P(MPIDI_CH3_DBG_CONNECT,TYPICAL,"vc=(%p) Going ahead to obtain connstring", vc);
1017     }else{
1018 	MPL_DBG_MSG_P(MPIDI_CH3_DBG_CONNECT,TYPICAL,"MT: vc=(%p) is already connecting/ed", vc);
1019 	MPL_DBG_MSG(MPIDI_CH3_DBG_CONNECT,TYPICAL,"Aborting posting a connect");
1020 	/*************** MT *****************/
1021 	/* There are 3 cases here,
1022          * 1) Another thread posted a connect while the current thread
1023          *    was blocked in MPIDI_PG_GetConnString()
1024          *    VC state = MPIDI_CH3I_VC_STATE_CONNECTING
1025          * 2) Another thread posted a connect and completed the
1026          *    connection while the current thread was blocked in
1027          *    MPIDI_PG_GetConnString()
1028          *    VC state = MPIDI_CH3I_VC_STATE_CONNECTED
1029          * 3) Another thread received a connect from the same proc we
1030          *    are connecting to and opened a connection while the
1031          *    current thread was blocked in MPIDI_PG_GetConnString()
1032          *    VC state = MPIDI_CH3I_VC_STATE_CONNECTING or
1033          *    VC state = MPIDI_CH3I_VC_STATE_CONNECTED
1034          * If we bail out here, in all the cases above the other thread
1035          * will handle the connection. In particular in the 3rd case
1036          * if we proceed to post a connect before the VC state is set
1037          * by the thread processing the remote connect,
1038          * the code for head-to-head conn resolution will take care of
1039          * discarding one of the connections
1040          */
1041 	 mpi_errno = MPI_SUCCESS;
1042          goto fn_exit;
1043     }
1044     mpi_errno = MPIDI_PG_GetConnString( vc->pg, vc->pg_rank, val, sizeof(val));
1045     MPIR_ERR_CHECK(mpi_errno);
1046 
1047     mpi_errno = MPIDI_CH3I_Sock_connect( vc, val, sizeof(val) );
1048 
1049   fn_exit:
1050     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3I_VC_POST_SOCKCONNECT);
1051     return mpi_errno;
1052  fn_fail:
1053     goto fn_exit;
1054     /* --END ERROR HANDLING-- */
1055 }
1056 /* end MPIDI_CH3I_VC_post_sockconnect() */
1057 
1058 /* Given a connection string, start the process of creating a socket
1059    connection to that designated interface (on a node).  This routine
1060    is used in MPIDI_CH3I_VC_post_sockconnect.
1061 
1062    vallen = sizeof(val)
1063 */
MPIDI_CH3I_Sock_connect(MPIDI_VC_t * vc,const char val[],int vallen)1064 int MPIDI_CH3I_Sock_connect( MPIDI_VC_t *vc, const char val[], int vallen )
1065 {
1066     char host_description[MAX_HOST_DESCRIPTION_LEN];
1067     MPL_sockaddr_t ifaddr;
1068     int hasIfaddr = 0, port;
1069     MPIDI_CH3I_Connection_t * conn = 0;
1070     int mpi_errno = MPI_SUCCESS;
1071     MPIDI_CH3I_VC *vcch = &vc->ch;
1072     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SOCK_CONNECT);
1073 
1074     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3I_SOCK_CONNECT);
1075 
1076     if(vcch->state == MPIDI_CH3I_VC_STATE_CONNECTING){
1077 	MPL_DBG_MSG_P(MPIDI_CH3_DBG_CONNECT,TYPICAL,"Posting a connect for vc=(%p)", vc);
1078     }else{
1079 	MPL_DBG_MSG_P(MPIDI_CH3_DBG_CONNECT,TYPICAL,"MT: vc=(%p) is already connected", vc);
1080 	MPL_DBG_MSG(MPIDI_CH3_DBG_CONNECT,TYPICAL,"Aborting posting a connect");
1081 	/*************** MT *****************/
1082         /* 1) Another thread received a connect from the same proc
1083          *    the current thread is connecting to and opened a
1084 	 *    connection while the current thread was blocked in
1085 	 *    MPIDI_PG_GetConnString()
1086          *    VC state = MPIDI_CH3I_VC_STATE_CONNECTED
1087          * If we bail out here, the other thread will handle the connection.
1088          * if we proceed to post a connect before the VC state is set
1089          * by the thread processing the remote connect,
1090          * the code for head-to-head conn resolution will take care of
1091          * discarding one of the connections
1092          */
1093 	 mpi_errno = MPI_SUCCESS;
1094          goto fn_exit;
1095     }
1096 
1097     mpi_errno = MPIDI_CH3I_Sock_get_conninfo_from_bc( val, host_description,
1098 						 sizeof(host_description),
1099 						 &port, &ifaddr, &hasIfaddr );
1100     MPIR_ERR_CHECK(mpi_errno);
1101 
1102     mpi_errno = MPIDI_CH3I_Connection_alloc(&conn);
1103     if (mpi_errno == MPI_SUCCESS)
1104     {
1105 	/* FIXME: This is a hack to allow Windows to continue to use
1106 	   the host description string instead of the interface address
1107 	   bytes when posting a socket connection.  This should be fixed
1108 	   by changing the Sock_post_connect to only accept interface
1109 	   address. */
1110 #ifndef HAVE_WINDOWS_H
1111 	if (hasIfaddr) {
1112 	    mpi_errno = MPIDI_CH3I_Sock_post_connect_ifaddr(MPIDI_CH3I_sock_set,
1113 						       conn, &ifaddr, port,
1114 						       &conn->sock);
1115 	}
1116 	else
1117 #endif
1118 	{
1119 	    mpi_errno = MPIDI_CH3I_Sock_post_connect(MPIDI_CH3I_sock_set, conn,
1120 						host_description, port,
1121 						&conn->sock);
1122 	}
1123 	if (mpi_errno == MPI_SUCCESS)
1124 	{
1125 	    MPL_DBG_CONNSTATECHANGE(vc,conn,CONN_STATE_CONNECTING);
1126 	    vcch->sock = conn->sock;
1127 	    vcch->conn = conn;
1128 	    conn->vc = vc;
1129 	    conn->state = CONN_STATE_CONNECTING;
1130 	    conn->send_active = NULL;
1131 	    conn->recv_active = NULL;
1132 	}
1133 	/* --BEGIN ERROR HANDLING-- */
1134 	else
1135 	{
1136 	    MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_FAILED);
1137 	    vcch->state = MPIDI_CH3I_VC_STATE_FAILED;
1138 	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|sock|postconnect",
1139 		"**ch3|sock|postconnect %d %d %s", MPIR_Process.comm_world->rank, vc->pg_rank, val);
1140 	    goto fn_fail;
1141 	}
1142 	/* --END ERROR HANDLING-- */
1143     }
1144     else {
1145 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|sock|connalloc");
1146     }
1147 
1148  fn_exit:
1149     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3I_SOCK_CONNECT);
1150     return mpi_errno;
1151  fn_fail:
1152     /* --BEGIN ERROR HANDLING-- */
1153     if (conn) {
1154 	connection_destroy(conn);
1155     }
1156     goto fn_exit;
1157     /* --END ERROR HANDLING-- */
1158 }
1159 
1160 
1161 /* FIXME: What does this do? */
1162 /* Guess: Setup a wait-to-read on the socket that was set after the accept
1163    was handled */
1164 /* Wrong guess.  */
connection_post_recv_pkt(MPIDI_CH3I_Connection_t * conn)1165 static int connection_post_recv_pkt(MPIDI_CH3I_Connection_t * conn)
1166 {
1167     int mpi_errno = MPI_SUCCESS;
1168     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CONNECTION_POST_RECV_PKT);
1169 
1170     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CONNECTION_POST_RECV_PKT);
1171 
1172     mpi_errno = MPIDI_CH3I_Sock_post_read(conn->sock, &conn->pkt, sizeof(conn->pkt),
1173 				     sizeof(conn->pkt), NULL);
1174     MPIR_ERR_CHECK(mpi_errno);
1175 
1176  fn_fail:
1177     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CONNECTION_POST_RECV_PKT);
1178     return mpi_errno;
1179 }
1180 
1181 
connection_post_send_pkt(MPIDI_CH3I_Connection_t * conn)1182 static int connection_post_send_pkt(MPIDI_CH3I_Connection_t * conn)
1183 {
1184     int mpi_errno = MPI_SUCCESS;
1185     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CONNECTION_POST_SEND_PKT);
1186 
1187     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CONNECTION_POST_SEND_PKT);
1188 
1189     MPL_DBG_PKT(conn,&conn->pkt,"connect");
1190     mpi_errno = MPIDI_CH3I_Sock_post_write(conn->sock, &conn->pkt, sizeof(conn->pkt),
1191 				      sizeof(conn->pkt), NULL);
1192     MPIR_ERR_CHECK(mpi_errno);
1193 
1194  fn_fail:
1195     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CONNECTION_POST_SEND_PKT);
1196     return mpi_errno;
1197 }
1198 
connection_post_send_pkt_and_pgid(MPIDI_CH3I_Connection_t * conn)1199 static int connection_post_send_pkt_and_pgid(MPIDI_CH3I_Connection_t * conn)
1200 {
1201     int mpi_errno;
1202     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CONNECTION_POST_SEND_PKT_AND_PGID);
1203 
1204     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CONNECTION_POST_SEND_PKT_AND_PGID);
1205 
1206     conn->iov[0].iov_base = (void *) &conn->pkt;
1207     conn->iov[0].iov_len = (int) sizeof(conn->pkt);
1208 
1209     conn->iov[1].iov_base = (void *) MPIDI_Process.my_pg->id;
1210     conn->iov[1].iov_len = (int) strlen(MPIDI_Process.my_pg->id) + 1;
1211 
1212     MPL_DBG_PKT(conn,&conn->pkt,"connect-pgid");
1213     mpi_errno = MPIDI_CH3I_Sock_post_writev(conn->sock, conn->iov, 2, NULL);
1214     MPIR_ERR_CHECK(mpi_errno);
1215 
1216  fn_fail:
1217     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CONNECTION_POST_SEND_PKT_AND_PGID);
1218     return mpi_errno;
1219 }
1220 
1221 /* FIXME: This function also used in channels/sock/src/ch3_progress.c */
connection_post_sendq_req(MPIDI_CH3I_Connection_t * conn)1222 static int connection_post_sendq_req(MPIDI_CH3I_Connection_t * conn)
1223 {
1224     int mpi_errno = MPI_SUCCESS;
1225     MPIDI_CH3I_VC *vcch = &conn->vc->ch;
1226     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CONNECTION_POST_SENDQ_REQ);
1227 
1228     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CONNECTION_POST_SENDQ_REQ);
1229 
1230     /* post send of next request on the send queue */
1231     conn->send_active = MPIDI_CH3I_SendQ_head(vcch); /* MT */
1232     if (conn->send_active != NULL)
1233     {
1234 	MPL_DBG_MSG_P(MPIDI_CH3_DBG_CONNECT,TYPICAL,"conn=%p: Posting message from connection send queue", conn );
1235 	mpi_errno = MPIDI_CH3I_Sock_post_writev(conn->sock,
1236 					   conn->send_active->dev.iov,
1237 					   conn->send_active->dev.iov_count,
1238 					   NULL);
1239 	MPIR_ERR_CHECK(mpi_errno);
1240     }
1241 
1242  fn_fail:
1243     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CONNECTION_POST_SENDQ_REQ);
1244     return mpi_errno;
1245 }
1246 
1247 
1248 /* This routine frees all of the memory associated with a connection.
1249    It is named destroy instead of free because routines with name "free"
1250    should have MPI semantics - free means to
1251    decrement reference count and free if reference count is zero */
connection_destroy(MPIDI_CH3I_Connection_t * conn)1252 static void connection_destroy(MPIDI_CH3I_Connection_t * conn)
1253 {
1254     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_CONNECTION_DESTROY);
1255 
1256     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_CONNECTION_DESTROY);
1257 
1258     MPL_free(conn->pg_id);
1259     MPL_free(conn);
1260 
1261     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_CONNECTION_DESTROY);
1262 }
1263 
1264 
1265 #ifdef MPL_USE_DBG_LOGGING
MPIDI_CH3_VC_SockGetStateString(struct MPIDI_VC * vc)1266 const char * MPIDI_CH3_VC_SockGetStateString( struct MPIDI_VC *vc )
1267 {
1268     const char *name = "unknown";
1269     static char asdigits[20];
1270     MPIDI_CH3I_VC *vcch = &vc->ch;
1271     int    state = vcch->state;
1272 
1273     switch (state) {
1274     case MPIDI_CH3I_VC_STATE_UNCONNECTED: name = "CH3I_VC_STATE_UNCONNECTED"; break;
1275     case MPIDI_CH3I_VC_STATE_CONNECTING:  name = "CH3I_VC_STATE_CONNECTING"; break;
1276     case MPIDI_CH3I_VC_STATE_CONNECTED:   name = "CH3I_VC_STATE_CONNECTED"; break;
1277     case MPIDI_CH3I_VC_STATE_FAILED:      name = "CH3I_VC_STATE_FAILED"; break;
1278     default:
1279 	MPL_snprintf( asdigits, sizeof(asdigits), "%d", state );
1280 	asdigits[20-1] = 0;
1281 	name = (const char *)asdigits;
1282     }
1283     return name;
1284 }
1285 #endif
1286