1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "hydra.h"
7 #include "bscu.h"
8 
9 int *HYD_bscu_fd_list = NULL;
10 int HYD_bscu_fd_count = 0;
11 int *HYD_bscu_pid_list = NULL;
12 int HYD_bscu_pid_count = 0;
13 
HYDT_bscu_wait_for_completion(int timeout)14 HYD_status HYDT_bscu_wait_for_completion(int timeout)
15 {
16     int pgid, pid, ret, count, i, time_elapsed, time_left;
17     struct timeval start, now;
18     HYD_status status = HYD_SUCCESS;
19 
20     HYDU_FUNC_ENTER();
21 
22     /* FIXME: We rely on gettimeofday here. This needs to detect the
23      * timer type available and use that. Probably more of an MPL
24      * functionality than Hydra's. */
25     gettimeofday(&start, NULL);
26 
27     /* Loop till all sockets have closed */
28   restart_wait:
29     while (1) {
30         count = 0;
31         for (i = 0; i < HYD_bscu_fd_count; i++) {
32             if (HYD_bscu_fd_list[i] == HYD_FD_CLOSED)
33                 continue;
34 
35             ret = HYDT_dmx_query_fd_registration(HYD_bscu_fd_list[i]);
36             if (ret) {  /* still registered */
37                 count++;        /* We still need to wait */
38 
39                 gettimeofday(&now, NULL);
40                 time_elapsed = (now.tv_sec - start.tv_sec);     /* Ignore microsec granularity */
41 
42                 time_left = -1;
43                 if (timeout >= 0) {
44                     if (time_elapsed > timeout) {
45 #if defined(HAVE_GETPGID) && defined(HAVE_SETSID)
46                         /* If we are able to get the process group ID,
47                          * send a signal to the entire process
48                          * group */
49                         pgid = getpgid(HYD_bscu_pid_list[i]);
50                         killpg(pgid, SIGKILL);
51 #else
52                         kill(HYD_bscu_pid_list[i], SIGKILL);
53 #endif
54                     } else
55                         time_left = timeout - time_elapsed;
56                 }
57 
58                 status = HYDT_dmx_wait_for_event(time_left);
59                 HYDU_ERR_POP(status, "error waiting for event\n");
60 
61                 /* Check if any processes terminated badly; if they
62                  * did, return an error. */
63                 pid = waitpid(-1, &ret, WNOHANG);
64                 if (pid > 0) {
65                     /* Find the pid and mark it as complete */
66                     for (i = 0; i < HYD_bscu_pid_count; i++)
67                         if (HYD_bscu_pid_list[i] == pid) {
68                             HYD_bscu_pid_list[i] = -1;
69                             break;
70                         }
71 
72                     if (ret) {
73                         HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
74                                             "one of the processes terminated badly; aborting\n");
75                     }
76                 }
77 
78                 goto restart_wait;
79             } else
80                 HYD_bscu_fd_list[i] = HYD_FD_CLOSED;
81         }
82 
83         if (count == 0)
84             break;
85     }
86 
87     /* Loop till all processes have completed */
88     while (1) {
89         count = 0;
90         for (i = 0; i < HYD_bscu_pid_count; i++)
91             if (HYD_bscu_pid_list[i] != -1)
92                 count++;
93 
94         /* If there are no processes to wait, we are done */
95         if (count == 0)
96             break;
97 
98         pid = waitpid(-1, &ret, WNOHANG);
99         if (pid > 0) {
100             /* Find the pid and mark it as complete */
101             for (i = 0; i < HYD_bscu_pid_count; i++)
102                 if (HYD_bscu_pid_list[i] == pid) {
103                     HYD_bscu_pid_list[i] = -1;
104                     break;
105                 }
106         }
107     }
108 
109     MPL_free(HYD_bscu_pid_list);
110     HYD_bscu_pid_list = NULL;
111     HYD_bscu_pid_count = 0;
112 
113     MPL_free(HYD_bscu_fd_list);
114     HYD_bscu_fd_list = NULL;
115     HYD_bscu_fd_count = 0;
116 
117   fn_exit:
118     HYDU_FUNC_EXIT();
119     return status;
120 
121   fn_fail:
122     goto fn_exit;
123 }
124