1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "hydra.h"
7 #include "bscu.h"
8
9 int *HYD_bscu_fd_list = NULL;
10 int HYD_bscu_fd_count = 0;
11 int *HYD_bscu_pid_list = NULL;
12 int HYD_bscu_pid_count = 0;
13
HYDT_bscu_wait_for_completion(int timeout)14 HYD_status HYDT_bscu_wait_for_completion(int timeout)
15 {
16 int pgid, pid, ret, count, i, time_elapsed, time_left;
17 struct timeval start, now;
18 HYD_status status = HYD_SUCCESS;
19
20 HYDU_FUNC_ENTER();
21
22 /* FIXME: We rely on gettimeofday here. This needs to detect the
23 * timer type available and use that. Probably more of an MPL
24 * functionality than Hydra's. */
25 gettimeofday(&start, NULL);
26
27 /* Loop till all sockets have closed */
28 restart_wait:
29 while (1) {
30 count = 0;
31 for (i = 0; i < HYD_bscu_fd_count; i++) {
32 if (HYD_bscu_fd_list[i] == HYD_FD_CLOSED)
33 continue;
34
35 ret = HYDT_dmx_query_fd_registration(HYD_bscu_fd_list[i]);
36 if (ret) { /* still registered */
37 count++; /* We still need to wait */
38
39 gettimeofday(&now, NULL);
40 time_elapsed = (now.tv_sec - start.tv_sec); /* Ignore microsec granularity */
41
42 time_left = -1;
43 if (timeout >= 0) {
44 if (time_elapsed > timeout) {
45 #if defined(HAVE_GETPGID) && defined(HAVE_SETSID)
46 /* If we are able to get the process group ID,
47 * send a signal to the entire process
48 * group */
49 pgid = getpgid(HYD_bscu_pid_list[i]);
50 killpg(pgid, SIGKILL);
51 #else
52 kill(HYD_bscu_pid_list[i], SIGKILL);
53 #endif
54 } else
55 time_left = timeout - time_elapsed;
56 }
57
58 status = HYDT_dmx_wait_for_event(time_left);
59 HYDU_ERR_POP(status, "error waiting for event\n");
60
61 /* Check if any processes terminated badly; if they
62 * did, return an error. */
63 pid = waitpid(-1, &ret, WNOHANG);
64 if (pid > 0) {
65 /* Find the pid and mark it as complete */
66 for (i = 0; i < HYD_bscu_pid_count; i++)
67 if (HYD_bscu_pid_list[i] == pid) {
68 HYD_bscu_pid_list[i] = -1;
69 break;
70 }
71
72 if (ret) {
73 HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
74 "one of the processes terminated badly; aborting\n");
75 }
76 }
77
78 goto restart_wait;
79 } else
80 HYD_bscu_fd_list[i] = HYD_FD_CLOSED;
81 }
82
83 if (count == 0)
84 break;
85 }
86
87 /* Loop till all processes have completed */
88 while (1) {
89 count = 0;
90 for (i = 0; i < HYD_bscu_pid_count; i++)
91 if (HYD_bscu_pid_list[i] != -1)
92 count++;
93
94 /* If there are no processes to wait, we are done */
95 if (count == 0)
96 break;
97
98 pid = waitpid(-1, &ret, WNOHANG);
99 if (pid > 0) {
100 /* Find the pid and mark it as complete */
101 for (i = 0; i < HYD_bscu_pid_count; i++)
102 if (HYD_bscu_pid_list[i] == pid) {
103 HYD_bscu_pid_list[i] = -1;
104 break;
105 }
106 }
107 }
108
109 MPL_free(HYD_bscu_pid_list);
110 HYD_bscu_pid_list = NULL;
111 HYD_bscu_pid_count = 0;
112
113 MPL_free(HYD_bscu_fd_list);
114 HYD_bscu_fd_list = NULL;
115 HYD_bscu_fd_count = 0;
116
117 fn_exit:
118 HYDU_FUNC_EXIT();
119 return status;
120
121 fn_fail:
122 goto fn_exit;
123 }
124