1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "hydra.h"
7 #include "bsci.h"
8 #include "pbs.h"
9 
10 static struct HYD_node *pbs_node_list = NULL;
11 
find_pbs_node_id(const char * hostname,int * node_id)12 static HYD_status find_pbs_node_id(const char *hostname, int *node_id)
13 {
14     struct HYD_node *t;
15     HYD_status status = HYD_SUCCESS;
16 
17     HYDU_FUNC_ENTER();
18 
19     *node_id = 0;
20     for (t = pbs_node_list; t; t = t->next) {
21         if (!strcmp(hostname, t->hostname))
22             break;
23         *node_id += t->core_count;
24     }
25 
26     HYDU_ERR_CHKANDJUMP(status, t == NULL, HYD_INTERNAL_ERROR,
27                         "user specified host not in the PBS allocated list\n");
28 
29   fn_exit:
30     HYDU_FUNC_EXIT();
31     return status;
32 
33   fn_fail:
34     goto fn_exit;
35 }
36 
HYDT_bscd_pbs_launch_procs(char ** args,struct HYD_proxy * proxy_list,int use_rmk,int * control_fd)37 HYD_status HYDT_bscd_pbs_launch_procs(char **args, struct HYD_proxy *proxy_list, int use_rmk,
38                                       int *control_fd)
39 {
40     int proxy_count, i, args_count, err, hostid;
41     struct HYD_proxy *proxy;
42     char *targs[HYD_NUM_TMP_STRINGS];
43     HYD_status status = HYD_SUCCESS;
44 
45     HYDU_FUNC_ENTER();
46 
47     /* If the RMK is not PBS, query for the PBS node list, and convert
48      * the user-specified node IDs to PBS node IDs */
49     if (use_rmk == HYD_FALSE || strcmp(HYDT_bsci_info.rmk, "pbs")) {
50         status = HYDT_bscd_pbs_query_node_list(&pbs_node_list);
51         HYDU_ERR_POP(status, "error querying PBS node list\n");
52     }
53 
54     proxy_count = 0;
55     for (proxy = proxy_list; proxy; proxy = proxy->next)
56         proxy_count++;
57 
58     /* Duplicate the args in local copy, targs */
59     for (args_count = 0; args[args_count]; args_count++)
60         targs[args_count] = MPL_strdup(args[args_count]);
61 
62     HYDU_MALLOC_OR_JUMP(HYDT_bscd_pbs_sys->task_id, tm_task_id *, proxy_count * sizeof(tm_task_id),
63                         status);
64     HYDU_MALLOC_OR_JUMP(HYDT_bscd_pbs_sys->spawn_events, tm_event_t *,
65                         proxy_count * sizeof(tm_event_t), status);
66 
67     /* Spawn a process on each allocated node through tm_spawn() which
68      * returns a taskID for the process + a eventID for the
69      * spawning. */
70     hostid = 0;
71     for (i = 0, proxy = proxy_list; proxy; proxy = proxy->next, i++) {
72         if (pbs_node_list) {
73             status = find_pbs_node_id(proxy->node->hostname, &hostid);
74             HYDU_ERR_POP(status, "error finding PBS node ID for host %s\n", proxy->node->hostname);
75         }
76 
77         targs[args_count] = HYDU_int_to_str(i);
78 
79         /* The task_id field is not filled in during tm_spawn(). The
80          * TM library just stores this address and fills it in when
81          * the event is completed by a call to tm_poll(). */
82         if (HYDT_bsci_info.debug) {
83             HYDU_dump(stdout, "Spawn arguments (host id %d): ", hostid);
84 
85             /* NULL terminate the arguments list to pass to
86              * HYDU_print_strlist() */
87             targs[args_count + 1] = NULL;
88             HYDU_print_strlist(targs);
89         }
90 
91         /* The args_count below does not include the possible NULL
92          * termination, as I'm not sure how tm_spawn() handles NULL
93          * arguments. Besides the last NULL string is not needed for
94          * tm_spawn(). */
95         err = tm_spawn(args_count + 1, targs, NULL, hostid, &HYDT_bscd_pbs_sys->task_id[i],
96                        &HYDT_bscd_pbs_sys->spawn_events[i]);
97         HYDU_ERR_CHKANDJUMP(status, err != TM_SUCCESS, HYD_INTERNAL_ERROR,
98                             "tm_spawn() failed with TM error %d\n", err);
99 
100         if (!pbs_node_list)
101             hostid += proxy->node->core_count;
102     }
103     HYDT_bscd_pbs_sys->spawn_count = i;
104 
105   fn_exit:
106     HYDU_FUNC_EXIT();
107     return status;
108 
109   fn_fail:
110     goto fn_exit;
111 }
112