1 /*****************************************************************************\
2  *  src/slurmd/slurmstepd/slurmstepd_job.h  stepd_step_rec_t definition
3  *****************************************************************************
4  *  Copyright (C) 2002-2007 The Regents of the University of California.
5  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
6  *  Copyright (C) 2013      Intel, Inc.
7  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8  *  Written by Mark Grondona <mgrondona@llnl.gov>.
9  *  CODE-OCEC-09-009. All rights reserved.
10  *
11  *  This file is part of Slurm, a resource management program.
12  *  For details, see <https://slurm.schedmd.com/>.
13  *  Please also read the included file: DISCLAIMER.
14  *
15  *  Slurm is free software; you can redistribute it and/or modify it under
16  *  the terms of the GNU General Public License as published by the Free
17  *  Software Foundation; either version 2 of the License, or (at your option)
18  *  any later version.
19  *
20  *  In addition, as a special exception, the copyright holders give permission
21  *  to link the code of portions of this program with the OpenSSL library under
22  *  certain conditions as described in each individual source file, and
23  *  distribute linked combinations including the two. You must obey the GNU
24  *  General Public License in all respects for all of the code used other than
25  *  OpenSSL. If you modify file(s) with this exception, you may extend this
26  *  exception to your version of the file(s), but you are not obligated to do
27  *  so. If you do not wish to do so, delete this exception statement from your
28  *  version.  If you delete this exception statement from all source files in
29  *  the program, then also delete it here.
30  *
31  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
32  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
33  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
34  *  details.
35  *
36  *  You should have received a copy of the GNU General Public License along
37  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
38  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
39 \*****************************************************************************/
40 
41 #ifndef _SLURMSTEPD_JOB_H
42 #define _SLURMSTEPD_JOB_H
43 
44 #include <pthread.h>
45 #include <pwd.h>
46 
47 #include "src/common/macros.h"
48 #include "src/common/slurm_protocol_api.h"
49 #include "src/common/slurm_protocol_defs.h"
50 #include "src/common/list.h"
51 #include "src/common/eio.h"
52 #include "src/common/env.h"
53 #include "src/common/io_hdr.h"
54 #include "src/common/job_options.h"
55 #include "src/common/stepd_api.h"
56 
57 #ifndef MAXHOSTNAMELEN
58 #define MAXHOSTNAMELEN	64
59 #endif
60 
61 typedef struct {
62 	unsigned char data[SLURM_IO_KEY_SIZE];
63 } srun_key_t;
64 
65 typedef struct {
66 	srun_key_t *key;	   /* srun key for IO verification         */
67 	slurm_addr_t resp_addr;	   /* response addr for task exit msg      */
68 	slurm_addr_t ioaddr;       /* Address to connect on for normal I/O.
69 				      Spawn IO uses messages to the normal
70 				      resp_addr. */
71 	uint16_t protocol_version; /* protocol_version of the srun */
72 } srun_info_t;
73 
74 typedef enum {
75 	STEPD_STEP_TASK_INIT,
76 	STEPD_STEP_TASK_STARTING,
77 	STEPD_STEP_TASK_RUNNING,
78 	STEPD_STEP_TASK_COMPLETE
79 } stepd_step_task_state_t;
80 
81 typedef struct {
82 	pthread_mutex_t mutex;	    /* mutex to protect task state          */
83 	stepd_step_task_state_t state;  /* task state                       */
84 
85 	int             id;	    /* local task id                        */
86 	uint32_t        gtid;	    /* global task id                       */
87 	pid_t           pid;	    /* task pid                             */
88 
89 	char           *ifname;     /* standard input file name             */
90 	char           *ofname;     /* standard output file name            */
91 	char           *efname;     /* standard error file name             */
92 	int             stdin_fd;   /* standard input file descriptor       */
93 	int             stdout_fd;  /* standard output file descriptor      */
94 	int             stderr_fd;  /* standard error file descriptor       */
95 	int             to_stdin;   /* write file descriptor for task stdin */
96 	int             from_stdout;/* read file descriptor from task stdout*/
97 	int             from_stderr;/* read file descriptor from task stderr*/
98 	eio_obj_t      *in;         /* standard input event IO object       */
99 	eio_obj_t      *out;        /* standard output event IO object      */
100 	eio_obj_t      *err;        /* standard error event IO object       */
101 
102 	bool            killed_by_cmd; /* true if task killed by our signal */
103 	bool            aborted;    /* true if task called abort            */
104 	bool            esent;      /* true if exit status has been sent    */
105 	bool            exited;     /* true if task has exited              */
106 	int             estatus;    /* this task's exit status              */
107 
108 	uint32_t	argc;
109 	char	      **argv;
110 } stepd_step_task_info_t;
111 
112 typedef struct {		/* MPMD specifications, needed for Cray */
113 	uint64_t apid;		/* Application ID */
114 	int num_cmds;		/* Number of executables in MPMD set */
115 	char **args;		/* Array of argument string for each executable */
116 	char **command;		/* Array of command name for each executable */
117 	int *first_pe;		/* First rank on this node of each executable,
118 				 * -1 if executable not on this node */
119 	int *start_pe;		/* Starting rank of each executable in set */
120 	int *total_pe;		/* Total ranks of each executable in set */
121 
122 	int *placement;		/* NID of each rank (ntasks in length) */
123 } mpmd_set_t;
124 
125 typedef struct {
126 	slurmstepd_state_t state;	/* Job state			*/
127 	pthread_cond_t state_cond;	/* Job state conditional	*/
128 	pthread_mutex_t state_mutex;	/* Job state mutex		*/
129 	uint32_t       jobid;  /* Current Slurm job id                      */
130 	uint32_t       stepid; /* Current step id (or NO_VAL)               */
131 	uint32_t       array_job_id;  /* job array master job ID            */
132 	uint32_t       array_task_id; /* job array ID                       */
133 	uint32_t       nnodes; /* number of nodes in current job            */
134 	uint32_t       ntasks; /* total number of tasks in current job      */
135 	uint32_t       nodeid; /* relative position of this node in job     */
136 	uint32_t       node_tasks;	/* number of tasks on *this* node   */
137 	uint32_t       het_job_id;	/* Hetjob ID or NO_VAL */
138 	uint32_t       het_job_nnodes;	/* total node count for entire hetjob */
139 	char          *het_job_node_list; /* Hetjob step node list */
140 	uint32_t       het_job_node_offset;/* Hetjob node offset or NO_VAL   */
141 	uint32_t       het_job_ntasks;	/* total task count for entire hetjob */
142 	uint32_t       het_job_offset;	/* Hetjob offset or NO_VAL        */
143 	uint32_t       het_job_step_cnt;  /* number of steps for entire hetjob */
144 	uint32_t       het_job_task_offset;/* Hetjob task offset or NO_VAL   */
145 	uint16_t      *het_job_task_cnts; /* Number of tasks on each node in hetjob */
146 	uint32_t     **het_job_tids;       /* Task IDs on each node of hetjob */
147 	uint32_t      *het_job_tid_offsets;/* map of tasks (by id) to originating hetjob*/
148 	uint16_t      *task_cnts;  /* Number of tasks on each node in job   */
149 	uint32_t       cpus_per_task;	/* number of cpus desired per task  */
150 	uint32_t       debug;  /* debug level for job slurmd                */
151 	uint64_t       job_mem;  /* MB of memory reserved for the job       */
152 	uint64_t       step_mem; /* MB of memory reserved for the step      */
153 	uint16_t       cpus;   /* number of cpus to use for this job        */
154 	uint32_t       argc;   /* number of commandline arguments           */
155 	char         **env;    /* job environment                           */
156 	char         **argv;   /* job argument vector                       */
157 	char          *cwd;    /* path to current working directory         */
158 	task_dist_states_t task_dist;/* -m distribution                     */
159 	char          *node_name; /* node name of node running job
160 				   * needed for front-end systems           */
161 	cpu_bind_type_t cpu_bind_type; /* --cpu-bind=                       */
162 	char          *cpu_bind;       /* binding map for map/mask_cpu      */
163 	mem_bind_type_t mem_bind_type; /* --mem-bind=                       */
164 	char          *mem_bind;       /* binding map for tasks to memory   */
165 	uint16_t accel_bind_type;  /* --accel_bind= */
166 	uint32_t cpu_freq_min; /* Minimum cpu frequency  */
167 	uint32_t cpu_freq_max; /* Maximum cpu frequency  */
168 	uint32_t cpu_freq_gov; /* cpu frequency governor */
169 	dynamic_plugin_data_t *switch_job; /* switch-specific job information     */
170 	uid_t         uid;     /* user id for job                           */
171 	char          *user_name;
172 	/* fields from the launch cred used to support nss_slurm	    */
173 	char *pw_gecos;
174 	char *pw_dir;
175 	char *pw_shell;
176 	gid_t         gid;     /* group ID for job                          */
177 	int           ngids;   /* length of the following gids array        */
178 	char **gr_names;
179 	gid_t        *gids;    /* array of gids for user specified in uid   */
180 	bool           aborted;    /* true if already aborted               */
181 	bool           batch;      /* true if this is a batch job           */
182 	bool           run_prolog; /* true if need to run prolog            */
183 	time_t         timelimit;  /* time at which job must stop           */
184 	uint32_t       profile;	   /* Level of acct_gather_profile          */
185 	char          *task_prolog; /* per-task prolog                      */
186 	char          *task_epilog; /* per-task epilog                      */
187 	stepd_step_task_info_t  **task;  /* array of task information pointers*/
188 	eio_handle_t  *eio;
189 	List 	       sruns; /* List of srun_info_t pointers               */
190 	List           clients; /* List of struct client_io_info pointers   */
191 	List stdout_eio_objs; /* List of objs that gather stdout from tasks */
192 	List stderr_eio_objs; /* List of objs that gather stderr from tasks */
193 	List free_incoming;   /* List of free struct io_buf * for incoming
194 			       * traffic. "incoming" means traffic from srun
195 			       * to the tasks.
196 			       */
197 	List free_outgoing;   /* List of free struct io_buf * for outgoing
198 			       * traffic "outgoing" means traffic from the
199 			       * tasks to srun.
200 			       */
201 	int incoming_count;   /* Count of total incoming message buffers
202 			       * including free_incoming buffers and
203 			       * buffers in use.
204 			       */
205 	int outgoing_count;   /* Count of total outgoing message buffers
206 			       * including free_outgoing buffers and
207 			       * buffers in use.
208 			       */
209 
210 	List outgoing_cache;  /* cache of outgoing stdio messages
211 			       * used when a new client attaches
212 			       */
213 
214 	pthread_t      ioid;  /* pthread id of IO thread                    */
215 	pthread_t      msgid; /* pthread id of message thread               */
216 	eio_handle_t  *msg_handle; /* eio handle for the message thread     */
217 
218 	pid_t          jmgr_pid;     /* job manager pid                     */
219 	pid_t          pgid;         /* process group id for tasks          */
220 	uint32_t       flags;        /* See LAUNCH_* flags defined in slurm_protocol_defs.h */
221 	uint16_t       overcommit;
222 	env_t          *envtp;
223 	uint64_t       cont_id;
224 
225 	char          *batchdir;
226 	jobacctinfo_t *jobacct;
227 	uint8_t        open_mode;	/* stdout/err append or truncate */
228 	job_options_t  options;
229 	uint32_t       resv_id;		/* Cray/BASIL reservation ID	*/
230 	uint16_t       restart_cnt;	/* batch job restart count	*/
231 	char	      *job_alloc_cores;	/* needed by the SPANK cpuset plugin */
232 	char	      *step_alloc_cores;/* needed by the SPANK cpuset plugin */
233 	List           job_gres_list;	/* Needed by GRES plugin */
234 	List           step_gres_list;	/* Needed by GRES plugin */
235 	char          *tres_bind;	/* TRES binding */
236 	char          *tres_freq;	/* TRES frequency */
237 	launch_tasks_request_msg_t *msg; /* When a non-batch step this
238 					  * is the message sent.  DO
239 					  * NOT FREE, IT IS JUST A
240 					  * POINTER. */
241 	mpmd_set_t     *mpmd_set;	/* MPMD specifications for Cray */
242 	uint16_t	job_core_spec;	/* count of specialized cores */
243 	int		non_smp;	/* Set if task IDs are not monotonically
244 					 * increasing across all nodes, set only
245 					 * native Cray systems */
246 	bool		oom_error;	/* step out of memory error */
247 
248 	uint16_t x11;			/* only set for extern step */
249 	int x11_display;		/* display number if x11 forwarding setup */
250 	char *x11_alloc_host;		/* remote host to proxy through */
251 	uint16_t x11_alloc_port;	/* remote port to proxy through */
252 	char *x11_magic_cookie;		/* xauth magic cookie value */
253 	char *x11_target;		/* remote target. unix socket if port == 0 */
254 	uint16_t x11_target_port;	/* remote x11 port to connect back to */
255 	char *x11_xauthority;		/* temporary XAUTHORITY location, or NULL */
256 } stepd_step_rec_t;
257 
258 
259 stepd_step_rec_t * stepd_step_rec_create(launch_tasks_request_msg_t *msg,
260 					 uint16_t protocol_version);
261 stepd_step_rec_t * batch_stepd_step_rec_create(batch_job_launch_msg_t *msg);
262 
263 void stepd_step_rec_destroy(stepd_step_rec_t *job);
264 
265 srun_info_t * srun_info_create(slurm_cred_t *cred, slurm_addr_t *respaddr,
266 			       slurm_addr_t *ioaddr, uint16_t protocol_version);
267 
268 void  srun_info_destroy(srun_info_t *srun);
269 
270 stepd_step_task_info_t * task_info_create(int taskid, int gtaskid,
271 					  char *ifname, char *ofname,
272 					  char *efname);
273 
274 /*
275  *  Return a task info structure corresponding to pid.
276  *   We inline it here so that it can be included from src/common/plugstack.c
277  *   without undefined symbol warnings.
278  */
279 static inline stepd_step_task_info_t *
job_task_info_by_pid(stepd_step_rec_t * job,pid_t pid)280 job_task_info_by_pid (stepd_step_rec_t *job, pid_t pid)
281 {
282 	uint32_t i;
283 
284 	if (!job)
285 		return NULL;
286 
287 	for (i = 0; i < job->node_tasks; i++) {
288 		if (job->task[i]->pid == pid)
289 			return (job->task[i]);
290 	}
291 	return (NULL);
292 }
293 
294 #endif /* !_SLURMSTEPD_JOB_H */
295