1 /*****************************************************************************\ 2 * src/common/stepd_api.h - slurmstepd message API 3 ***************************************************************************** 4 * Copyright (C) 2005-2007 The Regents of the University of California. 5 * Copyright (C) 2008-2010 Lawrence Livermore National Security. 6 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 7 * Written by Christopher Morrone <morrone2@llnl.gov> 8 * CODE-OCEC-09-009. All rights reserved. 9 * 10 * This file is part of Slurm, a resource management program. 11 * For details, see <https://slurm.schedmd.com/>. 12 * Please also read the included file: DISCLAIMER. 13 * 14 * Slurm is free software; you can redistribute it and/or modify it under 15 * the terms of the GNU General Public License as published by the Free 16 * Software Foundation; either version 2 of the License, or (at your option) 17 * any later version. 18 * 19 * In addition, as a special exception, the copyright holders give permission 20 * to link the code of portions of this program with the OpenSSL library under 21 * certain conditions as described in each individual source file, and 22 * distribute linked combinations including the two. You must obey the GNU 23 * General Public License in all respects for all of the code used other than 24 * OpenSSL. If you modify file(s) with this exception, you may extend this 25 * exception to your version of the file(s), but you are not obligated to do 26 * so. If you do not wish to do so, delete this exception statement from your 27 * version. If you delete this exception statement from all source files in 28 * the program, then also delete it here. 29 * 30 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 31 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 32 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 33 * details. 34 * 35 * You should have received a copy of the GNU General Public License along 36 * with Slurm; if not, write to the Free Software Foundation, Inc., 37 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 38 \*****************************************************************************/ 39 40 #ifndef _STEPD_API_H 41 #define _STEPD_API_H 42 43 #include <inttypes.h> 44 45 #include "slurm/slurm.h" 46 #include "src/common/list.h" 47 #include "src/common/slurm_protocol_defs.h" 48 #include "src/common/io_hdr.h" 49 50 typedef enum { 51 REQUEST_CONNECT = 0, 52 REQUEST_SIGNAL_PROCESS_GROUP, /* Defunct since July 2013 */ 53 REQUEST_SIGNAL_TASK_LOCAL, /* Defunct see REQUEST_SIGNAL_CONTAINER */ 54 REQUEST_SIGNAL_TASK_GLOBAL, /* Defunct see REQUEST_SIGNAL_CONTAINER */ 55 REQUEST_SIGNAL_CONTAINER, 56 REQUEST_STATE, 57 REQUEST_INFO, /* Defunct, See REQUEST_STEP_MEM_LIMITS|UID|NODEID */ 58 REQUEST_ATTACH, 59 REQUEST_PID_IN_CONTAINER, 60 REQUEST_DAEMON_PID, 61 REQUEST_STEP_SUSPEND, 62 REQUEST_STEP_RESUME, 63 REQUEST_STEP_TERMINATE, 64 REQUEST_STEP_COMPLETION, /* Defunct, See REQUEST_STEP_COMPLETION_V2 */ 65 REQUEST_STEP_TASK_INFO, 66 REQUEST_STEP_LIST_PIDS, 67 REQUEST_STEP_RECONFIGURE, 68 REQUEST_STEP_STAT, 69 REQUEST_STEP_COMPLETION_V2, 70 REQUEST_STEP_MEM_LIMITS, 71 REQUEST_STEP_UID, 72 REQUEST_STEP_NODEID, 73 REQUEST_ADD_EXTERN_PID, 74 REQUEST_X11_DISPLAY, 75 REQUEST_GETPW, 76 REQUEST_GETGR, 77 } step_msg_t; 78 79 typedef enum { 80 SLURMSTEPD_NOT_RUNNING = 0, 81 SLURMSTEPD_STEP_STARTING, 82 SLURMSTEPD_STEP_RUNNING, 83 SLURMSTEPD_STEP_ENDING 84 } slurmstepd_state_t; 85 86 typedef enum { 87 GETPW_MATCH_USER_AND_PID = 0, /* user must match, pid must belong */ 88 GETPW_MATCH_ALWAYS, /* always return */ 89 GETPW_MATCH_PID, /* only pid must belong */ 90 } stepd_getpw_mode_t; 91 92 typedef enum { 93 GETGR_MATCH_GROUP_AND_PID = 0, /* user must match, pid must belong */ 94 GETGR_MATCH_ALWAYS, /* always return */ 95 GETGR_MATCH_PID, /* only pid must belong */ 96 } stepd_getgr_mode_t; 97 98 typedef struct { 99 uid_t uid; 100 uint32_t jobid; 101 uint32_t stepid; 102 uint32_t nodeid; 103 uint16_t protocol_version; 104 uint64_t job_mem_limit; /* job's memory limit, MB */ 105 uint64_t step_mem_limit; /* step's memory limit, MB */ 106 } slurmstepd_info_t; 107 108 typedef struct { 109 uint64_t job_mem_limit; /* job's memory limit, MB */ 110 uint32_t nodeid; 111 uint64_t step_mem_limit; /* step's memory limit, MB */ 112 } slurmstepd_mem_info_t; 113 114 typedef struct { 115 int id; /* local task id */ 116 uint32_t gtid; /* global task id */ 117 pid_t pid; /* task pid */ 118 bool exited; /* true if task has exited */ 119 int estatus; /* exit status if exited is true*/ 120 } slurmstepd_task_info_t; 121 122 typedef struct step_location { 123 uint32_t jobid; 124 uint32_t stepid; 125 char *nodename; 126 char *directory; 127 uint16_t protocol_version; 128 } step_loc_t; 129 130 131 /* 132 * Cleanup stale stepd domain sockets. 133 */ 134 int stepd_cleanup_sockets(const char *directory, const char *nodename); 135 136 int stepd_terminate(int fd, uint16_t protocol_version); 137 138 /* 139 * Connect to a slurmstepd proccess by way of its unix domain socket. 140 * 141 * Both "directory" and "nodename" may be null, in which case stepd_connect 142 * will attempt to determine them on its own. If you are using multiple 143 * slurmd on one node (unusual outside of development environments), you 144 * will get one of the local NodeNames more-or-less at random. 145 * 146 * Returns a socket descriptor for the opened socket on success, 147 * and -1 on error. Also fills in protocol_version with the version 148 * of the running stepd. 149 */ 150 extern int stepd_connect(const char *directory, const char *nodename, 151 uint32_t jobid, uint32_t stepid, uint16_t *protocol_version); 152 153 154 /* 155 * Connect to a slurmstepd proccess by way of its unix domain socket. 156 * 157 * This is specifically intended to be used with nss_slurm to prevent possible 158 * deadlocks. Neither "directory" or "nodename" may be null, and will result 159 * in an error. Remove this function in 20.11. 160 * 161 * Returns a file descriptor for the opened socket on success alongside the 162 * protocol_version for the stepd, or -1 on error. 163 */ 164 extern int stepd_connect_nss(const char *directory, const char *nodename, 165 uint32_t jobid, uint32_t stepid, 166 uint16_t *protocol_version); 167 168 /* 169 * Retrieve a job step's current state. 170 */ 171 slurmstepd_state_t stepd_state(int fd, uint16_t protocol_version); 172 173 /* 174 * Retrieve slurmstepd_info_t structure for a job step. 175 * 176 * Must be xfree'd by the caller. 177 */ 178 slurmstepd_info_t *stepd_get_info(int fd); 179 180 /* 181 * Send job notification message to a batch job 182 */ 183 int stepd_notify_job(int fd, uint16_t protocol_version, char *message); 184 185 /* 186 * Send a signal to the proctrack container of a job step. 187 */ 188 int stepd_signal_container(int fd, uint16_t protocol_version, int signal, 189 int flags, uid_t uid); 190 191 /* 192 * Attach a client to a running job step. 193 * 194 * On success returns SLURM_SUCCESS and fills in resp->local_pids, 195 * resp->gtids, resp->ntasks, and resp->executable. 196 * 197 * FIXME - The pid/gtid info returned in the "resp" parameter should 198 * probably be moved into a more generic stepd_api call so that 199 * this header does not need to include slurm_protocol_defs.h. 200 */ 201 int stepd_attach(int fd, uint16_t protocol_version, 202 slurm_addr_t *ioaddr, slurm_addr_t *respaddr, 203 void *job_cred_sig, reattach_tasks_response_msg_t *resp); 204 205 /* 206 * Scan for available running slurm step daemons by checking 207 * "directory" for unix domain sockets with names beginning in "nodename". 208 * 209 * Both "directory" and "nodename" may be null, in which case stepd_available 210 * will attempt to determine them on its own. If you are using multiple 211 * slurmd on one node (unusual outside of development environments), you 212 * will get one of the local NodeNames more-or-less at random. 213 * 214 * Returns a List of pointers to step_loc_t structures. 215 */ 216 extern List stepd_available(const char *directory, const char *nodename); 217 218 /* 219 * Return true if the process with process ID "pid" is found in 220 * the proctrack container of the slurmstepd "step". 221 */ 222 bool stepd_pid_in_container(int fd, uint16_t protocol_version, pid_t pid); 223 224 /* 225 * Add a pid to the "extern" step of a job, meaning add it to the 226 * jobacct_gather and proctrack plugins. 227 */ 228 extern int stepd_add_extern_pid(int fd, uint16_t protocol_version, pid_t pid); 229 230 /* 231 * Fetch the display number if this extern step is providing x11 tunneling. 232 * If temporary XAUTHORITY files are in use, xauthority is set to that path, 233 * otherwise NULL. 234 * Returns 0 to indicate no display forwarded. 235 */ 236 extern int stepd_get_x11_display(int fd, uint16_t protocol_version, 237 char **xauthority); 238 239 /* 240 * Get the 'struct passwd' info for the user running this job step iff 241 * the cluster is running with enable_nss_slurm. 242 */ 243 extern struct passwd *stepd_getpw(int fd, uint16_t protocol_version, 244 int mode, uid_t uid, const char *name); 245 246 extern void xfree_struct_passwd(struct passwd *pwd); 247 248 /* 249 * Get the 'struct group' info for the user running this job step iff 250 * the cluster is running with enable_nss_slurm. 251 * 252 * Returns a NULL-terminated array of 'struct group' elements, with all 253 * fields allocated with xmalloc(). 254 */ 255 extern struct group **stepd_getgr(int fd, uint16_t protocol_version, 256 int mode, gid_t gid, const char *name); 257 258 extern void xfree_struct_group_array(struct group **grp); 259 260 /* 261 * Return the process ID of the slurmstepd. 262 */ 263 pid_t stepd_daemon_pid(int fd, uint16_t protocol_version); 264 265 /* 266 * Suspend execution of the job step. Only root or SlurmUser is 267 * authorized to use this call. 268 * 269 * Returns SLURM_SUCCESS if successful. On error returns SLURM_ERROR 270 * and sets errno. 271 */ 272 extern int stepd_suspend(int fd, uint16_t protocol_version, 273 suspend_int_msg_t *susp_req, int phase); 274 275 /* 276 * Resume execution of the job step that has been suspended by a 277 * call to stepd_suspend(). Only root or SlurmUser is 278 * authorized to use this call. 279 * 280 * Returns SLURM_SUCCESS if successful. On error returns SLURM_ERROR 281 * and sets errno. 282 */ 283 extern int stepd_resume(int fd, uint16_t protocol_version, 284 suspend_int_msg_t *susp_req, int phase); 285 286 /* 287 * Reconfigure the job step (Primarily to allow the stepd to refresh 288 * it's log file pointer. 289 * 290 * Returns SLURM_SUCCESS if successful. On error returns SLURM_ERROR 291 * and sets errno. 292 */ 293 int stepd_reconfig(int fd, uint16_t protocol_version); 294 295 /* 296 * 297 * Returns SLURM_SUCCESS if successful. On error returns SLURM_ERROR 298 * and sets errno. 299 */ 300 int stepd_completion(int fd, uint16_t protocol_version, 301 step_complete_msg_t *sent); 302 303 /* 304 * 305 * Returns SLURM_SUCCESS on success or SLURM_ERROR on error. 306 * resp receives a jobacctinfo_t which must be freed if SUCCESS. 307 */ 308 int stepd_stat_jobacct(int fd, uint16_t protocol_version, 309 job_step_id_msg_t *sent, job_step_stat_t *resp); 310 311 312 int stepd_task_info(int fd, uint16_t protocol_version, 313 slurmstepd_task_info_t **task_info, 314 uint32_t *task_info_count); 315 316 int stepd_list_pids(int fd, uint16_t protocol_version, 317 uint32_t **pids_array, uint32_t *pids_count); 318 319 /* 320 * Get the memory limits of the step 321 * Returns uid of the running step if successful. On error returns -1. 322 */ 323 extern int stepd_get_mem_limits(int fd, uint16_t protocol_version, 324 slurmstepd_mem_info_t *stepd_mem_info); 325 326 /* 327 * Get the uid of the step 328 * Returns uid of the running step if successful. On error returns -1. 329 * 330 * FIXME: BUG: On Linux, uid_t is uint32_t but this can return -1. 331 */ 332 extern uid_t stepd_get_uid(int fd, uint16_t protocol_version); 333 334 /* 335 * Get the nodeid of the stepd 336 * Returns nodeid of the running stepd if successful. On error returns NO_VAL. 337 */ 338 extern uint32_t stepd_get_nodeid(int fd, uint16_t protocol_version); 339 340 #endif /* _STEPD_API_H */ 341