1 /*****************************************************************************\
2  *  job_container_cncu.c - Define job container management functions for
3  *                         Cray systems
4  *****************************************************************************
5  *  Copyright (C) 2013 SchedMD LLC
6  *  Written by Morris Jette, SchedMD
7  *
8  *  This file is part of Slurm, a resource management program.
9  *  For details, see <https://slurm.schedmd.com/>.
10  *  Please also read the included file: DISCLAIMER.
11  *
12  *  Slurm is free software; you can redistribute it and/or modify it under
13  *  the terms of the GNU General Public License as published by the Free
14  *  Software Foundation; either version 2 of the License, or (at your option)
15  *  any later version.
16  *
17  *  In addition, as a special exception, the copyright holders give permission
18  *  to link the code of portions of this program with the OpenSSL library under
19  *  certain conditions as described in each individual source file, and
20  *  distribute linked combinations including the two. You must obey the GNU
21  *  General Public License in all respects for all of the code used other than
22  *  OpenSSL. If you modify file(s) with this exception, you may extend this
23  *  exception to your version of the file(s), but you are not obligated to do
24  *  so. If you do not wish to do so, delete this exception statement from your
25  *  version.  If you delete this exception statement from all source files in
26  *  the program, then also delete it here.
27  *
28  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
29  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
31  *  details.
32  *
33  *  You should have received a copy of the GNU General Public License along
34  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
35  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
36 \*****************************************************************************/
37 
38 #include "config.h"
39 
40 #include <fcntl.h>
41 #include <sys/stat.h>
42 #include <sys/types.h>
43 
44 #ifdef HAVE_NATIVE_CRAY
45 #include <job.h>	/* Cray's job module component */
46 #endif
47 
48 #include "slurm/slurm_errno.h"
49 #include "src/common/slurm_xlator.h"
50 #include "src/slurmd/common/proctrack.h"
51 
52 #define ADD_FLAGS	0
53 #define CREATE_FLAGS	0
54 #define DELETE_FLAGS	0
55 
56 #define JOB_BUF_SIZE 128
57 
58 /*
59  * These variables are required by the generic plugin interface.  If they
60  * are not found in the plugin, the plugin loader will ignore it.
61  *
62  * plugin_name - a string giving a human-readable description of the
63  * plugin.  There is no maximum length, but the symbol must refer to
64  * a valid string.
65  *
66  * plugin_type - a string suggesting the type of the plugin or its
67  * applicability to a particular form of data or method of data handling.
68  * If the low-level plugin API is used, the contents of this string are
69  * unimportant and may be anything.  Slurm uses the higher-level plugin
70  * interface which requires this string to be of the form
71  *
72  *      <application>/<method>
73  *
74  * where <application> is a description of the intended application of
75  * the plugin (e.g., "task" for task control) and <method> is a description
76  * of how this plugin satisfies that application.  Slurm will only load
77  * a task plugin if the plugin_type string has a prefix of "task/".
78  *
79  * plugin_version - an unsigned 32-bit integer containing the Slurm version
80  * (major.minor.micro combined into a single number).
81  */
82 const char plugin_name[]        = "job_container cncu plugin";
83 const char plugin_type[]        = "job_container/cncu";
84 const uint32_t plugin_version   = SLURM_VERSION_NUMBER;
85 
86 static uint32_t *job_id_array = NULL;
87 static uint32_t  job_id_count = 0;
88 static pthread_mutex_t context_lock = PTHREAD_MUTEX_INITIALIZER;
89 static char *state_dir = NULL;
90 static uint64_t debug_flags = 0;
91 
_save_state(char * dir_name)92 static int _save_state(char *dir_name)
93 {
94 	char *file_name;
95 	int ret = SLURM_SUCCESS;
96 	int state_fd;
97 
98 	if (!dir_name) {
99 		error("job_container state directory is NULL");
100 		return SLURM_ERROR;
101 	}
102 	file_name = xstrdup_printf("%s/job_container_state", dir_name);
103 	(void) unlink(file_name);
104 	state_fd = creat(file_name, 0600);
105 	if (state_fd < 0) {
106 		error("Can't save state, error creating file %s %m",
107 		      file_name);
108 		ret = SLURM_ERROR;
109 	} else {
110 		char  *buf = (char *) job_id_array;
111 		size_t len = job_id_count * sizeof(uint32_t);
112 		while (1) {
113 	  		int wrote = write(state_fd, buf, len);
114 			if ((wrote < 0) && (errno == EINTR))
115 				continue;
116 	 		if (wrote == 0)
117 		 		break;
118 			if (wrote < 0) {
119 				error("Can't save job_container state: %m");
120 				ret = SLURM_ERROR;
121 				break;
122 			}
123 			buf += wrote;
124 			len -= wrote;
125 		}
126 		close(state_fd);
127 	}
128 	xfree(file_name);
129 
130 	return ret;
131 }
132 
_restore_state(char * dir_name)133 static int _restore_state(char *dir_name)
134 {
135 	char *data = NULL, *file_name = NULL;
136 	int error_code = SLURM_SUCCESS;
137 	int state_fd, data_allocated = 0, data_read = 0, data_offset = 0;
138 
139 	if (!dir_name) {
140 		error("job_container state directory is NULL");
141 		return SLURM_ERROR;
142 	}
143 
144 	file_name = xstrdup_printf("%s/job_container_state", dir_name);
145 	state_fd = open (file_name, O_RDONLY);
146 	if (state_fd >= 0) {
147 		data_allocated = JOB_BUF_SIZE;
148 		data = xmalloc(data_allocated);
149 		while (1) {
150 			data_read = read(state_fd, data + data_offset,
151 					 JOB_BUF_SIZE);
152 			if ((data_read < 0) && (errno == EINTR))
153 				continue;
154 			if (data_read < 0) {
155 				error ("Read error on %s, %m", file_name);
156 				error_code = SLURM_ERROR;
157 				break;
158 			} else if (data_read == 0)
159 				break;
160 			data_offset    += data_read;
161 			data_allocated += data_read;
162 			xrealloc(data, data_allocated);
163 		}
164 		close(state_fd);
165 	} else {
166 		error("No %s file for %s state recovery",
167 		      file_name, plugin_type);
168 		xfree(file_name);
169 		return SLURM_SUCCESS;
170 	}
171 
172 	xfree(file_name);
173 
174 	if (error_code == SLURM_SUCCESS) {
175 		job_id_array = (uint32_t *) data;
176 		job_id_count = data_offset / sizeof(uint32_t);
177 	}
178 
179 	return error_code;
180 }
181 
182 #ifdef HAVE_NATIVE_CRAY
_stat_reservation(char * type,rid_t resv_id)183 static void _stat_reservation(char *type, rid_t resv_id)
184 {
185 	struct job_resv_stat buf;
186 	DEF_TIMERS;
187 
188 	START_TIMER;
189 
190 	if (job_stat_reservation(resv_id, &buf)) {
191 		error("%s: stat(%"PRIu64"): %m", plugin_type, resv_id);
192 	} else {
193 		info("%s: %s/stat(%"PRIu64"): flags=%d "
194 		     "num_jobs=%d num_files=%d num_ipc_objs=%d",
195 		     plugin_type, type, resv_id, buf.flags, buf.num_jobs,
196 		     buf.num_files, buf.num_ipc_objs);
197 	}
198 	END_TIMER;
199 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
200 		INFO_LINE("call took: %s", TIME_STR);
201 }
202 #endif
203 
container_p_reconfig(void)204 extern void container_p_reconfig(void)
205 {
206 	debug_flags = slurm_get_debug_flags();
207 }
208 
209 /*
210  * init() is called when the plugin is loaded, before any other functions
211  *	are called.  Put global initialization here.
212  */
init(void)213 extern int init(void)
214 {
215 	debug_flags = slurm_get_debug_flags();
216 	if (debug_flags & DEBUG_FLAG_JOB_CONT)
217 		info("%s loaded", plugin_name);
218 	else
219 		debug("%s loaded", plugin_name);
220 
221 	return SLURM_SUCCESS;
222 }
223 
224 /*
225  * fini() is called when the plugin is removed. Clear any allocated
226  *	storage here.
227  */
fini(void)228 extern int fini(void)
229 {
230 	slurm_mutex_lock(&context_lock);
231 	xfree(state_dir);
232 	xfree(job_id_array);
233 	job_id_count = 0;
234 	slurm_mutex_unlock(&context_lock);
235 
236 	return SLURM_SUCCESS;
237 }
238 
container_p_restore(char * dir_name,bool recover)239 extern int container_p_restore(char *dir_name, bool recover)
240 {
241 	int i;
242 
243 	slurm_mutex_lock(&context_lock);
244 	xfree(state_dir);
245 	state_dir = xstrdup(dir_name);
246 	_restore_state(state_dir);
247 	for (i = 0; i < job_id_count; i++) {
248 		if (job_id_array[i] == 0)
249 			continue;
250 		if (debug_flags & DEBUG_FLAG_JOB_CONT)
251 			info("%s: %s job(%u)",
252 			     plugin_type,
253 			     recover ? "recovered" : "purging",
254 			     job_id_array[i]);
255 		if (!recover)
256 			job_id_array[i] = 0;
257 	}
258 	slurm_mutex_unlock(&context_lock);
259 
260 	return SLURM_SUCCESS;
261 }
262 
container_p_create(uint32_t job_id)263 extern int container_p_create(uint32_t job_id)
264 {
265 #ifdef HAVE_NATIVE_CRAY
266 	rid_t resv_id = job_id;
267 	int rc;
268 #endif
269 	int i, empty = -1, found = -1;
270 	DEF_TIMERS;
271 
272 	START_TIMER;
273 	if (debug_flags & DEBUG_FLAG_JOB_CONT)
274 		info("%s: creating(%u)", plugin_type, job_id);
275 	slurm_mutex_lock(&context_lock);
276 	for (i = 0; i < job_id_count; i++) {
277 		if (job_id_array[i] == 0) {
278 			empty = i;
279 		} else if (job_id_array[i] == job_id) {
280 			found = i;
281 			break;
282 		}
283 	}
284 	if (found == -1) {
285 		if (empty == -1) {
286 			empty = job_id_count;
287 			job_id_count += 4;
288 			job_id_array = xrealloc(job_id_array,
289 						sizeof(uint32_t)*job_id_count);
290 		}
291 		job_id_array[empty] = job_id;
292 		_save_state(state_dir);
293 	}
294 	slurm_mutex_unlock(&context_lock);
295 
296 	if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
297 		END_TIMER;
298 		INFO_LINE("call took: %s", TIME_STR);
299 	} else {
300 		END_TIMER3("container_p_create: saving state took", 3000000);
301 	}
302 #ifdef HAVE_NATIVE_CRAY
303 	START_TIMER;
304 	rc = job_create_reservation(resv_id, CREATE_FLAGS);
305 	if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
306 		END_TIMER;
307 		INFO_LINE("call took: %s", TIME_STR);
308 	} else
309 		END_TIMER3("container_p_create: job_create_reservation took",
310 			   3000000);
311 	if ((rc == 0) || (errno == EEXIST)) {
312 		if ((found == -1) && (rc != 0) && (errno == EEXIST)) {
313 			error("%s: create(%u): Reservation already exists",
314 			      plugin_type, job_id);
315 		}
316 		if (debug_flags & DEBUG_FLAG_JOB_CONT)
317 			_stat_reservation("create", resv_id);
318 		return SLURM_SUCCESS;
319 	}
320 	error("%s: create(%u): %m", plugin_type, job_id);
321 	return SLURM_ERROR;
322 #else
323 	return SLURM_SUCCESS;
324 #endif
325 }
326 
327 /* Add proctrack container (PAGG) to a job container */
container_p_add_cont(uint32_t job_id,uint64_t cont_id)328 extern int container_p_add_cont(uint32_t job_id, uint64_t cont_id)
329 {
330 #ifdef HAVE_NATIVE_CRAY
331 	jid_t cjob_id = cont_id;
332 	rid_t resv_id = job_id;
333 	int rc;
334 	DEF_TIMERS;
335 #endif
336 
337 	if (debug_flags & DEBUG_FLAG_JOB_CONT) {
338 		info("%s: adding cont(%u.%"PRIu64")",
339 		     plugin_type, job_id, cont_id);
340 	}
341 
342 #ifdef HAVE_NATIVE_CRAY
343 	START_TIMER;
344 	rc = job_attach_reservation(cjob_id, resv_id, ADD_FLAGS);
345 	if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
346 		END_TIMER;
347 		INFO_LINE("call took: %s", TIME_STR);
348 	} else
349 		END_TIMER3("container_p_add_cont: job_attach_reservation took",
350 			   3000000);
351 	if ((rc != 0) && (errno == ENOENT)) {	/* Log and retry */
352 		if (debug_flags & DEBUG_FLAG_JOB_CONT)
353 			info("%s: add(%u.%"PRIu64"): No reservation found, "
354 			     "no big deal, this is probably the first time "
355 			     "this was called.  We will just create a new one.",
356 			     plugin_type, job_id, cont_id);
357 		START_TIMER;
358 		rc = job_create_reservation(resv_id, CREATE_FLAGS);
359 		rc = job_attach_reservation(cjob_id, resv_id, ADD_FLAGS);
360 		if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
361 			END_TIMER;
362 			INFO_LINE("call took: %s", TIME_STR);
363 		} else
364 			END_TIMER3("container_p_add_cont: "
365 				   "job_(create&attach)_reservation took",
366 				   3000000);
367 	}
368 
369 	if ((rc == 0) || (errno == EBUSY)) {
370 		if (rc) {
371 			/* EBUSY - job ID already attached to a reservation
372 			 * Duplicate adds can be generated by prolog/epilog */
373 			debug2("%s: add(%u.%"PRIu64"): %m",
374 			       plugin_type, job_id, cont_id);
375 		} else if (debug_flags & DEBUG_FLAG_JOB_CONT)
376 			_stat_reservation("add", resv_id);
377 		return SLURM_SUCCESS;
378 	}
379 	error("%s: add(%u.%"PRIu64"): %m", plugin_type, job_id, cont_id);
380 	return SLURM_ERROR;
381 #else
382 	return SLURM_SUCCESS;
383 #endif
384 }
385 
386 /* Add a process to a job container, create the proctrack container to add */
container_p_join(uint32_t job_id,uid_t uid)387 extern int container_p_join(uint32_t job_id, uid_t uid)
388 {
389 	stepd_step_rec_t job;
390 	int rc;
391 	pid_t pid = getpid();
392 	DEF_TIMERS;
393 
394 	START_TIMER;
395 
396 	if (debug_flags & DEBUG_FLAG_JOB_CONT) {
397 		info("%s: adding pid(%u.%u)",
398 		     plugin_type, job_id, (uint32_t) pid);
399 	}
400 	memset(&job, 0, sizeof(stepd_step_rec_t));
401 	job.jmgr_pid = pid;
402 	job.uid = uid;
403 
404 	/*
405 	 * container_g_join() is called only from forked processes, set the
406 	 * proctrack_forked global bool to inform proctrack/cray_aries we are
407 	 * forked.
408 	 */
409 	proctrack_forked = true;
410 	if (proctrack_g_create(&job) != SLURM_SUCCESS) {
411 		error("%s: proctrack_g_create job(%u)", plugin_type,job_id);
412 		return SLURM_ERROR;
413 	}
414 
415 	proctrack_g_add(&job, pid);
416 
417 	rc = container_p_add_cont(job_id, job.cont_id);
418 
419 	if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
420 		END_TIMER;
421 		INFO_LINE("call took: %s", TIME_STR);
422 	}
423 
424 	return rc;
425 }
426 
container_p_delete(uint32_t job_id)427 extern int container_p_delete(uint32_t job_id)
428 {
429 #ifdef HAVE_NATIVE_CRAY
430 	rid_t resv_id = job_id;
431 	DEF_TIMERS;
432 	int rc;
433 #endif
434 	int i, found = -1;
435 	bool job_id_change = false;
436 
437 	if (debug_flags & DEBUG_FLAG_JOB_CONT)
438 		info("%s: deleting(%u)", plugin_type, job_id);
439 	slurm_mutex_lock(&context_lock);
440 	for (i = 0; i < job_id_count; i++) {
441 		if (job_id_array[i] == job_id) {
442 			job_id_array[i] = 0;
443 			job_id_change = true;
444 			found = i;
445 		}
446 	}
447 	if (found == -1)
448 		info("%s: no job for delete(%u)", plugin_type, job_id);
449 	if (job_id_change)
450 		_save_state(state_dir);
451 	slurm_mutex_unlock(&context_lock);
452 #ifdef HAVE_NATIVE_CRAY
453 	START_TIMER;
454 	rc = job_end_reservation(resv_id, DELETE_FLAGS);
455 	if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
456 		END_TIMER;
457 		INFO_LINE("call took: %s", TIME_STR);
458 	} else
459 		END_TIMER3("container_p_delete: job_end_reservation took",
460 			   3000000);
461 	if (rc == 0)
462 		return SLURM_SUCCESS;
463 	if ((errno == ENOENT) || (errno == EINPROGRESS) || (errno == EALREADY))
464 		return SLURM_SUCCESS;	/* Not fatal error */
465 	error("%s: delete(%u): %m", plugin_type, job_id);
466 	return SLURM_ERROR;
467 #else
468 	return SLURM_SUCCESS;
469 #endif
470 
471 }
472