1 /*****************************************************************************\
2 * job_container_cncu.c - Define job container management functions for
3 * Cray systems
4 *****************************************************************************
5 * Copyright (C) 2013 SchedMD LLC
6 * Written by Morris Jette, SchedMD
7 *
8 * This file is part of Slurm, a resource management program.
9 * For details, see <https://slurm.schedmd.com/>.
10 * Please also read the included file: DISCLAIMER.
11 *
12 * Slurm is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 * In addition, as a special exception, the copyright holders give permission
18 * to link the code of portions of this program with the OpenSSL library under
19 * certain conditions as described in each individual source file, and
20 * distribute linked combinations including the two. You must obey the GNU
21 * General Public License in all respects for all of the code used other than
22 * OpenSSL. If you modify file(s) with this exception, you may extend this
23 * exception to your version of the file(s), but you are not obligated to do
24 * so. If you do not wish to do so, delete this exception statement from your
25 * version. If you delete this exception statement from all source files in
26 * the program, then also delete it here.
27 *
28 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
29 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
31 * details.
32 *
33 * You should have received a copy of the GNU General Public License along
34 * with Slurm; if not, write to the Free Software Foundation, Inc.,
35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
36 \*****************************************************************************/
37
38 #include "config.h"
39
40 #include <fcntl.h>
41 #include <sys/stat.h>
42 #include <sys/types.h>
43
44 #ifdef HAVE_NATIVE_CRAY
45 #include <job.h> /* Cray's job module component */
46 #endif
47
48 #include "slurm/slurm_errno.h"
49 #include "src/common/slurm_xlator.h"
50 #include "src/slurmd/common/proctrack.h"
51
52 #define ADD_FLAGS 0
53 #define CREATE_FLAGS 0
54 #define DELETE_FLAGS 0
55
56 #define JOB_BUF_SIZE 128
57
58 /*
59 * These variables are required by the generic plugin interface. If they
60 * are not found in the plugin, the plugin loader will ignore it.
61 *
62 * plugin_name - a string giving a human-readable description of the
63 * plugin. There is no maximum length, but the symbol must refer to
64 * a valid string.
65 *
66 * plugin_type - a string suggesting the type of the plugin or its
67 * applicability to a particular form of data or method of data handling.
68 * If the low-level plugin API is used, the contents of this string are
69 * unimportant and may be anything. Slurm uses the higher-level plugin
70 * interface which requires this string to be of the form
71 *
72 * <application>/<method>
73 *
74 * where <application> is a description of the intended application of
75 * the plugin (e.g., "task" for task control) and <method> is a description
76 * of how this plugin satisfies that application. Slurm will only load
77 * a task plugin if the plugin_type string has a prefix of "task/".
78 *
79 * plugin_version - an unsigned 32-bit integer containing the Slurm version
80 * (major.minor.micro combined into a single number).
81 */
82 const char plugin_name[] = "job_container cncu plugin";
83 const char plugin_type[] = "job_container/cncu";
84 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
85
86 static uint32_t *job_id_array = NULL;
87 static uint32_t job_id_count = 0;
88 static pthread_mutex_t context_lock = PTHREAD_MUTEX_INITIALIZER;
89 static char *state_dir = NULL;
90 static uint64_t debug_flags = 0;
91
_save_state(char * dir_name)92 static int _save_state(char *dir_name)
93 {
94 char *file_name;
95 int ret = SLURM_SUCCESS;
96 int state_fd;
97
98 if (!dir_name) {
99 error("job_container state directory is NULL");
100 return SLURM_ERROR;
101 }
102 file_name = xstrdup_printf("%s/job_container_state", dir_name);
103 (void) unlink(file_name);
104 state_fd = creat(file_name, 0600);
105 if (state_fd < 0) {
106 error("Can't save state, error creating file %s %m",
107 file_name);
108 ret = SLURM_ERROR;
109 } else {
110 char *buf = (char *) job_id_array;
111 size_t len = job_id_count * sizeof(uint32_t);
112 while (1) {
113 int wrote = write(state_fd, buf, len);
114 if ((wrote < 0) && (errno == EINTR))
115 continue;
116 if (wrote == 0)
117 break;
118 if (wrote < 0) {
119 error("Can't save job_container state: %m");
120 ret = SLURM_ERROR;
121 break;
122 }
123 buf += wrote;
124 len -= wrote;
125 }
126 close(state_fd);
127 }
128 xfree(file_name);
129
130 return ret;
131 }
132
_restore_state(char * dir_name)133 static int _restore_state(char *dir_name)
134 {
135 char *data = NULL, *file_name = NULL;
136 int error_code = SLURM_SUCCESS;
137 int state_fd, data_allocated = 0, data_read = 0, data_offset = 0;
138
139 if (!dir_name) {
140 error("job_container state directory is NULL");
141 return SLURM_ERROR;
142 }
143
144 file_name = xstrdup_printf("%s/job_container_state", dir_name);
145 state_fd = open (file_name, O_RDONLY);
146 if (state_fd >= 0) {
147 data_allocated = JOB_BUF_SIZE;
148 data = xmalloc(data_allocated);
149 while (1) {
150 data_read = read(state_fd, data + data_offset,
151 JOB_BUF_SIZE);
152 if ((data_read < 0) && (errno == EINTR))
153 continue;
154 if (data_read < 0) {
155 error ("Read error on %s, %m", file_name);
156 error_code = SLURM_ERROR;
157 break;
158 } else if (data_read == 0)
159 break;
160 data_offset += data_read;
161 data_allocated += data_read;
162 xrealloc(data, data_allocated);
163 }
164 close(state_fd);
165 } else {
166 error("No %s file for %s state recovery",
167 file_name, plugin_type);
168 xfree(file_name);
169 return SLURM_SUCCESS;
170 }
171
172 xfree(file_name);
173
174 if (error_code == SLURM_SUCCESS) {
175 job_id_array = (uint32_t *) data;
176 job_id_count = data_offset / sizeof(uint32_t);
177 }
178
179 return error_code;
180 }
181
182 #ifdef HAVE_NATIVE_CRAY
_stat_reservation(char * type,rid_t resv_id)183 static void _stat_reservation(char *type, rid_t resv_id)
184 {
185 struct job_resv_stat buf;
186 DEF_TIMERS;
187
188 START_TIMER;
189
190 if (job_stat_reservation(resv_id, &buf)) {
191 error("%s: stat(%"PRIu64"): %m", plugin_type, resv_id);
192 } else {
193 info("%s: %s/stat(%"PRIu64"): flags=%d "
194 "num_jobs=%d num_files=%d num_ipc_objs=%d",
195 plugin_type, type, resv_id, buf.flags, buf.num_jobs,
196 buf.num_files, buf.num_ipc_objs);
197 }
198 END_TIMER;
199 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
200 INFO_LINE("call took: %s", TIME_STR);
201 }
202 #endif
203
container_p_reconfig(void)204 extern void container_p_reconfig(void)
205 {
206 debug_flags = slurm_get_debug_flags();
207 }
208
209 /*
210 * init() is called when the plugin is loaded, before any other functions
211 * are called. Put global initialization here.
212 */
init(void)213 extern int init(void)
214 {
215 debug_flags = slurm_get_debug_flags();
216 if (debug_flags & DEBUG_FLAG_JOB_CONT)
217 info("%s loaded", plugin_name);
218 else
219 debug("%s loaded", plugin_name);
220
221 return SLURM_SUCCESS;
222 }
223
224 /*
225 * fini() is called when the plugin is removed. Clear any allocated
226 * storage here.
227 */
fini(void)228 extern int fini(void)
229 {
230 slurm_mutex_lock(&context_lock);
231 xfree(state_dir);
232 xfree(job_id_array);
233 job_id_count = 0;
234 slurm_mutex_unlock(&context_lock);
235
236 return SLURM_SUCCESS;
237 }
238
container_p_restore(char * dir_name,bool recover)239 extern int container_p_restore(char *dir_name, bool recover)
240 {
241 int i;
242
243 slurm_mutex_lock(&context_lock);
244 xfree(state_dir);
245 state_dir = xstrdup(dir_name);
246 _restore_state(state_dir);
247 for (i = 0; i < job_id_count; i++) {
248 if (job_id_array[i] == 0)
249 continue;
250 if (debug_flags & DEBUG_FLAG_JOB_CONT)
251 info("%s: %s job(%u)",
252 plugin_type,
253 recover ? "recovered" : "purging",
254 job_id_array[i]);
255 if (!recover)
256 job_id_array[i] = 0;
257 }
258 slurm_mutex_unlock(&context_lock);
259
260 return SLURM_SUCCESS;
261 }
262
container_p_create(uint32_t job_id)263 extern int container_p_create(uint32_t job_id)
264 {
265 #ifdef HAVE_NATIVE_CRAY
266 rid_t resv_id = job_id;
267 int rc;
268 #endif
269 int i, empty = -1, found = -1;
270 DEF_TIMERS;
271
272 START_TIMER;
273 if (debug_flags & DEBUG_FLAG_JOB_CONT)
274 info("%s: creating(%u)", plugin_type, job_id);
275 slurm_mutex_lock(&context_lock);
276 for (i = 0; i < job_id_count; i++) {
277 if (job_id_array[i] == 0) {
278 empty = i;
279 } else if (job_id_array[i] == job_id) {
280 found = i;
281 break;
282 }
283 }
284 if (found == -1) {
285 if (empty == -1) {
286 empty = job_id_count;
287 job_id_count += 4;
288 job_id_array = xrealloc(job_id_array,
289 sizeof(uint32_t)*job_id_count);
290 }
291 job_id_array[empty] = job_id;
292 _save_state(state_dir);
293 }
294 slurm_mutex_unlock(&context_lock);
295
296 if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
297 END_TIMER;
298 INFO_LINE("call took: %s", TIME_STR);
299 } else {
300 END_TIMER3("container_p_create: saving state took", 3000000);
301 }
302 #ifdef HAVE_NATIVE_CRAY
303 START_TIMER;
304 rc = job_create_reservation(resv_id, CREATE_FLAGS);
305 if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
306 END_TIMER;
307 INFO_LINE("call took: %s", TIME_STR);
308 } else
309 END_TIMER3("container_p_create: job_create_reservation took",
310 3000000);
311 if ((rc == 0) || (errno == EEXIST)) {
312 if ((found == -1) && (rc != 0) && (errno == EEXIST)) {
313 error("%s: create(%u): Reservation already exists",
314 plugin_type, job_id);
315 }
316 if (debug_flags & DEBUG_FLAG_JOB_CONT)
317 _stat_reservation("create", resv_id);
318 return SLURM_SUCCESS;
319 }
320 error("%s: create(%u): %m", plugin_type, job_id);
321 return SLURM_ERROR;
322 #else
323 return SLURM_SUCCESS;
324 #endif
325 }
326
327 /* Add proctrack container (PAGG) to a job container */
container_p_add_cont(uint32_t job_id,uint64_t cont_id)328 extern int container_p_add_cont(uint32_t job_id, uint64_t cont_id)
329 {
330 #ifdef HAVE_NATIVE_CRAY
331 jid_t cjob_id = cont_id;
332 rid_t resv_id = job_id;
333 int rc;
334 DEF_TIMERS;
335 #endif
336
337 if (debug_flags & DEBUG_FLAG_JOB_CONT) {
338 info("%s: adding cont(%u.%"PRIu64")",
339 plugin_type, job_id, cont_id);
340 }
341
342 #ifdef HAVE_NATIVE_CRAY
343 START_TIMER;
344 rc = job_attach_reservation(cjob_id, resv_id, ADD_FLAGS);
345 if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
346 END_TIMER;
347 INFO_LINE("call took: %s", TIME_STR);
348 } else
349 END_TIMER3("container_p_add_cont: job_attach_reservation took",
350 3000000);
351 if ((rc != 0) && (errno == ENOENT)) { /* Log and retry */
352 if (debug_flags & DEBUG_FLAG_JOB_CONT)
353 info("%s: add(%u.%"PRIu64"): No reservation found, "
354 "no big deal, this is probably the first time "
355 "this was called. We will just create a new one.",
356 plugin_type, job_id, cont_id);
357 START_TIMER;
358 rc = job_create_reservation(resv_id, CREATE_FLAGS);
359 rc = job_attach_reservation(cjob_id, resv_id, ADD_FLAGS);
360 if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
361 END_TIMER;
362 INFO_LINE("call took: %s", TIME_STR);
363 } else
364 END_TIMER3("container_p_add_cont: "
365 "job_(create&attach)_reservation took",
366 3000000);
367 }
368
369 if ((rc == 0) || (errno == EBUSY)) {
370 if (rc) {
371 /* EBUSY - job ID already attached to a reservation
372 * Duplicate adds can be generated by prolog/epilog */
373 debug2("%s: add(%u.%"PRIu64"): %m",
374 plugin_type, job_id, cont_id);
375 } else if (debug_flags & DEBUG_FLAG_JOB_CONT)
376 _stat_reservation("add", resv_id);
377 return SLURM_SUCCESS;
378 }
379 error("%s: add(%u.%"PRIu64"): %m", plugin_type, job_id, cont_id);
380 return SLURM_ERROR;
381 #else
382 return SLURM_SUCCESS;
383 #endif
384 }
385
386 /* Add a process to a job container, create the proctrack container to add */
container_p_join(uint32_t job_id,uid_t uid)387 extern int container_p_join(uint32_t job_id, uid_t uid)
388 {
389 stepd_step_rec_t job;
390 int rc;
391 pid_t pid = getpid();
392 DEF_TIMERS;
393
394 START_TIMER;
395
396 if (debug_flags & DEBUG_FLAG_JOB_CONT) {
397 info("%s: adding pid(%u.%u)",
398 plugin_type, job_id, (uint32_t) pid);
399 }
400 memset(&job, 0, sizeof(stepd_step_rec_t));
401 job.jmgr_pid = pid;
402 job.uid = uid;
403
404 /*
405 * container_g_join() is called only from forked processes, set the
406 * proctrack_forked global bool to inform proctrack/cray_aries we are
407 * forked.
408 */
409 proctrack_forked = true;
410 if (proctrack_g_create(&job) != SLURM_SUCCESS) {
411 error("%s: proctrack_g_create job(%u)", plugin_type,job_id);
412 return SLURM_ERROR;
413 }
414
415 proctrack_g_add(&job, pid);
416
417 rc = container_p_add_cont(job_id, job.cont_id);
418
419 if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
420 END_TIMER;
421 INFO_LINE("call took: %s", TIME_STR);
422 }
423
424 return rc;
425 }
426
container_p_delete(uint32_t job_id)427 extern int container_p_delete(uint32_t job_id)
428 {
429 #ifdef HAVE_NATIVE_CRAY
430 rid_t resv_id = job_id;
431 DEF_TIMERS;
432 int rc;
433 #endif
434 int i, found = -1;
435 bool job_id_change = false;
436
437 if (debug_flags & DEBUG_FLAG_JOB_CONT)
438 info("%s: deleting(%u)", plugin_type, job_id);
439 slurm_mutex_lock(&context_lock);
440 for (i = 0; i < job_id_count; i++) {
441 if (job_id_array[i] == job_id) {
442 job_id_array[i] = 0;
443 job_id_change = true;
444 found = i;
445 }
446 }
447 if (found == -1)
448 info("%s: no job for delete(%u)", plugin_type, job_id);
449 if (job_id_change)
450 _save_state(state_dir);
451 slurm_mutex_unlock(&context_lock);
452 #ifdef HAVE_NATIVE_CRAY
453 START_TIMER;
454 rc = job_end_reservation(resv_id, DELETE_FLAGS);
455 if (debug_flags & DEBUG_FLAG_TIME_CRAY) {
456 END_TIMER;
457 INFO_LINE("call took: %s", TIME_STR);
458 } else
459 END_TIMER3("container_p_delete: job_end_reservation took",
460 3000000);
461 if (rc == 0)
462 return SLURM_SUCCESS;
463 if ((errno == ENOENT) || (errno == EINPROGRESS) || (errno == EALREADY))
464 return SLURM_SUCCESS; /* Not fatal error */
465 error("%s: delete(%u): %m", plugin_type, job_id);
466 return SLURM_ERROR;
467 #else
468 return SLURM_SUCCESS;
469 #endif
470
471 }
472