1 /*****************************************************************************\
2 * jobcomp_filetxt.c - text file slurm job completion logging plugin.
3 *****************************************************************************
4 * Copyright (C) 2003 The Regents of the University of California.
5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6 * Written by Morris Jette <jette1@llnl.gov> et. al.
7 * CODE-OCEC-09-009. All rights reserved.
8 *
9 * This file is part of Slurm, a resource management program.
10 * For details, see <https://slurm.schedmd.com/>.
11 * Please also read the included file: DISCLAIMER.
12 *
13 * Slurm is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option)
16 * any later version.
17 *
18 * In addition, as a special exception, the copyright holders give permission
19 * to link the code of portions of this program with the OpenSSL library under
20 * certain conditions as described in each individual source file, and
21 * distribute linked combinations including the two. You must obey the GNU
22 * General Public License in all respects for all of the code used other than
23 * OpenSSL. If you modify file(s) with this exception, you may extend this
24 * exception to your version of the file(s), but you are not obligated to do
25 * so. If you do not wish to do so, delete this exception statement from your
26 * version. If you delete this exception statement from all source files in
27 * the program, then also delete it here.
28 *
29 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
32 * details.
33 *
34 * You should have received a copy of the GNU General Public License along
35 * with Slurm; if not, write to the Free Software Foundation, Inc.,
36 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
37 \*****************************************************************************/
38
39 #include "config.h"
40
41 #include <fcntl.h>
42 #include <grp.h>
43 #include <inttypes.h>
44 #include <pwd.h>
45 #include <unistd.h>
46
47 #include "src/common/slurm_protocol_defs.h"
48 #include "src/common/slurm_jobcomp.h"
49 #include "src/common/parse_time.h"
50 #include "src/common/slurm_time.h"
51 #include "src/common/uid.h"
52 #include "filetxt_jobcomp_process.h"
53
54 /*
55 * These variables are required by the generic plugin interface. If they
56 * are not found in the plugin, the plugin loader will ignore it.
57 *
58 * plugin_name - a string giving a human-readable description of the
59 * plugin. There is no maximum length, but the symbol must refer to
60 * a valid string.
61 *
62 * plugin_type - a string suggesting the type of the plugin or its
63 * applicability to a particular form of data or method of data handling.
64 * If the low-level plugin API is used, the contents of this string are
65 * unimportant and may be anything. Slurm uses the higher-level plugin
66 * interface which requires this string to be of the form
67 *
68 * <application>/<method>
69 *
70 * where <application> is a description of the intended application of
71 * the plugin (e.g., "jobcomp" for Slurm job completion logging) and <method>
72 * is a description of how this plugin satisfies that application. Slurm will
73 * only load job completion logging plugins if the plugin_type string has a
74 * prefix of "jobcomp/".
75 *
76 * plugin_version - an unsigned 32-bit integer containing the Slurm version
77 * (major.minor.micro combined into a single number).
78 */
79 const char plugin_name[] = "Job completion text file logging plugin";
80 const char plugin_type[] = "jobcomp/filetxt";
81 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
82
83 #define JOB_FORMAT "JobId=%lu UserId=%s(%lu) GroupId=%s(%lu) Name=%s JobState=%s Partition=%s "\
84 "TimeLimit=%s StartTime=%s EndTime=%s NodeList=%s NodeCnt=%u ProcCnt=%u "\
85 "WorkDir=%s ReservationName=%s Gres=%s Account=%s QOS=%s "\
86 "WcKey=%s Cluster=%s SubmitTime=%s EligibleTime=%s%s%s "\
87 "DerivedExitCode=%s ExitCode=%s %s\n"
88
89 /* File descriptor used for logging */
90 static pthread_mutex_t file_lock = PTHREAD_MUTEX_INITIALIZER;
91 static char * log_name = NULL;
92 static int job_comp_fd = -1;
93
94 /* get the user name for the give user_id */
95 static void
_get_user_name(uint32_t user_id,char * user_name,int buf_size)96 _get_user_name(uint32_t user_id, char *user_name, int buf_size)
97 {
98 static uint32_t cache_uid = 0;
99 static char cache_name[32] = "root", *uname;
100
101 if (user_id != cache_uid) {
102 uname = uid_to_string((uid_t) user_id);
103 snprintf(cache_name, sizeof(cache_name), "%s", uname);
104 xfree(uname);
105 cache_uid = user_id;
106 }
107 snprintf(user_name, buf_size, "%s", cache_name);
108 }
109
110 /* get the group name for the give group_id */
111 static void
_get_group_name(uint32_t group_id,char * group_name,int buf_size)112 _get_group_name(uint32_t group_id, char *group_name, int buf_size)
113 {
114 static uint32_t cache_gid = 0;
115 static char cache_name[32] = "root", *gname;
116
117 if (group_id != cache_gid) {
118 gname = gid_to_string((gid_t) group_id);
119 snprintf(cache_name, sizeof(cache_name), "%s", gname);
120 xfree(gname);
121 cache_gid = group_id;
122 }
123 snprintf(group_name, buf_size, "%s", cache_name);
124 }
125
126 /*
127 * init() is called when the plugin is loaded, before any other functions
128 * are called. Put global initialization here.
129 */
init(void)130 int init ( void )
131 {
132 return SLURM_SUCCESS;
133 }
134
fini(void)135 int fini ( void )
136 {
137 if (job_comp_fd >= 0)
138 close(job_comp_fd);
139 xfree(log_name);
140 return SLURM_SUCCESS;
141 }
142
143 /*
144 * The remainder of this file implements the standard Slurm job completion
145 * logging API.
146 */
147
slurm_jobcomp_set_location(char * location)148 extern int slurm_jobcomp_set_location ( char * location )
149 {
150 int rc = SLURM_SUCCESS;
151
152 if (location == NULL) {
153 return SLURM_ERROR;
154 }
155 xfree(log_name);
156 log_name = xstrdup(location);
157
158 slurm_mutex_lock( &file_lock );
159 if (job_comp_fd >= 0)
160 close(job_comp_fd);
161 job_comp_fd = open(location, O_WRONLY | O_CREAT | O_APPEND, 0644);
162 if (job_comp_fd == -1) {
163 fatal("open %s: %m", location);
164 rc = SLURM_ERROR;
165 } else
166 fchmod(job_comp_fd, 0644);
167 slurm_mutex_unlock( &file_lock );
168 return rc;
169 }
170
171 /* This is a variation of slurm_make_time_str() in src/common/parse_time.h
172 * This version uses ISO8601 format by default. */
_make_time_str(time_t * time,char * string,int size)173 static void _make_time_str (time_t *time, char *string, int size)
174 {
175 struct tm time_tm;
176
177 if ( *time == (time_t) 0 ) {
178 snprintf(string, size, "Unknown");
179 } else {
180 /* Format YYYY-MM-DDTHH:MM:SS, ISO8601 standard format */
181 localtime_r(time, &time_tm);
182 strftime(string, size, "%FT%T", &time_tm);
183 }
184 }
185
slurm_jobcomp_log_record(job_record_t * job_ptr)186 extern int slurm_jobcomp_log_record(job_record_t *job_ptr)
187 {
188 int rc = SLURM_SUCCESS, tmp_int, tmp_int2;
189 char job_rec[1024];
190 char usr_str[32], grp_str[32], start_str[32], end_str[32], lim_str[32];
191 char *resv_name, *gres, *account, *qos, *wckey, *cluster;
192 char *exit_code_str = NULL, *derived_ec_str = NULL;
193 char submit_time[32], eligible_time[32], array_id[64], het_id[64];
194 char select_buf[128], *state_string, *work_dir;
195 size_t offset = 0, tot_size, wrote;
196 uint32_t job_state;
197 uint32_t time_limit;
198
199 if ((log_name == NULL) || (job_comp_fd < 0)) {
200 error("JobCompLoc log file %s not open", log_name);
201 return SLURM_ERROR;
202 }
203
204 slurm_mutex_lock( &file_lock );
205 _get_user_name(job_ptr->user_id, usr_str, sizeof(usr_str));
206 _get_group_name(job_ptr->group_id, grp_str, sizeof(grp_str));
207
208 if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
209 time_limit = job_ptr->part_ptr->max_time;
210 else
211 time_limit = job_ptr->time_limit;
212 if (time_limit == INFINITE)
213 strcpy(lim_str, "UNLIMITED");
214 else {
215 snprintf(lim_str, sizeof(lim_str), "%lu",
216 (unsigned long) time_limit);
217 }
218
219 if (job_ptr->job_state & JOB_RESIZING) {
220 time_t now = time(NULL);
221 state_string = job_state_string(job_ptr->job_state);
222 if (job_ptr->resize_time) {
223 _make_time_str(&job_ptr->resize_time, start_str,
224 sizeof(start_str));
225 } else {
226 _make_time_str(&job_ptr->start_time, start_str,
227 sizeof(start_str));
228 }
229 _make_time_str(&now, end_str, sizeof(end_str));
230 } else {
231 /* Job state will typically have JOB_COMPLETING or JOB_RESIZING
232 * flag set when called. We remove the flags to get the eventual
233 * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
234 job_state = job_ptr->job_state & JOB_STATE_BASE;
235 state_string = job_state_string(job_state);
236 if (job_ptr->resize_time) {
237 _make_time_str(&job_ptr->resize_time, start_str,
238 sizeof(start_str));
239 } else if (job_ptr->start_time > job_ptr->end_time) {
240 /* Job cancelled while pending and
241 * expected start time is in the future. */
242 snprintf(start_str, sizeof(start_str), "Unknown");
243 } else {
244 _make_time_str(&job_ptr->start_time, start_str,
245 sizeof(start_str));
246 }
247 _make_time_str(&job_ptr->end_time, end_str, sizeof(end_str));
248 }
249
250 if (job_ptr->details && job_ptr->details->work_dir)
251 work_dir = job_ptr->details->work_dir;
252 else
253 work_dir = "unknown";
254
255 if (job_ptr->resv_name && job_ptr->resv_name[0])
256 resv_name = job_ptr->resv_name;
257 else
258 resv_name = "";
259
260 if (job_ptr->gres_req && job_ptr->gres_req[0])
261 gres = job_ptr->gres_req;
262 else
263 gres = "";
264
265 if (job_ptr->account && job_ptr->account[0])
266 account = job_ptr->account;
267 else
268 account = "";
269
270 if (job_ptr->qos_ptr != NULL) {
271 qos = job_ptr->qos_ptr->name;
272 } else
273 qos = "";
274
275 if (job_ptr->wckey && job_ptr->wckey[0])
276 wckey = job_ptr->wckey;
277 else
278 wckey = "";
279
280 if (job_ptr->assoc_ptr != NULL)
281 cluster = job_ptr->assoc_ptr->cluster;
282 else
283 cluster = "unknown";
284
285 if (job_ptr->details && job_ptr->details->submit_time) {
286 _make_time_str(&job_ptr->details->submit_time,
287 submit_time, sizeof(submit_time));
288 } else {
289 snprintf(submit_time, sizeof(submit_time), "unknown");
290 }
291
292 if (job_ptr->details && job_ptr->details->begin_time) {
293 _make_time_str(&job_ptr->details->begin_time,
294 eligible_time, sizeof(eligible_time));
295 } else {
296 snprintf(eligible_time, sizeof(eligible_time), "unknown");
297 }
298
299 if (job_ptr->array_task_id != NO_VAL) {
300 snprintf(array_id, sizeof(array_id),
301 " ArrayJobId=%u ArrayTaskId=%u",
302 job_ptr->array_job_id, job_ptr->array_task_id);
303 } else {
304 array_id[0] = '\0';
305 }
306
307 if (job_ptr->het_job_id) {
308 snprintf(het_id, sizeof(het_id),
309 " HetJobId=%u HetJobOffset=%u",
310 job_ptr->het_job_id, job_ptr->het_job_offset);
311 } else {
312 het_id[0] = '\0';
313 }
314
315 tmp_int = tmp_int2 = 0;
316 if (job_ptr->derived_ec == NO_VAL)
317 ;
318 else if (WIFSIGNALED(job_ptr->derived_ec))
319 tmp_int2 = WTERMSIG(job_ptr->derived_ec);
320 else if (WIFEXITED(job_ptr->derived_ec))
321 tmp_int = WEXITSTATUS(job_ptr->derived_ec);
322 xstrfmtcat(derived_ec_str, "%d:%d", tmp_int, tmp_int2);
323
324 tmp_int = tmp_int2 = 0;
325 if (job_ptr->exit_code == NO_VAL)
326 ;
327 else if (WIFSIGNALED(job_ptr->exit_code))
328 tmp_int2 = WTERMSIG(job_ptr->exit_code);
329 else if (WIFEXITED(job_ptr->exit_code))
330 tmp_int = WEXITSTATUS(job_ptr->exit_code);
331 xstrfmtcat(exit_code_str, "%d:%d", tmp_int, tmp_int2);
332
333 select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
334 select_buf, sizeof(select_buf), SELECT_PRINT_MIXED);
335
336 snprintf(job_rec, sizeof(job_rec), JOB_FORMAT,
337 (unsigned long) job_ptr->job_id, usr_str,
338 (unsigned long) job_ptr->user_id, grp_str,
339 (unsigned long) job_ptr->group_id, job_ptr->name,
340 state_string, job_ptr->partition, lim_str, start_str,
341 end_str, job_ptr->nodes, job_ptr->node_cnt,
342 job_ptr->total_cpus, work_dir, resv_name, gres, account, qos,
343 wckey, cluster, submit_time, eligible_time, array_id, het_id,
344 derived_ec_str, exit_code_str, select_buf);
345 tot_size = strlen(job_rec);
346
347 while (offset < tot_size) {
348 wrote = write(job_comp_fd, job_rec + offset,
349 tot_size - offset);
350 if (wrote == -1) {
351 if (errno == EAGAIN)
352 continue;
353 else {
354 rc = SLURM_ERROR;
355 break;
356 }
357 }
358 offset += wrote;
359 }
360 xfree(derived_ec_str);
361 xfree(exit_code_str);
362 slurm_mutex_unlock( &file_lock );
363 return rc;
364 }
365
366 /*
367 * get info from the database
368 * in/out job_list List of job_rec_t *
369 * note List needs to be freed when called
370 */
slurm_jobcomp_get_jobs(slurmdb_job_cond_t * job_cond)371 extern List slurm_jobcomp_get_jobs(slurmdb_job_cond_t *job_cond)
372 {
373 return filetxt_jobcomp_process_get_jobs(job_cond);
374 }
375
376 /*
377 * expire old info from the database
378 */
slurm_jobcomp_archive(slurmdb_archive_cond_t * arch_cond)379 extern int slurm_jobcomp_archive(slurmdb_archive_cond_t *arch_cond)
380 {
381 return filetxt_jobcomp_process_archive(arch_cond);
382 }
383