1 /*****************************************************************************\
2  *  jobcomp_filetxt.c - text file slurm job completion logging plugin.
3  *****************************************************************************
4  *  Copyright (C) 2003 The Regents of the University of California.
5  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6  *  Written by Morris Jette <jette1@llnl.gov> et. al.
7  *  CODE-OCEC-09-009. All rights reserved.
8  *
9  *  This file is part of Slurm, a resource management program.
10  *  For details, see <https://slurm.schedmd.com/>.
11  *  Please also read the included file: DISCLAIMER.
12  *
13  *  Slurm is free software; you can redistribute it and/or modify it under
14  *  the terms of the GNU General Public License as published by the Free
15  *  Software Foundation; either version 2 of the License, or (at your option)
16  *  any later version.
17  *
18  *  In addition, as a special exception, the copyright holders give permission
19  *  to link the code of portions of this program with the OpenSSL library under
20  *  certain conditions as described in each individual source file, and
21  *  distribute linked combinations including the two. You must obey the GNU
22  *  General Public License in all respects for all of the code used other than
23  *  OpenSSL. If you modify file(s) with this exception, you may extend this
24  *  exception to your version of the file(s), but you are not obligated to do
25  *  so. If you do not wish to do so, delete this exception statement from your
26  *  version.  If you delete this exception statement from all source files in
27  *  the program, then also delete it here.
28  *
29  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
32  *  details.
33  *
34  *  You should have received a copy of the GNU General Public License along
35  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
36  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
37 \*****************************************************************************/
38 
39 #include "config.h"
40 
41 #include <fcntl.h>
42 #include <grp.h>
43 #include <inttypes.h>
44 #include <pwd.h>
45 #include <unistd.h>
46 
47 #include "src/common/slurm_protocol_defs.h"
48 #include "src/common/slurm_jobcomp.h"
49 #include "src/common/parse_time.h"
50 #include "src/common/slurm_time.h"
51 #include "src/common/uid.h"
52 #include "filetxt_jobcomp_process.h"
53 
54 /*
55  * These variables are required by the generic plugin interface.  If they
56  * are not found in the plugin, the plugin loader will ignore it.
57  *
58  * plugin_name - a string giving a human-readable description of the
59  * plugin.  There is no maximum length, but the symbol must refer to
60  * a valid string.
61  *
62  * plugin_type - a string suggesting the type of the plugin or its
63  * applicability to a particular form of data or method of data handling.
64  * If the low-level plugin API is used, the contents of this string are
65  * unimportant and may be anything.  Slurm uses the higher-level plugin
66  * interface which requires this string to be of the form
67  *
68  *	<application>/<method>
69  *
70  * where <application> is a description of the intended application of
71  * the plugin (e.g., "jobcomp" for Slurm job completion logging) and <method>
72  * is a description of how this plugin satisfies that application.  Slurm will
73  * only load job completion logging plugins if the plugin_type string has a
74  * prefix of "jobcomp/".
75  *
76  * plugin_version - an unsigned 32-bit integer containing the Slurm version
77  * (major.minor.micro combined into a single number).
78  */
79 const char plugin_name[]       	= "Job completion text file logging plugin";
80 const char plugin_type[]       	= "jobcomp/filetxt";
81 const uint32_t plugin_version	= SLURM_VERSION_NUMBER;
82 
83 #define JOB_FORMAT "JobId=%lu UserId=%s(%lu) GroupId=%s(%lu) Name=%s JobState=%s Partition=%s "\
84 		"TimeLimit=%s StartTime=%s EndTime=%s NodeList=%s NodeCnt=%u ProcCnt=%u "\
85 		"WorkDir=%s ReservationName=%s Gres=%s Account=%s QOS=%s "\
86 		"WcKey=%s Cluster=%s SubmitTime=%s EligibleTime=%s%s%s "\
87 		"DerivedExitCode=%s ExitCode=%s %s\n"
88 
89 /* File descriptor used for logging */
90 static pthread_mutex_t  file_lock = PTHREAD_MUTEX_INITIALIZER;
91 static char *           log_name  = NULL;
92 static int              job_comp_fd = -1;
93 
94 /* get the user name for the give user_id */
95 static void
_get_user_name(uint32_t user_id,char * user_name,int buf_size)96 _get_user_name(uint32_t user_id, char *user_name, int buf_size)
97 {
98 	static uint32_t cache_uid      = 0;
99 	static char     cache_name[32] = "root", *uname;
100 
101 	if (user_id != cache_uid) {
102 		uname = uid_to_string((uid_t) user_id);
103 		snprintf(cache_name, sizeof(cache_name), "%s", uname);
104 		xfree(uname);
105 		cache_uid = user_id;
106 	}
107 	snprintf(user_name, buf_size, "%s", cache_name);
108 }
109 
110 /* get the group name for the give group_id */
111 static void
_get_group_name(uint32_t group_id,char * group_name,int buf_size)112 _get_group_name(uint32_t group_id, char *group_name, int buf_size)
113 {
114 	static uint32_t cache_gid      = 0;
115 	static char     cache_name[32] = "root", *gname;
116 
117 	if (group_id != cache_gid) {
118 		gname = gid_to_string((gid_t) group_id);
119 		snprintf(cache_name, sizeof(cache_name), "%s", gname);
120 		xfree(gname);
121 		cache_gid = group_id;
122 	}
123 	snprintf(group_name, buf_size, "%s", cache_name);
124 }
125 
126 /*
127  * init() is called when the plugin is loaded, before any other functions
128  * are called.  Put global initialization here.
129  */
init(void)130 int init ( void )
131 {
132 	return SLURM_SUCCESS;
133 }
134 
fini(void)135 int fini ( void )
136 {
137 	if (job_comp_fd >= 0)
138 		close(job_comp_fd);
139 	xfree(log_name);
140 	return SLURM_SUCCESS;
141 }
142 
143 /*
144  * The remainder of this file implements the standard Slurm job completion
145  * logging API.
146  */
147 
slurm_jobcomp_set_location(char * location)148 extern int slurm_jobcomp_set_location ( char * location )
149 {
150 	int rc = SLURM_SUCCESS;
151 
152 	if (location == NULL) {
153 		return SLURM_ERROR;
154 	}
155 	xfree(log_name);
156 	log_name = xstrdup(location);
157 
158 	slurm_mutex_lock( &file_lock );
159 	if (job_comp_fd >= 0)
160 		close(job_comp_fd);
161 	job_comp_fd = open(location, O_WRONLY | O_CREAT | O_APPEND, 0644);
162 	if (job_comp_fd == -1) {
163 		fatal("open %s: %m", location);
164 		rc = SLURM_ERROR;
165 	} else
166 		fchmod(job_comp_fd, 0644);
167 	slurm_mutex_unlock( &file_lock );
168 	return rc;
169 }
170 
171 /* This is a variation of slurm_make_time_str() in src/common/parse_time.h
172  * This version uses ISO8601 format by default. */
_make_time_str(time_t * time,char * string,int size)173 static void _make_time_str (time_t *time, char *string, int size)
174 {
175 	struct tm time_tm;
176 
177 	if ( *time == (time_t) 0 ) {
178 		snprintf(string, size, "Unknown");
179 	} else {
180 		/* Format YYYY-MM-DDTHH:MM:SS, ISO8601 standard format */
181 		localtime_r(time, &time_tm);
182 		strftime(string, size, "%FT%T", &time_tm);
183 	}
184 }
185 
slurm_jobcomp_log_record(job_record_t * job_ptr)186 extern int slurm_jobcomp_log_record(job_record_t *job_ptr)
187 {
188 	int rc = SLURM_SUCCESS, tmp_int, tmp_int2;
189 	char job_rec[1024];
190 	char usr_str[32], grp_str[32], start_str[32], end_str[32], lim_str[32];
191 	char *resv_name, *gres, *account, *qos, *wckey, *cluster;
192 	char *exit_code_str = NULL, *derived_ec_str = NULL;
193 	char submit_time[32], eligible_time[32], array_id[64], het_id[64];
194 	char select_buf[128], *state_string, *work_dir;
195 	size_t offset = 0, tot_size, wrote;
196 	uint32_t job_state;
197 	uint32_t time_limit;
198 
199 	if ((log_name == NULL) || (job_comp_fd < 0)) {
200 		error("JobCompLoc log file %s not open", log_name);
201 		return SLURM_ERROR;
202 	}
203 
204 	slurm_mutex_lock( &file_lock );
205 	_get_user_name(job_ptr->user_id, usr_str, sizeof(usr_str));
206 	_get_group_name(job_ptr->group_id, grp_str, sizeof(grp_str));
207 
208 	if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
209 		time_limit = job_ptr->part_ptr->max_time;
210 	else
211 		time_limit = job_ptr->time_limit;
212 	if (time_limit == INFINITE)
213 		strcpy(lim_str, "UNLIMITED");
214 	else {
215 		snprintf(lim_str, sizeof(lim_str), "%lu",
216 			 (unsigned long) time_limit);
217 	}
218 
219 	if (job_ptr->job_state & JOB_RESIZING) {
220 		time_t now = time(NULL);
221 		state_string = job_state_string(job_ptr->job_state);
222 		if (job_ptr->resize_time) {
223 			_make_time_str(&job_ptr->resize_time, start_str,
224 				       sizeof(start_str));
225 		} else {
226 			_make_time_str(&job_ptr->start_time, start_str,
227 				       sizeof(start_str));
228 		}
229 		_make_time_str(&now, end_str, sizeof(end_str));
230 	} else {
231 		/* Job state will typically have JOB_COMPLETING or JOB_RESIZING
232 		 * flag set when called. We remove the flags to get the eventual
233 		 * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
234 		job_state = job_ptr->job_state & JOB_STATE_BASE;
235 		state_string = job_state_string(job_state);
236 		if (job_ptr->resize_time) {
237 			_make_time_str(&job_ptr->resize_time, start_str,
238 				       sizeof(start_str));
239 		} else if (job_ptr->start_time > job_ptr->end_time) {
240 			/* Job cancelled while pending and
241 			 * expected start time is in the future. */
242 			snprintf(start_str, sizeof(start_str), "Unknown");
243 		} else {
244 			_make_time_str(&job_ptr->start_time, start_str,
245 				       sizeof(start_str));
246 		}
247 		_make_time_str(&job_ptr->end_time, end_str, sizeof(end_str));
248 	}
249 
250 	if (job_ptr->details && job_ptr->details->work_dir)
251 		work_dir = job_ptr->details->work_dir;
252 	else
253 		work_dir = "unknown";
254 
255 	if (job_ptr->resv_name && job_ptr->resv_name[0])
256 		resv_name = job_ptr->resv_name;
257 	else
258 		resv_name = "";
259 
260 	if (job_ptr->gres_req && job_ptr->gres_req[0])
261 		gres = job_ptr->gres_req;
262 	else
263 		gres = "";
264 
265 	if (job_ptr->account && job_ptr->account[0])
266 		account = job_ptr->account;
267 	else
268 		account = "";
269 
270 	if (job_ptr->qos_ptr != NULL) {
271 		qos = job_ptr->qos_ptr->name;
272 	} else
273 		qos = "";
274 
275 	if (job_ptr->wckey && job_ptr->wckey[0])
276 		wckey = job_ptr->wckey;
277 	else
278 		wckey = "";
279 
280 	if (job_ptr->assoc_ptr != NULL)
281 		cluster = job_ptr->assoc_ptr->cluster;
282 	else
283 		cluster = "unknown";
284 
285 	if (job_ptr->details && job_ptr->details->submit_time) {
286 		_make_time_str(&job_ptr->details->submit_time,
287 			       submit_time, sizeof(submit_time));
288 	} else {
289 		snprintf(submit_time, sizeof(submit_time), "unknown");
290 	}
291 
292 	if (job_ptr->details && job_ptr->details->begin_time) {
293 		_make_time_str(&job_ptr->details->begin_time,
294 			       eligible_time, sizeof(eligible_time));
295 	} else {
296 		snprintf(eligible_time, sizeof(eligible_time), "unknown");
297 	}
298 
299 	if (job_ptr->array_task_id != NO_VAL) {
300 		snprintf(array_id, sizeof(array_id),
301 			 " ArrayJobId=%u ArrayTaskId=%u",
302 			 job_ptr->array_job_id, job_ptr->array_task_id);
303 	} else {
304 		array_id[0] = '\0';
305 	}
306 
307 	if (job_ptr->het_job_id) {
308 		snprintf(het_id, sizeof(het_id),
309 			 " HetJobId=%u HetJobOffset=%u",
310 			 job_ptr->het_job_id, job_ptr->het_job_offset);
311 	} else {
312 		het_id[0] = '\0';
313 	}
314 
315 	tmp_int = tmp_int2 = 0;
316 	if (job_ptr->derived_ec == NO_VAL)
317 		;
318 	else if (WIFSIGNALED(job_ptr->derived_ec))
319 		tmp_int2 = WTERMSIG(job_ptr->derived_ec);
320 	else if (WIFEXITED(job_ptr->derived_ec))
321 		tmp_int = WEXITSTATUS(job_ptr->derived_ec);
322 	xstrfmtcat(derived_ec_str, "%d:%d", tmp_int, tmp_int2);
323 
324 	tmp_int = tmp_int2 = 0;
325 	if (job_ptr->exit_code == NO_VAL)
326 		;
327 	else if (WIFSIGNALED(job_ptr->exit_code))
328 		tmp_int2 = WTERMSIG(job_ptr->exit_code);
329 	else if (WIFEXITED(job_ptr->exit_code))
330 		tmp_int = WEXITSTATUS(job_ptr->exit_code);
331 	xstrfmtcat(exit_code_str, "%d:%d", tmp_int, tmp_int2);
332 
333 	select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
334 		select_buf, sizeof(select_buf), SELECT_PRINT_MIXED);
335 
336 	snprintf(job_rec, sizeof(job_rec), JOB_FORMAT,
337 		 (unsigned long) job_ptr->job_id, usr_str,
338 		 (unsigned long) job_ptr->user_id, grp_str,
339 		 (unsigned long) job_ptr->group_id, job_ptr->name,
340 		 state_string, job_ptr->partition, lim_str, start_str,
341 		 end_str, job_ptr->nodes, job_ptr->node_cnt,
342 		 job_ptr->total_cpus, work_dir, resv_name, gres, account, qos,
343 		 wckey, cluster, submit_time, eligible_time, array_id, het_id,
344 		 derived_ec_str, exit_code_str, select_buf);
345 	tot_size = strlen(job_rec);
346 
347 	while (offset < tot_size) {
348 		wrote = write(job_comp_fd, job_rec + offset,
349 			tot_size - offset);
350 		if (wrote == -1) {
351 			if (errno == EAGAIN)
352 				continue;
353 			else {
354 				rc = SLURM_ERROR;
355 				break;
356 			}
357 		}
358 		offset += wrote;
359 	}
360 	xfree(derived_ec_str);
361 	xfree(exit_code_str);
362 	slurm_mutex_unlock( &file_lock );
363 	return rc;
364 }
365 
366 /*
367  * get info from the database
368  * in/out job_list List of job_rec_t *
369  * note List needs to be freed when called
370  */
slurm_jobcomp_get_jobs(slurmdb_job_cond_t * job_cond)371 extern List slurm_jobcomp_get_jobs(slurmdb_job_cond_t *job_cond)
372 {
373 	return filetxt_jobcomp_process_get_jobs(job_cond);
374 }
375 
376 /*
377  * expire old info from the database
378  */
slurm_jobcomp_archive(slurmdb_archive_cond_t * arch_cond)379 extern int slurm_jobcomp_archive(slurmdb_archive_cond_t *arch_cond)
380 {
381 	return filetxt_jobcomp_process_archive(arch_cond);
382 }
383