1 /*****************************************************************************\
2  *  slurm_acct_gather.c - generic interface needed for some
3  *                        acct_gather plugins.
4  *****************************************************************************
5  *  Copyright (C) 2013 SchedMD LLC.
6  *  Written by Danny Auble <da@schedmd.com>
7  *
8  *  This file is part of Slurm, a resource management program.
9  *  For details, see <https://slurm.schedmd.com/>.
10  *  Please also read the included file: DISCLAIMER.
11  *
12  *  Slurm is free software; you can redistribute it and/or modify it under
13  *  the terms of the GNU General Public License as published by the Free
14  *  Software Foundation; either version 2 of the License, or (at your option)
15  *  any later version.
16  *
17  *  In addition, as a special exception, the copyright holders give permission
18  *  to link the code of portions of this program with the OpenSSL library under
19  *  certain conditions as described in each individual source file, and
20  *  distribute linked combinations including the two. You must obey the GNU
21  *  General Public License in all respects for all of the code used other than
22  *  OpenSSL. If you modify file(s) with this exception, you may extend this
23  *  exception to your version of the file(s), but you are not obligated to do
24  *  so. If you do not wish to do so, delete this exception statement from your
25  *  version.  If you delete this exception statement from all source files in
26  *  the program, then also delete it here.
27  *
28  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
29  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
31  *  details.
32  *
33  *  You should have received a copy of the GNU General Public License along
34  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
35  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
36 \*****************************************************************************/
37 
38 #include <sys/stat.h>
39 #include <stdlib.h>
40 
41 #include "src/common/pack.h"
42 #include "src/common/parse_config.h"
43 #include "src/common/slurm_acct_gather.h"
44 #include "slurm_acct_gather_energy.h"
45 #include "slurm_acct_gather_interconnect.h"
46 #include "slurm_acct_gather_filesystem.h"
47 #include "src/common/xstring.h"
48 
49 static bool acct_gather_suspended = false;
50 static pthread_mutex_t suspended_mutex = PTHREAD_MUTEX_INITIALIZER;
51 static pthread_mutex_t conf_mutex = PTHREAD_MUTEX_INITIALIZER;
52 static Buf acct_gather_options_buf = NULL;
53 static bool inited = 0;
54 
_get_int(const char * my_str)55 static int _get_int(const char *my_str)
56 {
57 	char *end = NULL;
58 	int value;
59 
60 	if (!my_str)
61 		return -1;
62 	value = strtol(my_str, &end, 10);
63 	//info("from %s I get %d and %s: %m", my_str, value, end);
64 	/* means no numbers */
65 	if (my_str == end)
66 		return -1;
67 
68 	return value;
69 }
70 
_process_tbl(s_p_hashtbl_t * tbl)71 static int _process_tbl(s_p_hashtbl_t *tbl)
72 {
73 	int rc = 0;
74 
75 	/* handle acct_gather.conf in each plugin */
76 	slurm_mutex_lock(&conf_mutex);
77 	rc += acct_gather_energy_g_conf_set(tbl);
78 	rc += acct_gather_profile_g_conf_set(tbl);
79 	rc += acct_gather_interconnect_g_conf_set(tbl);
80 	rc += acct_gather_filesystem_g_conf_set(tbl);
81 	/*********************************************************************/
82 	/* ADD MORE HERE AND FREE MEMORY IN acct_gather_conf_destroy() BELOW */
83 	/*********************************************************************/
84 	slurm_mutex_unlock(&conf_mutex);
85 
86 	return rc;
87 }
88 
acct_gather_conf_init(void)89 extern int acct_gather_conf_init(void)
90 {
91 	s_p_hashtbl_t *tbl = NULL;
92 	char *conf_path = NULL;
93 	s_p_options_t *full_options = NULL;
94 	int full_options_cnt = 0, i;
95 	struct stat buf;
96 	int rc = SLURM_SUCCESS;
97 
98 	if (inited)
99 		return SLURM_SUCCESS;
100 	inited = 1;
101 
102 	/* get options from plugins using acct_gather.conf */
103 
104 	rc += acct_gather_energy_g_conf_options(&full_options,
105 						&full_options_cnt);
106 	rc += acct_gather_profile_g_conf_options(&full_options,
107 						 &full_options_cnt);
108 	rc += acct_gather_interconnect_g_conf_options(&full_options,
109 						      &full_options_cnt);
110 	rc += acct_gather_filesystem_g_conf_options(&full_options,
111 						    &full_options_cnt);
112 	/* ADD MORE HERE */
113 
114 	/* for the NULL at the end */
115 	xrealloc(full_options,
116 		 ((full_options_cnt + 1) * sizeof(s_p_options_t)));
117 
118 	tbl = s_p_hashtbl_create(full_options);
119 
120 	/**************************************************/
121 
122 	/* Get the acct_gather.conf path and validate the file */
123 	conf_path = get_extra_conf_path("acct_gather.conf");
124 	if ((conf_path == NULL) || (stat(conf_path, &buf) == -1)) {
125 		debug2("No acct_gather.conf file (%s)", conf_path);
126 	} else {
127 		debug2("Reading acct_gather.conf file %s", conf_path);
128 
129 		if (s_p_parse_file(tbl, NULL, conf_path, false) ==
130 		    SLURM_ERROR) {
131 			fatal("Could not open/read/parse acct_gather.conf file "
132 			      "%s.  Many times this is because you have "
133 			      "defined options for plugins that are not "
134 			      "loaded.  Please check your slurm.conf file "
135 			      "and make sure the plugins for the options "
136 			      "listed are loaded.",
137 			      conf_path);
138 		}
139 	}
140 
141 	rc += _process_tbl(tbl);
142 
143 	acct_gather_options_buf = s_p_pack_hashtbl(
144 		tbl, full_options, full_options_cnt);
145 
146 	for (i=0; i<full_options_cnt; i++)
147 		xfree(full_options[i].key);
148 	xfree(full_options);
149 	xfree(conf_path);
150 
151 	s_p_hashtbl_destroy(tbl);
152 
153 	return rc;
154 }
155 
acct_gather_write_conf(int fd)156 extern int acct_gather_write_conf(int fd)
157 {
158 	int len;
159 
160 	acct_gather_conf_init();
161 
162 	slurm_mutex_lock(&conf_mutex);
163 	len = get_buf_offset(acct_gather_options_buf);
164 	safe_write(fd, &len, sizeof(int));
165 	safe_write(fd, get_buf_data(acct_gather_options_buf), len);
166 	slurm_mutex_unlock(&conf_mutex);
167 
168 	return 0;
169 
170 rwfail:
171 	slurm_mutex_unlock(&conf_mutex);
172 	return -1;
173 }
174 
acct_gather_read_conf(int fd)175 extern int acct_gather_read_conf(int fd)
176 {
177 	int len;
178 	s_p_hashtbl_t *tbl;
179 
180 	safe_read(fd, &len, sizeof(int));
181 
182 	acct_gather_options_buf = init_buf(len);
183 	safe_read(fd, acct_gather_options_buf->head, len);
184 
185 	if (!(tbl = s_p_unpack_hashtbl(acct_gather_options_buf)))
186 		return SLURM_ERROR;
187 
188 	/*
189 	 * We need to set inited before calling _process_tbl or we will get
190 	 * deadlock since the other acct_gather_* plugins will call
191 	 * acct_gather_init().
192 	 */
193 	inited = true;
194 	(void)_process_tbl(tbl);
195 
196 	s_p_hashtbl_destroy(tbl);
197 
198 	return SLURM_SUCCESS;
199 rwfail:
200 	return SLURM_ERROR;
201 }
202 
acct_gather_reconfig(void)203 extern int acct_gather_reconfig(void)
204 {
205 	acct_gather_conf_destroy();
206 	slurm_mutex_init(&conf_mutex);
207 	acct_gather_conf_init();
208 
209 	return SLURM_SUCCESS;
210 }
211 
acct_gather_conf_destroy(void)212 extern int acct_gather_conf_destroy(void)
213 {
214 	int rc, rc2;
215 
216 	if (!inited)
217 		return SLURM_SUCCESS;
218 
219 	inited = false;
220 
221 	rc = acct_gather_energy_fini();
222 
223 	rc2 = acct_gather_filesystem_fini();
224 	rc = MAX(rc, rc2);
225 	rc2 = acct_gather_interconnect_fini();
226 	rc = MAX(rc, rc2);
227 	rc2 = acct_gather_profile_fini();
228 	rc = MAX(rc, rc2);
229 
230 	FREE_NULL_BUFFER(acct_gather_options_buf);
231 
232 	slurm_mutex_destroy(&conf_mutex);
233 	return rc;
234 }
235 
acct_gather_conf_values(void)236 extern List acct_gather_conf_values(void)
237 {
238 	List acct_list = list_create(destroy_config_key_pair);
239 
240 	/* get acct_gather.conf in each plugin */
241 	slurm_mutex_lock(&conf_mutex);
242 	acct_gather_profile_g_conf_values(&acct_list);
243 	acct_gather_interconnect_g_conf_values(&acct_list);
244 	acct_gather_energy_g_conf_values(&acct_list);
245 	acct_gather_filesystem_g_conf_values(&acct_list);
246 	/* ADD MORE HERE */
247 	slurm_mutex_unlock(&conf_mutex);
248 	/******************************************/
249 
250 	list_sort(acct_list, (ListCmpF) sort_key_pairs);
251 
252 	return acct_list;
253 }
254 
acct_gather_parse_freq(int type,char * freq)255 extern int acct_gather_parse_freq(int type, char *freq)
256 {
257 	int freq_int = -1;
258 	char *sub_str = NULL;
259 
260 	if (!freq)
261 		return freq_int;
262 
263 	switch (type) {
264 	case PROFILE_ENERGY:
265 		if ((sub_str = xstrcasestr(freq, "energy=")))
266 			freq_int = _get_int(sub_str + 7);
267 		break;
268 	case PROFILE_TASK:
269 		/* backwards compatibility for when the freq was only
270 		   for task.
271 		*/
272 		freq_int = _get_int(freq);
273 		if ((freq_int == -1)
274 		    && (sub_str = xstrcasestr(freq, "task=")))
275 			freq_int = _get_int(sub_str + 5);
276 		break;
277 	case PROFILE_FILESYSTEM:
278 		if ((sub_str = xstrcasestr(freq, "filesystem=")))
279 			freq_int = _get_int(sub_str + 11);
280 		break;
281 	case PROFILE_NETWORK:
282 		if ((sub_str = xstrcasestr(freq, "network=")))
283 			freq_int = _get_int(sub_str + 8);
284 		break;
285 	default:
286 		fatal("Unhandled profile option %d please update "
287 		      "slurm_acct_gather.c "
288 		      "(acct_gather_parse_freq)", type);
289 	}
290 
291 	return freq_int;
292 }
293 
acct_gather_check_acct_freq_task(uint64_t job_mem_lim,char * acctg_freq)294 extern int acct_gather_check_acct_freq_task(uint64_t job_mem_lim,
295 					    char *acctg_freq)
296 {
297 	int task_freq;
298 	static uint32_t acct_freq_task = NO_VAL;
299 
300 	if (acct_freq_task == NO_VAL) {
301 		char *acct_freq = slurm_get_jobacct_gather_freq();
302 		int i = acct_gather_parse_freq(PROFILE_TASK, acct_freq);
303 		xfree(acct_freq);
304 
305 		/* If the value is -1 lets set the freq to something
306 		   really high so we don't check this again.
307 		*/
308 		if (i == -1)
309 			acct_freq_task = NO_VAL16;
310 		else
311 			acct_freq_task = i;
312 	}
313 
314 	if (!job_mem_lim || !acct_freq_task)
315 		return 0;
316 
317 	task_freq = acct_gather_parse_freq(PROFILE_TASK, acctg_freq);
318 
319 	if (task_freq == -1)
320 		return 0;
321 
322 	if (task_freq == 0) {
323 		error("Can't turn accounting frequency off.  "
324 		      "We need it to monitor memory usage.");
325 		slurm_seterrno(ESLURMD_INVALID_ACCT_FREQ);
326 		return 1;
327 	} else if (task_freq > acct_freq_task) {
328 		error("Can't set frequency to %d, it is higher than %u.  "
329 		      "We need it to be at least at this level to "
330 		      "monitor memory usage.",
331 		      task_freq, acct_freq_task);
332 		slurm_seterrno(ESLURMD_INVALID_ACCT_FREQ);
333 		return 1;
334 	}
335 
336 	return 0;
337 }
338 
acct_gather_suspend_poll(void)339 extern void acct_gather_suspend_poll(void)
340 {
341 	slurm_mutex_lock(&suspended_mutex);
342 	acct_gather_suspended = true;
343 	slurm_mutex_unlock(&suspended_mutex);
344 }
345 
acct_gather_resume_poll(void)346 extern void acct_gather_resume_poll(void)
347 {
348 	slurm_mutex_lock(&suspended_mutex);
349 	acct_gather_suspended = false;
350 	slurm_mutex_unlock(&suspended_mutex);
351 }
352 
acct_gather_suspend_test(void)353 extern bool acct_gather_suspend_test(void)
354 {
355 	bool rc;
356 	slurm_mutex_lock(&suspended_mutex);
357 	rc = acct_gather_suspended;
358 	slurm_mutex_unlock(&suspended_mutex);
359 	return rc;
360 }
361