1 /*****************************************************************************\
2 * slurm_acct_gather.c - generic interface needed for some
3 * acct_gather plugins.
4 *****************************************************************************
5 * Copyright (C) 2013 SchedMD LLC.
6 * Written by Danny Auble <da@schedmd.com>
7 *
8 * This file is part of Slurm, a resource management program.
9 * For details, see <https://slurm.schedmd.com/>.
10 * Please also read the included file: DISCLAIMER.
11 *
12 * Slurm is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 * In addition, as a special exception, the copyright holders give permission
18 * to link the code of portions of this program with the OpenSSL library under
19 * certain conditions as described in each individual source file, and
20 * distribute linked combinations including the two. You must obey the GNU
21 * General Public License in all respects for all of the code used other than
22 * OpenSSL. If you modify file(s) with this exception, you may extend this
23 * exception to your version of the file(s), but you are not obligated to do
24 * so. If you do not wish to do so, delete this exception statement from your
25 * version. If you delete this exception statement from all source files in
26 * the program, then also delete it here.
27 *
28 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
29 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
31 * details.
32 *
33 * You should have received a copy of the GNU General Public License along
34 * with Slurm; if not, write to the Free Software Foundation, Inc.,
35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
36 \*****************************************************************************/
37
38 #include <sys/stat.h>
39 #include <stdlib.h>
40
41 #include "src/common/pack.h"
42 #include "src/common/parse_config.h"
43 #include "src/common/slurm_acct_gather.h"
44 #include "slurm_acct_gather_energy.h"
45 #include "slurm_acct_gather_interconnect.h"
46 #include "slurm_acct_gather_filesystem.h"
47 #include "src/common/xstring.h"
48
49 static bool acct_gather_suspended = false;
50 static pthread_mutex_t suspended_mutex = PTHREAD_MUTEX_INITIALIZER;
51 static pthread_mutex_t conf_mutex = PTHREAD_MUTEX_INITIALIZER;
52 static Buf acct_gather_options_buf = NULL;
53 static bool inited = 0;
54
_get_int(const char * my_str)55 static int _get_int(const char *my_str)
56 {
57 char *end = NULL;
58 int value;
59
60 if (!my_str)
61 return -1;
62 value = strtol(my_str, &end, 10);
63 //info("from %s I get %d and %s: %m", my_str, value, end);
64 /* means no numbers */
65 if (my_str == end)
66 return -1;
67
68 return value;
69 }
70
_process_tbl(s_p_hashtbl_t * tbl)71 static int _process_tbl(s_p_hashtbl_t *tbl)
72 {
73 int rc = 0;
74
75 /* handle acct_gather.conf in each plugin */
76 slurm_mutex_lock(&conf_mutex);
77 rc += acct_gather_energy_g_conf_set(tbl);
78 rc += acct_gather_profile_g_conf_set(tbl);
79 rc += acct_gather_interconnect_g_conf_set(tbl);
80 rc += acct_gather_filesystem_g_conf_set(tbl);
81 /*********************************************************************/
82 /* ADD MORE HERE AND FREE MEMORY IN acct_gather_conf_destroy() BELOW */
83 /*********************************************************************/
84 slurm_mutex_unlock(&conf_mutex);
85
86 return rc;
87 }
88
acct_gather_conf_init(void)89 extern int acct_gather_conf_init(void)
90 {
91 s_p_hashtbl_t *tbl = NULL;
92 char *conf_path = NULL;
93 s_p_options_t *full_options = NULL;
94 int full_options_cnt = 0, i;
95 struct stat buf;
96 int rc = SLURM_SUCCESS;
97
98 if (inited)
99 return SLURM_SUCCESS;
100 inited = 1;
101
102 /* get options from plugins using acct_gather.conf */
103
104 rc += acct_gather_energy_g_conf_options(&full_options,
105 &full_options_cnt);
106 rc += acct_gather_profile_g_conf_options(&full_options,
107 &full_options_cnt);
108 rc += acct_gather_interconnect_g_conf_options(&full_options,
109 &full_options_cnt);
110 rc += acct_gather_filesystem_g_conf_options(&full_options,
111 &full_options_cnt);
112 /* ADD MORE HERE */
113
114 /* for the NULL at the end */
115 xrealloc(full_options,
116 ((full_options_cnt + 1) * sizeof(s_p_options_t)));
117
118 tbl = s_p_hashtbl_create(full_options);
119
120 /**************************************************/
121
122 /* Get the acct_gather.conf path and validate the file */
123 conf_path = get_extra_conf_path("acct_gather.conf");
124 if ((conf_path == NULL) || (stat(conf_path, &buf) == -1)) {
125 debug2("No acct_gather.conf file (%s)", conf_path);
126 } else {
127 debug2("Reading acct_gather.conf file %s", conf_path);
128
129 if (s_p_parse_file(tbl, NULL, conf_path, false) ==
130 SLURM_ERROR) {
131 fatal("Could not open/read/parse acct_gather.conf file "
132 "%s. Many times this is because you have "
133 "defined options for plugins that are not "
134 "loaded. Please check your slurm.conf file "
135 "and make sure the plugins for the options "
136 "listed are loaded.",
137 conf_path);
138 }
139 }
140
141 rc += _process_tbl(tbl);
142
143 acct_gather_options_buf = s_p_pack_hashtbl(
144 tbl, full_options, full_options_cnt);
145
146 for (i=0; i<full_options_cnt; i++)
147 xfree(full_options[i].key);
148 xfree(full_options);
149 xfree(conf_path);
150
151 s_p_hashtbl_destroy(tbl);
152
153 return rc;
154 }
155
acct_gather_write_conf(int fd)156 extern int acct_gather_write_conf(int fd)
157 {
158 int len;
159
160 acct_gather_conf_init();
161
162 slurm_mutex_lock(&conf_mutex);
163 len = get_buf_offset(acct_gather_options_buf);
164 safe_write(fd, &len, sizeof(int));
165 safe_write(fd, get_buf_data(acct_gather_options_buf), len);
166 slurm_mutex_unlock(&conf_mutex);
167
168 return 0;
169
170 rwfail:
171 slurm_mutex_unlock(&conf_mutex);
172 return -1;
173 }
174
acct_gather_read_conf(int fd)175 extern int acct_gather_read_conf(int fd)
176 {
177 int len;
178 s_p_hashtbl_t *tbl;
179
180 safe_read(fd, &len, sizeof(int));
181
182 acct_gather_options_buf = init_buf(len);
183 safe_read(fd, acct_gather_options_buf->head, len);
184
185 if (!(tbl = s_p_unpack_hashtbl(acct_gather_options_buf)))
186 return SLURM_ERROR;
187
188 /*
189 * We need to set inited before calling _process_tbl or we will get
190 * deadlock since the other acct_gather_* plugins will call
191 * acct_gather_init().
192 */
193 inited = true;
194 (void)_process_tbl(tbl);
195
196 s_p_hashtbl_destroy(tbl);
197
198 return SLURM_SUCCESS;
199 rwfail:
200 return SLURM_ERROR;
201 }
202
acct_gather_reconfig(void)203 extern int acct_gather_reconfig(void)
204 {
205 acct_gather_conf_destroy();
206 slurm_mutex_init(&conf_mutex);
207 acct_gather_conf_init();
208
209 return SLURM_SUCCESS;
210 }
211
acct_gather_conf_destroy(void)212 extern int acct_gather_conf_destroy(void)
213 {
214 int rc, rc2;
215
216 if (!inited)
217 return SLURM_SUCCESS;
218
219 inited = false;
220
221 rc = acct_gather_energy_fini();
222
223 rc2 = acct_gather_filesystem_fini();
224 rc = MAX(rc, rc2);
225 rc2 = acct_gather_interconnect_fini();
226 rc = MAX(rc, rc2);
227 rc2 = acct_gather_profile_fini();
228 rc = MAX(rc, rc2);
229
230 FREE_NULL_BUFFER(acct_gather_options_buf);
231
232 slurm_mutex_destroy(&conf_mutex);
233 return rc;
234 }
235
acct_gather_conf_values(void)236 extern List acct_gather_conf_values(void)
237 {
238 List acct_list = list_create(destroy_config_key_pair);
239
240 /* get acct_gather.conf in each plugin */
241 slurm_mutex_lock(&conf_mutex);
242 acct_gather_profile_g_conf_values(&acct_list);
243 acct_gather_interconnect_g_conf_values(&acct_list);
244 acct_gather_energy_g_conf_values(&acct_list);
245 acct_gather_filesystem_g_conf_values(&acct_list);
246 /* ADD MORE HERE */
247 slurm_mutex_unlock(&conf_mutex);
248 /******************************************/
249
250 list_sort(acct_list, (ListCmpF) sort_key_pairs);
251
252 return acct_list;
253 }
254
acct_gather_parse_freq(int type,char * freq)255 extern int acct_gather_parse_freq(int type, char *freq)
256 {
257 int freq_int = -1;
258 char *sub_str = NULL;
259
260 if (!freq)
261 return freq_int;
262
263 switch (type) {
264 case PROFILE_ENERGY:
265 if ((sub_str = xstrcasestr(freq, "energy=")))
266 freq_int = _get_int(sub_str + 7);
267 break;
268 case PROFILE_TASK:
269 /* backwards compatibility for when the freq was only
270 for task.
271 */
272 freq_int = _get_int(freq);
273 if ((freq_int == -1)
274 && (sub_str = xstrcasestr(freq, "task=")))
275 freq_int = _get_int(sub_str + 5);
276 break;
277 case PROFILE_FILESYSTEM:
278 if ((sub_str = xstrcasestr(freq, "filesystem=")))
279 freq_int = _get_int(sub_str + 11);
280 break;
281 case PROFILE_NETWORK:
282 if ((sub_str = xstrcasestr(freq, "network=")))
283 freq_int = _get_int(sub_str + 8);
284 break;
285 default:
286 fatal("Unhandled profile option %d please update "
287 "slurm_acct_gather.c "
288 "(acct_gather_parse_freq)", type);
289 }
290
291 return freq_int;
292 }
293
acct_gather_check_acct_freq_task(uint64_t job_mem_lim,char * acctg_freq)294 extern int acct_gather_check_acct_freq_task(uint64_t job_mem_lim,
295 char *acctg_freq)
296 {
297 int task_freq;
298 static uint32_t acct_freq_task = NO_VAL;
299
300 if (acct_freq_task == NO_VAL) {
301 char *acct_freq = slurm_get_jobacct_gather_freq();
302 int i = acct_gather_parse_freq(PROFILE_TASK, acct_freq);
303 xfree(acct_freq);
304
305 /* If the value is -1 lets set the freq to something
306 really high so we don't check this again.
307 */
308 if (i == -1)
309 acct_freq_task = NO_VAL16;
310 else
311 acct_freq_task = i;
312 }
313
314 if (!job_mem_lim || !acct_freq_task)
315 return 0;
316
317 task_freq = acct_gather_parse_freq(PROFILE_TASK, acctg_freq);
318
319 if (task_freq == -1)
320 return 0;
321
322 if (task_freq == 0) {
323 error("Can't turn accounting frequency off. "
324 "We need it to monitor memory usage.");
325 slurm_seterrno(ESLURMD_INVALID_ACCT_FREQ);
326 return 1;
327 } else if (task_freq > acct_freq_task) {
328 error("Can't set frequency to %d, it is higher than %u. "
329 "We need it to be at least at this level to "
330 "monitor memory usage.",
331 task_freq, acct_freq_task);
332 slurm_seterrno(ESLURMD_INVALID_ACCT_FREQ);
333 return 1;
334 }
335
336 return 0;
337 }
338
acct_gather_suspend_poll(void)339 extern void acct_gather_suspend_poll(void)
340 {
341 slurm_mutex_lock(&suspended_mutex);
342 acct_gather_suspended = true;
343 slurm_mutex_unlock(&suspended_mutex);
344 }
345
acct_gather_resume_poll(void)346 extern void acct_gather_resume_poll(void)
347 {
348 slurm_mutex_lock(&suspended_mutex);
349 acct_gather_suspended = false;
350 slurm_mutex_unlock(&suspended_mutex);
351 }
352
acct_gather_suspend_test(void)353 extern bool acct_gather_suspend_test(void)
354 {
355 bool rc;
356 slurm_mutex_lock(&suspended_mutex);
357 rc = acct_gather_suspended;
358 slurm_mutex_unlock(&suspended_mutex);
359 return rc;
360 }
361