1 /*****************************************************************************\
2 * as_mysql_job.c - functions dealing with jobs and job steps.
3 *****************************************************************************
4 *
5 * Copyright (C) 2004-2007 The Regents of the University of California.
6 * Copyright (C) 2008-2010 Lawrence Livermore National Security.
7 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8 * Written by Danny Auble <da@llnl.gov>
9 *
10 * This file is part of Slurm, a resource management program.
11 * For details, see <https://slurm.schedmd.com/>.
12 * Please also read the included file: DISCLAIMER.
13 *
14 * Slurm is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 * In addition, as a special exception, the copyright holders give permission
20 * to link the code of portions of this program with the OpenSSL library under
21 * certain conditions as described in each individual source file, and
22 * distribute linked combinations including the two. You must obey the GNU
23 * General Public License in all respects for all of the code used other than
24 * OpenSSL. If you modify file(s) with this exception, you may extend this
25 * exception to your version of the file(s), but you are not obligated to do
26 * so. If you do not wish to do so, delete this exception statement from your
27 * version. If you delete this exception statement from all source files in
28 * the program, then also delete it here.
29 *
30 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
33 * details.
34 *
35 * You should have received a copy of the GNU General Public License along
36 * with Slurm; if not, write to the Free Software Foundation, Inc.,
37 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
38 \*****************************************************************************/
39
40 #include "as_mysql_job.h"
41 #include "as_mysql_jobacct_process.h"
42 #include "as_mysql_usage.h"
43 #include "as_mysql_wckey.h"
44
45 #include "src/common/assoc_mgr.h"
46 #include "src/common/gres.h"
47 #include "src/common/node_select.h"
48 #include "src/common/parse_time.h"
49 #include "src/common/slurm_jobacct_gather.h"
50 #include "src/common/slurm_time.h"
51
52 typedef struct {
53 char *cluster;
54 uint32_t new;
55 uint32_t old;
56 } id_switch_t;
57
_find_id_switch(void * x,void * key)58 static int _find_id_switch(void *x, void *key)
59 {
60 id_switch_t *id_switch = (id_switch_t *)x;
61 uint32_t id = *(uint32_t *)key;
62
63 if (id_switch->old == id)
64 return 1;
65 return 0;
66 }
67
_average_tres_usage(uint32_t * tres_ids,uint64_t * tres_cnts,int tres_cnt,int tasks)68 static char *_average_tres_usage(uint32_t *tres_ids, uint64_t *tres_cnts,
69 int tres_cnt, int tasks)
70 {
71 char *ret_str = NULL;
72 int i;
73
74 /*
75 * Don't return NULL here, we need a blank string or we will print
76 * '(null)' in the database which really isn't what we want.
77 */
78 if (!tasks)
79 return xstrdup("");
80
81 for (i = 0; i < tres_cnt; i++) {
82 if (tres_cnts[i] == INFINITE64)
83 continue;
84 xstrfmtcat(ret_str, "%s%u=%"PRIu64,
85 ret_str ? "," : "",
86 tres_ids[i], tres_cnts[i] / (uint64_t)tasks);
87 }
88
89 if (!ret_str)
90 ret_str = xstrdup("");
91 return ret_str;
92 }
93
94 /* Used in job functions for getting the database index based off the
95 * submit time and job. 0 is returned if none is found
96 */
_get_db_index(mysql_conn_t * mysql_conn,time_t submit,uint32_t jobid)97 static uint64_t _get_db_index(mysql_conn_t *mysql_conn,
98 time_t submit, uint32_t jobid)
99 {
100 MYSQL_RES *result = NULL;
101 MYSQL_ROW row;
102 uint64_t db_index = 0;
103 char *query = xstrdup_printf("select job_db_inx from \"%s_%s\" where "
104 "time_submit=%d and id_job=%u",
105 mysql_conn->cluster_name, job_table,
106 (int)submit, jobid);
107
108 if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) {
109 xfree(query);
110 return 0;
111 }
112 xfree(query);
113
114 row = mysql_fetch_row(result);
115 if (!row) {
116 mysql_free_result(result);
117 debug4("We can't get a db_index for this combo, "
118 "time_submit=%d and id_job=%u. "
119 "We must not have heard about the start yet, "
120 "no big deal, we will get one right after this.",
121 (int)submit, jobid);
122 return 0;
123 }
124 db_index = slurm_atoull(row[0]);
125 mysql_free_result(result);
126
127 return db_index;
128 }
129
_get_user_from_associd(mysql_conn_t * mysql_conn,char * cluster,uint32_t associd)130 static char *_get_user_from_associd(mysql_conn_t *mysql_conn,
131 char *cluster, uint32_t associd)
132 {
133 char *user = NULL;
134 char *query = NULL;
135 MYSQL_RES *result = NULL;
136 MYSQL_ROW row;
137
138 /* Just so we don't have to keep a
139 cache of the associations around we
140 will just query the db for the user
141 name of the association id. Since
142 this should sort of be a rare case
143 this isn't too bad.
144 */
145 query = xstrdup_printf("select user from \"%s_%s\" where id_assoc=%u",
146 cluster, assoc_table, associd);
147
148 debug4("%d(%s:%d) query\n%s",
149 mysql_conn->conn, THIS_FILE, __LINE__, query);
150 if (!(result =
151 mysql_db_query_ret(mysql_conn, query, 0))) {
152 xfree(query);
153 return NULL;
154 }
155 xfree(query);
156
157 if ((row = mysql_fetch_row(result)) && row[0][0])
158 user = xstrdup(row[0]);
159
160 mysql_free_result(result);
161
162 return user;
163 }
164
_get_wckeyid(mysql_conn_t * mysql_conn,char ** name,uid_t uid,char * cluster,uint32_t associd)165 static uint32_t _get_wckeyid(mysql_conn_t *mysql_conn, char **name,
166 uid_t uid, char *cluster, uint32_t associd)
167 {
168 uint32_t wckeyid = 0;
169
170 if (slurm_get_track_wckey()) {
171 /* Here we are looking for the wckeyid if it doesn't
172 * exist we will create one. We don't need to check
173 * if it is good or not. Right now this is the only
174 * place things are created. We do this only on a job
175 * start, not on a job submit since we don't want to
176 * slow down getting the db_index back to the
177 * controller.
178 */
179 slurmdb_wckey_rec_t wckey_rec;
180 char *user = NULL;
181
182 /* since we are unable to rely on uids here (someone could
183 not have there uid in the system yet) we must
184 first get the user name from the associd */
185 if (!(user = _get_user_from_associd(
186 mysql_conn, cluster, associd))) {
187 error("No user for associd %u", associd);
188 goto no_wckeyid;
189 }
190 /* get the default key */
191 if (!*name) {
192 slurmdb_user_rec_t user_rec;
193 memset(&user_rec, 0, sizeof(slurmdb_user_rec_t));
194 user_rec.uid = NO_VAL;
195 user_rec.name = user;
196 if (assoc_mgr_fill_in_user(mysql_conn, &user_rec,
197 1, NULL, false)
198 != SLURM_SUCCESS) {
199 error("No user by name of %s assoc %u",
200 user, associd);
201 xfree(user);
202 goto no_wckeyid;
203 }
204
205 if (user_rec.default_wckey)
206 *name = xstrdup_printf("*%s",
207 user_rec.default_wckey);
208 else
209 *name = xstrdup_printf("*");
210 }
211
212 memset(&wckey_rec, 0, sizeof(slurmdb_wckey_rec_t));
213 wckey_rec.name = (*name);
214 wckey_rec.uid = NO_VAL;
215 wckey_rec.user = user;
216 wckey_rec.cluster = cluster;
217 if (assoc_mgr_fill_in_wckey(mysql_conn, &wckey_rec,
218 ACCOUNTING_ENFORCE_WCKEYS,
219 NULL, false) != SLURM_SUCCESS) {
220 List wckey_list = NULL;
221 slurmdb_wckey_rec_t *wckey_ptr = NULL;
222 /* we have already checked to make
223 sure this was the slurm user before
224 calling this */
225
226 wckey_list = list_create(slurmdb_destroy_wckey_rec);
227
228 wckey_ptr = xmalloc(sizeof(slurmdb_wckey_rec_t));
229 wckey_ptr->name = xstrdup((*name));
230 wckey_ptr->user = xstrdup(user);
231 wckey_ptr->cluster = xstrdup(cluster);
232 list_append(wckey_list, wckey_ptr);
233 /* info("adding wckey '%s' '%s' '%s'", */
234 /* wckey_ptr->name, wckey_ptr->user, */
235 /* wckey_ptr->cluster); */
236
237 if (*name[0] == '*') {
238 /* make sure the non * wckey has been added */
239 wckey_rec.name = (*name)+1;
240 if (assoc_mgr_fill_in_wckey(
241 mysql_conn, &wckey_rec,
242 ACCOUNTING_ENFORCE_WCKEYS,
243 NULL, false) != SLURM_SUCCESS) {
244 wckey_ptr = xmalloc(
245 sizeof(slurmdb_wckey_rec_t));
246 wckey_ptr->name =
247 xstrdup(wckey_rec.name);
248 wckey_ptr->user = xstrdup(user);
249 wckey_ptr->cluster = xstrdup(cluster);
250 list_prepend(wckey_list, wckey_ptr);
251 /* info("adding wckey '%s' '%s' " */
252 /* "'%s'", */
253 /* wckey_ptr->name, */
254 /* wckey_ptr->user, */
255 /* wckey_ptr->cluster); */
256 }
257 wckey_rec.name = (*name);
258 }
259
260 if (as_mysql_add_wckeys(mysql_conn,
261 slurm_get_slurm_user_id(),
262 wckey_list)
263 == SLURM_SUCCESS)
264 acct_storage_p_commit(mysql_conn, 1);
265 /* If that worked lets get it */
266 assoc_mgr_fill_in_wckey(mysql_conn, &wckey_rec,
267 ACCOUNTING_ENFORCE_WCKEYS,
268 NULL, false);
269
270 FREE_NULL_LIST(wckey_list);
271 }
272 xfree(user);
273 /* info("got wckeyid of %d", wckey_rec.id); */
274 wckeyid = wckey_rec.id;
275 }
276 no_wckeyid:
277 return wckeyid;
278 }
279
280 /* extern functions */
281
as_mysql_job_start(mysql_conn_t * mysql_conn,job_record_t * job_ptr)282 extern int as_mysql_job_start(mysql_conn_t *mysql_conn, job_record_t *job_ptr)
283 {
284 int rc = SLURM_SUCCESS;
285 char *nodes = NULL, *jname = NULL;
286 int track_steps = 0;
287 char *partition = NULL;
288 char *query = NULL;
289 int reinit = 0;
290 time_t begin_time, check_time, start_time, submit_time;
291 uint32_t wckeyid = 0;
292 uint32_t job_state;
293 uint32_t array_task_id =
294 (job_ptr->array_job_id) ? job_ptr->array_task_id : NO_VAL;
295 uint64_t job_db_inx = job_ptr->db_index;
296 job_array_struct_t *array_recs = job_ptr->array_recs;
297 char *tres_alloc_str = NULL;
298
299 if ((!job_ptr->details || !job_ptr->details->submit_time)
300 && !job_ptr->resize_time) {
301 error("as_mysql_job_start: "
302 "Not inputing this job, it has no submit time.");
303 return SLURM_ERROR;
304 }
305
306 if (check_connection(mysql_conn) != SLURM_SUCCESS)
307 return ESLURM_DB_CONNECTION;
308
309 debug2("%s: called", __func__);
310
311 job_state = job_ptr->job_state;
312
313 if (job_ptr->resize_time) {
314 begin_time = job_ptr->resize_time;
315 submit_time = job_ptr->resize_time;
316 start_time = job_ptr->resize_time;
317 } else {
318 begin_time = job_ptr->details->begin_time;
319 submit_time = job_ptr->details->submit_time;
320 start_time = job_ptr->start_time;
321 }
322
323 /* If the reason is WAIT_ARRAY_TASK_LIMIT we don't want to
324 * give the pending jobs an eligible time since it will add
325 * time to accounting where as these jobs aren't able to run
326 * until later so mark it as such.
327 */
328 if (job_ptr->state_reason == WAIT_ARRAY_TASK_LIMIT)
329 begin_time = INFINITE;
330
331 /* Since we need a new db_inx make sure the old db_inx
332 * removed. This is most likely the only time we are going to
333 * be notified of the change also so make the state without
334 * the resize. */
335 if (IS_JOB_RESIZING(job_ptr)) {
336 /* If we have a db_index lets end the previous record. */
337 if (!job_ptr->db_index) {
338 error("We don't have a db_index for job %u, "
339 "this should only happen when resizing "
340 "jobs and the database interface was down.",
341 job_ptr->job_id);
342 job_ptr->db_index = _get_db_index(mysql_conn,
343 job_ptr->details->
344 submit_time,
345 job_ptr->job_id);
346 }
347
348 if (job_ptr->db_index)
349 as_mysql_job_complete(mysql_conn, job_ptr);
350
351 job_state &= (~JOB_RESIZING);
352 job_ptr->db_index = 0;
353 }
354
355 job_state &= JOB_STATE_BASE;
356
357 /* See what we are hearing about here if no start time. If
358 * this job latest time is before the last roll up we will
359 * need to reset it to look at this job. */
360 if (start_time)
361 check_time = start_time;
362 else if (begin_time)
363 check_time = begin_time;
364 else
365 check_time = submit_time;
366
367 slurm_mutex_lock(&rollup_lock);
368 if (check_time < global_last_rollup) {
369 MYSQL_RES *result = NULL;
370 MYSQL_ROW row;
371
372 /* check to see if we are hearing about this time for the
373 * first time.
374 */
375 query = xstrdup_printf("select job_db_inx "
376 "from \"%s_%s\" where id_job=%u and "
377 "time_submit=%ld and time_eligible=%ld "
378 "and time_start=%ld;",
379 mysql_conn->cluster_name,
380 job_table, job_ptr->job_id,
381 submit_time, begin_time, start_time);
382 if (debug_flags & DEBUG_FLAG_DB_JOB)
383 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
384 if (!(result =
385 mysql_db_query_ret(mysql_conn, query, 0))) {
386 xfree(query);
387 slurm_mutex_unlock(&rollup_lock);
388 return SLURM_ERROR;
389 }
390 xfree(query);
391 if ((row = mysql_fetch_row(result))) {
392 mysql_free_result(result);
393 debug4("revieved an update for a "
394 "job (%u) already known about",
395 job_ptr->job_id);
396 slurm_mutex_unlock(&rollup_lock);
397 goto no_rollup_change;
398 }
399 mysql_free_result(result);
400
401 if (job_ptr->start_time)
402 debug("Need to reroll usage from %s Job %u "
403 "from %s started then and we are just "
404 "now hearing about it.",
405 slurm_ctime2(&check_time),
406 job_ptr->job_id, mysql_conn->cluster_name);
407 else if (begin_time)
408 debug("Need to reroll usage from %s Job %u "
409 "from %s became eligible then and we are just "
410 "now hearing about it.",
411 slurm_ctime2(&check_time),
412 job_ptr->job_id, mysql_conn->cluster_name);
413 else
414 debug("Need to reroll usage from %s Job %u "
415 "from %s was submitted then and we are just "
416 "now hearing about it.",
417 slurm_ctime2(&check_time),
418 job_ptr->job_id, mysql_conn->cluster_name);
419
420 global_last_rollup = check_time;
421 slurm_mutex_unlock(&rollup_lock);
422
423 /* If the times here are later than the daily_rollup
424 or monthly rollup it isn't a big deal since they
425 are always shrunk down to the beginning of each
426 time period.
427 */
428 query = xstrdup_printf("update \"%s_%s\" set "
429 "hourly_rollup=%ld, "
430 "daily_rollup=%ld, monthly_rollup=%ld",
431 mysql_conn->cluster_name,
432 last_ran_table, check_time,
433 check_time, check_time);
434 if (debug_flags & DEBUG_FLAG_DB_JOB)
435 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
436 rc = mysql_db_query(mysql_conn, query);
437 xfree(query);
438 } else
439 slurm_mutex_unlock(&rollup_lock);
440
441 no_rollup_change:
442
443 if (job_ptr->name && job_ptr->name[0])
444 jname = job_ptr->name;
445 else {
446 jname = "allocation";
447 track_steps = 1;
448 }
449
450 if (job_ptr->nodes && job_ptr->nodes[0])
451 nodes = job_ptr->nodes;
452 else
453 nodes = "None assigned";
454
455 if (job_ptr->batch_flag)
456 track_steps = 1;
457
458 /* Grab the wckey once to make sure it is placed. */
459 if (job_ptr->assoc_id && (!job_ptr->db_index || job_ptr->wckey))
460 wckeyid = _get_wckeyid(mysql_conn, &job_ptr->wckey,
461 job_ptr->user_id,
462 mysql_conn->cluster_name,
463 job_ptr->assoc_id);
464
465 if (!IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr)
466 partition = job_ptr->part_ptr->name;
467 else if (job_ptr->partition)
468 partition = job_ptr->partition;
469
470 if (!job_ptr->db_index) {
471 query = xstrdup_printf(
472 "insert into \"%s_%s\" "
473 "(id_job, mod_time, id_array_job, id_array_task, "
474 "het_job_id, het_job_offset, "
475 "id_assoc, id_qos, id_user, "
476 "id_group, nodelist, id_resv, timelimit, "
477 "time_eligible, time_submit, time_start, "
478 "job_name, track_steps, state, priority, cpus_req, "
479 "nodes_alloc, mem_req, flags, state_reason_prev",
480 mysql_conn->cluster_name, job_table);
481
482 if (wckeyid)
483 xstrcat(query, ", id_wckey");
484 if (job_ptr->mcs_label)
485 xstrcat(query, ", mcs_label");
486 if (job_ptr->account)
487 xstrcat(query, ", account");
488 if (partition)
489 xstrcat(query, ", `partition`");
490 if (job_ptr->wckey)
491 xstrcat(query, ", wckey");
492 if (job_ptr->network)
493 xstrcat(query, ", node_inx");
494 if (job_ptr->gres_req)
495 xstrcat(query, ", gres_req");
496 if (job_ptr->gres_alloc)
497 xstrcat(query, ", gres_alloc");
498 if (array_recs && array_recs->task_id_str)
499 xstrcat(query, ", array_task_str, array_max_tasks, "
500 "array_task_pending");
501 else
502 xstrcat(query, ", array_task_str, array_task_pending");
503
504 if (job_ptr->tres_alloc_str || tres_alloc_str)
505 xstrcat(query, ", tres_alloc");
506 if (job_ptr->tres_req_str)
507 xstrcat(query, ", tres_req");
508 if (job_ptr->details->work_dir)
509 xstrcat(query, ", work_dir");
510 if (job_ptr->details->features)
511 xstrcat(query, ", constraints");
512
513 xstrfmtcat(query,
514 ") values (%u, UNIX_TIMESTAMP(), "
515 "%u, %u, %u, %u, %u, %u, %u, %u, "
516 "'%s', %u, %u, %ld, %ld, %ld, "
517 "'%s', %u, %u, %u, %u, %u, %"PRIu64", %u, %u",
518 job_ptr->job_id,
519 job_ptr->array_job_id, array_task_id,
520 job_ptr->het_job_id, job_ptr->het_job_offset,
521 job_ptr->assoc_id, job_ptr->qos_id,
522 job_ptr->user_id, job_ptr->group_id, nodes,
523 job_ptr->resv_id, job_ptr->time_limit,
524 begin_time, submit_time, start_time,
525 jname, track_steps, job_state,
526 job_ptr->priority, job_ptr->details->min_cpus,
527 job_ptr->total_nodes,
528 job_ptr->details->pn_min_memory,
529 job_ptr->db_flags,
530 job_ptr->state_reason_prev_db);
531
532 if (wckeyid)
533 xstrfmtcat(query, ", %u", wckeyid);
534 if (job_ptr->mcs_label)
535 xstrfmtcat(query, ", '%s'", job_ptr->mcs_label);
536 if (job_ptr->account)
537 xstrfmtcat(query, ", '%s'", job_ptr->account);
538 if (partition)
539 xstrfmtcat(query, ", '%s'", partition);
540 if (job_ptr->wckey)
541 xstrfmtcat(query, ", '%s'", job_ptr->wckey);
542 if (job_ptr->network)
543 xstrfmtcat(query, ", '%s'", job_ptr->network);
544 if (job_ptr->gres_req)
545 xstrfmtcat(query, ", '%s'", job_ptr->gres_req);
546 if (job_ptr->gres_alloc)
547 xstrfmtcat(query, ", '%s'", job_ptr->gres_alloc);
548 if (array_recs && array_recs->task_id_str)
549 xstrfmtcat(query, ", '%s', %u, %u",
550 array_recs->task_id_str,
551 array_recs->max_run_tasks,
552 array_recs->task_cnt);
553 else
554 xstrcat(query, ", NULL, 0");
555
556 if (tres_alloc_str)
557 xstrfmtcat(query, ", '%s'", tres_alloc_str);
558 else if (job_ptr->tres_alloc_str)
559 xstrfmtcat(query, ", '%s'", job_ptr->tres_alloc_str);
560 if (job_ptr->tres_req_str)
561 xstrfmtcat(query, ", '%s'", job_ptr->tres_req_str);
562 if (job_ptr->details->work_dir)
563 xstrfmtcat(query, ", '%s'",
564 job_ptr->details->work_dir);
565 if (job_ptr->details->features)
566 xstrfmtcat(query, ", '%s'",
567 job_ptr->details->features);
568
569 xstrfmtcat(query,
570 ") on duplicate key update "
571 "job_db_inx=LAST_INSERT_ID(job_db_inx), "
572 "id_assoc=%u, id_user=%u, id_group=%u, "
573 "nodelist='%s', id_resv=%u, timelimit=%u, "
574 "time_submit=%ld, time_eligible=%ld, "
575 "time_start=%ld, mod_time=UNIX_TIMESTAMP(), "
576 "job_name='%s', track_steps=%u, id_qos=%u, "
577 "state=greatest(state, %u), priority=%u, "
578 "cpus_req=%u, nodes_alloc=%u, "
579 "mem_req=%"PRIu64", id_array_job=%u, id_array_task=%u, "
580 "het_job_id=%u, het_job_offset=%u, flags=%u, "
581 "state_reason_prev=%u",
582 job_ptr->assoc_id, job_ptr->user_id,
583 job_ptr->group_id, nodes,
584 job_ptr->resv_id, job_ptr->time_limit,
585 submit_time, begin_time, start_time,
586 jname, track_steps, job_ptr->qos_id, job_state,
587 job_ptr->priority, job_ptr->details->min_cpus,
588 job_ptr->total_nodes,
589 job_ptr->details->pn_min_memory,
590 job_ptr->array_job_id, array_task_id,
591 job_ptr->het_job_id, job_ptr->het_job_offset,
592 job_ptr->db_flags,
593 job_ptr->state_reason_prev_db);
594
595 if (wckeyid)
596 xstrfmtcat(query, ", id_wckey=%u", wckeyid);
597 if (job_ptr->mcs_label)
598 xstrfmtcat(query, ", mcs_label='%s'",
599 job_ptr->mcs_label);
600 if (job_ptr->account)
601 xstrfmtcat(query, ", account='%s'", job_ptr->account);
602 if (partition)
603 xstrfmtcat(query, ", `partition`='%s'", partition);
604 if (job_ptr->wckey)
605 xstrfmtcat(query, ", wckey='%s'", job_ptr->wckey);
606 if (job_ptr->network)
607 xstrfmtcat(query, ", node_inx='%s'", job_ptr->network);
608 if (job_ptr->gres_req)
609 xstrfmtcat(query, ", gres_req='%s'", job_ptr->gres_req);
610 if (job_ptr->gres_alloc)
611 xstrfmtcat(query, ", gres_alloc='%s'",
612 job_ptr->gres_alloc);
613 if (array_recs && array_recs->task_id_str)
614 xstrfmtcat(query, ", array_task_str='%s', "
615 "array_max_tasks=%u, array_task_pending=%u",
616 array_recs->task_id_str,
617 array_recs->max_run_tasks,
618 array_recs->task_cnt);
619 else
620 xstrfmtcat(query, ", array_task_str=NULL, "
621 "array_task_pending=0");
622
623 if (tres_alloc_str)
624 xstrfmtcat(query, ", tres_alloc='%s'", tres_alloc_str);
625 else if (job_ptr->tres_alloc_str)
626 xstrfmtcat(query, ", tres_alloc='%s'",
627 job_ptr->tres_alloc_str);
628 if (job_ptr->tres_req_str)
629 xstrfmtcat(query, ", tres_req='%s'",
630 job_ptr->tres_req_str);
631 if (job_ptr->details->work_dir)
632 xstrfmtcat(query, ", work_dir='%s'",
633 job_ptr->details->work_dir);
634 if (job_ptr->details->features)
635 xstrfmtcat(query, ", constraints='%s'",
636 job_ptr->details->features);
637
638 if (debug_flags & DEBUG_FLAG_DB_JOB)
639 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
640 try_again:
641 if (!(job_ptr->db_index = mysql_db_insert_ret_id(
642 mysql_conn, query))) {
643 if (!reinit) {
644 error("It looks like the storage has gone "
645 "away trying to reconnect");
646 /* reconnect */
647 check_connection(mysql_conn);
648 reinit = 1;
649 goto try_again;
650 } else
651 rc = SLURM_ERROR;
652 }
653 } else {
654 query = xstrdup_printf("update \"%s_%s\" set nodelist='%s', ",
655 mysql_conn->cluster_name,
656 job_table, nodes);
657
658 if (wckeyid)
659 xstrfmtcat(query, "id_wckey=%u, ", wckeyid);
660 if (job_ptr->mcs_label)
661 xstrfmtcat(query, "mcs_label='%s', ",
662 job_ptr->mcs_label);
663 if (job_ptr->account)
664 xstrfmtcat(query, "account='%s', ", job_ptr->account);
665 if (partition)
666 xstrfmtcat(query, "`partition`='%s', ", partition);
667 if (job_ptr->wckey)
668 xstrfmtcat(query, "wckey='%s', ", job_ptr->wckey);
669 if (job_ptr->network)
670 xstrfmtcat(query, "node_inx='%s', ", job_ptr->network);
671 if (job_ptr->gres_req)
672 xstrfmtcat(query, "gres_req='%s', ",
673 job_ptr->gres_req);
674 if (job_ptr->gres_alloc)
675 xstrfmtcat(query, "gres_alloc='%s', ",
676 job_ptr->gres_alloc);
677 if (array_recs && array_recs->task_id_str)
678 xstrfmtcat(query, "array_task_str='%s', "
679 "array_max_tasks=%u, "
680 "array_task_pending=%u, ",
681 array_recs->task_id_str,
682 array_recs->max_run_tasks,
683 array_recs->task_cnt);
684 else
685 xstrfmtcat(query, "array_task_str=NULL, "
686 "array_task_pending=0, ");
687
688 if (tres_alloc_str)
689 xstrfmtcat(query, "tres_alloc='%s', ", tres_alloc_str);
690 else if (job_ptr->tres_alloc_str)
691 xstrfmtcat(query, "tres_alloc='%s', ",
692 job_ptr->tres_alloc_str);
693 if (job_ptr->tres_req_str)
694 xstrfmtcat(query, "tres_req='%s', ",
695 job_ptr->tres_req_str);
696 if (job_ptr->details->work_dir)
697 xstrfmtcat(query, "work_dir='%s', ",
698 job_ptr->details->work_dir);
699 if (job_ptr->details->features)
700 xstrfmtcat(query, "constraints='%s', ",
701 job_ptr->details->features);
702
703 xstrfmtcat(query, "time_start=%ld, job_name='%s', "
704 "state=greatest(state, %u), "
705 "nodes_alloc=%u, id_qos=%u, "
706 "id_assoc=%u, id_resv=%u, "
707 "timelimit=%u, mem_req=%"PRIu64", "
708 "id_array_job=%u, id_array_task=%u, "
709 "het_job_id=%u, het_job_offset=%u, "
710 "flags=%u, state_reason_prev=%u, "
711 "time_eligible=%ld, mod_time=UNIX_TIMESTAMP() "
712 "where job_db_inx=%"PRIu64,
713 start_time, jname, job_state,
714 job_ptr->total_nodes, job_ptr->qos_id,
715 job_ptr->assoc_id,
716 job_ptr->resv_id, job_ptr->time_limit,
717 job_ptr->details->pn_min_memory,
718 job_ptr->array_job_id, array_task_id,
719 job_ptr->het_job_id, job_ptr->het_job_offset,
720 job_ptr->db_flags, job_ptr->state_reason_prev_db,
721 begin_time, job_ptr->db_index);
722
723 if (debug_flags & DEBUG_FLAG_DB_JOB)
724 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
725 rc = mysql_db_query(mysql_conn, query);
726 }
727
728 /* now we will reset all the steps */
729 if (IS_JOB_RESIZING(job_ptr)) {
730 /* FIXME : Verify this is still needed */
731 if (IS_JOB_SUSPENDED(job_ptr))
732 as_mysql_suspend(mysql_conn, job_db_inx, job_ptr);
733 }
734
735 xfree(tres_alloc_str);
736 xfree(query);
737
738 return rc;
739 }
740
as_mysql_modify_job(mysql_conn_t * mysql_conn,uint32_t uid,slurmdb_job_cond_t * job_cond,slurmdb_job_rec_t * job)741 extern List as_mysql_modify_job(mysql_conn_t *mysql_conn, uint32_t uid,
742 slurmdb_job_cond_t *job_cond,
743 slurmdb_job_rec_t *job)
744 {
745 List ret_list = NULL;
746 int rc = SLURM_SUCCESS;
747 char *object = NULL;
748 char *vals = NULL, *cond_char = NULL;
749 time_t now = time(NULL);
750 char *user_name = NULL;
751 List job_list = NULL;
752 slurmdb_job_rec_t *job_rec;
753 ListIterator itr;
754 List id_switch_list = NULL;
755 id_switch_t *id_switch;
756
757 if (!job_cond || !job) {
758 error("we need something to change");
759 return NULL;
760 } else if (check_connection(mysql_conn) != SLURM_SUCCESS)
761 return NULL;
762
763 if (job->derived_ec != NO_VAL)
764 xstrfmtcat(vals, ", derived_ec=%u", job->derived_ec);
765
766 if (job->derived_es)
767 xstrfmtcat(vals, ", derived_es='%s'", job->derived_es);
768
769 if (job->system_comment)
770 xstrfmtcat(vals, ", system_comment='%s'",
771 job->system_comment);
772
773 if (job->wckey)
774 xstrfmtcat(vals, ", wckey='%s'", job->wckey);
775
776 if (!vals) {
777 errno = SLURM_NO_CHANGE_IN_DATA;
778 error("No change specified for job modification");
779 return NULL;
780 }
781 job_cond->flags |= JOBCOND_FLAG_NO_STEP;
782 job_cond->flags |= JOBCOND_FLAG_DBD_UID;
783 job_cond->flags |= JOBCOND_FLAG_NO_DEFAULT_USAGE;
784
785 job_list = as_mysql_jobacct_process_get_jobs(mysql_conn, uid, job_cond);
786
787 if (!job_list || !list_count(job_list)) {
788 errno = SLURM_NO_CHANGE_IN_DATA;
789 if (debug_flags & DEBUG_FLAG_DB_JOB)
790 DB_DEBUG(mysql_conn->conn,
791 "%s: Job(s) not found\n",
792 __func__);
793 xfree(vals);
794 FREE_NULL_LIST(job_list);
795 return NULL;
796 }
797
798 user_name = uid_to_string((uid_t) uid);
799
800 itr = list_iterator_create(job_list);
801 while ((job_rec = list_next(itr))) {
802 char tmp_char[25];
803 char *vals_mod = NULL;
804
805 if ((uid != job_rec->uid) &&
806 !is_user_min_admin_level(mysql_conn, uid,
807 SLURMDB_ADMIN_OPERATOR)) {
808 errno = ESLURM_ACCESS_DENIED;
809 rc = SLURM_ERROR;
810 break;
811 }
812
813 slurm_make_time_str(&job_rec->submit,
814 tmp_char, sizeof(tmp_char));
815
816 xstrfmtcat(cond_char, "job_db_inx=%"PRIu64, job_rec->db_index);
817 object = xstrdup_printf("%u submitted at %s",
818 job_rec->jobid, tmp_char);
819
820 if (!ret_list)
821 ret_list = list_create(xfree_ptr);
822 list_append(ret_list, object);
823
824 /*
825 * Grab the wckey id to update the job now.
826 */
827 if (job->wckey) {
828 uint32_t wckeyid = _get_wckeyid(mysql_conn,
829 &job->wckey,
830 job_rec->uid,
831 job_rec->cluster,
832 job_rec->associd);
833 if (!wckeyid) {
834 rc = SLURM_ERROR;
835 break;
836 }
837 vals_mod = xstrdup_printf("%s, id_wckey='%u'",
838 vals, wckeyid);
839 id_switch = NULL;
840 if (!id_switch_list)
841 id_switch_list = list_create(xfree_ptr);
842 else {
843 id_switch = list_find_first(
844 id_switch_list,
845 _find_id_switch,
846 &job_rec->wckeyid);
847 }
848
849 if (!id_switch) {
850 id_switch = xmalloc(sizeof(id_switch_t));
851 id_switch->cluster = job_rec->cluster;
852 id_switch->old = job_rec->wckeyid;
853 id_switch->new = wckeyid;
854 list_append(id_switch_list, id_switch);
855 }
856 } else
857 vals_mod = vals;
858
859 rc = modify_common(mysql_conn, DBD_MODIFY_JOB, now, user_name,
860 job_table, cond_char, vals_mod,
861 job_rec->cluster);
862 xfree(cond_char);
863
864 if (job->wckey)
865 xfree(vals_mod);
866
867 if (rc != SLURM_SUCCESS)
868 break;
869 }
870 list_iterator_destroy(itr);
871
872 xfree(vals);
873 xfree(user_name);
874
875 if (rc == SLURM_ERROR) {
876 error("Couldn't modify job(s)");
877 FREE_NULL_LIST(ret_list);
878 ret_list = NULL;
879 } else if (id_switch_list) {
880 struct tm hour_tm;
881 time_t usage_start, usage_end;
882 char *time_str = NULL;
883 char *query = NULL;
884
885 if (!job_cond->usage_end)
886 job_cond->usage_end = now;
887
888 if (!localtime_r(&job_cond->usage_end, &hour_tm)) {
889 error("Couldn't get localtime from end %ld",
890 job_cond->usage_end);
891 FREE_NULL_LIST(ret_list);
892 ret_list = NULL;
893 goto endit;
894 }
895 hour_tm.tm_sec = 0;
896 hour_tm.tm_min = 0;
897
898 usage_end = slurm_mktime(&hour_tm);
899
900 if (!job_cond->usage_start)
901 usage_start = 0;
902 else {
903 if (!localtime_r(&job_cond->usage_start, &hour_tm)) {
904 error("Couldn't get localtime from start %ld",
905 job_cond->usage_start);
906 FREE_NULL_LIST(ret_list);
907 ret_list = NULL;
908 goto endit;
909 }
910 hour_tm.tm_sec = 0;
911 hour_tm.tm_min = 0;
912
913 usage_start = slurm_mktime(&hour_tm);
914 }
915
916 time_str = xstrdup_printf(
917 "(time_start < %ld && time_start >= %ld)",
918 usage_end, usage_start);
919
920 itr = list_iterator_create(id_switch_list);
921 while ((id_switch = list_next(itr))) {
922 char *use_table = NULL;
923
924 for (int i = 0; i < 3; i++) {
925 switch (i) {
926 case 0:
927 use_table = wckey_hour_table;
928 break;
929 case 1:
930 use_table = wckey_day_table;
931 break;
932 case 2:
933 use_table = wckey_month_table;
934 break;
935 }
936
937 use_table = xstrdup_printf(
938 "%s_%s",
939 id_switch->cluster, use_table);
940 /*
941 * Move any of the new id lines into the old id.
942 */
943 query = xstrdup_printf(
944 "insert into \"%s\" (creation_time, mod_time, id, id_tres, time_start, alloc_secs) "
945 "select creation_time, %ld, %u, id_tres, time_start, @ASUM:=SUM(alloc_secs) from \"%s\" where (id=%u || id=%u) && %s group by id_tres, time_start on duplicate key update alloc_secs=@ASUM;",
946 use_table,
947 now, id_switch->old, use_table,
948 id_switch->new, id_switch->old,
949 time_str);
950
951 /* Delete all traces of the new id */
952 xstrfmtcat(query,
953 "delete from \"%s\" where id=%u && %s;",
954 use_table, id_switch->new, time_str);
955
956 /* Now we just need to switch the ids */
957 xstrfmtcat(query,
958 "update \"%s\" set mod_time=%ld, id=%u where id=%u && %s;",
959 use_table, now, id_switch->new, id_switch->old, time_str);
960
961
962 xfree(use_table);
963 if (debug_flags & DEBUG_FLAG_DB_JOB)
964 DB_DEBUG(mysql_conn->conn,
965 "query\n%s", query);
966 rc = mysql_db_query(mysql_conn, query);
967 xfree(query);
968 if (rc != SLURM_SUCCESS)
969 break;
970 }
971 if (rc != SLURM_SUCCESS) {
972 FREE_NULL_LIST(ret_list);
973 ret_list = NULL;
974 break;
975 }
976 }
977 list_iterator_destroy(itr);
978 xfree(time_str);
979 }
980 endit:
981 FREE_NULL_LIST(job_list);
982 FREE_NULL_LIST(id_switch_list);
983 return ret_list;
984 }
985
as_mysql_job_complete(mysql_conn_t * mysql_conn,job_record_t * job_ptr)986 extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
987 job_record_t *job_ptr)
988 {
989 char *query = NULL;
990 int rc = SLURM_SUCCESS, job_state;
991 time_t submit_time, end_time;
992 uint32_t exit_code = 0;
993 char *tres_alloc_str = NULL;
994
995 if (!job_ptr->db_index
996 && ((!job_ptr->details || !job_ptr->details->submit_time)
997 && !job_ptr->resize_time)) {
998 error("as_mysql_job_complete: "
999 "Not inputing this job, it has no submit time.");
1000 return SLURM_ERROR;
1001 }
1002
1003 if (check_connection(mysql_conn) != SLURM_SUCCESS)
1004 return ESLURM_DB_CONNECTION;
1005
1006 debug2("%s() called", __func__);
1007
1008 if (job_ptr->resize_time)
1009 submit_time = job_ptr->resize_time;
1010 else
1011 submit_time = job_ptr->details->submit_time;
1012
1013 if (IS_JOB_RESIZING(job_ptr)) {
1014 end_time = job_ptr->resize_time;
1015 job_state = JOB_RESIZING;
1016 } else {
1017 if (job_ptr->end_time == 0) {
1018 if (job_ptr->start_time) {
1019 error("%s: We are trying to end a job (%u) with no end time, setting it to the start time (%ld) of the job.",
1020 __func__,
1021 job_ptr->job_id, job_ptr->start_time);
1022 job_ptr->end_time = job_ptr->start_time;
1023 } else {
1024 error("%s: job %u never started",
1025 __func__, job_ptr->job_id);
1026
1027 /* If we get an error with this just
1028 * fall through to avoid an infinite loop */
1029 return SLURM_SUCCESS;
1030 }
1031 }
1032 end_time = job_ptr->end_time;
1033
1034 if (IS_JOB_REQUEUED(job_ptr))
1035 job_state = JOB_REQUEUE;
1036 else if (IS_JOB_REVOKED(job_ptr))
1037 job_state = JOB_REVOKED;
1038 else
1039 job_state = job_ptr->job_state & JOB_STATE_BASE;
1040 }
1041
1042 slurm_mutex_lock(&rollup_lock);
1043 if (end_time < global_last_rollup) {
1044 global_last_rollup = job_ptr->end_time;
1045 slurm_mutex_unlock(&rollup_lock);
1046
1047 query = xstrdup_printf("update \"%s_%s\" set "
1048 "hourly_rollup=%ld, "
1049 "daily_rollup=%ld, monthly_rollup=%ld",
1050 mysql_conn->cluster_name,
1051 last_ran_table, end_time,
1052 end_time, end_time);
1053 if (debug_flags & DEBUG_FLAG_DB_JOB)
1054 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1055 (void) mysql_db_query(mysql_conn, query);
1056 xfree(query);
1057 } else
1058 slurm_mutex_unlock(&rollup_lock);
1059
1060 if (!job_ptr->db_index) {
1061 if (!(job_ptr->db_index =
1062 _get_db_index(mysql_conn,
1063 submit_time,
1064 job_ptr->job_id))) {
1065 /* Comment is overloaded in job_start to be
1066 the block_id, so we will need to store this
1067 for later.
1068 */
1069 char *comment = job_ptr->comment;
1070 job_ptr->comment = NULL;
1071 /* If we get an error with this just fall
1072 * through to avoid an infinite loop
1073 */
1074 if (as_mysql_job_start(
1075 mysql_conn, job_ptr) == SLURM_ERROR) {
1076 job_ptr->comment = comment;
1077 error("couldn't add job %u at job completion",
1078 job_ptr->job_id);
1079 return SLURM_SUCCESS;
1080 }
1081 job_ptr->comment = comment;
1082 }
1083 }
1084
1085 /*
1086 * make sure we handle any quotes that may be in the comment
1087 */
1088
1089 query = xstrdup_printf("update \"%s_%s\" set "
1090 "mod_time=UNIX_TIMESTAMP(), "
1091 "time_end=%ld, state=%d",
1092 mysql_conn->cluster_name, job_table,
1093 end_time, job_state);
1094
1095 if (job_ptr->derived_ec != NO_VAL)
1096 xstrfmtcat(query, ", derived_ec=%u", job_ptr->derived_ec);
1097
1098 if (tres_alloc_str)
1099 xstrfmtcat(query, ", tres_alloc='%s'", tres_alloc_str);
1100 else if (job_ptr->tres_alloc_str)
1101 xstrfmtcat(query, ", tres_alloc='%s'", job_ptr->tres_alloc_str);
1102
1103 if (job_ptr->comment)
1104 xstrfmtcat(query, ", derived_es='%s'", job_ptr->comment);
1105
1106 if (job_ptr->admin_comment)
1107 xstrfmtcat(query, ", admin_comment='%s'",
1108 job_ptr->admin_comment);
1109
1110 if (job_ptr->system_comment)
1111 xstrfmtcat(query, ", system_comment='%s'",
1112 job_ptr->system_comment);
1113
1114 exit_code = job_ptr->exit_code;
1115 if (exit_code == 1) {
1116 /* This wasn't signaled, it was set by Slurm so don't
1117 * treat it like a signal.
1118 */
1119 exit_code = 256;
1120 }
1121
1122 xstrfmtcat(query,
1123 ", exit_code=%d, kill_requid=%d where job_db_inx=%"PRIu64";",
1124 exit_code, job_ptr->requid,
1125 job_ptr->db_index);
1126
1127 if (debug_flags & DEBUG_FLAG_DB_JOB)
1128 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1129 rc = mysql_db_query(mysql_conn, query);
1130 xfree(query);
1131
1132 xfree(tres_alloc_str);
1133 return rc;
1134 }
1135
as_mysql_step_start(mysql_conn_t * mysql_conn,step_record_t * step_ptr)1136 extern int as_mysql_step_start(mysql_conn_t *mysql_conn,
1137 step_record_t *step_ptr)
1138 {
1139 int tasks = 0, nodes = 0, task_dist = 0;
1140 int rc = SLURM_SUCCESS;
1141 char temp_bit[BUF_SIZE];
1142 char *node_list = NULL;
1143 char *node_inx = NULL;
1144 time_t start_time, submit_time;
1145 char *query = NULL;
1146
1147 if (!step_ptr->job_ptr->db_index
1148 && ((!step_ptr->job_ptr->details
1149 || !step_ptr->job_ptr->details->submit_time)
1150 && !step_ptr->job_ptr->resize_time)) {
1151 error("as_mysql_step_start: "
1152 "Not inputing this job, it has no submit time.");
1153 return SLURM_ERROR;
1154 }
1155
1156 if (step_ptr->job_ptr->resize_time) {
1157 submit_time = start_time = step_ptr->job_ptr->resize_time;
1158 if (step_ptr->start_time > submit_time)
1159 start_time = step_ptr->start_time;
1160 } else {
1161 start_time = step_ptr->start_time;
1162 submit_time = step_ptr->job_ptr->details->submit_time;
1163 }
1164
1165 if (check_connection(mysql_conn) != SLURM_SUCCESS)
1166 return ESLURM_DB_CONNECTION;
1167 if (slurmdbd_conf) {
1168 if (step_ptr->job_ptr->details)
1169 tasks = step_ptr->job_ptr->details->num_tasks;
1170 else
1171 tasks = step_ptr->cpu_count;
1172 node_list = step_ptr->job_ptr->nodes;
1173 nodes = step_ptr->step_layout->node_cnt;
1174 task_dist = step_ptr->step_layout->task_dist;
1175 node_inx = step_ptr->network;
1176 } else if (step_ptr->step_id == SLURM_BATCH_SCRIPT) {
1177 if (step_ptr->step_node_bitmap) {
1178 node_inx = bit_fmt(temp_bit, sizeof(temp_bit),
1179 step_ptr->step_node_bitmap);
1180 }
1181 /*
1182 * We overload tres_per_node with the node name of where the
1183 * script was running.
1184 */
1185 node_list = step_ptr->tres_per_node;
1186 nodes = tasks = 1;
1187 if (!step_ptr->tres_alloc_str)
1188 xstrfmtcat(step_ptr->tres_alloc_str,
1189 "%s%u=%u,%u=%u",
1190 step_ptr->tres_alloc_str ? "," : "",
1191 TRES_CPU, 1,
1192 TRES_NODE, 1);
1193 } else {
1194 if (step_ptr->step_node_bitmap) {
1195 node_inx = bit_fmt(temp_bit, sizeof(temp_bit),
1196 step_ptr->step_node_bitmap);
1197 }
1198
1199 if (!step_ptr->step_layout
1200 || !step_ptr->step_layout->task_cnt) {
1201 if (step_ptr->cpu_count)
1202 tasks = step_ptr->cpu_count;
1203 else {
1204 if ((tasks = slurmdb_find_tres_count_in_string(
1205 step_ptr->tres_alloc_str,
1206 TRES_CPU)) == INFINITE64) {
1207 if ((tasks =
1208 slurmdb_find_tres_count_in_string(
1209 step_ptr->job_ptr->
1210 tres_alloc_str,
1211 TRES_CPU)) == INFINITE64)
1212 tasks = step_ptr->job_ptr->
1213 total_nodes;
1214 }
1215 }
1216
1217 nodes = step_ptr->job_ptr->total_nodes;
1218 node_list = step_ptr->job_ptr->nodes;
1219 } else {
1220 tasks = step_ptr->step_layout->task_cnt;
1221 nodes = step_ptr->step_layout->node_cnt;
1222 task_dist = step_ptr->step_layout->task_dist;
1223 node_list = step_ptr->step_layout->node_list;
1224 }
1225 }
1226
1227 if (!step_ptr->job_ptr->db_index) {
1228 if (!(step_ptr->job_ptr->db_index =
1229 _get_db_index(mysql_conn,
1230 submit_time,
1231 step_ptr->job_ptr->job_id))) {
1232 /* If we get an error with this just fall
1233 * through to avoid an infinite loop
1234 */
1235 if (as_mysql_job_start(mysql_conn, step_ptr->job_ptr)
1236 == SLURM_ERROR) {
1237 error("couldn't add job %u at step start",
1238 step_ptr->job_ptr->job_id);
1239 return SLURM_SUCCESS;
1240 }
1241 }
1242 }
1243
1244 /* we want to print a -1 for the requid so leave it a
1245 %d */
1246 /* The stepid could be -2 so use %d not %u */
1247 query = xstrdup_printf(
1248 "insert into \"%s_%s\" (job_db_inx, id_step, time_start, "
1249 "step_name, state, tres_alloc, "
1250 "nodes_alloc, task_cnt, nodelist, node_inx, "
1251 "task_dist, req_cpufreq, req_cpufreq_min, req_cpufreq_gov) "
1252 "values (%"PRIu64", %d, %d, '%s', %d, '%s', %d, %d, "
1253 "'%s', '%s', %d, %u, %u, %u) "
1254 "on duplicate key update "
1255 "nodes_alloc=%d, task_cnt=%d, time_end=0, state=%d, "
1256 "nodelist='%s', node_inx='%s', task_dist=%d, "
1257 "req_cpufreq=%u, req_cpufreq_min=%u, req_cpufreq_gov=%u,"
1258 "tres_alloc='%s';",
1259 mysql_conn->cluster_name, step_table,
1260 step_ptr->job_ptr->db_index,
1261 step_ptr->step_id,
1262 (int)start_time, step_ptr->name,
1263 JOB_RUNNING, step_ptr->tres_alloc_str,
1264 nodes, tasks, node_list, node_inx, task_dist,
1265 step_ptr->cpu_freq_max, step_ptr->cpu_freq_min,
1266 step_ptr->cpu_freq_gov, nodes, tasks, JOB_RUNNING,
1267 node_list, node_inx, task_dist, step_ptr->cpu_freq_max,
1268 step_ptr->cpu_freq_min, step_ptr->cpu_freq_gov,
1269 step_ptr->tres_alloc_str);
1270 if (debug_flags & DEBUG_FLAG_DB_STEP)
1271 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1272 rc = mysql_db_query(mysql_conn, query);
1273 xfree(query);
1274
1275 return rc;
1276 }
1277
as_mysql_step_complete(mysql_conn_t * mysql_conn,step_record_t * step_ptr)1278 extern int as_mysql_step_complete(mysql_conn_t *mysql_conn,
1279 step_record_t *step_ptr)
1280 {
1281 time_t now;
1282 uint16_t comp_status;
1283 int tasks = 0;
1284 struct jobacctinfo *jobacct = (struct jobacctinfo *)step_ptr->jobacct;
1285 char *query = NULL;
1286 int rc = SLURM_SUCCESS;
1287 uint32_t exit_code = 0;
1288 time_t submit_time;
1289
1290 if (!step_ptr->job_ptr->db_index
1291 && ((!step_ptr->job_ptr->details
1292 || !step_ptr->job_ptr->details->submit_time)
1293 && !step_ptr->job_ptr->resize_time)) {
1294 error("as_mysql_step_complete: "
1295 "Not inputing this job, it has no submit time.");
1296 return SLURM_ERROR;
1297 }
1298
1299 if (step_ptr->job_ptr->resize_time)
1300 submit_time = step_ptr->job_ptr->resize_time;
1301 else
1302 submit_time = step_ptr->job_ptr->details->submit_time;
1303
1304 if (check_connection(mysql_conn) != SLURM_SUCCESS)
1305 return ESLURM_DB_CONNECTION;
1306
1307 if (slurmdbd_conf) {
1308 now = step_ptr->job_ptr->end_time;
1309 if (step_ptr->job_ptr->details)
1310 tasks = step_ptr->job_ptr->details->num_tasks;
1311 else
1312 tasks = step_ptr->cpu_count;
1313 } else if (step_ptr->step_id == SLURM_BATCH_SCRIPT) {
1314 now = time(NULL);
1315 tasks = 1;
1316 } else {
1317 now = time(NULL);
1318 if (!step_ptr->step_layout
1319 || !step_ptr->step_layout->task_cnt) {
1320 if (step_ptr->cpu_count)
1321 tasks = step_ptr->cpu_count;
1322 else {
1323 if ((tasks = slurmdb_find_tres_count_in_string(
1324 step_ptr->tres_alloc_str,
1325 TRES_CPU)) == INFINITE64) {
1326 if ((tasks =
1327 slurmdb_find_tres_count_in_string(
1328 step_ptr->job_ptr->
1329 tres_alloc_str,
1330 TRES_CPU)) == INFINITE64)
1331 tasks = step_ptr->job_ptr->
1332 total_nodes;
1333 }
1334 }
1335 } else
1336 tasks = step_ptr->step_layout->task_cnt;
1337 }
1338
1339 exit_code = step_ptr->exit_code;
1340 comp_status = step_ptr->state & JOB_STATE_BASE;
1341 if (comp_status < JOB_COMPLETE) {
1342 if (exit_code == SIG_OOM) {
1343 comp_status = JOB_OOM;
1344 } else if (WIFSIGNALED(exit_code)) {
1345 comp_status = JOB_CANCELLED;
1346 } else if (exit_code)
1347 comp_status = JOB_FAILED;
1348 else {
1349 step_ptr->requid = -1;
1350 comp_status = JOB_COMPLETE;
1351 }
1352 }
1353
1354 if (!step_ptr->job_ptr->db_index) {
1355 if (!(step_ptr->job_ptr->db_index =
1356 _get_db_index(mysql_conn,
1357 submit_time,
1358 step_ptr->job_ptr->job_id))) {
1359 /* If we get an error with this just fall
1360 * through to avoid an infinite loop
1361 */
1362 if (as_mysql_job_start(mysql_conn, step_ptr->job_ptr)
1363 == SLURM_ERROR) {
1364 error("couldn't add job %u "
1365 "at step completion",
1366 step_ptr->job_ptr->job_id);
1367 return SLURM_SUCCESS;
1368 }
1369 }
1370 }
1371
1372 /* The stepid could be -2 so use %d not %u */
1373 query = xstrdup_printf(
1374 "update \"%s_%s\" set time_end=%d, state=%u, "
1375 "kill_requid=%d, exit_code=%d",
1376 mysql_conn->cluster_name, step_table, (int)now,
1377 comp_status,
1378 step_ptr->requid,
1379 exit_code);
1380
1381
1382 if (jobacct) {
1383 slurmdb_stats_t stats;
1384
1385 memset(&stats, 0, sizeof(slurmdb_stats_t));
1386
1387 /* figure out the ave of the totals sent */
1388 if (tasks > 0) {
1389 stats.tres_usage_in_ave =
1390 _average_tres_usage(jobacct->tres_ids,
1391 jobacct->tres_usage_in_tot,
1392 jobacct->tres_count,
1393 tasks);
1394 stats.tres_usage_out_ave =
1395 _average_tres_usage(jobacct->tres_ids,
1396 jobacct->tres_usage_out_tot,
1397 jobacct->tres_count,
1398 tasks);
1399 }
1400
1401 /*
1402 * We can't trust the assoc_mgr here as the tres may have
1403 * changed, we have to go off what was sent us. We can just use
1404 * the _average_tres_usage to do this by dividing by 1.
1405 */
1406 stats.tres_usage_in_max = _average_tres_usage(
1407 jobacct->tres_ids,
1408 jobacct->tres_usage_in_max,
1409 jobacct->tres_count, 1);
1410 stats.tres_usage_in_max_nodeid = _average_tres_usage(
1411 jobacct->tres_ids,
1412 jobacct->tres_usage_in_max_nodeid,
1413 jobacct->tres_count, 1);
1414 stats.tres_usage_in_max_taskid = _average_tres_usage(
1415 jobacct->tres_ids,
1416 jobacct->tres_usage_in_max_taskid,
1417 jobacct->tres_count, 1);
1418 stats.tres_usage_in_min = _average_tres_usage(
1419 jobacct->tres_ids,
1420 jobacct->tres_usage_in_min,
1421 jobacct->tres_count, 1);
1422 stats.tres_usage_in_min_nodeid = _average_tres_usage(
1423 jobacct->tres_ids,
1424 jobacct->tres_usage_in_min_nodeid,
1425 jobacct->tres_count, 1);
1426 stats.tres_usage_in_min_taskid = _average_tres_usage(
1427 jobacct->tres_ids,
1428 jobacct->tres_usage_in_min_taskid,
1429 jobacct->tres_count, 1);
1430 stats.tres_usage_in_tot = _average_tres_usage(
1431 jobacct->tres_ids,
1432 jobacct->tres_usage_in_tot,
1433 jobacct->tres_count, 1);
1434 stats.tres_usage_out_max = _average_tres_usage(
1435 jobacct->tres_ids,
1436 jobacct->tres_usage_out_max,
1437 jobacct->tres_count, 1);
1438 stats.tres_usage_out_max_nodeid = _average_tres_usage(
1439 jobacct->tres_ids,
1440 jobacct->tres_usage_out_max_nodeid,
1441 jobacct->tres_count, 1);
1442 stats.tres_usage_out_max_taskid = _average_tres_usage(
1443 jobacct->tres_ids,
1444 jobacct->tres_usage_out_max_taskid,
1445 jobacct->tres_count, 1);
1446 stats.tres_usage_out_min = _average_tres_usage(
1447 jobacct->tres_ids,
1448 jobacct->tres_usage_out_min,
1449 jobacct->tres_count, 1);
1450 stats.tres_usage_out_min_nodeid = _average_tres_usage(
1451 jobacct->tres_ids,
1452 jobacct->tres_usage_out_min_nodeid,
1453 jobacct->tres_count, 1);
1454 stats.tres_usage_out_min_taskid = _average_tres_usage(
1455 jobacct->tres_ids,
1456 jobacct->tres_usage_out_min_taskid,
1457 jobacct->tres_count, 1);
1458 stats.tres_usage_out_tot = _average_tres_usage(
1459 jobacct->tres_ids,
1460 jobacct->tres_usage_out_tot,
1461 jobacct->tres_count, 1);
1462
1463 xstrfmtcat(query,
1464 ", user_sec=%u, user_usec=%u, "
1465 "sys_sec=%u, sys_usec=%u, "
1466 "act_cpufreq=%u, consumed_energy=%"PRIu64", "
1467 "tres_usage_in_ave='%s', "
1468 "tres_usage_out_ave='%s', "
1469 "tres_usage_in_max='%s', "
1470 "tres_usage_in_max_taskid='%s', "
1471 "tres_usage_in_max_nodeid='%s', "
1472 "tres_usage_in_min='%s', "
1473 "tres_usage_in_min_taskid='%s', "
1474 "tres_usage_in_min_nodeid='%s', "
1475 "tres_usage_in_tot='%s', "
1476 "tres_usage_out_max='%s', "
1477 "tres_usage_out_max_taskid='%s', "
1478 "tres_usage_out_max_nodeid='%s', "
1479 "tres_usage_out_min='%s', "
1480 "tres_usage_out_min_taskid='%s', "
1481 "tres_usage_out_min_nodeid='%s', "
1482 "tres_usage_out_tot='%s'",
1483 /* user seconds */
1484 jobacct->user_cpu_sec,
1485 /* user microseconds */
1486 jobacct->user_cpu_usec,
1487 /* system seconds */
1488 jobacct->sys_cpu_sec,
1489 /* system microsecs */
1490 jobacct->sys_cpu_usec,
1491 jobacct->act_cpufreq,
1492 jobacct->energy.consumed_energy,
1493 stats.tres_usage_in_ave,
1494 stats.tres_usage_out_ave,
1495 stats.tres_usage_in_max,
1496 stats.tres_usage_in_max_taskid,
1497 stats.tres_usage_in_max_nodeid,
1498 stats.tres_usage_in_min,
1499 stats.tres_usage_in_min_taskid,
1500 stats.tres_usage_in_min_nodeid,
1501 stats.tres_usage_in_tot,
1502 stats.tres_usage_out_max,
1503 stats.tres_usage_out_max_taskid,
1504 stats.tres_usage_out_max_nodeid,
1505 stats.tres_usage_out_min,
1506 stats.tres_usage_out_min_taskid,
1507 stats.tres_usage_out_min_nodeid,
1508 stats.tres_usage_out_tot);
1509
1510 slurmdb_free_slurmdb_stats_members(&stats);
1511 }
1512
1513 /* id_step has to be %d here to handle the -2 -1 for the batch and
1514 extern steps. Don't change it to a %u.
1515 */
1516 xstrfmtcat(query,
1517 " where job_db_inx=%"PRIu64" and id_step=%d",
1518 step_ptr->job_ptr->db_index, step_ptr->step_id);
1519 if (debug_flags & DEBUG_FLAG_DB_STEP)
1520 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1521 rc = mysql_db_query(mysql_conn, query);
1522 xfree(query);
1523
1524 /* set the energy for the entire job. */
1525 if (step_ptr->job_ptr->tres_alloc_str) {
1526 query = xstrdup_printf(
1527 "update \"%s_%s\" set tres_alloc='%s' where "
1528 "job_db_inx=%"PRIu64,
1529 mysql_conn->cluster_name, job_table,
1530 step_ptr->job_ptr->tres_alloc_str,
1531 step_ptr->job_ptr->db_index);
1532 if (debug_flags & DEBUG_FLAG_DB_STEP)
1533 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1534 rc = mysql_db_query(mysql_conn, query);
1535 xfree(query);
1536 }
1537
1538 return rc;
1539 }
1540
as_mysql_suspend(mysql_conn_t * mysql_conn,uint64_t old_db_inx,job_record_t * job_ptr)1541 extern int as_mysql_suspend(mysql_conn_t *mysql_conn, uint64_t old_db_inx,
1542 job_record_t *job_ptr)
1543 {
1544 char *query = NULL;
1545 int rc = SLURM_SUCCESS;
1546 time_t submit_time;
1547 uint64_t job_db_inx;
1548
1549 if (check_connection(mysql_conn) != SLURM_SUCCESS)
1550 return ESLURM_DB_CONNECTION;
1551
1552 if (job_ptr->resize_time)
1553 submit_time = job_ptr->resize_time;
1554 else
1555 submit_time = job_ptr->details->submit_time;
1556
1557 if (!job_ptr->db_index) {
1558 if (!(job_ptr->db_index =
1559 _get_db_index(mysql_conn,
1560 submit_time,
1561 job_ptr->job_id))) {
1562 /* If we get an error with this just fall
1563 * through to avoid an infinite loop
1564 */
1565 if (as_mysql_job_start(
1566 mysql_conn, job_ptr) == SLURM_ERROR) {
1567 error("couldn't suspend job %u",
1568 job_ptr->job_id);
1569 return SLURM_SUCCESS;
1570 }
1571 }
1572 }
1573
1574 if (IS_JOB_RESIZING(job_ptr)) {
1575 if (!old_db_inx) {
1576 error("No old db inx given for job %u cluster %s, "
1577 "can't update suspend table.",
1578 job_ptr->job_id, mysql_conn->cluster_name);
1579 return SLURM_ERROR;
1580 }
1581 job_db_inx = old_db_inx;
1582 xstrfmtcat(query,
1583 "update \"%s_%s\" set time_end=%d where "
1584 "job_db_inx=%"PRIu64" && time_end=0;",
1585 mysql_conn->cluster_name, suspend_table,
1586 (int)job_ptr->suspend_time, job_db_inx);
1587
1588 } else
1589 job_db_inx = job_ptr->db_index;
1590
1591 /* use job_db_inx for this one since we want to update the
1592 supend time of the job before it was resized.
1593 */
1594 xstrfmtcat(query,
1595 "update \"%s_%s\" set time_suspended=%d-time_suspended, "
1596 "state=%d where job_db_inx=%"PRIu64";",
1597 mysql_conn->cluster_name, job_table,
1598 (int)job_ptr->suspend_time,
1599 job_ptr->job_state & JOB_STATE_BASE,
1600 job_db_inx);
1601 if (IS_JOB_SUSPENDED(job_ptr))
1602 xstrfmtcat(query,
1603 "insert into \"%s_%s\" (job_db_inx, id_assoc, "
1604 "time_start, time_end) "
1605 "values (%"PRIu64", %u, %d, 0);",
1606 mysql_conn->cluster_name, suspend_table,
1607 job_ptr->db_index, job_ptr->assoc_id,
1608 (int)job_ptr->suspend_time);
1609 else
1610 xstrfmtcat(query,
1611 "update \"%s_%s\" set time_end=%d where "
1612 "job_db_inx=%"PRIu64" && time_end=0;",
1613 mysql_conn->cluster_name, suspend_table,
1614 (int)job_ptr->suspend_time, job_ptr->db_index);
1615 if (debug_flags & DEBUG_FLAG_DB_JOB)
1616 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1617
1618 rc = mysql_db_query(mysql_conn, query);
1619
1620 xfree(query);
1621 if (rc != SLURM_ERROR) {
1622 xstrfmtcat(query,
1623 "update \"%s_%s\" set "
1624 "time_suspended=%u-time_suspended, "
1625 "state=%d where job_db_inx=%"PRIu64" and time_end=0",
1626 mysql_conn->cluster_name, step_table,
1627 (int)job_ptr->suspend_time,
1628 job_ptr->job_state, job_ptr->db_index);
1629 rc = mysql_db_query(mysql_conn, query);
1630 xfree(query);
1631 }
1632
1633 return rc;
1634 }
1635
as_mysql_flush_jobs_on_cluster(mysql_conn_t * mysql_conn,time_t event_time)1636 extern int as_mysql_flush_jobs_on_cluster(
1637 mysql_conn_t *mysql_conn, time_t event_time)
1638 {
1639 int rc = SLURM_SUCCESS;
1640 /* put end times for a clean start */
1641 MYSQL_RES *result = NULL;
1642 MYSQL_ROW row;
1643 char *query = NULL;
1644 char *id_char = NULL;
1645 char *suspended_char = NULL;
1646
1647 if (check_connection(mysql_conn) != SLURM_SUCCESS)
1648 return ESLURM_DB_CONNECTION;
1649
1650 /* First we need to get the job_db_inx's and states so we can clean up
1651 * the suspend table and the step table
1652 */
1653 query = xstrdup_printf(
1654 "select distinct t1.job_db_inx, t1.state from \"%s_%s\" "
1655 "as t1 where t1.time_end=0;",
1656 mysql_conn->cluster_name, job_table);
1657 if (debug_flags & DEBUG_FLAG_DB_JOB)
1658 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1659 if (!(result =
1660 mysql_db_query_ret(mysql_conn, query, 0))) {
1661 xfree(query);
1662 return SLURM_ERROR;
1663 }
1664 xfree(query);
1665
1666 while ((row = mysql_fetch_row(result))) {
1667 int state = slurm_atoul(row[1]);
1668 if (state == JOB_SUSPENDED) {
1669 if (suspended_char)
1670 xstrfmtcat(suspended_char,
1671 ", %s", row[0]);
1672 else
1673 xstrfmtcat(suspended_char, "job_db_inx in (%s",
1674 row[0]);
1675 }
1676
1677 if (id_char)
1678 xstrfmtcat(id_char, ", %s", row[0]);
1679 else
1680 xstrfmtcat(id_char, "job_db_inx in (%s", row[0]);
1681 }
1682 mysql_free_result(result);
1683
1684 if (suspended_char) {
1685 xstrfmtcat(suspended_char, ")");
1686 xstrfmtcat(query,
1687 "update \"%s_%s\" set "
1688 "time_suspended=%ld-time_suspended "
1689 "where %s;",
1690 mysql_conn->cluster_name, job_table,
1691 event_time, suspended_char);
1692 xstrfmtcat(query,
1693 "update \"%s_%s\" set "
1694 "time_suspended=%ld-time_suspended "
1695 "where %s;",
1696 mysql_conn->cluster_name, step_table,
1697 event_time, suspended_char);
1698 xstrfmtcat(query,
1699 "update \"%s_%s\" set time_end=%ld where (%s) "
1700 "&& time_end=0;",
1701 mysql_conn->cluster_name, suspend_table,
1702 event_time, suspended_char);
1703 xfree(suspended_char);
1704 }
1705 if (id_char) {
1706 xstrfmtcat(id_char, ")");
1707 xstrfmtcat(query,
1708 "update \"%s_%s\" set state=%d, "
1709 "time_end=%ld where %s;",
1710 mysql_conn->cluster_name, job_table,
1711 JOB_CANCELLED, event_time, id_char);
1712 xstrfmtcat(query,
1713 "update \"%s_%s\" set state=%d, "
1714 "time_end=%ld where %s;",
1715 mysql_conn->cluster_name, step_table,
1716 JOB_CANCELLED, event_time, id_char);
1717 xfree(id_char);
1718 }
1719
1720 if (query) {
1721 if (debug_flags & DEBUG_FLAG_DB_JOB)
1722 DB_DEBUG(mysql_conn->conn, "query\n%s", query);
1723
1724 rc = mysql_db_query(mysql_conn, query);
1725 xfree(query);
1726 }
1727
1728 return rc;
1729 }
1730