1 /*****************************************************************************\
2 * acct_policy.c - Enforce accounting policy
3 *****************************************************************************
4 * Copyright (C) 2008 Lawrence Livermore National Security.
5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6 * Written by Morris Jette <jette1@llnl.gov>
7 * CODE-OCEC-09-009. All rights reserved.
8 *
9 * This file is part of Slurm, a resource management program.
10 * For details, see <https://slurm.schedmd.com/>.
11 * Please also read the included file: DISCLAIMER.
12 *
13 * Slurm is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option)
16 * any later version.
17 *
18 * In addition, as a special exception, the copyright holders give permission
19 * to link the code of portions of this program with the OpenSSL library under
20 * certain conditions as described in each individual source file, and
21 * distribute linked combinations including the two. You must obey the GNU
22 * General Public License in all respects for all of the code used other than
23 * OpenSSL. If you modify file(s) with this exception, you may extend this
24 * exception to your version of the file(s), but you are not obligated to do
25 * so. If you do not wish to do so, delete this exception statement from your
26 * version. If you delete this exception statement from all source files in
27 * the program, then also delete it here.
28 *
29 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
32 * details.
33 *
34 * You should have received a copy of the GNU General Public License along
35 * with Slurm; if not, write to the Free Software Foundation, Inc.,
36 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
37 \*****************************************************************************/
38
39 #include "slurm/slurm_errno.h"
40
41 #include "src/common/assoc_mgr.h"
42 #include "src/common/slurm_accounting_storage.h"
43
44 #include "src/slurmctld/slurmctld.h"
45 #include "src/slurmctld/acct_policy.h"
46 #include "src/common/node_select.h"
47 #include "src/common/slurm_priority.h"
48
49 #define _DEBUG 0
50
51 enum {
52 ACCT_POLICY_ADD_SUBMIT,
53 ACCT_POLICY_REM_SUBMIT,
54 ACCT_POLICY_JOB_BEGIN,
55 ACCT_POLICY_JOB_FINI
56 };
57
58 typedef enum {
59 TRES_USAGE_OKAY,
60 TRES_USAGE_CUR_EXCEEDS_LIMIT,
61 TRES_USAGE_REQ_EXCEEDS_LIMIT,
62 TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE
63 } acct_policy_tres_usage_t;
64
65 typedef struct het_job_limits {
66 slurmdb_assoc_rec_t *assoc_ptr;
67 job_record_t *job_ptr;
68 slurmdb_qos_rec_t *qos_ptr_1;
69 slurmdb_qos_rec_t *qos_ptr_2;
70 } het_job_limits_t;
71
72 /*
73 * Update a job's allocated node count to reflect only nodes that are not
74 * already allocated to this association. Needed to enforce GrpNode limit.
75 */
_get_unique_job_node_cnt(job_record_t * job_ptr,bitstr_t * grp_node_bitmap,uint64_t * node_cnt)76 static void _get_unique_job_node_cnt(job_record_t *job_ptr,
77 bitstr_t *grp_node_bitmap,
78 uint64_t *node_cnt)
79 {
80 xassert(node_cnt);
81 #if _DEBUG
82 char node_bitstr[64];
83 if (job_ptr->job_resrcs && job_ptr->job_resrcs->node_bitmap) {
84 bit_fmt(node_bitstr, sizeof(node_bitstr),
85 job_ptr->job_resrcs->node_bitmap);
86 info("%s: %pJ job_resrcs->node_bitmap:%s", __func__, job_ptr,
87 node_bitstr);
88 } else {
89 info("%s: %pJ job_resrcs->node_bitmap:NULL", __func__,
90 job_ptr);
91 }
92
93 if (grp_node_bitmap) {
94 bit_fmt(node_bitstr, sizeof(node_bitstr), grp_node_bitmap);
95 info("%s: object grp_node_bitmap:%s", __func__,
96 node_bitstr);
97 } else {
98 info("%s: object grp_node_bitmap:NULL", __func__);
99 }
100 #endif
101
102 if (job_ptr->job_resrcs && job_ptr->job_resrcs->node_bitmap &&
103 grp_node_bitmap) {
104 uint64_t overlap_cnt = bit_overlap(
105 job_ptr->job_resrcs->node_bitmap, grp_node_bitmap);
106 if (overlap_cnt) {
107 uint64_t init_cnt = bit_set_count(
108 job_ptr->job_resrcs->node_bitmap);
109 *node_cnt = init_cnt - overlap_cnt;
110 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRES_NODE)
111 info("%s: %pJ unique allocated node count changed from %"PRIu64" to %"PRIu64,
112 __func__, job_ptr, init_cnt, *node_cnt);
113 }
114 } else if (job_ptr->details && job_ptr->details->req_node_bitmap &&
115 grp_node_bitmap) {
116 uint64_t overlap_cnt = bit_overlap(
117 job_ptr->details->req_node_bitmap, grp_node_bitmap);
118 if (overlap_cnt <= *node_cnt) {
119 *node_cnt -= overlap_cnt;
120 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRES_NODE)
121 info("%s: %pJ unique allocated node count changed from %"PRIu64" to %"PRIu64,
122 __func__, job_ptr, *node_cnt + overlap_cnt, *node_cnt);
123 }
124 }
125 }
126
127 /*
128 * Update node allocation information for a job being started.
129 * This includes grp_node_bitmap, grp_node_job_cnt and
130 * grp_used_tres[TRES_ARRAY_NODE] of an object (qos, assoc, etc).
131 */
_add_usage_node_bitmap(job_record_t * job_ptr,bitstr_t ** grp_node_bitmap,uint16_t ** grp_node_job_cnt,uint64_t * grp_used_tres)132 static void _add_usage_node_bitmap(job_record_t *job_ptr,
133 bitstr_t **grp_node_bitmap,
134 uint16_t **grp_node_job_cnt,
135 uint64_t *grp_used_tres)
136 {
137 static int node_cnt = -1;
138 int i, i_first, i_last;
139
140 xassert(grp_node_bitmap);
141 xassert(grp_node_job_cnt);
142 xassert(grp_used_tres);
143
144 if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) {
145 if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id) {
146 /*
147 * Hetjobs reach here as part of testing before any
148 * resource allocation. See _het_job_limit_check()
149 * in src/plugins/sched/backfill/backfill.c
150 */
151 } else if (job_ptr->node_cnt == 0) {
152 /* Zero size jobs OK to create/destroy burst buffers */
153 } else {
154 error("%s: %pJ lacks allocated node bitmap", __func__,
155 job_ptr);
156 }
157 return;
158 }
159 if (*grp_node_bitmap)
160 bit_or(*grp_node_bitmap, job_ptr->job_resrcs->node_bitmap);
161 else
162 *grp_node_bitmap = bit_copy(job_ptr->job_resrcs->node_bitmap);
163
164 if (!*grp_node_job_cnt) {
165 if (node_cnt == -1)
166 node_cnt = bit_size(*grp_node_bitmap);
167 *grp_node_job_cnt = xcalloc(node_cnt, sizeof(uint16_t));
168 }
169
170 i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
171 if (i_first == -1)
172 i_last = -2;
173 else
174 i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
175 for (i = i_first; i <= i_last; i++) {
176 if (bit_test(job_ptr->job_resrcs->node_bitmap, i))
177 (*grp_node_job_cnt)[i]++;
178 }
179 *grp_used_tres = bit_set_count(*grp_node_bitmap);
180 }
181
182 /*
183 * Update node allocation information for a job being completed.
184 * This includes grp_node_bitmap, grp_node_job_cnt and
185 * grp_used_tres[TRES_ARRAY_NODE] of an object (qos, assoc, etc).
186 */
_rm_usage_node_bitmap(job_record_t * job_ptr,bitstr_t * grp_node_bitmap,uint16_t * grp_node_job_cnt,uint64_t * grp_used_tres)187 static void _rm_usage_node_bitmap(job_record_t *job_ptr,
188 bitstr_t *grp_node_bitmap,
189 uint16_t *grp_node_job_cnt,
190 uint64_t *grp_used_tres)
191 {
192 int i, i_first, i_last;
193
194 xassert(grp_used_tres);
195
196 if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) {
197 if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id) {
198 /*
199 * Hetjobs reach here as part of testing before any
200 * resource allocation. See _het_job_limit_check()
201 * in src/plugins/sched/backfill/backfill.c
202 */
203 } else if (job_ptr->node_cnt == 0) {
204 /* Zero size jobs OK to create/destroy burst buffers */
205 } else {
206 error("%s: %pJ lacks allocated node bitmap", __func__,
207 job_ptr);
208 }
209 return;
210 }
211 if (!grp_node_bitmap) {
212 error("%s: grp_node_bitmap is NULL", __func__);
213 return;
214 }
215 if (!grp_node_job_cnt) {
216 error("%s: grp_node_job_cnt is NULL", __func__);
217 return;
218 }
219 i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
220 if (i_first == -1)
221 i_last = -2;
222 else
223 i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
224 for (i = i_first; i <= i_last; i++) {
225 if (!bit_test(job_ptr->job_resrcs->node_bitmap, i))
226 continue;
227 if (--grp_node_job_cnt[i] == 0)
228 bit_clear(grp_node_bitmap, i);
229 }
230 *grp_used_tres = bit_set_count(grp_node_bitmap);
231 }
232
_get_tres_state_reason(int tres_pos,int unk_reason)233 static int _get_tres_state_reason(int tres_pos, int unk_reason)
234 {
235 switch (tres_pos) {
236 case TRES_ARRAY_CPU:
237 switch (unk_reason) {
238 case WAIT_ASSOC_GRP_UNK:
239 return WAIT_ASSOC_GRP_CPU;
240 case WAIT_ASSOC_GRP_UNK_MIN:
241 return WAIT_ASSOC_GRP_CPU_MIN;
242 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
243 return WAIT_ASSOC_GRP_CPU_RUN_MIN;
244 case WAIT_ASSOC_MAX_UNK_PER_JOB:
245 return WAIT_ASSOC_MAX_CPU_PER_JOB;
246 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
247 return WAIT_ASSOC_MAX_CPU_MINS_PER_JOB;
248 case WAIT_ASSOC_MAX_UNK_PER_NODE:
249 return WAIT_ASSOC_MAX_CPU_PER_NODE;
250 case WAIT_QOS_GRP_UNK:
251 return WAIT_QOS_GRP_CPU;
252 case WAIT_QOS_GRP_UNK_MIN:
253 return WAIT_QOS_GRP_CPU_MIN;
254 case WAIT_QOS_GRP_UNK_RUN_MIN:
255 return WAIT_QOS_GRP_CPU_RUN_MIN;
256 case WAIT_QOS_MAX_UNK_PER_JOB:
257 return WAIT_QOS_MAX_CPU_PER_JOB;
258 case WAIT_QOS_MAX_UNK_PER_NODE:
259 return WAIT_QOS_MAX_CPU_PER_NODE;
260 case WAIT_QOS_MAX_UNK_PER_ACCT:
261 return WAIT_QOS_MAX_CPU_PER_ACCT;
262 case WAIT_QOS_MAX_UNK_PER_USER:
263 return WAIT_QOS_MAX_CPU_PER_USER;
264 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
265 return WAIT_QOS_MAX_CPU_MINS_PER_JOB;
266 case WAIT_QOS_MIN_UNK:
267 return WAIT_QOS_MIN_CPU;
268 default:
269 return unk_reason;
270 break;
271 }
272 break;
273 case TRES_ARRAY_MEM:
274 switch (unk_reason) {
275 case WAIT_ASSOC_GRP_UNK:
276 return WAIT_ASSOC_GRP_MEM;
277 case WAIT_ASSOC_GRP_UNK_MIN:
278 return WAIT_ASSOC_GRP_MEM_MIN;
279 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
280 return WAIT_ASSOC_GRP_MEM_RUN_MIN;
281 case WAIT_ASSOC_MAX_UNK_PER_JOB:
282 return WAIT_ASSOC_MAX_MEM_PER_JOB;
283 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
284 return WAIT_ASSOC_MAX_MEM_MINS_PER_JOB;
285 case WAIT_ASSOC_MAX_UNK_PER_NODE:
286 return WAIT_ASSOC_MAX_MEM_PER_NODE;
287 case WAIT_QOS_GRP_UNK:
288 return WAIT_QOS_GRP_MEM;
289 case WAIT_QOS_GRP_UNK_MIN:
290 return WAIT_QOS_GRP_MEM_MIN;
291 case WAIT_QOS_GRP_UNK_RUN_MIN:
292 return WAIT_QOS_GRP_MEM_RUN_MIN;
293 case WAIT_QOS_MAX_UNK_PER_JOB:
294 return WAIT_QOS_MAX_MEM_PER_JOB;
295 case WAIT_QOS_MAX_UNK_PER_NODE:
296 return WAIT_QOS_MAX_MEM_PER_NODE;
297 case WAIT_QOS_MAX_UNK_PER_ACCT:
298 return WAIT_QOS_MAX_MEM_PER_ACCT;
299 case WAIT_QOS_MAX_UNK_PER_USER:
300 return WAIT_QOS_MAX_MEM_PER_USER;
301 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
302 return WAIT_QOS_MAX_MEM_MINS_PER_JOB;
303 case WAIT_QOS_MIN_UNK:
304 return WAIT_QOS_MIN_MEM;
305 default:
306 return unk_reason;
307 break;
308 }
309 break;
310 case TRES_ARRAY_ENERGY:
311 switch (unk_reason) {
312 case WAIT_ASSOC_GRP_UNK:
313 return WAIT_ASSOC_GRP_ENERGY;
314 case WAIT_ASSOC_GRP_UNK_MIN:
315 return WAIT_ASSOC_GRP_ENERGY_MIN;
316 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
317 return WAIT_ASSOC_GRP_ENERGY_RUN_MIN;
318 case WAIT_ASSOC_MAX_UNK_PER_JOB:
319 return WAIT_ASSOC_MAX_ENERGY_PER_JOB;
320 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
321 return WAIT_ASSOC_MAX_ENERGY_MINS_PER_JOB;
322 case WAIT_ASSOC_MAX_UNK_PER_NODE:
323 return WAIT_ASSOC_MAX_ENERGY_PER_NODE;
324 case WAIT_QOS_GRP_UNK:
325 return WAIT_QOS_GRP_ENERGY;
326 case WAIT_QOS_GRP_UNK_MIN:
327 return WAIT_QOS_GRP_ENERGY_MIN;
328 case WAIT_QOS_GRP_UNK_RUN_MIN:
329 return WAIT_QOS_GRP_ENERGY_RUN_MIN;
330 case WAIT_QOS_MAX_UNK_PER_JOB:
331 return WAIT_QOS_MAX_ENERGY_PER_JOB;
332 case WAIT_QOS_MAX_UNK_PER_NODE:
333 return WAIT_QOS_MAX_ENERGY_PER_NODE;
334 case WAIT_QOS_MAX_UNK_PER_ACCT:
335 return WAIT_QOS_MAX_ENERGY_PER_ACCT;
336 case WAIT_QOS_MAX_UNK_PER_USER:
337 return WAIT_QOS_MAX_ENERGY_PER_USER;
338 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
339 return WAIT_QOS_MAX_ENERGY_MINS_PER_JOB;
340 case WAIT_QOS_MIN_UNK:
341 return WAIT_QOS_MIN_ENERGY;
342 default:
343 return unk_reason;
344 break;
345 }
346 break;
347 case TRES_ARRAY_NODE:
348 switch (unk_reason) {
349 case WAIT_ASSOC_GRP_UNK:
350 return WAIT_ASSOC_GRP_NODE;
351 case WAIT_ASSOC_GRP_UNK_MIN:
352 return WAIT_ASSOC_GRP_NODE_MIN;
353 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
354 return WAIT_ASSOC_GRP_NODE_RUN_MIN;
355 case WAIT_ASSOC_MAX_UNK_PER_JOB:
356 return WAIT_ASSOC_MAX_NODE_PER_JOB;
357 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
358 return WAIT_ASSOC_MAX_NODE_MINS_PER_JOB;
359 case WAIT_QOS_GRP_UNK:
360 return WAIT_QOS_GRP_NODE;
361 case WAIT_QOS_GRP_UNK_MIN:
362 return WAIT_QOS_GRP_NODE_MIN;
363 case WAIT_QOS_GRP_UNK_RUN_MIN:
364 return WAIT_QOS_GRP_NODE_RUN_MIN;
365 case WAIT_QOS_MAX_UNK_PER_JOB:
366 return WAIT_QOS_MAX_NODE_PER_JOB;
367 case WAIT_QOS_MAX_UNK_PER_ACCT:
368 return WAIT_QOS_MAX_NODE_PER_ACCT;
369 case WAIT_QOS_MAX_UNK_PER_USER:
370 return WAIT_QOS_MAX_NODE_PER_USER;
371 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
372 return WAIT_QOS_MAX_NODE_MINS_PER_JOB;
373 case WAIT_QOS_MIN_UNK:
374 return WAIT_QOS_MIN_NODE;
375 default:
376 return unk_reason;
377 break;
378 }
379 break;
380 case TRES_ARRAY_BILLING:
381 switch (unk_reason) {
382 case WAIT_ASSOC_GRP_UNK:
383 return WAIT_ASSOC_GRP_BILLING;
384 case WAIT_ASSOC_GRP_UNK_MIN:
385 return WAIT_ASSOC_GRP_BILLING_MIN;
386 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
387 return WAIT_ASSOC_GRP_BILLING_RUN_MIN;
388 case WAIT_ASSOC_MAX_UNK_PER_JOB:
389 return WAIT_ASSOC_MAX_BILLING_PER_JOB;
390 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
391 return WAIT_ASSOC_MAX_BILLING_MINS_PER_JOB;
392 case WAIT_ASSOC_MAX_UNK_PER_NODE:
393 return WAIT_ASSOC_MAX_BILLING_PER_NODE;
394 case WAIT_QOS_GRP_UNK:
395 return WAIT_QOS_GRP_BILLING;
396 case WAIT_QOS_GRP_UNK_MIN:
397 return WAIT_QOS_GRP_BILLING_MIN;
398 case WAIT_QOS_GRP_UNK_RUN_MIN:
399 return WAIT_QOS_GRP_BILLING_RUN_MIN;
400 case WAIT_QOS_MAX_UNK_PER_JOB:
401 return WAIT_QOS_MAX_BILLING_PER_JOB;
402 case WAIT_QOS_MAX_UNK_PER_NODE:
403 return WAIT_QOS_MAX_BILLING_PER_NODE;
404 case WAIT_QOS_MAX_UNK_PER_ACCT:
405 return WAIT_QOS_MAX_BILLING_PER_ACCT;
406 case WAIT_QOS_MAX_UNK_PER_USER:
407 return WAIT_QOS_MAX_BILLING_PER_USER;
408 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
409 return WAIT_QOS_MAX_BILLING_MINS_PER_JOB;
410 case WAIT_QOS_MIN_UNK:
411 return WAIT_QOS_MIN_BILLING;
412 default:
413 return unk_reason;
414 break;
415 }
416 break;
417 default:
418 if (!xstrcmp("gres", assoc_mgr_tres_array[tres_pos]->type))
419 switch (unk_reason) {
420 case WAIT_ASSOC_GRP_UNK:
421 return WAIT_ASSOC_GRP_GRES;
422 case WAIT_ASSOC_GRP_UNK_MIN:
423 return WAIT_ASSOC_GRP_GRES_MIN;
424 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
425 return WAIT_ASSOC_GRP_GRES_RUN_MIN;
426 case WAIT_ASSOC_MAX_UNK_PER_JOB:
427 return WAIT_ASSOC_MAX_GRES_PER_JOB;
428 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
429 return WAIT_ASSOC_MAX_GRES_MINS_PER_JOB;
430 case WAIT_ASSOC_MAX_UNK_PER_NODE:
431 return WAIT_ASSOC_MAX_GRES_PER_NODE;
432 case WAIT_QOS_GRP_UNK:
433 return WAIT_QOS_GRP_GRES;
434 case WAIT_QOS_GRP_UNK_MIN:
435 return WAIT_QOS_GRP_GRES_MIN;
436 case WAIT_QOS_GRP_UNK_RUN_MIN:
437 return WAIT_QOS_GRP_GRES_RUN_MIN;
438 case WAIT_QOS_MAX_UNK_PER_JOB:
439 return WAIT_QOS_MAX_GRES_PER_JOB;
440 case WAIT_QOS_MAX_UNK_PER_NODE:
441 return WAIT_QOS_MAX_GRES_PER_NODE;
442 case WAIT_QOS_MAX_UNK_PER_ACCT:
443 return WAIT_QOS_MAX_GRES_PER_ACCT;
444 case WAIT_QOS_MAX_UNK_PER_USER:
445 return WAIT_QOS_MAX_GRES_PER_USER;
446 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
447 return WAIT_QOS_MAX_GRES_MINS_PER_JOB;
448 case WAIT_QOS_MIN_UNK:
449 return WAIT_QOS_MIN_GRES;
450 default:
451 return unk_reason;
452 break;
453 }
454 else if (!xstrcmp("license",
455 assoc_mgr_tres_array[tres_pos]->type))
456 switch (unk_reason) {
457 case WAIT_ASSOC_GRP_UNK:
458 return WAIT_ASSOC_GRP_LIC;
459 case WAIT_ASSOC_GRP_UNK_MIN:
460 return WAIT_ASSOC_GRP_LIC_MIN;
461 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
462 return WAIT_ASSOC_GRP_LIC_RUN_MIN;
463 case WAIT_ASSOC_MAX_UNK_PER_JOB:
464 return WAIT_ASSOC_MAX_LIC_PER_JOB;
465 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
466 return WAIT_ASSOC_MAX_LIC_MINS_PER_JOB;
467 case WAIT_QOS_GRP_UNK:
468 return WAIT_QOS_GRP_LIC;
469 case WAIT_QOS_GRP_UNK_MIN:
470 return WAIT_QOS_GRP_LIC_MIN;
471 case WAIT_QOS_GRP_UNK_RUN_MIN:
472 return WAIT_QOS_GRP_LIC_RUN_MIN;
473 case WAIT_QOS_MAX_UNK_PER_JOB:
474 return WAIT_QOS_MAX_LIC_PER_JOB;
475 case WAIT_QOS_MAX_UNK_PER_ACCT:
476 return WAIT_QOS_MAX_LIC_PER_ACCT;
477 case WAIT_QOS_MAX_UNK_PER_USER:
478 return WAIT_QOS_MAX_LIC_PER_USER;
479 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
480 return WAIT_QOS_MAX_LIC_MINS_PER_JOB;
481 case WAIT_QOS_MIN_UNK:
482 return WAIT_QOS_MIN_LIC;
483 default:
484 return unk_reason;
485 break;
486 }
487 else if (!xstrcmp("bb", assoc_mgr_tres_array[tres_pos]->type))
488 switch (unk_reason) {
489 case WAIT_ASSOC_GRP_UNK:
490 return WAIT_ASSOC_GRP_BB;
491 case WAIT_ASSOC_GRP_UNK_MIN:
492 return WAIT_ASSOC_GRP_BB_MIN;
493 case WAIT_ASSOC_GRP_UNK_RUN_MIN:
494 return WAIT_ASSOC_GRP_BB_RUN_MIN;
495 case WAIT_ASSOC_MAX_UNK_PER_JOB:
496 return WAIT_ASSOC_MAX_BB_PER_JOB;
497 case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
498 return WAIT_ASSOC_MAX_BB_MINS_PER_JOB;
499 case WAIT_ASSOC_MAX_UNK_PER_NODE:
500 return WAIT_ASSOC_MAX_BB_PER_NODE;
501 case WAIT_QOS_GRP_UNK:
502 return WAIT_QOS_GRP_BB;
503 case WAIT_QOS_GRP_UNK_MIN:
504 return WAIT_QOS_GRP_BB_MIN;
505 case WAIT_QOS_GRP_UNK_RUN_MIN:
506 return WAIT_QOS_GRP_BB_RUN_MIN;
507 case WAIT_QOS_MAX_UNK_PER_JOB:
508 return WAIT_QOS_MAX_BB_PER_JOB;
509 case WAIT_QOS_MAX_UNK_PER_NODE:
510 return WAIT_QOS_MAX_BB_PER_NODE;
511 case WAIT_QOS_MAX_UNK_PER_ACCT:
512 return WAIT_QOS_MAX_BB_PER_ACCT;
513 case WAIT_QOS_MAX_UNK_PER_USER:
514 return WAIT_QOS_MAX_BB_PER_USER;
515 case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
516 return WAIT_QOS_MAX_BB_MINS_PER_JOB;
517 case WAIT_QOS_MIN_UNK:
518 return WAIT_QOS_MIN_BB;
519 default:
520 return unk_reason;
521 break;
522 }
523 break;
524 }
525
526 return unk_reason;
527 }
528
_find_used_limits_for_acct(void * x,void * key)529 static int _find_used_limits_for_acct(void *x, void *key)
530 {
531 slurmdb_used_limits_t *used_limits = (slurmdb_used_limits_t *)x;
532 char *account = (char *)key;
533
534 if (!xstrcmp(account, used_limits->acct))
535 return 1;
536
537 return 0;
538 }
539
_find_used_limits_for_user(void * x,void * key)540 static int _find_used_limits_for_user(void *x, void *key)
541 {
542 slurmdb_used_limits_t *used_limits = (slurmdb_used_limits_t *)x;
543 uint32_t user_id = *(uint32_t *)key;
544
545 if (used_limits->uid == user_id)
546 return 1;
547
548 return 0;
549 }
550
_valid_job_assoc(job_record_t * job_ptr)551 static bool _valid_job_assoc(job_record_t *job_ptr)
552 {
553 slurmdb_assoc_rec_t assoc_rec;
554
555 if ((job_ptr->assoc_ptr == NULL) ||
556 (job_ptr->assoc_ptr->id != job_ptr->assoc_id) ||
557 (job_ptr->assoc_ptr->uid != job_ptr->user_id)) {
558 error("Invalid assoc_ptr for %pJ", job_ptr);
559 memset(&assoc_rec, 0, sizeof(slurmdb_assoc_rec_t));
560
561 assoc_rec.acct = job_ptr->account;
562 if (job_ptr->part_ptr)
563 assoc_rec.partition = job_ptr->part_ptr->name;
564 assoc_rec.uid = job_ptr->user_id;
565
566 if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
567 accounting_enforce,
568 &job_ptr->assoc_ptr, false)) {
569 info("%s: invalid account or partition for uid=%u %pJ",
570 __func__, job_ptr->user_id, job_ptr);
571 return false;
572 }
573 job_ptr->assoc_id = assoc_rec.id;
574 }
575 return true;
576 }
577
_qos_adjust_limit_usage(int type,job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,uint64_t * used_tres_run_secs,uint32_t job_cnt)578 static void _qos_adjust_limit_usage(int type, job_record_t *job_ptr,
579 slurmdb_qos_rec_t *qos_ptr,
580 uint64_t *used_tres_run_secs,
581 uint32_t job_cnt)
582 {
583 slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
584 int i;
585
586 if (!qos_ptr || !job_ptr->assoc_ptr)
587 return;
588
589 used_limits_a = acct_policy_get_acct_used_limits(
590 &qos_ptr->usage->acct_limit_list,
591 job_ptr->assoc_ptr->acct);
592
593 used_limits = acct_policy_get_user_used_limits(
594 &qos_ptr->usage->user_limit_list,
595 job_ptr->user_id);
596
597 switch (type) {
598 case ACCT_POLICY_ADD_SUBMIT:
599 qos_ptr->usage->grp_used_submit_jobs += job_cnt;
600 used_limits->submit_jobs += job_cnt;
601 used_limits_a->submit_jobs += job_cnt;
602 break;
603 case ACCT_POLICY_REM_SUBMIT:
604 if (qos_ptr->usage->grp_used_submit_jobs >= job_cnt)
605 qos_ptr->usage->grp_used_submit_jobs -= job_cnt;
606 else {
607 qos_ptr->usage->grp_used_submit_jobs = 0;
608 debug2("acct_policy_remove_job_submit: "
609 "grp_submit_jobs underflow for qos %s",
610 qos_ptr->name);
611 }
612
613 if (used_limits->submit_jobs >= job_cnt)
614 used_limits->submit_jobs -= job_cnt;
615 else {
616 used_limits->submit_jobs = 0;
617 debug2("acct_policy_remove_job_submit: "
618 "used_submit_jobs underflow for "
619 "qos %s user %d",
620 qos_ptr->name, used_limits->uid);
621 }
622
623 if (used_limits_a->submit_jobs >= job_cnt)
624 used_limits_a->submit_jobs -= job_cnt;
625 else {
626 used_limits_a->submit_jobs = 0;
627 debug2("acct_policy_remove_job_submit: "
628 "used_submit_jobs underflow for "
629 "qos %s account %s",
630 qos_ptr->name, used_limits_a->acct);
631 }
632
633 break;
634 case ACCT_POLICY_JOB_BEGIN:
635 qos_ptr->usage->grp_used_jobs++;
636 for (i=0; i<slurmctld_tres_cnt; i++) {
637 /* tres_alloc_cnt for ENERGY is currently after the
638 * fact, so don't add it here or you will get underflows
639 * when you remove it. If this ever changes this will
640 * have to be moved to a new TRES ARRAY probably.
641 */
642 if (i == TRES_ARRAY_ENERGY)
643 continue;
644 if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
645 continue;
646
647 used_limits->tres[i] += job_ptr->tres_alloc_cnt[i];
648 used_limits_a->tres[i] += job_ptr->tres_alloc_cnt[i];
649
650 qos_ptr->usage->grp_used_tres[i] +=
651 job_ptr->tres_alloc_cnt[i];
652 qos_ptr->usage->grp_used_tres_run_secs[i] +=
653 used_tres_run_secs[i];
654 debug2("acct_policy_job_begin: after adding %pJ, qos %s grp_used_tres_run_secs(%s) is %"PRIu64,
655 job_ptr, qos_ptr->name,
656 assoc_mgr_tres_name_array[i],
657 qos_ptr->usage->grp_used_tres_run_secs[i]);
658 }
659
660 used_limits->jobs++;
661 used_limits_a->jobs++;
662
663 _add_usage_node_bitmap(
664 job_ptr,
665 &qos_ptr->usage->grp_node_bitmap,
666 &qos_ptr->usage->grp_node_job_cnt,
667 &qos_ptr->usage->grp_used_tres[TRES_ARRAY_NODE]);
668
669 _add_usage_node_bitmap(
670 job_ptr,
671 &used_limits->node_bitmap,
672 &used_limits->node_job_cnt,
673 &used_limits->tres[TRES_ARRAY_NODE]);
674
675 _add_usage_node_bitmap(
676 job_ptr,
677 &used_limits_a->node_bitmap,
678 &used_limits_a->node_job_cnt,
679 &used_limits_a->tres[TRES_ARRAY_NODE]);
680 break;
681 case ACCT_POLICY_JOB_FINI:
682 /*
683 * If tres_alloc_cnt doesn't exist means ACCT_POLICY_JOB_BEGIN
684 * was never called so no need to clean up that which was never
685 * set up.
686 */
687 if (!job_ptr->tres_alloc_cnt)
688 break;
689 qos_ptr->usage->grp_used_jobs--;
690 if ((int32_t)qos_ptr->usage->grp_used_jobs < 0) {
691 qos_ptr->usage->grp_used_jobs = 0;
692 debug2("acct_policy_job_fini: used_jobs "
693 "underflow for qos %s", qos_ptr->name);
694 }
695
696 for (i=0; i<slurmctld_tres_cnt; i++) {
697 if (i == TRES_ARRAY_ENERGY)
698 continue;
699
700 if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
701 continue;
702
703 if (job_ptr->tres_alloc_cnt[i] >
704 qos_ptr->usage->grp_used_tres[i]) {
705 qos_ptr->usage->grp_used_tres[i] = 0;
706 debug2("acct_policy_job_fini: "
707 "grp_used_tres(%s) "
708 "underflow for QOS %s",
709 assoc_mgr_tres_name_array[i],
710 qos_ptr->name);
711 } else
712 qos_ptr->usage->grp_used_tres[i] -=
713 job_ptr->tres_alloc_cnt[i];
714
715 if (job_ptr->tres_alloc_cnt[i] > used_limits->tres[i]) {
716 used_limits->tres[i] = 0;
717 debug2("acct_policy_job_fini: "
718 "used_limits->tres(%s) "
719 "underflow for qos %s user %u",
720 assoc_mgr_tres_name_array[i],
721 qos_ptr->name, used_limits->uid);
722 } else
723 used_limits->tres[i] -=
724 job_ptr->tres_alloc_cnt[i];
725
726 if (job_ptr->tres_alloc_cnt[i] >
727 used_limits_a->tres[i]) {
728 used_limits_a->tres[i] = 0;
729 debug2("acct_policy_job_fini: "
730 "used_limits->tres(%s) "
731 "underflow for qos %s account %s",
732 assoc_mgr_tres_name_array[i],
733 qos_ptr->name, used_limits_a->acct);
734 } else
735 used_limits_a->tres[i] -=
736 job_ptr->tres_alloc_cnt[i];
737 }
738
739 if (used_limits->jobs)
740 used_limits->jobs--;
741 else
742 debug2("acct_policy_job_fini: used_jobs "
743 "underflow for qos %s user %d",
744 qos_ptr->name, used_limits->uid);
745
746 if (used_limits_a->jobs)
747 used_limits_a->jobs--;
748 else
749 debug2("acct_policy_job_fini: used_jobs "
750 "underflow for qos %s account %s",
751 qos_ptr->name, used_limits_a->acct);
752
753 _rm_usage_node_bitmap(
754 job_ptr,
755 qos_ptr->usage->grp_node_bitmap,
756 qos_ptr->usage->grp_node_job_cnt,
757 &qos_ptr->usage->grp_used_tres[TRES_ARRAY_NODE]);
758
759 _rm_usage_node_bitmap(
760 job_ptr,
761 used_limits->node_bitmap,
762 used_limits->node_job_cnt,
763 &used_limits->tres[TRES_ARRAY_NODE]);
764
765 _rm_usage_node_bitmap(
766 job_ptr,
767 used_limits_a->node_bitmap,
768 used_limits_a->node_job_cnt,
769 &used_limits_a->tres[TRES_ARRAY_NODE]);
770 break;
771 default:
772 error("acct_policy: qos unknown type %d", type);
773 break;
774 }
775
776 }
777
_find_qos_part(void * x,void * key)778 static int _find_qos_part(void *x, void *key)
779 {
780 if ((slurmdb_qos_rec_t *) x == (slurmdb_qos_rec_t *) key)
781 return 1; /* match */
782
783 return 0;
784 }
785
_adjust_limit_usage(int type,job_record_t * job_ptr)786 static void _adjust_limit_usage(int type, job_record_t *job_ptr)
787 {
788 slurmdb_assoc_rec_t *assoc_ptr = NULL;
789 assoc_mgr_lock_t locks =
790 { .assoc = WRITE_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
791 uint64_t used_tres_run_secs[slurmctld_tres_cnt];
792 int i;
793 uint32_t job_cnt = 1;
794
795 memset(used_tres_run_secs, 0, sizeof(uint64_t) * slurmctld_tres_cnt);
796
797 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
798 || !_valid_job_assoc(job_ptr))
799 return;
800
801 if (type == ACCT_POLICY_JOB_FINI)
802 priority_g_job_end(job_ptr);
803 else if (type == ACCT_POLICY_JOB_BEGIN) {
804 uint64_t time_limit_secs = (uint64_t)job_ptr->time_limit * 60;
805 /* take into account usage factor */
806 if (job_ptr->qos_ptr &&
807 (job_ptr->qos_ptr->usage_factor >= 0))
808 time_limit_secs *= job_ptr->qos_ptr->usage_factor;
809 for (i = 0; i < slurmctld_tres_cnt; i++) {
810 if (i == TRES_ARRAY_ENERGY)
811 continue;
812 if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
813 continue;
814
815 used_tres_run_secs[i] =
816 job_ptr->tres_alloc_cnt[i] * time_limit_secs;
817 }
818 } else if (((type == ACCT_POLICY_ADD_SUBMIT) ||
819 (type == ACCT_POLICY_REM_SUBMIT)) &&
820 job_ptr->array_recs && job_ptr->array_recs->task_cnt)
821 job_cnt = job_ptr->array_recs->task_cnt;
822
823 assoc_mgr_lock(&locks);
824
825 /*
826 * This handles removal of the accrual_cnt pending on
827 * state. We do not want to call this on add submit as it could push
828 * other jobs pending waiting in line for the limit. The main call to
829 * this that handles the initial call happens in build_job_queue().
830 */
831 if (type != ACCT_POLICY_ADD_SUBMIT)
832 acct_policy_handle_accrue_time(job_ptr, true);
833
834 /*
835 * If we have submitted to multiple partitions we need to handle all of
836 * them on submit and remove if the job was cancelled before it ran
837 * (!job_ptr->tres_alloc_str).
838 */
839 if (((type == ACCT_POLICY_ADD_SUBMIT) ||
840 (type == ACCT_POLICY_REM_SUBMIT)) &&
841 job_ptr->part_ptr_list &&
842 (IS_JOB_PENDING(job_ptr) || !job_ptr->tres_alloc_str)) {
843 bool job_first = false;
844 ListIterator part_itr;
845 part_record_t *part_ptr;
846 List part_qos_list = NULL;
847
848 if (job_ptr->qos_ptr &&
849 (((slurmdb_qos_rec_t *)job_ptr->qos_ptr)->flags
850 & QOS_FLAG_OVER_PART_QOS))
851 job_first = true;
852
853 if (job_first) {
854 _qos_adjust_limit_usage(type, job_ptr, job_ptr->qos_ptr,
855 used_tres_run_secs, job_cnt);
856 part_qos_list = list_create(NULL);
857 list_push(part_qos_list, job_ptr->qos_ptr);
858 }
859
860 part_itr = list_iterator_create(job_ptr->part_ptr_list);
861 while ((part_ptr = list_next(part_itr))) {
862 if (!part_ptr->qos_ptr)
863 continue;
864 if (!part_qos_list)
865 part_qos_list = list_create(NULL);
866 /*
867 * Don't adjust usage to this partition's qos if
868 * it's the same as the qos of another partition
869 * that we already handled.
870 */
871 if (list_find_first(part_qos_list, _find_qos_part,
872 part_ptr->qos_ptr))
873 continue;
874 list_push(part_qos_list, part_ptr->qos_ptr);
875 _qos_adjust_limit_usage(type, job_ptr,
876 part_ptr->qos_ptr,
877 used_tres_run_secs, job_cnt);
878 }
879 list_iterator_destroy(part_itr);
880
881 /*
882 * Don't adjust usage to this job's qos if
883 * it's the same as the qos of a partition
884 * that we already handled.
885 */
886 if (!job_first && job_ptr->qos_ptr &&
887 (!part_qos_list ||
888 !list_find_first(part_qos_list, _find_qos_part,
889 job_ptr->qos_ptr)))
890 _qos_adjust_limit_usage(type, job_ptr, job_ptr->qos_ptr,
891 used_tres_run_secs, job_cnt);
892
893 FREE_NULL_LIST(part_qos_list);
894 } else {
895 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
896
897 /*
898 * Here if the job is starting and we had a part_ptr_list before
899 * hand we need to remove the submit from all partition qos
900 * outside of the one we actually are going to run on.
901 */
902 if ((type == ACCT_POLICY_JOB_BEGIN) &&
903 job_ptr->part_ptr_list) {
904 ListIterator part_itr;
905 part_record_t *part_ptr;
906 List part_qos_list = list_create(NULL);
907
908 if (job_ptr->qos_ptr)
909 list_push(part_qos_list, job_ptr->qos_ptr);
910 if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr &&
911 job_ptr->qos_ptr != job_ptr->part_ptr->qos_ptr)
912 list_push(part_qos_list,
913 job_ptr->part_ptr->qos_ptr);
914
915 part_itr = list_iterator_create(job_ptr->part_ptr_list);
916 while ((part_ptr = list_next(part_itr))) {
917 if (!part_ptr->qos_ptr)
918 continue;
919
920 /*
921 * Don't adjust usage to this partition's qos if
922 * it's the same as the qos of another partition
923 * that we already handled.
924 */
925 if (list_find_first(part_qos_list,
926 _find_qos_part,
927 part_ptr->qos_ptr))
928 continue;
929 _qos_adjust_limit_usage(ACCT_POLICY_REM_SUBMIT,
930 job_ptr,
931 part_ptr->qos_ptr,
932 used_tres_run_secs,
933 job_cnt);
934 }
935 list_iterator_destroy(part_itr);
936 FREE_NULL_LIST(part_qos_list);
937 }
938
939 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
940
941 _qos_adjust_limit_usage(type, job_ptr, qos_ptr_1,
942 used_tres_run_secs, job_cnt);
943 _qos_adjust_limit_usage(type, job_ptr, qos_ptr_2,
944 used_tres_run_secs, job_cnt);
945 }
946
947 assoc_ptr = job_ptr->assoc_ptr;
948 while (assoc_ptr) {
949 switch (type) {
950 case ACCT_POLICY_ADD_SUBMIT:
951 assoc_ptr->usage->used_submit_jobs += job_cnt;
952 break;
953 case ACCT_POLICY_REM_SUBMIT:
954 if (assoc_ptr->usage->used_submit_jobs)
955 assoc_ptr->usage->used_submit_jobs -= job_cnt;
956 else
957 debug2("acct_policy_remove_job_submit: "
958 "used_submit_jobs underflow for "
959 "account %s",
960 assoc_ptr->acct);
961 break;
962 case ACCT_POLICY_JOB_BEGIN:
963 assoc_ptr->usage->used_jobs++;
964 _add_usage_node_bitmap(
965 job_ptr,
966 &assoc_ptr->usage->grp_node_bitmap,
967 &assoc_ptr->usage->grp_node_job_cnt,
968 &assoc_ptr->usage->
969 grp_used_tres[TRES_ARRAY_NODE]);
970
971 for (i = 0; i < slurmctld_tres_cnt; i++) {
972 if (i == TRES_ARRAY_ENERGY)
973 continue;
974 if (job_ptr->tres_alloc_cnt[i] ==
975 NO_CONSUME_VAL64)
976 continue;
977
978 if (i != TRES_ARRAY_NODE) {
979 assoc_ptr->usage->grp_used_tres[i] +=
980 job_ptr->tres_alloc_cnt[i];
981 }
982 assoc_ptr->usage->grp_used_tres_run_secs[i] +=
983 used_tres_run_secs[i];
984 debug2("acct_policy_job_begin: after adding %pJ, assoc %u(%s/%s/%s) grp_used_tres_run_secs(%s) is %"PRIu64,
985 job_ptr, assoc_ptr->id, assoc_ptr->acct,
986 assoc_ptr->user, assoc_ptr->partition,
987 assoc_mgr_tres_name_array[i],
988 assoc_ptr->usage->
989 grp_used_tres_run_secs[i]);
990 }
991 break;
992 case ACCT_POLICY_JOB_FINI:
993 if (assoc_ptr->usage->used_jobs)
994 assoc_ptr->usage->used_jobs--;
995 else
996 debug2("acct_policy_job_fini: used_jobs "
997 "underflow for account %s",
998 assoc_ptr->acct);
999 _rm_usage_node_bitmap(
1000 job_ptr,
1001 assoc_ptr->usage->grp_node_bitmap,
1002 assoc_ptr->usage->grp_node_job_cnt,
1003 &assoc_ptr->usage->
1004 grp_used_tres[TRES_ARRAY_NODE]);
1005 for (i = 0; i < slurmctld_tres_cnt; i++) {
1006 if ((i == TRES_ARRAY_ENERGY) ||
1007 (i == TRES_ARRAY_NODE))
1008 continue;
1009 if (job_ptr->tres_alloc_cnt[i] ==
1010 NO_CONSUME_VAL64)
1011 continue;
1012
1013 if (job_ptr->tres_alloc_cnt[i] >
1014 assoc_ptr->usage->grp_used_tres[i]) {
1015 assoc_ptr->usage->grp_used_tres[i] = 0;
1016 debug2("acct_policy_job_fini: "
1017 "grp_used_tres(%s) "
1018 "underflow for assoc "
1019 "%u(%s/%s/%s)",
1020 assoc_mgr_tres_name_array[i],
1021 assoc_ptr->id, assoc_ptr->acct,
1022 assoc_ptr->user,
1023 assoc_ptr->partition);
1024 } else {
1025 assoc_ptr->usage->grp_used_tres[i] -=
1026 job_ptr->tres_alloc_cnt[i];
1027 }
1028 }
1029
1030 break;
1031 default:
1032 error("acct_policy: association unknown type %d", type);
1033 break;
1034 }
1035 /* now handle all the group limits of the parents */
1036 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
1037 }
1038 assoc_mgr_unlock(&locks);
1039 }
1040
_set_time_limit(uint32_t * time_limit,uint32_t part_max_time,uint32_t limit_max_time,uint16_t * limit_set_time)1041 static void _set_time_limit(uint32_t *time_limit, uint32_t part_max_time,
1042 uint32_t limit_max_time, uint16_t *limit_set_time)
1043 {
1044 if ((*time_limit) == NO_VAL) {
1045 if (limit_max_time)
1046 (*time_limit) = limit_max_time;
1047 else if (part_max_time != INFINITE)
1048 (*time_limit) = part_max_time;
1049 else
1050 (*time_limit) = INFINITE;
1051
1052 if (limit_set_time)
1053 (*limit_set_time) = 1;
1054 } else if (limit_set_time && (*limit_set_time) &&
1055 ((*time_limit) > limit_max_time))
1056 (*time_limit) = limit_max_time;
1057 }
1058
_qos_alter_job(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,uint64_t * used_tres_run_secs,uint64_t * new_used_tres_run_secs)1059 static void _qos_alter_job(job_record_t *job_ptr,
1060 slurmdb_qos_rec_t *qos_ptr,
1061 uint64_t *used_tres_run_secs,
1062 uint64_t *new_used_tres_run_secs)
1063 {
1064 int i;
1065
1066 if (!qos_ptr || !job_ptr)
1067 return;
1068
1069 for (i=0; i<slurmctld_tres_cnt; i++) {
1070 if (used_tres_run_secs[i] == new_used_tres_run_secs[i])
1071 continue;
1072 qos_ptr->usage->grp_used_tres_run_secs[i] -=
1073 used_tres_run_secs[i];
1074 qos_ptr->usage->grp_used_tres_run_secs[i] +=
1075 new_used_tres_run_secs[i];
1076 debug2("altering %pJ QOS %s got %"PRIu64" just removed %"PRIu64" and added %"PRIu64,
1077 job_ptr, qos_ptr->name,
1078 qos_ptr->usage->grp_used_tres_run_secs[i],
1079 used_tres_run_secs[i],
1080 new_used_tres_run_secs[i]);
1081 }
1082 }
1083
1084 /*
1085 * _validate_tres_limits_for_assoc - validate the tres requested against limits
1086 * of an association as well as qos skipping any limit an admin set
1087 *
1088 * OUT - tres_pos - if false is returned position in array of failed limit
1089 * IN - job_tres_array - count of various TRES requested by the job
1090 * IN - divisor - divide the job_tres_array TRES by this variable, 0 if none
1091 * IN - assoc_tres_array - TRES limits from an association (Grp, Max, Min)
1092 * IN - qos_tres_array - TRES limits QOS has imposed already
1093 * IN - acct_policy_limit_set_array - limits that have been overridden
1094 * by an admin
1095 * IN strict_checking - If a limit needs to be enforced now or not.
1096 * IN update_call - If this is an update or a create call
1097 * IN max_limit - Limits are for MAX else, the limits are MIN.
1098 *
1099 * RET - True if no limit is violated, false otherwise with tres_pos
1100 * being set to the position of the failed limit.
1101 */
_validate_tres_limits_for_assoc(int * tres_pos,uint64_t * job_tres_array,uint64_t divisor,uint64_t * assoc_tres_array,uint64_t * qos_tres_array,uint16_t * admin_set_limit_tres_array,bool strict_checking,bool update_call,bool max_limit)1102 static bool _validate_tres_limits_for_assoc(
1103 int *tres_pos,
1104 uint64_t *job_tres_array,
1105 uint64_t divisor,
1106 uint64_t *assoc_tres_array,
1107 uint64_t *qos_tres_array,
1108 uint16_t *admin_set_limit_tres_array,
1109 bool strict_checking,
1110 bool update_call, bool max_limit)
1111 {
1112 int i;
1113 uint64_t job_tres;
1114
1115 if (!strict_checking)
1116 return true;
1117
1118 for (i = 0; i < g_tres_count; i++) {
1119 (*tres_pos) = i;
1120
1121 if ((admin_set_limit_tres_array[i] == ADMIN_SET_LIMIT)
1122 || (qos_tres_array[i] != INFINITE64)
1123 || (assoc_tres_array[i] == INFINITE64)
1124 || (!job_tres_array[i] && !update_call))
1125 continue;
1126
1127 job_tres = job_tres_array[i];
1128
1129 if (divisor)
1130 job_tres /= divisor;
1131
1132 if (max_limit) {
1133 if (job_tres > assoc_tres_array[i])
1134 return false;
1135 } else if (job_tres < assoc_tres_array[i])
1136 return false;
1137 }
1138
1139 return true;
1140 }
1141
1142
1143 /*
1144 * _validate_tres_limits_for_qos - validate the tres requested against limits
1145 * of a QOS as well as qos skipping any limit an admin set
1146 *
1147 * OUT - tres_pos - if false is returned position in array of failed limit
1148 * IN - job_tres_array - count of various TRES requested by the job
1149 * IN - divisor - divide the job_tres_array TRES by this variable, 0 if none
1150 * IN - grp_tres_array - Grp TRES limits from QOS
1151 * IN - max_tres_array - Max/Min TRES limits from QOS
1152 * IN/OUT - out_grp_tres_array - Grp TRES limits QOS has imposed already,
1153 * if a new limit is found the limit is filled in.
1154 * IN/OUT - out_max_tres_array - Max/Min TRES limits QOS has imposed already,
1155 * if a new limit is found the limit is filled in.
1156 * IN - acct_policy_limit_set_array - limits that have been overridden
1157 * by an admin
1158 * IN strict_checking - If a limit needs to be enforced now or not.
1159 * IN max_limit - Limits are for MAX else, the limits are MIN.
1160 *
1161 * RET - True if no limit is violated, false otherwise with tres_pos
1162 * being set to the position of the failed limit.
1163 */
_validate_tres_limits_for_qos(int * tres_pos,uint64_t * job_tres_array,uint64_t divisor,uint64_t * grp_tres_array,uint64_t * max_tres_array,uint64_t * out_grp_tres_array,uint64_t * out_max_tres_array,uint16_t * admin_set_limit_tres_array,bool strict_checking,bool max_limit)1164 static bool _validate_tres_limits_for_qos(
1165 int *tres_pos,
1166 uint64_t *job_tres_array,
1167 uint64_t divisor,
1168 uint64_t *grp_tres_array,
1169 uint64_t *max_tres_array,
1170 uint64_t *out_grp_tres_array,
1171 uint64_t *out_max_tres_array,
1172 uint16_t *admin_set_limit_tres_array,
1173 bool strict_checking, bool max_limit)
1174 {
1175 uint64_t max_tres_limit, out_max_tres_limit;
1176 int i;
1177 uint64_t job_tres;
1178
1179 if (!strict_checking)
1180 return true;
1181
1182 for (i = 0; i < g_tres_count; i++) {
1183 (*tres_pos) = i;
1184 if (grp_tres_array) {
1185 max_tres_limit = MIN(grp_tres_array[i],
1186 max_tres_array[i]);
1187 out_max_tres_limit = MIN(out_grp_tres_array[i],
1188 out_max_tres_array[i]);
1189 } else {
1190 max_tres_limit = max_tres_array[i];
1191 out_max_tres_limit = out_max_tres_array[i];
1192 }
1193
1194 /* we don't need to look at this limit */
1195 if ((admin_set_limit_tres_array[i] == ADMIN_SET_LIMIT)
1196 || (out_max_tres_limit != INFINITE64)
1197 || (max_tres_limit == INFINITE64)
1198 || (job_tres_array[i] && (job_tres_array[i] == NO_VAL64)))
1199 continue;
1200
1201 out_max_tres_array[i] = max_tres_array[i];
1202
1203 job_tres = job_tres_array[i];
1204
1205 if (divisor)
1206 job_tres /= divisor;
1207
1208 if (out_grp_tres_array && grp_tres_array) {
1209 if (out_grp_tres_array[i] == INFINITE64)
1210 out_grp_tres_array[i] = grp_tres_array[i];
1211
1212 if (max_limit) {
1213 if (job_tres > grp_tres_array[i])
1214 return false;
1215 } else if (job_tres < grp_tres_array[i])
1216 return false;
1217 }
1218
1219 if (max_limit) {
1220 if (job_tres > max_tres_array[i])
1221 return false;
1222 } else if (job_tres < max_tres_array[i])
1223 return false;
1224 }
1225
1226 return true;
1227 }
1228
1229 /* Only check the time_limits if the admin didn't set
1230 * the timelimit.
1231 * It is important we look at these even if strict_checking
1232 * isn't set so we get the correct time_limit from the job.
1233 */
_validate_time_limit(uint32_t * time_limit_in,uint32_t part_max_time,uint64_t tres_req_cnt,uint64_t max_limit,void * out_max_limit,uint16_t * limit_set_time,bool strict_checking,bool is64)1234 static bool _validate_time_limit(uint32_t *time_limit_in,
1235 uint32_t part_max_time,
1236 uint64_t tres_req_cnt,
1237 uint64_t max_limit,
1238 void *out_max_limit,
1239 uint16_t *limit_set_time,
1240 bool strict_checking,
1241 bool is64)
1242 {
1243 uint32_t max_time_limit;
1244 uint64_t out_max_64 = *(uint64_t *)out_max_limit;
1245 uint32_t out_max_32 = *(uint32_t *)out_max_limit;
1246
1247 if (!tres_req_cnt || (((*time_limit_in) != NO_VAL) &&
1248 (!strict_checking ||
1249 (*limit_set_time) == ADMIN_SET_LIMIT)))
1250 return true;
1251
1252 if (is64) {
1253 if ((out_max_64 != INFINITE64) ||
1254 (max_limit == INFINITE64) ||
1255 (tres_req_cnt == NO_VAL64))
1256 return true;
1257 } else {
1258 if ((out_max_32 != INFINITE) ||
1259 ((uint32_t)max_limit == INFINITE) ||
1260 ((uint32_t)tres_req_cnt == NO_VAL))
1261 return true;
1262 }
1263
1264 max_time_limit = (uint32_t)(max_limit / tres_req_cnt);
1265
1266 _set_time_limit(time_limit_in, part_max_time, max_time_limit,
1267 limit_set_time);
1268
1269 if (is64)
1270 (*(uint64_t *)out_max_limit) = max_limit;
1271 else
1272 (*(uint32_t *)out_max_limit) = (uint32_t)max_limit;
1273
1274 if ((*time_limit_in) > max_time_limit)
1275 return false;
1276
1277 return true;
1278 }
1279
1280 /*
1281 * _validate_tres_time_limits - validate the tres requested
1282 * against limits of an association as well as qos skipping any limit
1283 * an admin set
1284 *
1285 * OUT - tres_pos - if false is returned position in array of failed limit
1286 * IN/OUT - time_limit_in - Job's time limit, set and returned based off limits
1287 * if none is given.
1288 * IN - part_max_time - Job's partition max time limit
1289 * IN - job_tres_array - count of various TRES requested by the job
1290 * IN - max_tres_array - Max TRES limits of association/QOS
1291 * OUT - out_max_tres_array - Max TRES limits as set by the various TRES
1292 * OUT - limit_set_time - set if the time_limit was set by a limit QOS/Assoc or
1293 * otherwise.
1294 * IN strict_checking - If a limit needs to be enforced now or not.
1295 *
1296 * RET - True if no limit is violated, false otherwise with tres_pos
1297 * being set to the position of the failed limit.
1298 */
_validate_tres_time_limits(int * tres_pos,uint32_t * time_limit_in,uint32_t part_max_time,uint64_t * job_tres_array,uint64_t * max_tres_array,uint64_t * out_max_tres_array,uint16_t * limit_set_time,bool strict_checking)1299 static bool _validate_tres_time_limits(
1300 int *tres_pos,
1301 uint32_t *time_limit_in,
1302 uint32_t part_max_time,
1303 uint64_t *job_tres_array,
1304 uint64_t *max_tres_array,
1305 uint64_t *out_max_tres_array,
1306 uint16_t *limit_set_time,
1307 bool strict_checking)
1308 {
1309 int i;
1310 // uint32_t max_time_limit;
1311
1312 if (!strict_checking || (*limit_set_time) == ADMIN_SET_LIMIT)
1313 return true;
1314
1315 for (i = 0; i < g_tres_count; i++) {
1316 (*tres_pos) = i;
1317
1318 if (!_validate_time_limit(time_limit_in, part_max_time,
1319 job_tres_array[i],
1320 max_tres_array[i],
1321 &out_max_tres_array[i],
1322 limit_set_time,
1323 strict_checking, true))
1324 return false;
1325 /* if ((out_max_tres_array[i] != INFINITE64) || */
1326 /* (max_tres_array[i] == INFINITE64) || */
1327 /* (job_tres_array[i] == NO_VAL64) || */
1328 /* (job_tres_array[i] == 0)) */
1329 /* continue; */
1330
1331 /* max_time_limit = (uint32_t)(max_tres_array[i] / */
1332 /* job_tres_array[i]); */
1333
1334 /* _set_time_limit(time_limit_in, */
1335 /* part_max_time, max_time_limit, */
1336 /* limit_set_time); */
1337
1338 /* out_max_tres_array[i] = max_tres_array[i]; */
1339
1340 /* if ((*time_limit_in) > max_time_limit) */
1341 /* return false; */
1342 }
1343
1344 return true;
1345 }
1346
1347 /*
1348 * _validate_tres_usage_limits - validate the TRES requested against
1349 * specified limits; when checking for safe limits, also take into
1350 * consideration already used and currently running TRES resources
1351 *
1352 * OUT - tres_pos - if function returns other than TRES_USAGE_OKAY,
1353 * position in TRES array of failed limit
1354 * IN - tres_limit_array - count of various TRES limits to check against
1355 * OUT - out_tres_limit_array - optional; assigned values from tres_limit_array
1356 * when out_tres_limit_set is true,
1357 * skipped when any of:
1358 * 1) admin_limit_set is set and is an admin
1359 * limit
1360 * 2) out_tres_limit_array is set and its value
1361 * has been changed since initially being set
1362 * to INFINITE64
1363 * 3) tres_limit_array is INFINITE64
1364 * IN - tres_req_cnt - must be set when safe_limits is true; the following
1365 * is checked with tres_req_cnt:
1366 * 1) tres_req_cnt > tres_limit_array,
1367 * return TRES_USAGE_REQ_EXCEEDS_LIMIT
1368 * 2) when tres_usage is set:
1369 * (tres_req_cnt + tres_usage) >
1370 * (tres_limit_array - curr_usage),
1371 * return TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE
1372 * curr_usage will be 0 when not passed
1373 * IN - tres_usage - TRES (currently running if curr_usage is set, already used
1374 * otherwise) optional; This value is used primarily only if
1375 * safe_limits is true. It will be added to tres_req_cnt to
1376 * count as extra time to observe, see tres_req_cnt section
1377 * above for tres_usage interaction
1378 * IN - curr_usage - TRES (already used) optional; when set, check if:
1379 * 1) curr_usage > tres_limit_array
1380 * return TRES_USAGE_CUR_EXCEEDS_LIMIT
1381 * 2) when safe_limits is true, see tres_req_cnt section
1382 * above for curr_usage interaction
1383 * IN - admin_limit_set - limits that have been overridden by an admin, see
1384 * out_tres_limit_array section above for interaction
1385 * IN - safe_limits - requires tres_req_cnt when true; see tres_req_cnt
1386 * section above for interaction
1387 * IN - out_tres_limit_set - out_tres_limit_array is set as described above
1388 * when true; out_tres_limit_array is not modified when false
1389 * RET - TRES_USAGE_OKAY if no limit is violated, otherwise one of the other
1390 * acct_policy_tres_usage_t enumerations with tres_pos being set to the
1391 * position of the failed limit.
1392 */
_validate_tres_usage_limits(int * tres_pos,uint64_t * tres_limit_array,uint64_t * out_tres_limit_array,uint64_t * tres_req_cnt,uint64_t * tres_usage,uint64_t * curr_usage,uint16_t * admin_limit_set,bool safe_limits,bool out_tres_limit_set)1393 static acct_policy_tres_usage_t _validate_tres_usage_limits(
1394 int *tres_pos,
1395 uint64_t *tres_limit_array,
1396 uint64_t *out_tres_limit_array,
1397 uint64_t *tres_req_cnt,
1398 uint64_t *tres_usage,
1399 uint64_t *curr_usage,
1400 uint16_t *admin_limit_set,
1401 bool safe_limits,
1402 bool out_tres_limit_set)
1403 {
1404 int i;
1405 uint64_t usage = 0;
1406
1407 xassert(tres_limit_array);
1408
1409 for (i = 0; i < g_tres_count; i++) {
1410 (*tres_pos) = i;
1411
1412 if ((admin_limit_set &&
1413 admin_limit_set[i] == ADMIN_SET_LIMIT) ||
1414 (out_tres_limit_array &&
1415 out_tres_limit_array[i] != INFINITE64) ||
1416 (tres_limit_array[i] == INFINITE64))
1417 continue;
1418
1419 if (out_tres_limit_set && out_tres_limit_array)
1420 out_tres_limit_array[i] = tres_limit_array[i];
1421
1422 if (curr_usage && (curr_usage[i] >= tres_limit_array[i]))
1423 return TRES_USAGE_CUR_EXCEEDS_LIMIT;
1424
1425 if (safe_limits) {
1426 xassert(tres_req_cnt);
1427 if (tres_req_cnt[i] > tres_limit_array[i])
1428 return TRES_USAGE_REQ_EXCEEDS_LIMIT;
1429
1430 if (curr_usage)
1431 usage = curr_usage[i];
1432 if (tres_usage &&
1433 ((tres_req_cnt[i] + tres_usage[i]) >
1434 (tres_limit_array[i] - usage)))
1435 return TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE;
1436 }
1437 }
1438
1439 return TRES_USAGE_OKAY;
1440 }
1441
1442 /*
1443 * _validate_tres_usage_limits_for_qos - validate the tres requested
1444 * against limits of an qos skipping any limit an admin set
1445 *
1446 * OUT - tres_pos - if false is returned position in array of failed limit
1447 * IN - tres_limit_array - TRES limits from an association
1448 * IN/OUT - out_tres_limit_array - TRES limits QOS has imposed already, if a new
1449 * limit is found the limit is filled in.
1450 * IN - tres_req_cnt - TRES requested from the job
1451 * IN - tres_usage - TRES usage from the QOS (in minutes)
1452 * IN - curr_usage - TRES usage in use right now by the QOS (running jobs)
1453 * IN - admin_limit_set - TRES limits that have been overridden by an admin
1454 * IN - safe_limits - if the safe flag was set on AccountingStorageEnforce
1455 *
1456 * RET - True if no limit is violated, false otherwise with tres_pos
1457 * being set to the position of the failed limit.
1458 */
_validate_tres_usage_limits_for_qos(int * tres_pos,uint64_t * tres_limit_array,uint64_t * out_tres_limit_array,uint64_t * tres_req_cnt,uint64_t * tres_usage,uint64_t * curr_usage,uint16_t * admin_limit_set,bool safe_limits)1459 static acct_policy_tres_usage_t _validate_tres_usage_limits_for_qos(
1460 int *tres_pos,
1461 uint64_t *tres_limit_array,
1462 uint64_t *out_tres_limit_array,
1463 uint64_t *tres_req_cnt,
1464 uint64_t *tres_usage,
1465 uint64_t *curr_usage,
1466 uint16_t *admin_limit_set,
1467 bool safe_limits)
1468 {
1469 return _validate_tres_usage_limits(tres_pos,
1470 tres_limit_array,
1471 out_tres_limit_array,
1472 tres_req_cnt,
1473 tres_usage,
1474 curr_usage,
1475 admin_limit_set,
1476 safe_limits,
1477 true);
1478 }
1479
1480 /*
1481 * _validate_tres_usage_limits_for_assoc - validate the tres requested
1482 * against limits of an association as well as qos skipping any limit
1483 * an admin set
1484 *
1485 * OUT - tres_pos - if false is returned position in array of failed limit
1486 * IN - tres_limit_array - TRES limits from an association
1487 * IN - qos_tres_limit_array - TRES limits QOS has imposed already
1488 * IN - tres_req_cnt - TRES requested from the job
1489 * IN - tres_usage - TRES usage from the association (in minutes)
1490 * IN - curr_usage - TRES usage in use right now by the assoc (running jobs)
1491 * IN - admin_limit_set - TRES limits that have been overridden by an admin
1492 * IN - safe_limits - if the safe flag was set on AccountingStorageEnforce
1493 *
1494 * RET - True if no limit is violated, false otherwise with tres_pos
1495 * being set to the position of the failed limit.
1496 */
_validate_tres_usage_limits_for_assoc(int * tres_pos,uint64_t * tres_limit_array,uint64_t * qos_tres_limit_array,uint64_t * tres_req_cnt,uint64_t * tres_usage,uint64_t * curr_usage,uint16_t * admin_limit_set,bool safe_limits)1497 static acct_policy_tres_usage_t _validate_tres_usage_limits_for_assoc(
1498 int *tres_pos,
1499 uint64_t *tres_limit_array,
1500 uint64_t *qos_tres_limit_array,
1501 uint64_t *tres_req_cnt,
1502 uint64_t *tres_usage,
1503 uint64_t *curr_usage,
1504 uint16_t *admin_limit_set,
1505 bool safe_limits)
1506 {
1507 return _validate_tres_usage_limits(tres_pos,
1508 tres_limit_array,
1509 qos_tres_limit_array,
1510 tres_req_cnt,
1511 tres_usage,
1512 curr_usage,
1513 admin_limit_set,
1514 safe_limits,
1515 false);
1516 }
1517
_qos_policy_validate(job_desc_msg_t * job_desc,slurmdb_assoc_rec_t * assoc_ptr,part_record_t * part_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr,uint32_t * reason,acct_policy_limit_set_t * acct_policy_limit_set,bool update_call,char * user_name,int job_cnt,bool strict_checking)1518 static int _qos_policy_validate(job_desc_msg_t *job_desc,
1519 slurmdb_assoc_rec_t *assoc_ptr,
1520 part_record_t *part_ptr,
1521 slurmdb_qos_rec_t *qos_ptr,
1522 slurmdb_qos_rec_t *qos_out_ptr,
1523 uint32_t *reason,
1524 acct_policy_limit_set_t *acct_policy_limit_set,
1525 bool update_call,
1526 char *user_name,
1527 int job_cnt,
1528 bool strict_checking)
1529 {
1530 int rc = true;
1531 int tres_pos = 0;
1532
1533 if (!qos_ptr || !qos_out_ptr)
1534 return rc;
1535
1536 if (!_validate_tres_limits_for_qos(&tres_pos,
1537 job_desc->tres_req_cnt, 0,
1538 NULL,
1539 qos_ptr->max_tres_pa_ctld,
1540 NULL,
1541 qos_out_ptr->max_tres_pa_ctld,
1542 acct_policy_limit_set->tres,
1543 strict_checking, 1)) {
1544 if (job_desc->tres_req_cnt[tres_pos] >
1545 qos_ptr->max_tres_pa_ctld[tres_pos]) {
1546 if (reason)
1547 *reason = _get_tres_state_reason(
1548 tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
1549
1550 debug2("job submit for user %s(%u): "
1551 "min tres(%s) request %"PRIu64" exceeds "
1552 "per-acct max tres limit %"PRIu64" for qos '%s'",
1553 user_name,
1554 job_desc->user_id,
1555 assoc_mgr_tres_name_array[tres_pos],
1556 job_desc->tres_req_cnt[tres_pos],
1557 qos_ptr->max_tres_pa_ctld[tres_pos],
1558 qos_ptr->name);
1559 rc = false;
1560 goto end_it;
1561 }
1562 }
1563
1564 if (!_validate_tres_limits_for_qos(&tres_pos,
1565 job_desc->tres_req_cnt, 0,
1566 qos_ptr->grp_tres_ctld,
1567 qos_ptr->max_tres_pu_ctld,
1568 qos_out_ptr->grp_tres_ctld,
1569 qos_out_ptr->max_tres_pu_ctld,
1570 acct_policy_limit_set->tres,
1571 strict_checking, 1)) {
1572 if (job_desc->tres_req_cnt[tres_pos] >
1573 qos_ptr->max_tres_pu_ctld[tres_pos]) {
1574 if (reason)
1575 *reason = _get_tres_state_reason(
1576 tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
1577
1578 debug2("job submit for user %s(%u): "
1579 "min tres(%s) request %"PRIu64" exceeds "
1580 "per-user max tres limit %"PRIu64" for qos '%s'",
1581 user_name,
1582 job_desc->user_id,
1583 assoc_mgr_tres_name_array[tres_pos],
1584 job_desc->tres_req_cnt[tres_pos],
1585 qos_ptr->max_tres_pu_ctld[tres_pos],
1586 qos_ptr->name);
1587 rc = false;
1588 goto end_it;
1589 } else if (job_desc->tres_req_cnt[tres_pos] >
1590 qos_ptr->grp_tres_ctld[tres_pos]) {
1591 if (reason)
1592 *reason = _get_tres_state_reason(
1593 tres_pos, WAIT_QOS_GRP_UNK);
1594
1595 debug2("job submit for user %s(%u): "
1596 "min tres(%s) request %"PRIu64" exceeds "
1597 "group max tres limit %"PRIu64" for qos '%s'",
1598 user_name,
1599 job_desc->user_id,
1600 assoc_mgr_tres_name_array[tres_pos],
1601 job_desc->tres_req_cnt[tres_pos],
1602 qos_ptr->grp_tres_ctld[tres_pos],
1603 qos_ptr->name);
1604 rc = false;
1605 goto end_it;
1606 }
1607 }
1608
1609 /* for validation we don't need to look at
1610 * qos_ptr->grp_jobs.
1611 */
1612
1613 if ((qos_out_ptr->grp_submit_jobs == INFINITE) &&
1614 (qos_ptr->grp_submit_jobs != INFINITE)) {
1615
1616 qos_out_ptr->grp_submit_jobs = qos_ptr->grp_submit_jobs;
1617
1618 if ((qos_ptr->usage->grp_used_submit_jobs + job_cnt)
1619 > qos_ptr->grp_submit_jobs) {
1620 if (reason)
1621 *reason = WAIT_QOS_GRP_SUB_JOB;
1622 debug2("job submit for user %s(%u): group max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
1623 user_name,
1624 job_desc->user_id,
1625 qos_ptr->grp_submit_jobs,
1626 qos_ptr->usage->grp_used_submit_jobs, job_cnt,
1627 qos_ptr->name);
1628 rc = false;
1629 goto end_it;
1630 }
1631 }
1632
1633 /* Only check the time_limits if the admin didn't set the timelimit.
1634 * It is important we look at these even if strict_checking
1635 * isn't set so we get the correct time_limit from the job.
1636 */
1637 if (acct_policy_limit_set->time != ADMIN_SET_LIMIT) {
1638 if (!_validate_tres_time_limits(
1639 &tres_pos,
1640 &job_desc->time_limit,
1641 part_ptr->max_time,
1642 job_desc->tres_req_cnt,
1643 qos_ptr->max_tres_mins_pj_ctld,
1644 qos_out_ptr->max_tres_mins_pj_ctld,
1645 &acct_policy_limit_set->time,
1646 strict_checking)) {
1647 if (reason)
1648 *reason = _get_tres_state_reason(
1649 tres_pos,
1650 WAIT_QOS_MAX_UNK_MINS_PER_JOB);
1651 debug2("job submit for user %s(%u): "
1652 "tres(%s) time limit request %"PRIu64" "
1653 "exceeds max per-job limit %"PRIu64" "
1654 "for qos '%s'",
1655 user_name,
1656 job_desc->user_id,
1657 assoc_mgr_tres_name_array[tres_pos],
1658 ((uint64_t)job_desc->time_limit *
1659 job_desc->tres_req_cnt[tres_pos]),
1660 qos_ptr->max_tres_mins_pj_ctld[tres_pos],
1661 qos_ptr->name);
1662 rc = false;
1663 goto end_it;
1664 }
1665
1666 if (!_validate_tres_time_limits(
1667 &tres_pos,
1668 &job_desc->time_limit,
1669 part_ptr->max_time,
1670 job_desc->tres_req_cnt,
1671 qos_ptr->grp_tres_mins_ctld,
1672 qos_out_ptr->grp_tres_mins_ctld,
1673 &acct_policy_limit_set->time,
1674 strict_checking)) {
1675 if (reason)
1676 *reason = _get_tres_state_reason(
1677 tres_pos, WAIT_QOS_GRP_UNK_MIN);
1678 debug2("job submit for user %s(%u): "
1679 "tres(%s) time limit request %"PRIu64" "
1680 "exceeds group max limit %"PRIu64" "
1681 "for qos '%s'",
1682 user_name,
1683 job_desc->user_id,
1684 assoc_mgr_tres_name_array[tres_pos],
1685 ((uint64_t)job_desc->time_limit *
1686 job_desc->tres_req_cnt[tres_pos]),
1687 qos_ptr->grp_tres_mins_ctld[tres_pos],
1688 qos_ptr->name);
1689 rc = false;
1690 goto end_it;
1691 }
1692
1693 if (!_validate_tres_time_limits(
1694 &tres_pos,
1695 &job_desc->time_limit,
1696 part_ptr->max_time,
1697 job_desc->tres_req_cnt,
1698 qos_ptr->grp_tres_run_mins_ctld,
1699 qos_out_ptr->grp_tres_run_mins_ctld,
1700 &acct_policy_limit_set->time,
1701 strict_checking)) {
1702 if (reason)
1703 *reason = _get_tres_state_reason(
1704 tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
1705 debug2("job submit for user %s(%u): "
1706 "tres(%s) time limit request %"PRIu64" "
1707 "exceeds group max running limit %"PRIu64" "
1708 "for qos '%s'",
1709 user_name,
1710 job_desc->user_id,
1711 assoc_mgr_tres_name_array[tres_pos],
1712 ((uint64_t)job_desc->time_limit *
1713 job_desc->tres_req_cnt[tres_pos]),
1714 qos_ptr->grp_tres_run_mins_ctld[tres_pos],
1715 qos_ptr->name);
1716 rc = false;
1717 goto end_it;
1718 }
1719
1720 if ((qos_out_ptr->max_wall_pj == INFINITE) &&
1721 (qos_ptr->max_wall_pj != INFINITE) &&
1722 (!update_call || (job_desc->time_limit != NO_VAL))) {
1723 _set_time_limit(&job_desc->time_limit,
1724 part_ptr->max_time,
1725 qos_ptr->max_wall_pj,
1726 &acct_policy_limit_set->time);
1727 qos_out_ptr->max_wall_pj = qos_ptr->max_wall_pj;
1728
1729 if (strict_checking
1730 && job_desc->time_limit > qos_ptr->max_wall_pj) {
1731 if (reason)
1732 *reason = WAIT_QOS_MAX_WALL_PER_JOB;
1733 debug2("job submit for user %s(%u): "
1734 "time limit %u exceeds qos max %u",
1735 user_name,
1736 job_desc->user_id,
1737 job_desc->time_limit,
1738 qos_ptr->max_wall_pj);
1739 rc = false;
1740 goto end_it;
1741 }
1742 }
1743
1744 if ((qos_out_ptr->grp_wall == INFINITE) &&
1745 (qos_ptr->grp_wall != INFINITE) &&
1746 (!update_call || (job_desc->time_limit != NO_VAL))) {
1747 _set_time_limit(&job_desc->time_limit,
1748 part_ptr->max_time,
1749 qos_ptr->grp_wall,
1750 &acct_policy_limit_set->time);
1751
1752 qos_out_ptr->grp_wall = qos_ptr->grp_wall;
1753
1754 if (strict_checking
1755 && job_desc->time_limit > qos_ptr->grp_wall) {
1756 if (reason)
1757 *reason = WAIT_QOS_GRP_WALL;
1758 debug2("job submit for user %s(%u): "
1759 "time limit %u exceeds qos grp max %u",
1760 user_name,
1761 job_desc->user_id,
1762 job_desc->time_limit,
1763 qos_ptr->grp_wall);
1764 rc = false;
1765 goto end_it;
1766 }
1767 }
1768 }
1769
1770 if (!_validate_tres_limits_for_qos(&tres_pos,
1771 job_desc->tres_req_cnt, 0,
1772 NULL,
1773 qos_ptr->max_tres_pj_ctld,
1774 NULL,
1775 qos_out_ptr->max_tres_pj_ctld,
1776 acct_policy_limit_set->tres,
1777 strict_checking, 1)) {
1778 if (reason)
1779 *reason = _get_tres_state_reason(
1780 tres_pos, WAIT_QOS_MAX_UNK_PER_JOB);
1781
1782 debug2("job submit for user %s(%u): "
1783 "min tres(%s) request %"PRIu64" exceeds "
1784 "per-job max tres limit %"PRIu64" for qos '%s'",
1785 user_name,
1786 job_desc->user_id,
1787 assoc_mgr_tres_name_array[tres_pos],
1788 job_desc->tres_req_cnt[tres_pos],
1789 qos_ptr->max_tres_pj_ctld[tres_pos],
1790 qos_ptr->name);
1791 rc = false;
1792 goto end_it;
1793 }
1794
1795 if (!_validate_tres_limits_for_qos(&tres_pos,
1796 job_desc->tres_req_cnt,
1797 job_desc->tres_req_cnt[
1798 TRES_ARRAY_NODE],
1799 NULL,
1800 qos_ptr->max_tres_pn_ctld,
1801 NULL,
1802 qos_out_ptr->max_tres_pn_ctld,
1803 acct_policy_limit_set->tres,
1804 strict_checking, 1)) {
1805 if (reason)
1806 *reason = _get_tres_state_reason(
1807 tres_pos, WAIT_QOS_MAX_UNK_PER_NODE);
1808
1809 debug2("job submit for user %s(%u): "
1810 "min tres(%s) request %"PRIu64" exceeds "
1811 "per-node max tres limit %"PRIu64" for qos '%s'",
1812 user_name,
1813 job_desc->user_id,
1814 assoc_mgr_tres_name_array[tres_pos],
1815 job_desc->tres_req_cnt[tres_pos] /
1816 job_desc->tres_req_cnt[TRES_ARRAY_NODE],
1817 qos_ptr->max_tres_pn_ctld[tres_pos],
1818 qos_ptr->name);
1819 rc = false;
1820 goto end_it;
1821 }
1822
1823 /* for validation we don't need to look at
1824 * qos_ptr->max_jobs.
1825 */
1826
1827 if ((qos_out_ptr->max_submit_jobs_pa == INFINITE) &&
1828 (qos_ptr->max_submit_jobs_pa != INFINITE)) {
1829 slurmdb_used_limits_t *used_limits =
1830 acct_policy_get_acct_used_limits(
1831 &qos_ptr->usage->acct_limit_list,
1832 assoc_ptr->acct);
1833
1834 qos_out_ptr->max_submit_jobs_pa = qos_ptr->max_submit_jobs_pa;
1835
1836 if ((used_limits->submit_jobs + job_cnt) >
1837 qos_ptr->max_submit_jobs_pa) {
1838 if (reason)
1839 *reason = WAIT_QOS_MAX_SUB_JOB_PER_ACCT;
1840 debug2("job submit for account %s: qos max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
1841 assoc_ptr->acct,
1842 qos_ptr->max_submit_jobs_pa,
1843 used_limits->submit_jobs, job_cnt,
1844 qos_ptr->name);
1845 rc = false;
1846 goto end_it;
1847 }
1848 }
1849
1850 if ((qos_out_ptr->max_submit_jobs_pu == INFINITE) &&
1851 (qos_ptr->max_submit_jobs_pu != INFINITE)) {
1852 slurmdb_used_limits_t *used_limits =
1853 acct_policy_get_user_used_limits(
1854 &qos_ptr->usage->user_limit_list,
1855 job_desc->user_id);
1856
1857 qos_out_ptr->max_submit_jobs_pu = qos_ptr->max_submit_jobs_pu;
1858
1859 if ((used_limits->submit_jobs + job_cnt) >
1860 qos_ptr->max_submit_jobs_pu) {
1861 if (reason)
1862 *reason = WAIT_QOS_MAX_SUB_JOB;
1863 debug2("job submit for user %s(%u): qos max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
1864 user_name,
1865 job_desc->user_id,
1866 qos_ptr->max_submit_jobs_pu,
1867 used_limits->submit_jobs, job_cnt,
1868 qos_ptr->name);
1869 rc = false;
1870 goto end_it;
1871 }
1872 }
1873
1874 if (!_validate_tres_limits_for_qos(&tres_pos,
1875 job_desc->tres_req_cnt, 0,
1876 NULL,
1877 qos_ptr->min_tres_pj_ctld,
1878 NULL,
1879 qos_out_ptr->min_tres_pj_ctld,
1880 acct_policy_limit_set->tres,
1881 strict_checking, 0)) {
1882 if (reason)
1883 *reason = _get_tres_state_reason(
1884 tres_pos, WAIT_QOS_MIN_UNK);
1885
1886 debug2("job submit for user %s(%u): "
1887 "min tres(%s) request %"PRIu64" exceeds "
1888 "per-job max tres limit %"PRIu64" for qos '%s'",
1889 user_name,
1890 job_desc->user_id,
1891 assoc_mgr_tres_name_array[tres_pos],
1892 job_desc->tres_req_cnt[tres_pos],
1893 qos_ptr->min_tres_pj_ctld[tres_pos],
1894 qos_ptr->name);
1895 rc = false;
1896 goto end_it;
1897 }
1898
1899 end_it:
1900 return rc;
1901 }
1902
_qos_job_runnable_pre_select(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr)1903 static int _qos_job_runnable_pre_select(job_record_t *job_ptr,
1904 slurmdb_qos_rec_t *qos_ptr,
1905 slurmdb_qos_rec_t *qos_out_ptr)
1906 {
1907 uint32_t wall_mins;
1908 uint32_t time_limit = NO_VAL;
1909 int rc = true;
1910 slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
1911 bool safe_limits = false;
1912 slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
1913
1914 if (!qos_ptr || !qos_out_ptr || !assoc_ptr)
1915 return rc;
1916
1917 /*
1918 * check to see if we should be using safe limits, if so we
1919 * will only start a job if there are sufficient remaining
1920 * cpu-minutes for it to run to completion
1921 */
1922 if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
1923 safe_limits = true;
1924
1925 wall_mins = qos_ptr->usage->grp_used_wall / 60;
1926
1927 used_limits_a = acct_policy_get_acct_used_limits(
1928 &qos_ptr->usage->acct_limit_list,
1929 assoc_ptr->acct);
1930
1931 used_limits = acct_policy_get_user_used_limits(
1932 &qos_ptr->usage->user_limit_list,
1933 job_ptr->user_id);
1934
1935
1936 /* we don't need to check grp_tres_mins here */
1937
1938 /* we don't need to check grp_tres here */
1939
1940 /* we don't need to check grp_mem here */
1941 if ((qos_out_ptr->grp_jobs == INFINITE) &&
1942 (qos_ptr->grp_jobs != INFINITE)) {
1943
1944 qos_out_ptr->grp_jobs = qos_ptr->grp_jobs;
1945
1946 if (qos_ptr->usage->grp_used_jobs >= qos_ptr->grp_jobs) {
1947 xfree(job_ptr->state_desc);
1948 job_ptr->state_reason = WAIT_QOS_GRP_JOB;
1949 debug2("%pJ being held, the job is at or exceeds group max jobs limit %u with %u for QOS %s",
1950 job_ptr, qos_ptr->grp_jobs,
1951 qos_ptr->usage->grp_used_jobs, qos_ptr->name);
1952
1953 rc = false;
1954 goto end_it;
1955 }
1956 }
1957
1958 /* we don't need to check grp_tres_run_mins here */
1959
1960 /* we don't need to check grp_nodes here */
1961
1962 /* we don't need to check submit_jobs here */
1963
1964 if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
1965 && (qos_out_ptr->grp_wall == INFINITE)
1966 && (qos_ptr->grp_wall != INFINITE)) {
1967 if (time_limit == NO_VAL) {
1968 time_limit = job_ptr->time_limit;
1969 _set_time_limit(&time_limit,
1970 job_ptr->part_ptr->max_time,
1971 MIN(qos_ptr->grp_wall,
1972 qos_ptr->max_wall_pj),
1973 &job_ptr->limit_set.time);
1974
1975 /* Account for usage factor, if necessary */
1976 if ((job_ptr->qos_ptr &&
1977 (job_ptr->qos_ptr->flags &
1978 QOS_FLAG_USAGE_FACTOR_SAFE) &&
1979 (job_ptr->qos_ptr->usage_factor >= 0)) &&
1980 ((time_limit != INFINITE) ||
1981 (job_ptr->qos_ptr->usage_factor < 1.0))) {
1982 time_limit *= job_ptr->qos_ptr->usage_factor;
1983 }
1984 }
1985
1986 qos_out_ptr->grp_wall = qos_ptr->grp_wall;
1987
1988 if (wall_mins >= qos_ptr->grp_wall) {
1989 xfree(job_ptr->state_desc);
1990 job_ptr->state_reason = WAIT_QOS_GRP_WALL;
1991 debug2("%pJ being held, the job is at or exceeds group wall limit %u with %u for QOS %s",
1992 job_ptr, qos_ptr->grp_wall,
1993 wall_mins, qos_ptr->name);
1994 rc = false;
1995 goto end_it;
1996 } else if (safe_limits &&
1997 ((wall_mins + time_limit) > qos_ptr->grp_wall)) {
1998 xfree(job_ptr->state_desc);
1999 job_ptr->state_reason = WAIT_QOS_GRP_WALL;
2000 debug2("%pJ being held, the job request will exceed group wall limit %u if ran with %u for QOS %s",
2001 job_ptr, qos_ptr->grp_wall,
2002 wall_mins + time_limit, qos_ptr->name);
2003 rc = false;
2004 goto end_it;
2005 }
2006 }
2007
2008 /* we don't need to check max_tres_mins_pj here */
2009
2010 /* we don't need to check max_tres_pj here */
2011
2012 /* we don't need to check max_tres_pn here */
2013
2014 /* we don't need to check min_tres_pj here */
2015
2016 /* we don't need to check max_tres_pa here */
2017
2018 /* we don't need to check max_tres_pu here */
2019
2020 if ((qos_out_ptr->max_jobs_pa == INFINITE)
2021 && (qos_ptr->max_jobs_pa != INFINITE)) {
2022
2023 qos_out_ptr->max_jobs_pa = qos_ptr->max_jobs_pa;
2024
2025 if (used_limits_a->jobs >= qos_ptr->max_jobs_pa) {
2026 xfree(job_ptr->state_desc);
2027 job_ptr->state_reason =
2028 WAIT_QOS_MAX_JOB_PER_ACCT;
2029 debug2("%pJ being held, the job is at or exceeds max jobs per-acct (%s) limit %u with %u for QOS %s",
2030 job_ptr, used_limits_a->acct,
2031 qos_ptr->max_jobs_pa,
2032 used_limits_a->jobs, qos_ptr->name);
2033 rc = false;
2034 goto end_it;
2035 }
2036 }
2037
2038 if ((qos_out_ptr->max_jobs_pu == INFINITE)
2039 && (qos_ptr->max_jobs_pu != INFINITE)) {
2040
2041 qos_out_ptr->max_jobs_pu = qos_ptr->max_jobs_pu;
2042
2043 if (used_limits->jobs >= qos_ptr->max_jobs_pu) {
2044 xfree(job_ptr->state_desc);
2045 job_ptr->state_reason =
2046 WAIT_QOS_MAX_JOB_PER_USER;
2047 debug2("%pJ being held, the job is at or exceeds max jobs per-user limit %u with %u for QOS %s",
2048 job_ptr, qos_ptr->max_jobs_pu,
2049 used_limits->jobs, qos_ptr->name);
2050 rc = false;
2051 goto end_it;
2052 }
2053 }
2054
2055 /* we don't need to check submit_jobs_pa here */
2056
2057 /* we don't need to check submit_jobs_pu here */
2058
2059 /*
2060 * if the QOS limits have changed since job
2061 * submission and job can not run, then kill it
2062 */
2063 if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
2064 && (qos_out_ptr->max_wall_pj == INFINITE)
2065 && (qos_ptr->max_wall_pj != INFINITE)) {
2066 if (time_limit == NO_VAL) {
2067 time_limit = job_ptr->time_limit;
2068 _set_time_limit(&time_limit,
2069 job_ptr->part_ptr->max_time,
2070 qos_ptr->max_wall_pj,
2071 &job_ptr->limit_set.time);
2072 }
2073
2074 /* Account for usage factor, if necessary */
2075 if ((job_ptr->qos_ptr &&
2076 (job_ptr->qos_ptr->flags &
2077 QOS_FLAG_USAGE_FACTOR_SAFE) &&
2078 (job_ptr->qos_ptr->usage_factor >= 0)) &&
2079 ((time_limit != INFINITE) ||
2080 (job_ptr->qos_ptr->usage_factor < 1.0))) {
2081 time_limit *= job_ptr->qos_ptr->usage_factor;
2082 }
2083
2084 qos_out_ptr->max_wall_pj = qos_ptr->max_wall_pj;
2085
2086 if (time_limit > qos_out_ptr->max_wall_pj) {
2087 xfree(job_ptr->state_desc);
2088 job_ptr->state_reason =
2089 WAIT_QOS_MAX_WALL_PER_JOB;
2090 debug2("%pJ being held, time limit %u exceeds QOS max wall pj %u",
2091 job_ptr, time_limit, qos_out_ptr->max_wall_pj);
2092 rc = false;
2093 goto end_it;
2094 }
2095 }
2096 end_it:
2097
2098 return rc;
2099 }
2100
_qos_job_runnable_post_select(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr,uint64_t * tres_req_cnt,uint64_t * job_tres_time_limit)2101 static int _qos_job_runnable_post_select(job_record_t *job_ptr,
2102 slurmdb_qos_rec_t *qos_ptr,
2103 slurmdb_qos_rec_t *qos_out_ptr,
2104 uint64_t *tres_req_cnt,
2105 uint64_t *job_tres_time_limit)
2106 {
2107 uint64_t tres_usage_mins[slurmctld_tres_cnt];
2108 uint64_t tres_run_mins[slurmctld_tres_cnt];
2109 uint64_t orig_node_cnt;
2110 slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
2111 bool safe_limits = false;
2112 int rc = true, i, tres_pos = 0;
2113 acct_policy_tres_usage_t tres_usage;
2114 slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
2115 double usage_factor = 1.0;
2116
2117 if (!qos_ptr || !qos_out_ptr || !assoc_ptr)
2118 return rc;
2119
2120 /*
2121 * check to see if we should be using safe limits, if so we will only
2122 * will only start a job if there are sufficient remaining cpu-minutes
2123 * for it to run to completion
2124 */
2125 if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
2126 safe_limits = true;
2127
2128 /* clang needs this memset to avoid a warning */
2129 memset(tres_run_mins, 0, sizeof(tres_run_mins));
2130 memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
2131 if (job_ptr->qos_ptr &&
2132 (job_ptr->qos_ptr->usage_factor >= 0))
2133 usage_factor = job_ptr->qos_ptr->usage_factor;
2134 for (i=0; i<slurmctld_tres_cnt; i++) {
2135 tres_run_mins[i] =
2136 qos_ptr->usage->grp_used_tres_run_secs[i] / 60;
2137 tres_usage_mins[i] =
2138 (uint64_t)(qos_ptr->usage->usage_tres_raw[i] / 60.0);
2139
2140 /*
2141 * Clear usage if factor is 0 so that jobs can run. Otherwise
2142 * multiplying can cause more jobs to be run than the limit
2143 * allows (e.g. usagefactor=.5).
2144 */
2145 if (usage_factor == 0.0) {
2146 tres_run_mins[i] *= usage_factor;
2147 tres_usage_mins[i] *= usage_factor;
2148 }
2149 }
2150
2151 used_limits_a = acct_policy_get_acct_used_limits(
2152 &qos_ptr->usage->acct_limit_list,
2153 assoc_ptr->acct);
2154
2155 used_limits = acct_policy_get_user_used_limits(
2156 &qos_ptr->usage->user_limit_list,
2157 job_ptr->user_id);
2158
2159 tres_usage = _validate_tres_usage_limits_for_qos(
2160 &tres_pos, qos_ptr->grp_tres_mins_ctld,
2161 qos_out_ptr->grp_tres_mins_ctld, job_tres_time_limit,
2162 tres_run_mins, tres_usage_mins, job_ptr->limit_set.tres,
2163 safe_limits);
2164 switch (tres_usage) {
2165 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2166 xfree(job_ptr->state_desc);
2167 job_ptr->state_reason = _get_tres_state_reason(
2168 tres_pos, WAIT_QOS_GRP_UNK_MIN);
2169 debug2("%pJ being held, QOS %s group max tres(%s) minutes limit of %"PRIu64" is already at or exceeded with %"PRIu64,
2170 job_ptr, qos_ptr->name,
2171 assoc_mgr_tres_name_array[tres_pos],
2172 qos_ptr->grp_tres_mins_ctld[tres_pos],
2173 tres_usage_mins[tres_pos]);
2174 rc = false;
2175 goto end_it;
2176 break;
2177 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2178 xfree(job_ptr->state_desc);
2179 job_ptr->state_reason = _get_tres_state_reason(
2180 tres_pos, WAIT_QOS_GRP_UNK_MIN);
2181 debug2("%pJ being held, the job is requesting more than allowed with QOS %s's group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
2182 job_ptr, qos_ptr->name,
2183 assoc_mgr_tres_name_array[tres_pos],
2184 qos_ptr->grp_tres_mins_ctld[tres_pos],
2185 job_tres_time_limit[tres_pos]);
2186 rc = false;
2187 goto end_it;
2188 break;
2189 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2190 /*
2191 * If we're using safe limits start
2192 * the job only if there are
2193 * sufficient cpu-mins left such that
2194 * it will run to completion without
2195 * being killed
2196 */
2197 xfree(job_ptr->state_desc);
2198 job_ptr->state_reason = _get_tres_state_reason(
2199 tres_pos, WAIT_QOS_GRP_UNK_MIN);
2200 debug2("%pJ being held, the job is at or exceeds QOS %s's group max tres(%s) minutes of %"PRIu64" of which %"PRIu64" are still available but request is for %"PRIu64" (plus %"PRIu64" already in use) tres minutes (request tres count %"PRIu64")",
2201 job_ptr, qos_ptr->name,
2202 assoc_mgr_tres_name_array[tres_pos],
2203 qos_ptr->grp_tres_mins_ctld[tres_pos],
2204 qos_ptr->grp_tres_mins_ctld[tres_pos] -
2205 tres_usage_mins[tres_pos],
2206 job_tres_time_limit[tres_pos],
2207 tres_run_mins[tres_pos],
2208 tres_req_cnt[tres_pos]);
2209 rc = false;
2210 goto end_it;
2211 break;
2212 case TRES_USAGE_OKAY:
2213 /* all good */
2214 break;
2215 }
2216
2217 /*
2218 * If the job's CPU limit wasn't administratively set and the QOS
2219 * has a GrpCPU limit, cancel the job if its minimum CPU requirement
2220 * has exceeded the limit for all CPUs usable by the QOS
2221 */
2222 orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
2223 _get_unique_job_node_cnt(job_ptr, qos_ptr->usage->grp_node_bitmap,
2224 &tres_req_cnt[TRES_ARRAY_NODE]);
2225 tres_usage = _validate_tres_usage_limits_for_qos(
2226 &tres_pos,
2227 qos_ptr->grp_tres_ctld, qos_out_ptr->grp_tres_ctld,
2228 tres_req_cnt, qos_ptr->usage->grp_used_tres,
2229 NULL, job_ptr->limit_set.tres, true);
2230 tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
2231 switch (tres_usage) {
2232 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2233 /* not possible because the curr_usage sent in is NULL */
2234 break;
2235 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2236 xfree(job_ptr->state_desc);
2237 job_ptr->state_reason = _get_tres_state_reason(
2238 tres_pos, WAIT_QOS_GRP_UNK);
2239 debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds group max tres limit %"PRIu64,
2240 job_ptr, qos_ptr->name,
2241 assoc_mgr_tres_name_array[tres_pos],
2242 tres_req_cnt[tres_pos],
2243 qos_ptr->grp_tres_ctld[tres_pos]);
2244 rc = false;
2245 goto end_it;
2246 break;
2247 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2248 xfree(job_ptr->state_desc);
2249 job_ptr->state_reason = _get_tres_state_reason(
2250 tres_pos, WAIT_QOS_GRP_UNK);
2251 debug2("%pJ being held, if allowed the job request will exceed QOS %s group max tres(%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2252 job_ptr, qos_ptr->name,
2253 assoc_mgr_tres_name_array[tres_pos],
2254 qos_ptr->grp_tres_ctld[tres_pos],
2255 qos_ptr->usage->grp_used_tres[tres_pos],
2256 tres_req_cnt[tres_pos]);
2257 rc = false;
2258 goto end_it;
2259 case TRES_USAGE_OKAY:
2260 /* all good */
2261 break;
2262 }
2263
2264 /* we don't need to check grp_jobs here */
2265
2266 tres_usage = _validate_tres_usage_limits_for_qos(
2267 &tres_pos,
2268 qos_ptr->grp_tres_run_mins_ctld,
2269 qos_out_ptr->grp_tres_run_mins_ctld,
2270 job_tres_time_limit, tres_run_mins, NULL, NULL, true);
2271 switch (tres_usage) {
2272 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2273 /* not possible because the curr_usage sent in is NULL */
2274 break;
2275 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2276 xfree(job_ptr->state_desc);
2277 job_ptr->state_reason = _get_tres_state_reason(
2278 tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
2279 debug2("%pJ is being held, QOS %s group max running tres(%s) minutes request %"PRIu64" exceeds limit %"PRIu64,
2280 job_ptr, qos_ptr->name,
2281 assoc_mgr_tres_name_array[tres_pos],
2282 job_tres_time_limit[tres_pos],
2283 qos_ptr->grp_tres_run_mins_ctld[tres_pos]);
2284 rc = false;
2285 goto end_it;
2286 break;
2287 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2288 xfree(job_ptr->state_desc);
2289 job_ptr->state_reason = _get_tres_state_reason(
2290 tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
2291 debug2("%pJ being held, if allowed the job request will exceed QOS %s group max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2292 job_ptr, qos_ptr->name,
2293 assoc_mgr_tres_name_array[tres_pos],
2294 qos_ptr->grp_tres_run_mins_ctld[tres_pos],
2295 tres_run_mins[tres_pos],
2296 job_tres_time_limit[tres_pos]);
2297 rc = false;
2298 goto end_it;
2299 break;
2300 case TRES_USAGE_OKAY:
2301 /* all good */
2302 break;
2303 }
2304
2305 /* we don't need to check submit_jobs here */
2306
2307 /* we don't need to check grp_wall here */
2308
2309 if (!_validate_tres_limits_for_qos(&tres_pos,
2310 job_tres_time_limit, 0,
2311 NULL,
2312 qos_ptr->max_tres_mins_pj_ctld,
2313 NULL,
2314 qos_out_ptr->max_tres_mins_pj_ctld,
2315 job_ptr->limit_set.tres,
2316 1, 1)) {
2317 xfree(job_ptr->state_desc);
2318 job_ptr->state_reason = _get_tres_state_reason(
2319 tres_pos, WAIT_QOS_MAX_UNK_MINS_PER_JOB);
2320 debug2("%pJ being held, the job is requesting more than allowed with QOS %s's max tres(%s) minutes of %"PRIu64" with %"PRIu64,
2321 job_ptr, qos_ptr->name,
2322 assoc_mgr_tres_name_array[tres_pos],
2323 qos_ptr->max_tres_mins_pj_ctld[tres_pos],
2324 job_tres_time_limit[tres_pos]);
2325 rc = false;
2326 goto end_it;
2327 }
2328
2329 if (!_validate_tres_limits_for_qos(&tres_pos,
2330 tres_req_cnt, 0,
2331 NULL,
2332 qos_ptr->max_tres_pj_ctld,
2333 NULL,
2334 qos_out_ptr->max_tres_pj_ctld,
2335 job_ptr->limit_set.tres,
2336 1, 1)) {
2337 xfree(job_ptr->state_desc);
2338 job_ptr->state_reason = _get_tres_state_reason(
2339 tres_pos, WAIT_QOS_MAX_UNK_PER_JOB);
2340 debug2("%pJ is being held, QOS %s min tres(%s) per job request %"PRIu64" exceeds max tres limit %"PRIu64,
2341 job_ptr, qos_ptr->name,
2342 assoc_mgr_tres_name_array[tres_pos],
2343 tres_req_cnt[tres_pos],
2344 qos_ptr->max_tres_pj_ctld[tres_pos]);
2345 rc = false;
2346 goto end_it;
2347 }
2348
2349 if (!_validate_tres_limits_for_qos(&tres_pos,
2350 tres_req_cnt,
2351 tres_req_cnt[TRES_ARRAY_NODE],
2352 NULL,
2353 qos_ptr->max_tres_pn_ctld,
2354 NULL,
2355 qos_out_ptr->max_tres_pn_ctld,
2356 job_ptr->limit_set.tres,
2357 1, 1)) {
2358 uint64_t req_per_node;
2359 xfree(job_ptr->state_desc);
2360 job_ptr->state_reason = _get_tres_state_reason(
2361 tres_pos, WAIT_QOS_MAX_UNK_PER_NODE);
2362 req_per_node = tres_req_cnt[tres_pos];
2363 if (tres_req_cnt[TRES_ARRAY_NODE] > 1)
2364 req_per_node /= tres_req_cnt[TRES_ARRAY_NODE];
2365 debug2("%pJ is being held, QOS %s min tres(%s) per node request %"PRIu64" exceeds max tres limit %"PRIu64,
2366 job_ptr, qos_ptr->name,
2367 assoc_mgr_tres_name_array[tres_pos],
2368 req_per_node,
2369 qos_ptr->max_tres_pn_ctld[tres_pos]);
2370 rc = false;
2371 goto end_it;
2372 }
2373
2374 if (!_validate_tres_limits_for_qos(&tres_pos,
2375 tres_req_cnt, 0,
2376 NULL,
2377 qos_ptr->min_tres_pj_ctld,
2378 NULL,
2379 qos_out_ptr->min_tres_pj_ctld,
2380 job_ptr->limit_set.tres,
2381 1, 0)) {
2382 xfree(job_ptr->state_desc);
2383 job_ptr->state_reason = _get_tres_state_reason(
2384 tres_pos, WAIT_QOS_MIN_UNK);
2385 debug2("%pJ is being held, QOS %s min tres(%s) per job request %"PRIu64" exceeds min tres limit %"PRIu64,
2386 job_ptr, qos_ptr->name,
2387 assoc_mgr_tres_name_array[tres_pos],
2388 tres_req_cnt[tres_pos],
2389 qos_ptr->min_tres_pj_ctld[tres_pos]);
2390 rc = false;
2391 goto end_it;
2392 }
2393
2394 orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
2395 _get_unique_job_node_cnt(job_ptr, used_limits_a->node_bitmap,
2396 &tres_req_cnt[TRES_ARRAY_NODE]);
2397 tres_usage = _validate_tres_usage_limits_for_qos(
2398 &tres_pos,
2399 qos_ptr->max_tres_pa_ctld, qos_out_ptr->max_tres_pa_ctld,
2400 tres_req_cnt, used_limits_a->tres,
2401 NULL, job_ptr->limit_set.tres, true);
2402 tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
2403 switch (tres_usage) {
2404 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2405 /* not possible because the curr_usage sent in is NULL */
2406 break;
2407 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2408 /*
2409 * Hold the job if it exceeds the per-acct
2410 * TRES limit for the given QOS
2411 */
2412 xfree(job_ptr->state_desc);
2413 job_ptr->state_reason = _get_tres_state_reason(
2414 tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
2415 debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds max tres per account (%s) limit %"PRIu64,
2416 job_ptr, qos_ptr->name,
2417 assoc_mgr_tres_name_array[tres_pos],
2418 tres_req_cnt[tres_pos],
2419 used_limits_a->acct,
2420 qos_ptr->max_tres_pa_ctld[tres_pos]);
2421 rc = false;
2422 goto end_it;
2423 break;
2424 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2425 /*
2426 * Hold the job if the user has exceeded the QOS per-user
2427 * TRES limit with their current usage
2428 */
2429 xfree(job_ptr->state_desc);
2430 job_ptr->state_reason = _get_tres_state_reason(
2431 tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
2432 debug2("%pJ being held, if allowed the job request will exceed QOS %s max tres(%s) per account (%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2433 job_ptr, qos_ptr->name,
2434 assoc_mgr_tres_name_array[tres_pos],
2435 used_limits_a->acct,
2436 qos_ptr->max_tres_pa_ctld[tres_pos],
2437 used_limits_a->tres[tres_pos],
2438 tres_req_cnt[tres_pos]);
2439 rc = false;
2440 goto end_it;
2441 case TRES_USAGE_OKAY:
2442 /* all good */
2443 break;
2444 }
2445
2446 orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
2447 _get_unique_job_node_cnt(job_ptr, used_limits->node_bitmap,
2448 &tres_req_cnt[TRES_ARRAY_NODE]);
2449 tres_usage = _validate_tres_usage_limits_for_qos(
2450 &tres_pos,
2451 qos_ptr->max_tres_pu_ctld, qos_out_ptr->max_tres_pu_ctld,
2452 tres_req_cnt, used_limits->tres,
2453 NULL, job_ptr->limit_set.tres, true);
2454 tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
2455 switch (tres_usage) {
2456 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2457 /* not possible because the curr_usage sent in is NULL */
2458 break;
2459 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2460 /*
2461 * Hold the job if it exceeds the per-user
2462 * TRES limit for the given QOS
2463 */
2464 xfree(job_ptr->state_desc);
2465 job_ptr->state_reason = _get_tres_state_reason(
2466 tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
2467 debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds max tres per user limit %"PRIu64,
2468 job_ptr, qos_ptr->name,
2469 assoc_mgr_tres_name_array[tres_pos],
2470 tres_req_cnt[tres_pos],
2471 qos_ptr->max_tres_pu_ctld[tres_pos]);
2472 rc = false;
2473 goto end_it;
2474 break;
2475 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2476 /*
2477 * Hold the job if the user has exceeded the QOS
2478 * per-user TRES limit with their current usage
2479 */
2480 xfree(job_ptr->state_desc);
2481 job_ptr->state_reason = _get_tres_state_reason(
2482 tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
2483 debug2("%pJ being held, if allowed the job request will exceed QOS %s max tres(%s) per user limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2484 job_ptr, qos_ptr->name,
2485 assoc_mgr_tres_name_array[tres_pos],
2486 qos_ptr->max_tres_pu_ctld[tres_pos],
2487 used_limits->tres[tres_pos],
2488 tres_req_cnt[tres_pos]);
2489 rc = false;
2490 goto end_it;
2491 case TRES_USAGE_OKAY:
2492 /* all good */
2493 break;
2494 }
2495
2496 /* We do not need to check max_jobs_pa here */
2497
2498 /* We do not need to check max_jobs_pu here */
2499
2500 /* we don't need to check submit_jobs_pa here */
2501
2502 /* we don't need to check submit_jobs_pu here */
2503
2504 /* we don't need to check max_wall_pj here */
2505
2506 end_it:
2507 if (!rc)
2508 job_ptr->qos_blocking_ptr = qos_ptr;
2509
2510 return rc;
2511 }
2512
_qos_job_time_out(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr,uint64_t * job_tres_usage_mins)2513 static int _qos_job_time_out(job_record_t *job_ptr,
2514 slurmdb_qos_rec_t *qos_ptr,
2515 slurmdb_qos_rec_t *qos_out_ptr,
2516 uint64_t *job_tres_usage_mins)
2517 {
2518 uint64_t tres_usage_mins[slurmctld_tres_cnt];
2519 uint32_t wall_mins;
2520 int rc = true, tres_pos = 0, i;
2521 acct_policy_tres_usage_t tres_usage;
2522 time_t now = time(NULL);
2523
2524 if (!qos_ptr || !qos_out_ptr)
2525 return rc;
2526
2527 /*
2528 * The idea here is for QOS to trump what an association has set for
2529 * a limit, so if an association set of wall 10 mins and the QOS has
2530 * 20 mins set and the job has been running for 11 minutes it continues
2531 * until 20.
2532 */
2533 /* clang needs this memset to avoid a warning */
2534 memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
2535 for (i = 0; i < slurmctld_tres_cnt; i++)
2536 tres_usage_mins[i] =
2537 (uint64_t)(qos_ptr->usage->usage_tres_raw[i] / 60.0);
2538 wall_mins = qos_ptr->usage->grp_used_wall / 60;
2539
2540 tres_usage = _validate_tres_usage_limits_for_qos(
2541 &tres_pos, qos_ptr->grp_tres_mins_ctld,
2542 qos_out_ptr->grp_tres_mins_ctld, NULL,
2543 NULL, tres_usage_mins, NULL, false);
2544 switch (tres_usage) {
2545 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2546 last_job_update = now;
2547 info("%pJ timed out, the job is at or exceeds QOS %s's group max tres(%s) minutes of %"PRIu64" with %"PRIu64"",
2548 job_ptr, qos_ptr->name,
2549 assoc_mgr_tres_name_array[tres_pos],
2550 qos_ptr->grp_tres_mins_ctld[tres_pos],
2551 tres_usage_mins[tres_pos]);
2552 job_ptr->state_reason = FAIL_TIMEOUT;
2553 rc = false;
2554 goto end_it;
2555 break;
2556 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2557 /* not possible safe_limits is 0 */
2558 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2559 /* not possible safe_limits is 0 */
2560 case TRES_USAGE_OKAY:
2561 /* all good */
2562 break;
2563 }
2564
2565 if ((qos_out_ptr->grp_wall == INFINITE)
2566 && (qos_ptr->grp_wall != INFINITE)) {
2567
2568 qos_out_ptr->grp_wall = qos_ptr->grp_wall;
2569
2570 if (wall_mins >= qos_ptr->grp_wall) {
2571 last_job_update = now;
2572 info("%pJ timed out, the job is at or exceeds QOS %s's group wall limit of %u with %u",
2573 job_ptr, qos_ptr->name,
2574 qos_ptr->grp_wall, wall_mins);
2575 job_ptr->state_reason = FAIL_TIMEOUT;
2576 rc = false;
2577 goto end_it;
2578 }
2579 }
2580
2581 tres_usage = _validate_tres_usage_limits_for_qos(
2582 &tres_pos, qos_ptr->max_tres_mins_pj_ctld,
2583 qos_out_ptr->max_tres_mins_pj_ctld, job_tres_usage_mins,
2584 NULL, NULL, NULL, true);
2585 switch (tres_usage) {
2586 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2587 /* not possible curr_usage is NULL */
2588 break;
2589 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2590 last_job_update = now;
2591 info("%pJ timed out, the job is at or exceeds QOS %s's max tres(%s) minutes of %"PRIu64" with %"PRIu64,
2592 job_ptr, qos_ptr->name,
2593 assoc_mgr_tres_name_array[tres_pos],
2594 qos_ptr->max_tres_mins_pj_ctld[tres_pos],
2595 job_tres_usage_mins[tres_pos]);
2596 job_ptr->state_reason = FAIL_TIMEOUT;
2597 rc = false;
2598 goto end_it;
2599 break;
2600 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2601 /* not possible tres_usage is NULL */
2602 case TRES_USAGE_OKAY:
2603 /* all good */
2604 break;
2605 }
2606
2607 end_it:
2608 return rc;
2609 }
2610
2611 /*
2612 * acct_policy_add_job_submit - Note that a job has been submitted for
2613 * accounting policy purposes.
2614 */
acct_policy_add_job_submit(job_record_t * job_ptr)2615 extern void acct_policy_add_job_submit(job_record_t *job_ptr)
2616 {
2617 _adjust_limit_usage(ACCT_POLICY_ADD_SUBMIT, job_ptr);
2618 }
2619
2620 /*
2621 * acct_policy_remove_job_submit - Note that a job has finished (might
2622 * not had started or been allocated resources) for accounting
2623 * policy purposes.
2624 */
acct_policy_remove_job_submit(job_record_t * job_ptr)2625 extern void acct_policy_remove_job_submit(job_record_t *job_ptr)
2626 {
2627 _adjust_limit_usage(ACCT_POLICY_REM_SUBMIT, job_ptr);
2628 }
2629
2630 /*
2631 * acct_policy_job_begin - Note that a job is starting for accounting
2632 * policy purposes.
2633 */
acct_policy_job_begin(job_record_t * job_ptr)2634 extern void acct_policy_job_begin(job_record_t *job_ptr)
2635 {
2636 _adjust_limit_usage(ACCT_POLICY_JOB_BEGIN, job_ptr);
2637 }
2638
2639 /*
2640 * acct_policy_job_fini - Note that a job is completing for accounting
2641 * policy purposes.
2642 */
acct_policy_job_fini(job_record_t * job_ptr)2643 extern void acct_policy_job_fini(job_record_t *job_ptr)
2644 {
2645 /* if end_time_exp == NO_VAL this has already happened */
2646 if (job_ptr->end_time_exp != (time_t)NO_VAL)
2647 _adjust_limit_usage(ACCT_POLICY_JOB_FINI, job_ptr);
2648 else
2649 debug2("We have already ran the job_fini for %pJ", job_ptr);
2650 }
2651
acct_policy_alter_job(job_record_t * job_ptr,uint32_t new_time_limit)2652 extern void acct_policy_alter_job(job_record_t *job_ptr,
2653 uint32_t new_time_limit)
2654 {
2655 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
2656 slurmdb_assoc_rec_t *assoc_ptr = NULL;
2657 assoc_mgr_lock_t locks =
2658 { .assoc = WRITE_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
2659 uint64_t used_tres_run_secs[slurmctld_tres_cnt];
2660 uint64_t new_used_tres_run_secs[slurmctld_tres_cnt];
2661 uint64_t time_limit_secs, new_time_limit_secs;
2662 int i;
2663
2664 if (!IS_JOB_RUNNING(job_ptr) || (job_ptr->time_limit == new_time_limit))
2665 return;
2666
2667 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
2668 || !_valid_job_assoc(job_ptr))
2669 return;
2670
2671 time_limit_secs = (uint64_t)job_ptr->time_limit * 60;
2672 new_time_limit_secs = (uint64_t)new_time_limit * 60;
2673
2674 /* clang needs these memset to avoid a warning */
2675 memset(used_tres_run_secs, 0, sizeof(used_tres_run_secs));
2676 memset(new_used_tres_run_secs, 0, sizeof(new_used_tres_run_secs));
2677 for (i=0; i<slurmctld_tres_cnt; i++) {
2678 if (i == TRES_ARRAY_ENERGY)
2679 continue;
2680 if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
2681 continue;
2682
2683 used_tres_run_secs[i] =
2684 job_ptr->tres_alloc_cnt[i] * time_limit_secs;
2685 new_used_tres_run_secs[i] =
2686 job_ptr->tres_alloc_cnt[i] * new_time_limit_secs;
2687 }
2688
2689 assoc_mgr_lock(&locks);
2690
2691 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
2692
2693 _qos_alter_job(job_ptr, qos_ptr_1,
2694 used_tres_run_secs, new_used_tres_run_secs);
2695 _qos_alter_job(job_ptr, qos_ptr_2,
2696 used_tres_run_secs, new_used_tres_run_secs);
2697
2698 assoc_ptr = job_ptr->assoc_ptr;
2699 while (assoc_ptr) {
2700 for (i=0; i<slurmctld_tres_cnt; i++) {
2701 if (used_tres_run_secs[i] == new_used_tres_run_secs[i])
2702 continue;
2703 assoc_ptr->usage->grp_used_tres_run_secs[i] -=
2704 used_tres_run_secs[i];
2705 assoc_ptr->usage->grp_used_tres_run_secs[i] +=
2706 new_used_tres_run_secs[i];
2707 debug2("altering %pJ assoc %u(%s/%s/%s) got %"PRIu64" just removed %"PRIu64" and added %"PRIu64,
2708 job_ptr, assoc_ptr->id, assoc_ptr->acct,
2709 assoc_ptr->user, assoc_ptr->partition,
2710 assoc_ptr->usage->grp_used_tres_run_secs[i],
2711 used_tres_run_secs[i],
2712 new_used_tres_run_secs[i]);
2713 }
2714
2715 /* now handle all the group limits of the parents */
2716 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
2717 }
2718 assoc_mgr_unlock(&locks);
2719 }
2720
_get_prio_thresh(uint32_t * prio_thresh,uint32_t in_thresh)2721 static void _get_prio_thresh(uint32_t *prio_thresh, uint32_t in_thresh)
2722 {
2723 /*
2724 * If we already set prio_thresh then call it good.
2725 * If in_thresh is INFINITE we don't have a limit
2726 */
2727 if ((*prio_thresh) || (in_thresh == INFINITE))
2728 return;
2729
2730 *prio_thresh = in_thresh;
2731 }
2732
_get_accrue_create_cnt(uint32_t * max_jobs_accrue,int * create_cnt,uint32_t in_accrue,uint32_t in_used)2733 static void _get_accrue_create_cnt(uint32_t *max_jobs_accrue, int *create_cnt,
2734 uint32_t in_accrue, uint32_t in_used)
2735 {
2736 /*
2737 * If we already set max_jobs_accrue then call it good.
2738 * If in_accrue is INFINITE we don't have a limit
2739 */
2740 if ((*max_jobs_accrue != INFINITE) || (in_accrue == INFINITE))
2741 return;
2742
2743 *max_jobs_accrue = in_accrue;
2744 if (*max_jobs_accrue > in_used)
2745 *create_cnt = *max_jobs_accrue - in_used;
2746 else
2747 *create_cnt = 0;
2748
2749 return;
2750 }
2751
_add_accrue_time_internal(slurmdb_assoc_rec_t * assoc_ptr,slurmdb_qos_rec_t * qos_ptr_1,slurmdb_used_limits_t * used_limits_a1,slurmdb_used_limits_t * used_limits_u1,slurmdb_qos_rec_t * qos_ptr_2,slurmdb_used_limits_t * used_limits_a2,slurmdb_used_limits_t * used_limits_u2,int cnt)2752 static void _add_accrue_time_internal(slurmdb_assoc_rec_t *assoc_ptr,
2753 slurmdb_qos_rec_t *qos_ptr_1,
2754 slurmdb_used_limits_t *used_limits_a1,
2755 slurmdb_used_limits_t *used_limits_u1,
2756 slurmdb_qos_rec_t *qos_ptr_2,
2757 slurmdb_used_limits_t *used_limits_a2,
2758 slurmdb_used_limits_t *used_limits_u2,
2759 int cnt)
2760 {
2761 if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2762 info("%s: Adding %d to assoc_ptr %p (%p %p %p %p %p %p)",
2763 __func__, cnt, assoc_ptr, qos_ptr_1, used_limits_a1,
2764 used_limits_u1, qos_ptr_2, used_limits_a2,
2765 used_limits_u2);
2766 }
2767
2768 if (qos_ptr_1)
2769 qos_ptr_1->usage->accrue_cnt += cnt;
2770 if (used_limits_a1)
2771 used_limits_a1->accrue_cnt += cnt;
2772 if (used_limits_u1)
2773 used_limits_u1->accrue_cnt += cnt;
2774
2775 if (qos_ptr_2)
2776 qos_ptr_2->usage->accrue_cnt += cnt;
2777 if (used_limits_a2)
2778 used_limits_a2->accrue_cnt += cnt;
2779 if (used_limits_u2)
2780 used_limits_u2->accrue_cnt += cnt;
2781
2782 while (assoc_ptr) {
2783 if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2784 info("assoc_id %u(%s/%s/%s/%p) added %d count %d",
2785 assoc_ptr->id, assoc_ptr->acct,
2786 assoc_ptr->user, assoc_ptr->partition,
2787 assoc_ptr->usage, cnt,
2788 assoc_ptr->usage->accrue_cnt);
2789 }
2790 assoc_ptr->usage->accrue_cnt += cnt;
2791 /* now go up the hierarchy */
2792 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
2793 }
2794 }
2795
_remove_accrue_time_internal(slurmdb_assoc_rec_t * assoc_ptr,slurmdb_qos_rec_t * qos_ptr_1,slurmdb_used_limits_t * used_limits_a1,slurmdb_used_limits_t * used_limits_u1,slurmdb_qos_rec_t * qos_ptr_2,slurmdb_used_limits_t * used_limits_a2,slurmdb_used_limits_t * used_limits_u2,int cnt)2796 static void _remove_accrue_time_internal(slurmdb_assoc_rec_t *assoc_ptr,
2797 slurmdb_qos_rec_t *qos_ptr_1,
2798 slurmdb_used_limits_t *used_limits_a1,
2799 slurmdb_used_limits_t *used_limits_u1,
2800 slurmdb_qos_rec_t *qos_ptr_2,
2801 slurmdb_used_limits_t *used_limits_a2,
2802 slurmdb_used_limits_t *used_limits_u2,
2803 int cnt)
2804 {
2805 if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2806 info("%s: Removing %d from assoc_ptr %p (%p %p %p %p %p %p)",
2807 __func__, cnt, assoc_ptr, qos_ptr_1, used_limits_a1,
2808 used_limits_u1, qos_ptr_2, used_limits_a2,
2809 used_limits_u2);
2810 }
2811
2812 if (qos_ptr_1) {
2813 if (qos_ptr_1->usage->accrue_cnt >= cnt)
2814 qos_ptr_1->usage->accrue_cnt -= cnt;
2815 else {
2816 error("%s: QOS %s accrue_cnt underflow",
2817 __func__, qos_ptr_1->name);
2818 qos_ptr_1->usage->accrue_cnt = 0;
2819 }
2820 }
2821
2822 if (used_limits_a1) {
2823 if (used_limits_a1->accrue_cnt >= cnt)
2824 used_limits_a1->accrue_cnt -= cnt;
2825 else {
2826 if (qos_ptr_1) {
2827 error("%s: QOS %s acct %s accrue_cnt underflow",
2828 __func__, qos_ptr_1->name,
2829 used_limits_a1->acct);
2830 }
2831 used_limits_a1->accrue_cnt = 0;
2832 }
2833 }
2834
2835 if (used_limits_u1) {
2836 if (used_limits_u1->accrue_cnt >= cnt)
2837 used_limits_u1->accrue_cnt -= cnt;
2838 else {
2839 if (qos_ptr_1) {
2840 error("%s: QOS %s user %u accrue_cnt underflow",
2841 __func__, qos_ptr_1->name,
2842 used_limits_u1->uid);
2843 }
2844 used_limits_u1->accrue_cnt = 0;
2845 }
2846 }
2847
2848 if (qos_ptr_2) {
2849 if (qos_ptr_2->usage->accrue_cnt)
2850 qos_ptr_2->usage->accrue_cnt -= cnt;
2851 else {
2852 error("%s: QOS %s accrue_cnt underflow",
2853 __func__, qos_ptr_2->name);
2854 qos_ptr_2->usage->accrue_cnt = 0;
2855 }
2856 }
2857
2858 if (used_limits_a2) {
2859 if (used_limits_a2->accrue_cnt >= cnt)
2860 used_limits_a2->accrue_cnt -= cnt;
2861 else {
2862 if (qos_ptr_2) {
2863 error("%s: QOS %s acct %s accrue_cnt underflow",
2864 __func__, qos_ptr_2->name,
2865 used_limits_a2->acct);
2866 }
2867 used_limits_a2->accrue_cnt = 0;
2868 }
2869 }
2870
2871 if (used_limits_u2) {
2872 if (used_limits_u2->accrue_cnt >= cnt)
2873 used_limits_u2->accrue_cnt -= cnt;
2874 else {
2875 if (qos_ptr_2 && used_limits_a2) {
2876 error("%s: QOS %s user %u accrue_cnt underflow",
2877 __func__, qos_ptr_2->name,
2878 used_limits_a2->uid);
2879 }
2880 used_limits_u2->accrue_cnt = 0;
2881 }
2882 }
2883
2884 while (assoc_ptr) {
2885 if (assoc_ptr->usage->accrue_cnt >= cnt) {
2886 if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2887 info("assoc_id %u(%s/%s/%s/%p) removed %d count %d",
2888 assoc_ptr->id, assoc_ptr->acct,
2889 assoc_ptr->user, assoc_ptr->partition,
2890 assoc_ptr->usage, cnt,
2891 assoc_ptr->usage->accrue_cnt);
2892 }
2893 assoc_ptr->usage->accrue_cnt -= cnt;
2894 } else {
2895 error("%s: assoc_id %u(%s/%s/%s) accrue_cnt underflow",
2896 __func__, assoc_ptr->id,
2897 assoc_ptr->acct,
2898 assoc_ptr->user,
2899 assoc_ptr->partition);
2900 assoc_ptr->usage->accrue_cnt = 0;
2901 }
2902 /* now go up the hierarchy */
2903 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
2904 }
2905 }
2906
_acct_policy_validate(job_desc_msg_t * job_desc,part_record_t * part_ptr,slurmdb_assoc_rec_t * assoc_in,slurmdb_qos_rec_t * qos_ptr_1,slurmdb_qos_rec_t * qos_ptr_2,uint32_t * reason,acct_policy_limit_set_t * acct_policy_limit_set,bool update_call)2907 static bool _acct_policy_validate(job_desc_msg_t *job_desc,
2908 part_record_t *part_ptr,
2909 slurmdb_assoc_rec_t *assoc_in,
2910 slurmdb_qos_rec_t *qos_ptr_1,
2911 slurmdb_qos_rec_t *qos_ptr_2,
2912 uint32_t *reason,
2913 acct_policy_limit_set_t *
2914 acct_policy_limit_set,
2915 bool update_call)
2916 {
2917 slurmdb_qos_rec_t qos_rec;
2918 slurmdb_assoc_rec_t *assoc_ptr = assoc_in;
2919 int parent = 0, job_cnt = 1;
2920 char *user_name = NULL;
2921 bool rc = true;
2922 assoc_mgr_lock_t locks =
2923 { .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
2924 bool strict_checking;
2925
2926 xassert(acct_policy_limit_set);
2927
2928 if (!assoc_ptr) {
2929 error("acct_policy_validate: no assoc_ptr given for job.");
2930 return false;
2931 }
2932 user_name = assoc_ptr->user;
2933
2934 if (job_desc->array_bitmap)
2935 job_cnt = bit_set_count(job_desc->array_bitmap);
2936
2937 slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
2938
2939 assoc_mgr_lock(&locks);
2940
2941 assoc_mgr_set_qos_tres_cnt(&qos_rec);
2942
2943 if (qos_ptr_1) {
2944 strict_checking = (qos_ptr_1->flags & QOS_FLAG_DENY_LIMIT);
2945 if (qos_ptr_2 && !strict_checking)
2946 strict_checking =
2947 qos_ptr_2->flags & QOS_FLAG_DENY_LIMIT;
2948
2949 if (!(rc = _qos_policy_validate(
2950 job_desc, assoc_ptr, part_ptr,
2951 qos_ptr_1, &qos_rec,
2952 reason, acct_policy_limit_set, update_call,
2953 user_name, job_cnt, strict_checking)))
2954 goto end_it;
2955 if (!(rc = _qos_policy_validate(
2956 job_desc, assoc_ptr,
2957 part_ptr, qos_ptr_2, &qos_rec,
2958 reason, acct_policy_limit_set, update_call,
2959 user_name, job_cnt, strict_checking)))
2960 goto end_it;
2961
2962 } else /*
2963 * We don't have a QOS to determine if we should fail or not, so
2964 * we will go with strict_checking by default.
2965 */
2966 strict_checking = true;
2967
2968 while (assoc_ptr) {
2969 int tres_pos = 0;
2970
2971 if (!_validate_tres_limits_for_assoc(
2972 &tres_pos, job_desc->tres_req_cnt, 0,
2973 assoc_ptr->grp_tres_ctld,
2974 qos_rec.grp_tres_ctld,
2975 acct_policy_limit_set->tres,
2976 strict_checking, update_call, 1)) {
2977 if (reason)
2978 *reason = _get_tres_state_reason(
2979 tres_pos, WAIT_ASSOC_GRP_UNK);
2980
2981 debug2("job submit for user %s(%u): "
2982 "min tres(%s) request %"PRIu64" exceeds "
2983 "group max tres limit %"PRIu64" for account %s",
2984 user_name,
2985 job_desc->user_id,
2986 assoc_mgr_tres_name_array[tres_pos],
2987 job_desc->tres_req_cnt[tres_pos],
2988 assoc_ptr->grp_tres_ctld[tres_pos],
2989 assoc_ptr->acct);
2990 rc = false;
2991 break;
2992 }
2993
2994 /* for validation we don't need to look at
2995 * assoc_ptr->grp_jobs.
2996 */
2997
2998 if ((qos_rec.grp_submit_jobs == INFINITE) &&
2999 (assoc_ptr->grp_submit_jobs != INFINITE) &&
3000 ((assoc_ptr->usage->used_submit_jobs + job_cnt)
3001 > assoc_ptr->grp_submit_jobs)) {
3002 if (reason)
3003 *reason = WAIT_ASSOC_GRP_SUB_JOB;
3004 debug2("job submit for user %s(%u): group max submit job limit exceeded %u (used:%u + requested:%d) for account '%s'",
3005 user_name,
3006 job_desc->user_id,
3007 assoc_ptr->grp_submit_jobs,
3008 assoc_ptr->usage->used_submit_jobs, job_cnt,
3009 assoc_ptr->acct);
3010 rc = false;
3011 break;
3012 }
3013
3014 tres_pos = 0;
3015 if (!update_call && !_validate_tres_time_limits(
3016 &tres_pos,
3017 &job_desc->time_limit,
3018 part_ptr->max_time,
3019 job_desc->tres_req_cnt,
3020 assoc_ptr->grp_tres_mins_ctld,
3021 qos_rec.grp_tres_mins_ctld,
3022 &acct_policy_limit_set->time,
3023 strict_checking)) {
3024 if (reason)
3025 *reason = _get_tres_state_reason(
3026 tres_pos,
3027 WAIT_ASSOC_GRP_UNK_MIN);
3028 debug2("job submit for user %s(%u): "
3029 "tres(%s) time limit request %"PRIu64" "
3030 "exceeds group max limit %"PRIu64" "
3031 "for account '%s'",
3032 user_name,
3033 job_desc->user_id,
3034 assoc_mgr_tres_name_array[tres_pos],
3035 ((uint64_t)job_desc->time_limit *
3036 job_desc->tres_req_cnt[tres_pos]),
3037 assoc_ptr->
3038 grp_tres_mins_ctld[tres_pos],
3039 assoc_ptr->acct);
3040 rc = false;
3041 goto end_it;
3042 }
3043
3044 tres_pos = 0;
3045 if (!update_call && !_validate_tres_time_limits(
3046 &tres_pos,
3047 &job_desc->time_limit,
3048 part_ptr->max_time,
3049 job_desc->tres_req_cnt,
3050 assoc_ptr->grp_tres_run_mins_ctld,
3051 qos_rec.grp_tres_run_mins_ctld,
3052 &acct_policy_limit_set->time,
3053 strict_checking)) {
3054 if (reason)
3055 *reason = _get_tres_state_reason(
3056 tres_pos,
3057 WAIT_ASSOC_GRP_UNK_RUN_MIN);
3058 debug2("job submit for user %s(%u): "
3059 "tres(%s) time limit request %"PRIu64" "
3060 "exceeds group max running "
3061 "limit %"PRIu64" for account '%s'",
3062 user_name,
3063 job_desc->user_id,
3064 assoc_mgr_tres_name_array[tres_pos],
3065 ((uint64_t)job_desc->time_limit *
3066 job_desc->tres_req_cnt[tres_pos]),
3067 assoc_ptr->
3068 grp_tres_run_mins_ctld[tres_pos],
3069 assoc_ptr->acct);
3070 rc = false;
3071 goto end_it;
3072 }
3073
3074 if (!update_call && !_validate_time_limit(
3075 &job_desc->time_limit,
3076 part_ptr->max_time,
3077 1,
3078 assoc_ptr->grp_wall,
3079 &qos_rec.grp_wall,
3080 &acct_policy_limit_set->time,
3081 strict_checking, false)) {
3082 if (reason)
3083 *reason = WAIT_ASSOC_GRP_WALL;
3084 debug2("job submit for user %s(%u): "
3085 "time limit %u exceeds max group %u for "
3086 "account '%s'",
3087 user_name,
3088 job_desc->user_id,
3089 job_desc->time_limit,
3090 assoc_ptr->grp_wall,
3091 assoc_ptr->acct);
3092 rc = false;
3093 break;
3094 }
3095
3096 /* We don't need to look at the regular limits for
3097 * parents since we have pre-propogated them, so just
3098 * continue with the next parent
3099 */
3100 if (parent) {
3101 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3102 continue;
3103 }
3104
3105 /* for validation we don't need to look at
3106 * assoc_ptr->max_cpu_mins_pj.
3107 */
3108
3109 tres_pos = 0;
3110 if (!_validate_tres_limits_for_assoc(
3111 &tres_pos, job_desc->tres_req_cnt, 0,
3112 assoc_ptr->max_tres_ctld,
3113 qos_rec.max_tres_pj_ctld,
3114 acct_policy_limit_set->tres,
3115 strict_checking, update_call, 1)) {
3116 if (reason)
3117 *reason = _get_tres_state_reason(
3118 tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB);
3119
3120 debug2("job submit for user %s(%u): "
3121 "min tres(%s) request %"PRIu64" exceeds "
3122 "max tres limit %"PRIu64" for account %s",
3123 user_name,
3124 job_desc->user_id,
3125 assoc_mgr_tres_name_array[tres_pos],
3126 job_desc->tres_req_cnt[tres_pos],
3127 assoc_ptr->max_tres_ctld[tres_pos],
3128 assoc_ptr->acct);
3129 rc = false;
3130 break;
3131 }
3132
3133 tres_pos = 0;
3134 if (!_validate_tres_limits_for_assoc(
3135 &tres_pos, job_desc->tres_req_cnt,
3136 job_desc->tres_req_cnt[TRES_ARRAY_NODE],
3137 assoc_ptr->max_tres_pn_ctld,
3138 qos_rec.max_tres_pn_ctld,
3139 acct_policy_limit_set->tres,
3140 strict_checking, update_call, 1)) {
3141 if (reason)
3142 *reason = _get_tres_state_reason(
3143 tres_pos,
3144 WAIT_ASSOC_MAX_UNK_PER_NODE);
3145
3146 debug2("job submit for user %s(%u): "
3147 "min tres(%s) request %"PRIu64" exceeds "
3148 "max tres limit %"PRIu64" per node "
3149 "for account %s",
3150 user_name,
3151 job_desc->user_id,
3152 assoc_mgr_tres_name_array[tres_pos],
3153 job_desc->tres_req_cnt[tres_pos] /
3154 job_desc->tres_req_cnt[TRES_ARRAY_NODE],
3155 assoc_ptr->max_tres_pn_ctld[tres_pos],
3156 assoc_ptr->acct);
3157 rc = false;
3158 break;
3159 }
3160
3161 /* for validation we don't need to look at
3162 * assoc_ptr->max_jobs.
3163 */
3164
3165 if ((qos_rec.max_submit_jobs_pa == INFINITE) &&
3166 (qos_rec.max_submit_jobs_pu == INFINITE) &&
3167 (assoc_ptr->max_submit_jobs != INFINITE) &&
3168 ((assoc_ptr->usage->used_submit_jobs + job_cnt)
3169 > assoc_ptr->max_submit_jobs)) {
3170 if (reason)
3171 *reason = WAIT_ASSOC_MAX_SUB_JOB;
3172 debug2("job submit for user %s(%u): account max submit job limit exceeded %u (used:%u + requested:%d) for account '%s'",
3173 user_name,
3174 job_desc->user_id,
3175 assoc_ptr->max_submit_jobs,
3176 assoc_ptr->usage->used_submit_jobs, job_cnt,
3177 assoc_ptr->acct);
3178 rc = false;
3179 break;
3180 }
3181
3182 if (!update_call && !_validate_tres_time_limits(
3183 &tres_pos,
3184 &job_desc->time_limit,
3185 part_ptr->max_time,
3186 job_desc->tres_req_cnt,
3187 assoc_ptr->max_tres_mins_ctld,
3188 qos_rec.max_tres_mins_pj_ctld,
3189 &acct_policy_limit_set->time,
3190 strict_checking)) {
3191 if (reason)
3192 *reason = _get_tres_state_reason(
3193 tres_pos,
3194 WAIT_ASSOC_MAX_UNK_MINS_PER_JOB);
3195 debug2("job submit for user %s(%u): "
3196 "tres(%s) time limit request %"PRIu64" "
3197 "exceeds max per-job limit %"PRIu64" "
3198 "for account '%s'",
3199 user_name,
3200 job_desc->user_id,
3201 assoc_mgr_tres_name_array[tres_pos],
3202 ((uint64_t)job_desc->time_limit *
3203 job_desc->tres_req_cnt[tres_pos]),
3204 assoc_ptr->max_tres_mins_ctld[tres_pos],
3205 assoc_ptr->acct);
3206 rc = false;
3207 break;
3208 }
3209
3210 if (!update_call && !_validate_time_limit(
3211 &job_desc->time_limit,
3212 part_ptr->max_time,
3213 1,
3214 assoc_ptr->max_wall_pj,
3215 &qos_rec.max_wall_pj,
3216 &acct_policy_limit_set->time,
3217 strict_checking, false)) {
3218 if (reason)
3219 *reason = WAIT_ASSOC_MAX_WALL_PER_JOB;
3220 debug2("job submit for user %s(%u): "
3221 "time limit %u exceeds max %u for "
3222 "account '%s'",
3223 user_name,
3224 job_desc->user_id,
3225 job_desc->time_limit,
3226 assoc_ptr->max_wall_pj,
3227 assoc_ptr->acct);
3228 rc = false;
3229 break;
3230 }
3231
3232 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3233 parent = 1;
3234 }
3235 end_it:
3236 assoc_mgr_unlock(&locks);
3237 slurmdb_free_qos_rec_members(&qos_rec);
3238
3239 return rc;
3240 }
3241
3242 /*
3243 * acct_policy_validate - validate that a job request can be satisfied without
3244 * exceeding any association or QOS limit.
3245 * job_desc IN - job descriptor being submitted
3246 * part_ptr IN - pointer to (one) partition to which the job is being submitted
3247 * assoc_in IN - pointer to association to which the job is being submitted
3248 * qos_ptr IN - pointer to QOS to which the job is being submitted
3249 * state_reason OUT - if non-NULL, set to reason for rejecting the job
3250 * acct_policy_limit_set IN/OUT - limits set for the job, pre-allocated storage
3251 * is filled in by acct_policy_validate
3252 * update_call IN - true if request to update existing job request
3253 * RET true if valid
3254 */
acct_policy_validate(job_desc_msg_t * job_desc,part_record_t * part_ptr,slurmdb_assoc_rec_t * assoc_in,slurmdb_qos_rec_t * qos_ptr,uint32_t * reason,acct_policy_limit_set_t * acct_policy_limit_set,bool update_call)3255 extern bool acct_policy_validate(job_desc_msg_t *job_desc,
3256 part_record_t *part_ptr,
3257 slurmdb_assoc_rec_t *assoc_in,
3258 slurmdb_qos_rec_t *qos_ptr,
3259 uint32_t *reason,
3260 acct_policy_limit_set_t *acct_policy_limit_set,
3261 bool update_call)
3262 {
3263 slurmdb_qos_rec_t *qos_ptr_1 = NULL, *qos_ptr_2 = NULL;
3264 job_record_t job_rec;
3265 bool rc;
3266 assoc_mgr_lock_t locks =
3267 { .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3268
3269 assoc_mgr_lock(&locks);
3270 job_rec.qos_ptr = qos_ptr;
3271 job_rec.part_ptr = part_ptr;
3272 acct_policy_set_qos_order(&job_rec, &qos_ptr_1, &qos_ptr_2);
3273 assoc_mgr_unlock(&locks);
3274 rc = _acct_policy_validate(job_desc, part_ptr, assoc_in,
3275 qos_ptr_1, qos_ptr_2, reason,
3276 acct_policy_limit_set, update_call);
3277 return rc;
3278 }
3279
3280 /*
3281 * acct_policy_validate_het_job - validate that a hetjob as a whole (all
3282 * components at once) can be satisfied without exceeding any association
3283 * limit. Build a list of every job's association and QOS information then combine
3284 * usage information for every job sharing an association and test that against
3285 * the appropriate limit.
3286 *
3287 * NOTE: This test is imperfect. Each job actually has up to 3 sets of limits
3288 * to test (association, job QOS and partition QOS). Ideally each would be tested
3289 * independently, but that is complicated due to QOS limits overriding the
3290 * association limits and the ability to have 3 sets of limits for each job.
3291 * This only tests the association limit for each hetjob component based
3292 * upon that component's job and partition QOS.
3293 *
3294 * NOTE: That a hetjob passes this test does not mean that it will be able
3295 * to run. For example, this test assumues resource allocation at the CPU level.
3296 * If each task is allocated one core, with 2 CPUs, then the CPU limit test
3297 * would not be accurate.
3298 *
3299 * submit_job_list IN - list of job_record_t entries (already created)
3300 * RET true if valid
3301 */
acct_policy_validate_het_job(List submit_job_list)3302 extern bool acct_policy_validate_het_job(List submit_job_list)
3303 {
3304 assoc_mgr_lock_t locks =
3305 { .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3306 List het_job_limit_list;
3307 ListIterator iter1, iter2;
3308 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
3309 job_record_t *job_ptr1, *job_ptr2;
3310 het_job_limits_t *job_limit1, *job_limit2;
3311 bool rc = true;
3312 job_desc_msg_t job_desc;
3313 bool build_job_desc = true;
3314 acct_policy_limit_set_t acct_policy_limit_set;
3315 int i, job_cnt;
3316 uint32_t reason = 0;
3317 int tres_req_size = sizeof(uint64_t) * g_tres_count;
3318
3319 memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set_t));
3320 acct_policy_limit_set.tres =
3321 xmalloc(sizeof(uint16_t) * slurmctld_tres_cnt);
3322
3323 /* Build list of QOS, association, and job pointers */
3324 het_job_limit_list = list_create(xfree_ptr);
3325 iter1 = list_iterator_create(submit_job_list);
3326 assoc_mgr_lock(&locks);
3327 while ((job_ptr1 = list_next(iter1))) {
3328 qos_ptr_1 = NULL;
3329 qos_ptr_2 = NULL;
3330 acct_policy_set_qos_order(job_ptr1, &qos_ptr_1, &qos_ptr_2);
3331 job_limit1 = xmalloc(sizeof(het_job_limits_t));
3332 job_limit1->assoc_ptr = job_ptr1->assoc_ptr;
3333 job_limit1->job_ptr = job_ptr1;
3334 job_limit1->qos_ptr_1 = qos_ptr_1;
3335 job_limit1->qos_ptr_2 = qos_ptr_2;
3336 list_append(het_job_limit_list, job_limit1);
3337 }
3338 assoc_mgr_unlock(&locks);
3339 list_iterator_destroy(iter1);
3340
3341 iter1 = list_iterator_create(het_job_limit_list);
3342 while ((job_limit1 = list_next(iter1))) {
3343 job_ptr1 = job_limit1->job_ptr;
3344 if (build_job_desc) {
3345 build_job_desc = false;
3346 job_desc.time_limit = job_ptr1->time_limit;
3347 job_desc.tres_req_cnt = xmalloc(tres_req_size);
3348 job_desc.user_id = job_ptr1->user_id;
3349 }
3350 if (job_limit1->assoc_ptr) {
3351 job_cnt = 1;
3352 memcpy(job_desc.tres_req_cnt, job_ptr1->tres_req_cnt,
3353 tres_req_size);
3354 iter2 = list_iterator_create(het_job_limit_list);
3355 while ((job_limit2 = list_next(iter2))) {
3356 if ((job_limit2 == job_limit1) ||
3357 (job_limit2->assoc_ptr !=
3358 job_limit1->assoc_ptr))
3359 continue;
3360 job_ptr2 = job_limit2->job_ptr;
3361 for (i = 0 ; i < g_tres_count; i++) {
3362 job_desc.tres_req_cnt[i] +=
3363 job_ptr2->tres_req_cnt[i];
3364 }
3365 job_cnt++;
3366 }
3367 list_iterator_destroy(iter2);
3368 if (job_cnt > 1) {
3369 job_desc.array_bitmap = bit_alloc(job_cnt);
3370 /*
3371 * SET NO BITS. Make this look like zero jobs
3372 * are being added. The job count was already
3373 * validated when each individual component of
3374 * the heterogeneous job was created.
3375 */
3376 rc = _acct_policy_validate(&job_desc,
3377 job_ptr1->part_ptr,
3378 job_limit1->assoc_ptr,
3379 job_limit1->qos_ptr_1,
3380 job_limit1->qos_ptr_2,
3381 &reason,
3382 &acct_policy_limit_set,
3383 false);
3384 bit_free(job_desc.array_bitmap);
3385 if (!rc)
3386 break;
3387 }
3388 }
3389 }
3390 list_iterator_destroy(iter1);
3391
3392 xfree(job_desc.tres_req_cnt);
3393 list_destroy(het_job_limit_list);
3394 xfree(acct_policy_limit_set.tres);
3395
3396 return rc;
3397 }
3398
3399 /*
3400 * Determine if the specified job can execute right now or is currently
3401 * blocked by an association or QOS limit. Does not re-validate job state.
3402 */
acct_policy_job_runnable_state(job_record_t * job_ptr)3403 extern bool acct_policy_job_runnable_state(job_record_t *job_ptr)
3404 {
3405 /* If any more limits are added this will need to be added to */
3406 if ((job_ptr->state_reason >= WAIT_QOS_GRP_CPU
3407 && job_ptr->state_reason <= WAIT_ASSOC_MAX_SUB_JOB) ||
3408 (job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) ||
3409 (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) ||
3410 (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT) ||
3411 (job_ptr->state_reason == WAIT_QOS_JOB_LIMIT) ||
3412 (job_ptr->state_reason == WAIT_QOS_TIME_LIMIT)) {
3413 return false;
3414 }
3415
3416 return true;
3417 }
3418
3419 /*
3420 * acct_policy_job_runnable_pre_select - Determine if the specified
3421 * job can execute right now or not depending upon accounting
3422 * policy (e.g. running job limit for this association). If the
3423 * association limits prevent the job from ever running (lowered
3424 * limits since job submission), then cancel the job.
3425 */
acct_policy_job_runnable_pre_select(job_record_t * job_ptr,bool assoc_mgr_locked)3426 extern bool acct_policy_job_runnable_pre_select(job_record_t *job_ptr,
3427 bool assoc_mgr_locked)
3428 {
3429 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
3430 slurmdb_qos_rec_t qos_rec;
3431 slurmdb_assoc_rec_t *assoc_ptr;
3432 uint32_t time_limit = NO_VAL;
3433 bool rc = true;
3434 uint32_t wall_mins;
3435 bool safe_limits = false;
3436 int parent = 0; /* flag to tell us if we are looking at the
3437 * parent or not
3438 */
3439 assoc_mgr_lock_t locks =
3440 { .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3441
3442 /* check to see if we are enforcing associations */
3443 if (!accounting_enforce)
3444 return true;
3445
3446 if (!_valid_job_assoc(job_ptr)) {
3447 xfree(job_ptr->state_desc);
3448 job_ptr->state_reason = FAIL_ACCOUNT;
3449 return false;
3450 }
3451
3452 /* now see if we are enforcing limits */
3453 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
3454 return true;
3455
3456 /* clear old state reason */
3457 if (!acct_policy_job_runnable_state(job_ptr)) {
3458 xfree(job_ptr->state_desc);
3459 job_ptr->state_reason = WAIT_NO_REASON;
3460 }
3461
3462 slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
3463
3464 if (!assoc_mgr_locked)
3465 assoc_mgr_lock(&locks);
3466
3467 assoc_mgr_set_qos_tres_cnt(&qos_rec);
3468
3469 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
3470
3471 /* check the first QOS setting it's values in the qos_rec */
3472 if (qos_ptr_1 &&
3473 !(rc = _qos_job_runnable_pre_select(job_ptr, qos_ptr_1, &qos_rec)))
3474 goto end_it;
3475
3476 /* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
3477 if (qos_ptr_2 &&
3478 !(rc = _qos_job_runnable_pre_select(job_ptr, qos_ptr_2, &qos_rec)))
3479 goto end_it;
3480
3481 /*
3482 * check to see if we should be using safe limits, if so we
3483 * will only start a job if there are sufficient remaining
3484 * cpu-minutes for it to run to completion
3485 */
3486 if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
3487 safe_limits = true;
3488
3489 assoc_ptr = job_ptr->assoc_ptr;
3490 while (assoc_ptr) {
3491 /* This only trips when the grp_used_wall is divisible
3492 * by 60, i.e if a limit is 1 min and you have only
3493 * accumulated 59 seconds you will still be able to
3494 * get another job in as 59/60 = 0 int wise.
3495 */
3496 wall_mins = assoc_ptr->usage->grp_used_wall / 60;
3497
3498 #if _DEBUG
3499 info("acct_job_limits: %u of %u",
3500 assoc_ptr->usage->used_jobs, assoc_ptr->max_jobs);
3501 #endif
3502 /* we don't need to check grp_cpu_mins here */
3503
3504 /* we don't need to check grp_cpus here */
3505
3506 /* we don't need to check grp_mem here */
3507
3508 if ((qos_rec.grp_jobs == INFINITE) &&
3509 (assoc_ptr->grp_jobs != INFINITE) &&
3510 (assoc_ptr->usage->used_jobs >= assoc_ptr->grp_jobs)) {
3511 xfree(job_ptr->state_desc);
3512 job_ptr->state_reason = WAIT_ASSOC_GRP_JOB;
3513 debug2("%pJ being held, assoc %u is at or exceeds group max jobs limit %u with %u for account %s",
3514 job_ptr, assoc_ptr->id, assoc_ptr->grp_jobs,
3515 assoc_ptr->usage->used_jobs, assoc_ptr->acct);
3516
3517 rc = false;
3518 goto end_it;
3519 }
3520
3521 /* we don't need to check grp_cpu_run_mins here */
3522
3523 /* we don't need to check grp_nodes here */
3524
3525 /* we don't need to check submit_jobs here */
3526
3527 if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
3528 && (qos_rec.grp_wall == INFINITE)
3529 && (assoc_ptr->grp_wall != INFINITE)) {
3530 if (time_limit == NO_VAL) {
3531 time_limit = job_ptr->time_limit;
3532 _set_time_limit(&time_limit,
3533 job_ptr->part_ptr->max_time,
3534 MIN(assoc_ptr->grp_wall,
3535 assoc_ptr->max_wall_pj),
3536 &job_ptr->limit_set.time);
3537
3538 /* Account for usage factor, if necessary */
3539 if ((job_ptr->qos_ptr &&
3540 (job_ptr->qos_ptr->flags &
3541 QOS_FLAG_USAGE_FACTOR_SAFE) &&
3542 (job_ptr->qos_ptr->usage_factor >= 0)) &&
3543 ((time_limit != INFINITE) ||
3544 (job_ptr->qos_ptr->usage_factor < 1.0))) {
3545 time_limit *=
3546 job_ptr->qos_ptr->usage_factor;
3547 }
3548 }
3549
3550 if (wall_mins >= assoc_ptr->grp_wall) {
3551 xfree(job_ptr->state_desc);
3552 job_ptr->state_reason = WAIT_ASSOC_GRP_WALL;
3553 debug2("%pJ being held, assoc %u is at or exceeds group wall limit %u with %u for account %s",
3554 job_ptr, assoc_ptr->id,
3555 assoc_ptr->grp_wall,
3556 wall_mins, assoc_ptr->acct);
3557 rc = false;
3558 goto end_it;
3559 } else if (safe_limits &&
3560 ((wall_mins + time_limit) >
3561 assoc_ptr->grp_wall)) {
3562 xfree(job_ptr->state_desc);
3563 job_ptr->state_reason = WAIT_ASSOC_GRP_WALL;
3564 debug2("%pJ being held, the job request with assoc %u will exceed group wall limit %u if ran with %u for account %s",
3565 job_ptr, assoc_ptr->id,
3566 assoc_ptr->grp_wall,
3567 wall_mins + time_limit, assoc_ptr->acct);
3568 rc = false;
3569 goto end_it;
3570 }
3571 }
3572
3573 /*
3574 * We don't need to look at the regular limits for parents
3575 * since we have pre-propogated them, so just continue with
3576 * the next parent.
3577 */
3578 if (parent) {
3579 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3580 continue;
3581 }
3582
3583 /* we don't need to check max_cpu_mins_pj here */
3584
3585 /* we don't need to check max_cpus_pj here */
3586
3587 if ((qos_rec.max_jobs_pa == INFINITE) &&
3588 (qos_rec.max_jobs_pu == INFINITE) &&
3589 (assoc_ptr->max_jobs != INFINITE) &&
3590 (assoc_ptr->usage->used_jobs >= assoc_ptr->max_jobs)) {
3591 xfree(job_ptr->state_desc);
3592 job_ptr->state_reason = WAIT_ASSOC_MAX_JOBS;
3593 debug2("%pJ being held, assoc %u is at or exceeds max jobs limit %u with %u for account %s",
3594 job_ptr, assoc_ptr->id,
3595 assoc_ptr->max_jobs,
3596 assoc_ptr->usage->used_jobs, assoc_ptr->acct);
3597 rc = false;
3598 goto end_it;
3599 }
3600
3601 /* we don't need to check submit_jobs here */
3602
3603 /*
3604 * if the association limits have changed since job
3605 * submission and job can not run, then kill it
3606 */
3607 if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
3608 && (qos_rec.max_wall_pj == INFINITE)
3609 && (assoc_ptr->max_wall_pj != INFINITE)) {
3610 if (time_limit == NO_VAL) {
3611 time_limit = job_ptr->time_limit;
3612 _set_time_limit(&time_limit,
3613 job_ptr->part_ptr->max_time,
3614 assoc_ptr->max_wall_pj,
3615 &job_ptr->limit_set.time);
3616
3617 /* Account for usage factor, if necessary */
3618 if ((job_ptr->qos_ptr &&
3619 (job_ptr->qos_ptr->flags &
3620 QOS_FLAG_USAGE_FACTOR_SAFE) &&
3621 (job_ptr->qos_ptr->usage_factor >= 0)) &&
3622 ((time_limit != INFINITE) ||
3623 (job_ptr->qos_ptr->usage_factor < 1.0))) {
3624 time_limit *=
3625 job_ptr->qos_ptr->usage_factor;
3626 }
3627 }
3628
3629 if (time_limit > assoc_ptr->max_wall_pj) {
3630 xfree(job_ptr->state_desc);
3631 job_ptr->state_reason =
3632 WAIT_ASSOC_MAX_WALL_PER_JOB;
3633 debug2("%pJ being held, time limit %u exceeds account max %u",
3634 job_ptr, job_ptr->time_limit,
3635 time_limit);
3636 rc = false;
3637 goto end_it;
3638 }
3639 }
3640
3641 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3642 parent = 1;
3643 }
3644 end_it:
3645 if (!assoc_mgr_locked)
3646 assoc_mgr_unlock(&locks);
3647 slurmdb_free_qos_rec_members(&qos_rec);
3648
3649 return rc;
3650 }
3651
3652 /*
3653 * acct_policy_job_runnable_post_select - After nodes have been
3654 * selected for the job verify the counts don't exceed aggregated limits.
3655 */
acct_policy_job_runnable_post_select(job_record_t * job_ptr,uint64_t * tres_req_cnt,bool assoc_mgr_locked)3656 extern bool acct_policy_job_runnable_post_select(job_record_t *job_ptr,
3657 uint64_t *tres_req_cnt,
3658 bool assoc_mgr_locked)
3659 {
3660 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
3661 slurmdb_qos_rec_t qos_rec;
3662 slurmdb_assoc_rec_t *assoc_ptr;
3663 uint64_t tres_usage_mins[slurmctld_tres_cnt];
3664 uint64_t tres_run_mins[slurmctld_tres_cnt];
3665 uint64_t job_tres_time_limit[slurmctld_tres_cnt];
3666 uint64_t orig_node_cnt;
3667 uint32_t time_limit;
3668 bool rc = true;
3669 bool safe_limits = false;
3670 int i, tres_pos = 0;
3671 acct_policy_tres_usage_t tres_usage;
3672 double usage_factor = 1.0;
3673 int parent = 0; /* flag to tell us if we are looking at the
3674 * parent or not
3675 */
3676 assoc_mgr_lock_t locks =
3677 { .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3678
3679 xassert(job_ptr);
3680 xassert(job_ptr->part_ptr);
3681 xassert(tres_req_cnt);
3682
3683 /* check to see if we are enforcing associations */
3684 if (!accounting_enforce)
3685 return true;
3686
3687 /* probably don't need to check this here */
3688 /* if (!_valid_job_assoc(job_ptr)) { */
3689 /* job_ptr->state_reason = FAIL_ACCOUNT; */
3690 /* return false; */
3691 /* } */
3692
3693 /* now see if we are enforcing limits */
3694 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
3695 return true;
3696
3697 /* check to see if we should be using safe limits, if so we
3698 * will only start a job if there are sufficient remaining
3699 * cpu-minutes for it to run to completion */
3700 if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
3701 safe_limits = true;
3702
3703 /* clear old state reason */
3704 if (!acct_policy_job_runnable_state(job_ptr)) {
3705 xfree(job_ptr->state_desc);
3706 job_ptr->state_reason = WAIT_NO_REASON;
3707 }
3708
3709 job_ptr->qos_blocking_ptr = NULL;
3710
3711 /* clang needs this memset to avoid a warning */
3712 memset(tres_run_mins, 0, sizeof(tres_run_mins));
3713 memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
3714 memset(job_tres_time_limit, 0, sizeof(job_tres_time_limit));
3715
3716 time_limit = job_ptr->time_limit;
3717 _set_time_limit(&time_limit, job_ptr->part_ptr->max_time,
3718 job_ptr->part_ptr->default_time, NULL);
3719
3720 if (job_ptr->qos_ptr) {
3721 usage_factor = job_ptr->qos_ptr->usage_factor;
3722
3723 if ((usage_factor >= 0) &&
3724 (job_ptr->qos_ptr->flags & QOS_FLAG_USAGE_FACTOR_SAFE) &&
3725 ((time_limit != INFINITE) || (usage_factor < 1.0))) {
3726 time_limit *= usage_factor;
3727 }
3728 }
3729
3730 for (i=0; i<slurmctld_tres_cnt; i++)
3731 job_tres_time_limit[i] = (uint64_t)time_limit * tres_req_cnt[i];
3732
3733 slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
3734
3735 if (!assoc_mgr_locked)
3736 assoc_mgr_lock(&locks);
3737
3738 assoc_mgr_set_qos_tres_cnt(&qos_rec);
3739
3740 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
3741
3742 /* check the first QOS setting it's values in the qos_rec */
3743 if (qos_ptr_1 &&
3744 !(rc = _qos_job_runnable_post_select(job_ptr, qos_ptr_1,
3745 &qos_rec, tres_req_cnt,
3746 job_tres_time_limit)))
3747 goto end_it;
3748
3749 /* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
3750 if (qos_ptr_2 &&
3751 !(rc = _qos_job_runnable_post_select(job_ptr, qos_ptr_2,
3752 &qos_rec, tres_req_cnt,
3753 job_tres_time_limit)))
3754 goto end_it;
3755
3756 assoc_ptr = job_ptr->assoc_ptr;
3757 while (assoc_ptr) {
3758 for (i = 0; i < slurmctld_tres_cnt; i++) {
3759 tres_usage_mins[i] =
3760 (uint64_t)(assoc_ptr->usage->usage_tres_raw[i]
3761 / 60);
3762 tres_run_mins[i] =
3763 assoc_ptr->usage->grp_used_tres_run_secs[i] /
3764 60;
3765
3766 /*
3767 * Clear usage if factor is 0 so that jobs can run.
3768 * Otherwise multiplying can cause more jobs to be run
3769 * than the limit allows (e.g. usagefactor=.5).
3770 */
3771 if (usage_factor == 0.0) {
3772 tres_usage_mins[i] *= usage_factor;
3773 tres_run_mins[i] *= usage_factor;
3774 }
3775 }
3776
3777 #if _DEBUG
3778 info("acct_job_limits: %u of %u",
3779 assoc_ptr->usage->used_jobs, assoc_ptr->max_jobs);
3780 #endif
3781 /*
3782 * If the association has a GrpCPUMins limit set (and there
3783 * is no QOS with GrpCPUMins set) we may hold the job
3784 */
3785 tres_usage = _validate_tres_usage_limits_for_assoc(
3786 &tres_pos, assoc_ptr->grp_tres_mins_ctld,
3787 qos_rec.grp_tres_mins_ctld,
3788 job_tres_time_limit, tres_run_mins,
3789 tres_usage_mins, job_ptr->limit_set.tres,
3790 safe_limits);
3791 switch (tres_usage) {
3792 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
3793 xfree(job_ptr->state_desc);
3794 job_ptr->state_reason = _get_tres_state_reason(
3795 tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
3796 debug2("%pJ being held, assoc %u(%s/%s/%s) group max tres(%s) minutes limit of %"PRIu64" is already at or exceeded with %"PRIu64,
3797 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3798 assoc_ptr->user, assoc_ptr->partition,
3799 assoc_mgr_tres_name_array[tres_pos],
3800 assoc_ptr->grp_tres_mins_ctld[tres_pos],
3801 tres_usage_mins[tres_pos]);
3802 rc = false;
3803 goto end_it;
3804 break;
3805 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
3806 xfree(job_ptr->state_desc);
3807 job_ptr->state_reason = _get_tres_state_reason(
3808 tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
3809 debug2("%pJ being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
3810 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3811 assoc_ptr->user, assoc_ptr->partition,
3812 assoc_mgr_tres_name_array[tres_pos],
3813 assoc_ptr->grp_tres_mins_ctld[tres_pos],
3814 job_tres_time_limit[tres_pos]);
3815 rc = false;
3816 goto end_it;
3817 break;
3818 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
3819 /*
3820 * If we're using safe limits start
3821 * the job only if there are
3822 * sufficient cpu-mins left such that
3823 * it will run to completion without
3824 * being killed
3825 */
3826 xfree(job_ptr->state_desc);
3827 job_ptr->state_reason = _get_tres_state_reason(
3828 tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
3829 debug2("%pJ being held, the job is at or exceeds assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" of which %"PRIu64" are still available but request is for %"PRIu64" (plus %"PRIu64" already in use) tres minutes (request tres count %"PRIu64")",
3830 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3831 assoc_ptr->user, assoc_ptr->partition,
3832 assoc_mgr_tres_name_array[tres_pos],
3833 assoc_ptr->grp_tres_mins_ctld[tres_pos],
3834 assoc_ptr->grp_tres_mins_ctld[tres_pos] -
3835 tres_usage_mins[tres_pos],
3836 job_tres_time_limit[tres_pos],
3837 tres_run_mins[tres_pos],
3838 tres_req_cnt[tres_pos]);
3839 rc = false;
3840 goto end_it;
3841 break;
3842 case TRES_USAGE_OKAY:
3843 /* all good */
3844 break;
3845 }
3846
3847 orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
3848 _get_unique_job_node_cnt(job_ptr,
3849 assoc_ptr->usage->grp_node_bitmap,
3850 &tres_req_cnt[TRES_ARRAY_NODE]);
3851 tres_usage = _validate_tres_usage_limits_for_assoc(
3852 &tres_pos,
3853 assoc_ptr->grp_tres_ctld, qos_rec.grp_tres_ctld,
3854 tres_req_cnt, assoc_ptr->usage->grp_used_tres,
3855 NULL, job_ptr->limit_set.tres, true);
3856 tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
3857 switch (tres_usage) {
3858 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
3859 /* not possible because the curr_usage sent in is NULL*/
3860 break;
3861 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
3862 xfree(job_ptr->state_desc);
3863 job_ptr->state_reason = _get_tres_state_reason(
3864 tres_pos, WAIT_ASSOC_GRP_UNK);
3865 debug2("%pJ is being held, assoc %u(%s/%s/%s) min tres(%s) request %"PRIu64" exceeds group max tres limit %"PRIu64,
3866 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3867 assoc_ptr->user, assoc_ptr->partition,
3868 assoc_mgr_tres_name_array[tres_pos],
3869 tres_req_cnt[tres_pos],
3870 assoc_ptr->grp_tres_ctld[tres_pos]);
3871 rc = false;
3872 goto end_it;
3873 break;
3874 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
3875 xfree(job_ptr->state_desc);
3876 job_ptr->state_reason = _get_tres_state_reason(
3877 tres_pos, WAIT_ASSOC_GRP_UNK);
3878 debug2("%pJ being held, if allowed the job request will exceed assoc %u(%s/%s/%s) group max tres(%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
3879 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3880 assoc_ptr->user, assoc_ptr->partition,
3881 assoc_mgr_tres_name_array[tres_pos],
3882 assoc_ptr->grp_tres_ctld[tres_pos],
3883 assoc_ptr->usage->grp_used_tres[tres_pos],
3884 tres_req_cnt[tres_pos]);
3885 rc = false;
3886 goto end_it;
3887 case TRES_USAGE_OKAY:
3888 /* all good */
3889 break;
3890 }
3891
3892 /* we don't need to check grp_jobs here */
3893
3894 tres_usage = _validate_tres_usage_limits_for_assoc(
3895 &tres_pos,
3896 assoc_ptr->grp_tres_run_mins_ctld,
3897 qos_rec.grp_tres_run_mins_ctld,
3898 job_tres_time_limit, tres_run_mins, NULL, NULL, true);
3899 switch (tres_usage) {
3900 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
3901 /* not possible because the curr_usage sent in is NULL*/
3902 break;
3903 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
3904 xfree(job_ptr->state_desc);
3905 job_ptr->state_reason = _get_tres_state_reason(
3906 tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN);
3907 debug2("%pJ is being held, assoc %u(%s/%s/%s) group max running tres(%s) minutes request limit %"PRIu64" exceeds limit %"PRIu64,
3908 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3909 assoc_ptr->user, assoc_ptr->partition,
3910 assoc_mgr_tres_name_array[tres_pos],
3911 tres_run_mins[tres_pos],
3912 assoc_ptr->grp_tres_run_mins_ctld[tres_pos]);
3913 rc = false;
3914 goto end_it;
3915 break;
3916 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
3917 xfree(job_ptr->state_desc);
3918 job_ptr->state_reason = _get_tres_state_reason(
3919 tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN);
3920 debug2("%pJ being held, if allowed the job request will exceed assoc %u(%s/%s/%s) group max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
3921 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3922 assoc_ptr->user, assoc_ptr->partition,
3923 assoc_mgr_tres_name_array[tres_pos],
3924 assoc_ptr->grp_tres_run_mins_ctld[tres_pos],
3925 tres_run_mins[tres_pos],
3926 job_tres_time_limit[tres_pos]);
3927 rc = false;
3928 goto end_it;
3929 break;
3930 case TRES_USAGE_OKAY:
3931 /* all good */
3932 break;
3933 }
3934
3935 /* we don't need to check submit_jobs here */
3936
3937 /* we don't need to check grp_wall here */
3938
3939
3940 /* We don't need to look at the regular limits for
3941 * parents since we have pre-propogated them, so just
3942 * continue with the next parent
3943 */
3944 if (parent) {
3945 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3946 continue;
3947 }
3948
3949 if (!_validate_tres_limits_for_assoc(
3950 &tres_pos, job_tres_time_limit, 0,
3951 assoc_ptr->max_tres_mins_ctld,
3952 qos_rec.max_tres_mins_pj_ctld,
3953 job_ptr->limit_set.tres,
3954 1, 0, 1)) {
3955 xfree(job_ptr->state_desc);
3956 job_ptr->state_reason = _get_tres_state_reason(
3957 tres_pos, WAIT_ASSOC_MAX_UNK_MINS_PER_JOB);
3958 debug2("%pJ being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) minutes of %"PRIu64" with %"PRIu64,
3959 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3960 assoc_ptr->user, assoc_ptr->partition,
3961 assoc_mgr_tres_name_array[tres_pos],
3962 assoc_ptr->max_tres_mins_ctld[tres_pos],
3963 job_tres_time_limit[tres_pos]);
3964 rc = false;
3965 goto end_it;
3966 }
3967
3968 if (!_validate_tres_limits_for_assoc(
3969 &tres_pos, tres_req_cnt, 0,
3970 assoc_ptr->max_tres_ctld,
3971 qos_rec.max_tres_pj_ctld,
3972 job_ptr->limit_set.tres,
3973 1, 0, 1)) {
3974 xfree(job_ptr->state_desc);
3975 job_ptr->state_reason = _get_tres_state_reason(
3976 tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB);
3977 debug2("%pJ is being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) limit of %"PRIu64" with %"PRIu64,
3978 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3979 assoc_ptr->user, assoc_ptr->partition,
3980 assoc_mgr_tres_name_array[tres_pos],
3981 assoc_ptr->max_tres_ctld[tres_pos],
3982 tres_req_cnt[tres_pos]);
3983 rc = false;
3984 break;
3985 }
3986
3987 if (!_validate_tres_limits_for_assoc(
3988 &tres_pos, tres_req_cnt,
3989 tres_req_cnt[TRES_ARRAY_NODE],
3990 assoc_ptr->max_tres_pn_ctld,
3991 qos_rec.max_tres_pn_ctld,
3992 job_ptr->limit_set.tres,
3993 1, 0, 1)) {
3994 xfree(job_ptr->state_desc);
3995 job_ptr->state_reason = _get_tres_state_reason(
3996 tres_pos, WAIT_ASSOC_MAX_UNK_PER_NODE);
3997 debug2("%pJ is being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) per node limit of %"PRIu64" with %"PRIu64,
3998 job_ptr, assoc_ptr->id, assoc_ptr->acct,
3999 assoc_ptr->user, assoc_ptr->partition,
4000 assoc_mgr_tres_name_array[tres_pos],
4001 assoc_ptr->max_tres_pn_ctld[tres_pos],
4002 tres_req_cnt[tres_pos]);
4003 rc = false;
4004 break;
4005 }
4006
4007 /* we do not need to check max_jobs here */
4008
4009 /* we don't need to check submit_jobs here */
4010
4011 /* we don't need to check max_wall_pj here */
4012
4013 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
4014 parent = 1;
4015 }
4016 end_it:
4017 if (!assoc_mgr_locked)
4018 assoc_mgr_unlock(&locks);
4019 slurmdb_free_qos_rec_members(&qos_rec);
4020
4021 return rc;
4022 }
4023
acct_policy_get_max_nodes(job_record_t * job_ptr,uint32_t * wait_reason)4024 extern uint32_t acct_policy_get_max_nodes(job_record_t *job_ptr,
4025 uint32_t *wait_reason)
4026 {
4027 uint64_t max_nodes_limit = INFINITE64, qos_max_p_limit = INFINITE64,
4028 grp_nodes = INFINITE64;
4029 assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .qos = READ_LOCK };
4030 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4031 slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
4032 bool parent = 0; /* flag to tell us if we are looking at the
4033 * parent or not
4034 */
4035 bool grp_set = 0;
4036
4037 /* check to see if we are enforcing associations */
4038 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4039 return max_nodes_limit;
4040
4041 xassert(wait_reason);
4042
4043 assoc_mgr_lock(&locks);
4044
4045 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4046
4047 if (qos_ptr_1) {
4048 uint64_t max_nodes_pj =
4049 qos_ptr_1->max_tres_pj_ctld[TRES_ARRAY_NODE];
4050 uint64_t max_nodes_pu =
4051 qos_ptr_1->max_tres_pu_ctld[TRES_ARRAY_NODE];
4052 uint64_t max_nodes_pa =
4053 qos_ptr_1->max_tres_pa_ctld[TRES_ARRAY_NODE];
4054
4055 grp_nodes = qos_ptr_1->grp_tres_ctld[TRES_ARRAY_NODE];
4056
4057 if (qos_ptr_2) {
4058 if (max_nodes_pa == INFINITE64)
4059 max_nodes_pa = qos_ptr_2->max_tres_pa_ctld[
4060 TRES_ARRAY_NODE];
4061 if (max_nodes_pj == INFINITE64)
4062 max_nodes_pj = qos_ptr_2->max_tres_pj_ctld[
4063 TRES_ARRAY_NODE];
4064 if (max_nodes_pu == INFINITE64)
4065 max_nodes_pu = qos_ptr_2->max_tres_pu_ctld[
4066 TRES_ARRAY_NODE];
4067 if (grp_nodes == INFINITE64)
4068 grp_nodes = qos_ptr_2->grp_tres_ctld[
4069 TRES_ARRAY_NODE];
4070 }
4071
4072 if (max_nodes_pa < max_nodes_limit) {
4073 max_nodes_limit = max_nodes_pa;
4074 *wait_reason = WAIT_QOS_MAX_NODE_PER_ACCT;
4075 }
4076
4077 if (max_nodes_pj < max_nodes_limit) {
4078 max_nodes_limit = max_nodes_pj;
4079 *wait_reason = WAIT_QOS_MAX_NODE_PER_JOB;
4080 }
4081
4082 if (max_nodes_pu < max_nodes_limit) {
4083 max_nodes_limit = max_nodes_pu;
4084 *wait_reason = WAIT_QOS_MAX_NODE_PER_USER;
4085 }
4086
4087 qos_max_p_limit = max_nodes_limit;
4088
4089 if (grp_nodes < max_nodes_limit) {
4090 max_nodes_limit = grp_nodes;
4091 *wait_reason = WAIT_QOS_GRP_NODE;
4092 }
4093 }
4094
4095 /* We have to traverse all the associations because QOS might
4096 not override a particular limit.
4097 */
4098 while (assoc_ptr) {
4099 if ((!qos_ptr_1 || (grp_nodes == INFINITE64))
4100 && (assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)
4101 && (assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE] <
4102 max_nodes_limit)) {
4103 max_nodes_limit =
4104 assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE];
4105 *wait_reason = WAIT_ASSOC_GRP_NODE;
4106 grp_set = 1;
4107 }
4108
4109 if (!parent
4110 && (qos_max_p_limit == INFINITE64)
4111 && (assoc_ptr->max_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)
4112 && (assoc_ptr->max_tres_ctld[TRES_ARRAY_NODE] <
4113 max_nodes_limit)) {
4114 max_nodes_limit =
4115 assoc_ptr->max_tres_ctld[TRES_ARRAY_NODE];
4116 *wait_reason = WAIT_ASSOC_MAX_NODE_PER_JOB;
4117 }
4118
4119 /* only check the first grp set */
4120 if (grp_set)
4121 break;
4122
4123 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
4124 parent = 1;
4125 continue;
4126 }
4127
4128 assoc_mgr_unlock(&locks);
4129 return max_nodes_limit;
4130 }
4131
4132 /*
4133 * acct_policy_update_pending_job - Make sure the limits imposed on a job on
4134 * submission are correct after an update to a qos or association. If
4135 * the association/qos limits prevent the job from running (lowered
4136 * limits since job submission), then reset its reason field.
4137 */
acct_policy_update_pending_job(job_record_t * job_ptr)4138 extern int acct_policy_update_pending_job(job_record_t *job_ptr)
4139 {
4140 job_desc_msg_t job_desc;
4141 acct_policy_limit_set_t acct_policy_limit_set;
4142 bool update_accounting = false;
4143 struct job_details *details_ptr;
4144 int rc = SLURM_SUCCESS;
4145 uint64_t tres_req_cnt[slurmctld_tres_cnt];
4146
4147 /* check to see if we are enforcing associations and the job
4148 * is pending or if we are even enforcing limits. */
4149 if (!accounting_enforce || !IS_JOB_PENDING(job_ptr)
4150 || !(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4151 return SLURM_SUCCESS;
4152
4153 details_ptr = job_ptr->details;
4154
4155 if (!details_ptr) {
4156 error("acct_policy_update_pending_job: no details");
4157 return SLURM_ERROR;
4158 }
4159
4160 /* set up the job desc to make sure things are the way we
4161 * need.
4162 */
4163 slurm_init_job_desc_msg(&job_desc);
4164
4165 /* copy the limits set from the job the only one that
4166 * acct_policy_validate changes is the time limit so we
4167 * should be ok with the memcpy here */
4168 memcpy(&acct_policy_limit_set, &job_ptr->limit_set,
4169 sizeof(acct_policy_limit_set_t));
4170 job_desc.tres_req_cnt = tres_req_cnt;
4171 /* copy all the tres requests over */
4172 memcpy(job_desc.tres_req_cnt, job_ptr->tres_req_cnt,
4173 sizeof(uint64_t) * slurmctld_tres_cnt);
4174
4175 /* Only set this value if not set from a limit */
4176 if (job_ptr->limit_set.time == ADMIN_SET_LIMIT)
4177 acct_policy_limit_set.time = job_ptr->limit_set.time;
4178 else if ((job_ptr->time_limit != NO_VAL) && !job_ptr->limit_set.time)
4179 job_desc.time_limit = job_ptr->time_limit;
4180
4181 if (!acct_policy_validate(&job_desc, job_ptr->part_ptr,
4182 job_ptr->assoc_ptr, job_ptr->qos_ptr,
4183 &job_ptr->state_reason,
4184 &acct_policy_limit_set, 0)) {
4185 info("%s: exceeded association/qos's cpu, node, memory or time limit for %pJ",
4186 __func__, job_ptr);
4187 return SLURM_ERROR;
4188 }
4189
4190 /* The only variable in acct_policy_limit_set that is changed
4191 * in acct_policy_validate is the time limit so only worry
4192 * about that one.
4193 */
4194
4195 /* If it isn't an admin set limit replace it. */
4196 if (!acct_policy_limit_set.time && (job_ptr->limit_set.time == 1)) {
4197 job_ptr->time_limit = NO_VAL;
4198 job_ptr->limit_set.time = 0;
4199 update_accounting = true;
4200 } else if (acct_policy_limit_set.time != ADMIN_SET_LIMIT) {
4201 if (job_ptr->time_limit != job_desc.time_limit) {
4202 job_ptr->time_limit = job_desc.time_limit;
4203 update_accounting = true;
4204 }
4205 job_ptr->limit_set.time = acct_policy_limit_set.time;
4206 }
4207
4208 if (update_accounting) {
4209 last_job_update = time(NULL);
4210 debug("limits changed for %pJ: updating accounting", job_ptr);
4211 /* Update job record in accounting to reflect changes */
4212 jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
4213 }
4214
4215 return rc;
4216 }
4217
4218 /*
4219 * acct_policy_job_runnable - Determine if the specified job has timed
4220 * out based on it's QOS or association.
4221 */
acct_policy_job_time_out(job_record_t * job_ptr)4222 extern bool acct_policy_job_time_out(job_record_t *job_ptr)
4223 {
4224 uint64_t job_tres_usage_mins[slurmctld_tres_cnt];
4225 uint64_t time_delta;
4226 uint64_t tres_usage_mins[slurmctld_tres_cnt];
4227 uint32_t wall_mins, orig_node_cnt;
4228 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4229 slurmdb_qos_rec_t qos_rec;
4230 slurmdb_assoc_rec_t *assoc = NULL;
4231 assoc_mgr_lock_t locks =
4232 { .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
4233 time_t now;
4234 int i, tres_pos = 0;
4235 acct_policy_tres_usage_t tres_usage;
4236
4237 /*
4238 * Now see if we are enforcing limits. If Safe is set then
4239 * return false as well since we are being safe if the limit
4240 * was changed after the job was already deemed safe to start.
4241 */
4242 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
4243 || (accounting_enforce & ACCOUNTING_ENFORCE_SAFE))
4244 return false;
4245
4246 slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
4247 assoc_mgr_lock(&locks);
4248
4249 assoc_mgr_set_qos_tres_cnt(&qos_rec);
4250
4251 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4252
4253 assoc = job_ptr->assoc_ptr;
4254
4255 now = time(NULL);
4256
4257 time_delta = (uint64_t)(((now - job_ptr->start_time) -
4258 job_ptr->tot_sus_time) / 60);
4259
4260 /* clang needs this memset to avoid a warning */
4261 memset(job_tres_usage_mins, 0, sizeof(tres_usage_mins));
4262 memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
4263
4264 /*
4265 * find out how many CPU minutes this job has been running for.
4266 * We add 1 here to make it so we can check for just > instead of
4267 * >= in our checks.
4268 */
4269 for (i = 0; i < slurmctld_tres_cnt; i++) {
4270 if (i == TRES_ARRAY_ENERGY)
4271 continue;
4272 if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
4273 continue;
4274
4275 if (job_ptr->tres_alloc_cnt[i]) {
4276 job_tres_usage_mins[i] =
4277 (time_delta * job_ptr->tres_alloc_cnt[i]) + 1;
4278 }
4279 }
4280
4281 /* check the first QOS setting it's values in the qos_rec */
4282 if (qos_ptr_1 && !_qos_job_time_out(job_ptr, qos_ptr_1,
4283 &qos_rec, job_tres_usage_mins))
4284 goto job_failed;
4285
4286 /* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
4287 if (qos_ptr_2 && !_qos_job_time_out(job_ptr, qos_ptr_2,
4288 &qos_rec, job_tres_usage_mins))
4289 goto job_failed;
4290
4291 /* handle any association stuff here */
4292 while (assoc) {
4293 for (i = 0; i < slurmctld_tres_cnt; i++)
4294 tres_usage_mins[i] =
4295 (uint64_t)(assoc->usage->usage_tres_raw[i]
4296 / 60.0);
4297 wall_mins = assoc->usage->grp_used_wall / 60;
4298
4299 tres_usage = _validate_tres_usage_limits_for_assoc(
4300 &tres_pos, assoc->grp_tres_mins_ctld,
4301 qos_rec.grp_tres_mins_ctld, NULL,
4302 NULL, tres_usage_mins, NULL, false);
4303 switch (tres_usage) {
4304 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
4305 last_job_update = now;
4306 info("%pJ timed out, the job is at or exceeds assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
4307 job_ptr, assoc->id, assoc->acct,
4308 assoc->user, assoc->partition,
4309 assoc_mgr_tres_name_array[tres_pos],
4310 assoc->grp_tres_mins_ctld[tres_pos],
4311 tres_usage_mins[tres_pos]);
4312 job_ptr->state_reason = FAIL_TIMEOUT;
4313 goto job_failed;
4314 break;
4315 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
4316 /* not possible safe_limits is 0 */
4317 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
4318 /* not possible safe_limits is 0 */
4319 case TRES_USAGE_OKAY:
4320 /* all good */
4321 break;
4322 }
4323
4324 if ((qos_rec.grp_wall == INFINITE)
4325 && (assoc->grp_wall != INFINITE)
4326 && (wall_mins >= assoc->grp_wall)) {
4327 info("%pJ timed out, assoc %u is at or exceeds group wall limit %u with %u for account %s",
4328 job_ptr, assoc->id, assoc->grp_wall,
4329 wall_mins, assoc->acct);
4330 job_ptr->state_reason = FAIL_TIMEOUT;
4331 break;
4332 }
4333
4334 orig_node_cnt = job_tres_usage_mins[TRES_ARRAY_NODE];
4335 job_tres_usage_mins[TRES_ARRAY_NODE] = 0;
4336 tres_usage = _validate_tres_usage_limits_for_assoc(
4337 &tres_pos, assoc->max_tres_mins_ctld,
4338 qos_rec.max_tres_mins_pj_ctld, job_tres_usage_mins,
4339 NULL, NULL, NULL, true);
4340 job_tres_usage_mins[TRES_ARRAY_NODE] = orig_node_cnt;
4341 switch (tres_usage) {
4342 case TRES_USAGE_CUR_EXCEEDS_LIMIT:
4343 /* not possible curr_usage is NULL */
4344 break;
4345 case TRES_USAGE_REQ_EXCEEDS_LIMIT:
4346 last_job_update = now;
4347 info("%pJ timed out, the job is at or exceeds assoc %u(%s/%s/%s) max tres(%s) minutes of %"PRIu64" with %"PRIu64,
4348 job_ptr, assoc->id, assoc->acct,
4349 assoc->user, assoc->partition,
4350 assoc_mgr_tres_name_array[tres_pos],
4351 assoc->max_tres_mins_ctld[tres_pos],
4352 job_tres_usage_mins[tres_pos]);
4353 job_ptr->state_reason = FAIL_TIMEOUT;
4354 goto job_failed;
4355 break;
4356 case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
4357 /* not possible tres_usage is NULL */
4358 case TRES_USAGE_OKAY:
4359 /* all good */
4360 break;
4361 }
4362
4363 assoc = assoc->usage->parent_assoc_ptr;
4364 /* these limits don't apply to the root assoc */
4365 if (assoc == assoc_mgr_root_assoc)
4366 break;
4367 }
4368 job_failed:
4369 assoc_mgr_unlock(&locks);
4370 slurmdb_free_qos_rec_members(&qos_rec);
4371
4372 if (job_ptr->state_reason == FAIL_TIMEOUT)
4373 return true;
4374
4375 return false;
4376 }
4377
acct_policy_handle_accrue_time(job_record_t * job_ptr,bool assoc_mgr_locked)4378 extern int acct_policy_handle_accrue_time(job_record_t *job_ptr,
4379 bool assoc_mgr_locked)
4380 {
4381 job_record_t *old_job_ptr;
4382 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4383 slurmdb_assoc_rec_t *assoc_ptr;
4384 struct job_details *details_ptr;
4385 slurmdb_used_limits_t *used_limits_a1 = NULL, *used_limits_u1 = NULL;
4386 slurmdb_used_limits_t *used_limits_a2 = NULL, *used_limits_u2 = NULL;
4387
4388 uint32_t max_jobs_accrue = INFINITE;
4389 int create_cnt = 0, i, rc = SLURM_SUCCESS;
4390 time_t now = time(NULL);
4391 bool parent = false;
4392 static time_t sched_update = 0;
4393 static uint16_t priority_flags = 0;
4394 assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK,
4395 NO_LOCK, NO_LOCK, NO_LOCK };
4396
4397 details_ptr = job_ptr->details;
4398 if (!details_ptr) {
4399 error("%s: no details", __func__);
4400 return SLURM_ERROR;
4401 }
4402
4403 if (sched_update != slurmctld_conf.last_update)
4404 priority_flags = slurm_get_priority_flags();
4405
4406 /*
4407 * ACCRUE_ALWAYS flag will always force the accrue_time to be the
4408 * submit_time (Not begin). Accrue limits don't work with this flag.
4409 */
4410 if (priority_flags & PRIORITY_FLAGS_ACCRUE_ALWAYS) {
4411 if (!details_ptr->accrue_time)
4412 details_ptr->accrue_time = details_ptr->submit_time;
4413 return SLURM_SUCCESS;
4414 }
4415
4416 /* Always set accrue_time to begin time when not enforcing limits. */
4417 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
4418 if (!details_ptr->accrue_time)
4419 details_ptr->accrue_time = details_ptr->begin_time;
4420 return SLURM_SUCCESS;
4421 }
4422
4423 /*
4424 * If the job is not eligible because it is either held, dependent or
4425 * because its begin time is in the future don't accrue time.
4426 */
4427 if (!job_ptr->priority || (job_ptr->bit_flags & JOB_DEPENDENT) ||
4428 (details_ptr->begin_time && (details_ptr->begin_time > now)))
4429 return SLURM_SUCCESS;
4430
4431 /* No accrue_time and the job isn't pending, bail */
4432 if (!details_ptr->accrue_time && !IS_JOB_PENDING(job_ptr))
4433 return SLURM_SUCCESS;
4434
4435 assoc_ptr = job_ptr->assoc_ptr;
4436 if (!assoc_ptr) {
4437 debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4438 __func__, job_ptr);
4439 return SLURM_ERROR;
4440 }
4441
4442 if (!assoc_mgr_locked)
4443 assoc_mgr_lock(&locks);
4444
4445 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4446
4447 if (qos_ptr_1) {
4448 used_limits_a1 = acct_policy_get_acct_used_limits(
4449 &qos_ptr_1->usage->acct_limit_list,
4450 assoc_ptr->acct);
4451 used_limits_u1 = acct_policy_get_user_used_limits(
4452 &qos_ptr_1->usage->user_limit_list,
4453 job_ptr->user_id);
4454 }
4455
4456 if (qos_ptr_2) {
4457 used_limits_a2 = acct_policy_get_acct_used_limits(
4458 &qos_ptr_2->usage->acct_limit_list,
4459 assoc_ptr->acct);
4460 used_limits_u2 = acct_policy_get_user_used_limits(
4461 &qos_ptr_2->usage->user_limit_list,
4462 job_ptr->user_id);
4463 }
4464
4465 /* We have started running, let's clear us out of the mix. */
4466 if (details_ptr->accrue_time) {
4467 if (!(job_ptr->bit_flags & JOB_ACCRUE_OVER) &&
4468 !IS_JOB_PENDING(job_ptr)) {
4469 int job_cnt;
4470 /*
4471 * Normally only single jobs come in here, but if we
4472 * don't have any limits and an array is cancelled the
4473 * array itself comes in so we need to remove all of it.
4474 */
4475
4476 if (job_ptr->array_recs &&
4477 job_ptr->array_recs->task_cnt)
4478 job_cnt = job_ptr->array_recs->task_cnt;
4479 else
4480 job_cnt = 1;
4481
4482 /* We only want to handle this once */
4483 job_ptr->bit_flags |= JOB_ACCRUE_OVER;
4484
4485 _remove_accrue_time_internal(job_ptr->assoc_ptr,
4486 qos_ptr_1,
4487 used_limits_a1,
4488 used_limits_u1,
4489 qos_ptr_2,
4490 used_limits_a2,
4491 used_limits_u2,
4492 job_cnt);
4493 }
4494
4495 /* We already have our time and we aren't an array, endit */
4496 if (!IS_JOB_PENDING(job_ptr) ||
4497 !job_ptr->array_recs || !job_ptr->array_recs->task_cnt)
4498 goto endit;
4499 } else if (!IS_JOB_PENDING(job_ptr))
4500 goto endit;
4501
4502 if (qos_ptr_1)
4503 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4504 qos_ptr_1->grp_jobs_accrue,
4505 qos_ptr_1->usage->accrue_cnt);
4506 if (used_limits_a1)
4507 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4508 qos_ptr_1->max_jobs_accrue_pa,
4509 used_limits_a1->accrue_cnt);
4510
4511 if (used_limits_u1)
4512 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4513 qos_ptr_1->max_jobs_accrue_pu,
4514 used_limits_u1->accrue_cnt);
4515 if (qos_ptr_2)
4516 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4517 qos_ptr_2->grp_jobs_accrue,
4518 qos_ptr_2->usage->accrue_cnt);
4519 if (used_limits_a2)
4520 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4521 qos_ptr_2->max_jobs_accrue_pa,
4522 used_limits_a2->accrue_cnt);
4523
4524 if (used_limits_u2)
4525 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4526 qos_ptr_2->max_jobs_accrue_pu,
4527 used_limits_u2->accrue_cnt);
4528
4529 assoc_ptr = job_ptr->assoc_ptr;
4530 while (assoc_ptr) {
4531 if (max_jobs_accrue != INFINITE)
4532 break;
4533
4534 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4535 assoc_ptr->grp_jobs_accrue,
4536 assoc_ptr->usage->accrue_cnt);
4537 /* We don't need to look at the regular limits for
4538 * parents since we have pre-propogated them, so just
4539 * continue with the next parent
4540 */
4541 if (!parent)
4542 _get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4543 assoc_ptr->max_jobs_accrue,
4544 assoc_ptr->usage->accrue_cnt);
4545
4546 /* now go up the hierarchy */
4547 assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
4548 parent = true;
4549 }
4550
4551 /* No limit (or there is space to accrue) */
4552 if ((max_jobs_accrue == INFINITE) ||
4553 (create_cnt && (!job_ptr->array_recs ||
4554 !job_ptr->array_recs->task_cnt))) {
4555 if (!details_ptr->accrue_time &&
4556 job_ptr->details->begin_time) {
4557 /*
4558 * If no limit and begin_time hasn't happened yet
4559 * then set accrue_time to now.
4560 */
4561 details_ptr->accrue_time =
4562 ((max_jobs_accrue == INFINITE) &&
4563 details_ptr->begin_time) ?
4564 details_ptr->begin_time : time(NULL);
4565
4566 /*
4567 * If we have an array here and no limit we want to add
4568 * all the tasks in the array.
4569 */
4570 if (job_ptr->array_recs &&
4571 job_ptr->array_recs->task_cnt)
4572 create_cnt = job_ptr->array_recs->task_cnt;
4573 else
4574 create_cnt = 1;
4575
4576 _add_accrue_time_internal(job_ptr->assoc_ptr,
4577 qos_ptr_1,
4578 used_limits_a1,
4579 used_limits_u1,
4580 qos_ptr_2,
4581 used_limits_a2,
4582 used_limits_u2,
4583 create_cnt);
4584 }
4585
4586 goto endit;
4587 }
4588
4589 /* Looks like we are at the limit */
4590 if (!create_cnt) {
4591 if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE)
4592 info("%s: %pJ can't accrue, we are over a limit",
4593 __func__, job_ptr);
4594 goto endit;
4595 }
4596
4597 create_cnt = MIN(create_cnt, job_ptr->array_recs->task_cnt);
4598
4599 /* How many can we spin off? */
4600 for (i = 0; i < create_cnt; i++) {
4601 /*
4602 * After we split off the old_job_ptr is what we want to alter
4603 * as the job_ptr returned from job_array_post_sched will be the
4604 * master job_ptr for the array and we will use that to split
4605 * more off if needed.
4606 */
4607 old_job_ptr = job_ptr;
4608
4609 job_array_pre_sched(job_ptr);
4610 job_ptr = job_array_post_sched(job_ptr);
4611
4612 details_ptr = old_job_ptr->details;
4613 if (!details_ptr) {
4614 fatal_abort("%s: no details after split", __func__);
4615 rc = SLURM_ERROR;
4616 _add_accrue_time_internal(job_ptr->assoc_ptr,
4617 qos_ptr_1,
4618 used_limits_a1,
4619 used_limits_u1,
4620 qos_ptr_2,
4621 used_limits_a2,
4622 used_limits_u2,
4623 i - 1);
4624 goto endit;
4625 }
4626 details_ptr->accrue_time = now;
4627 if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE)
4628 info("%pJ is now accruing time %ld", old_job_ptr, now);
4629 }
4630
4631 /*
4632 * Here we are ok to use all the same pointers from the main job_ptr as
4633 * an array will always have the same pointers. If this ever changes in
4634 * the future some how we will need to address it.
4635 */
4636 _add_accrue_time_internal(job_ptr->assoc_ptr,
4637 qos_ptr_1,
4638 used_limits_a1,
4639 used_limits_u1,
4640 qos_ptr_2,
4641 used_limits_a2,
4642 used_limits_u2,
4643 create_cnt);
4644
4645 endit:
4646
4647 if (!assoc_mgr_locked)
4648 assoc_mgr_unlock(&locks);
4649
4650 return rc;
4651 }
4652
acct_policy_add_accrue_time(job_record_t * job_ptr,bool assoc_mgr_locked)4653 extern void acct_policy_add_accrue_time(job_record_t *job_ptr,
4654 bool assoc_mgr_locked)
4655 {
4656 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4657 slurmdb_assoc_rec_t *assoc_ptr;
4658 slurmdb_used_limits_t *used_limits_a1 = NULL, *used_limits_u1 = NULL;
4659 slurmdb_used_limits_t *used_limits_a2 = NULL, *used_limits_u2 = NULL;
4660 assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK,
4661 NO_LOCK, NO_LOCK, NO_LOCK };
4662 int job_cnt;
4663
4664 /* check to see if we are enforcing limits */
4665 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4666 return;
4667
4668 /* If Job is held or dependent don't accrue time */
4669 if (!job_ptr->priority || (job_ptr->bit_flags & JOB_DEPENDENT))
4670 return;
4671
4672 /* Job has to be pending to accrue time. */
4673 if (!IS_JOB_PENDING(job_ptr))
4674 return;
4675
4676 assoc_ptr = job_ptr->assoc_ptr;
4677 if (!assoc_ptr) {
4678 debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4679 __func__, job_ptr);
4680 return;
4681 }
4682
4683 if (!assoc_mgr_locked)
4684 assoc_mgr_lock(&locks);
4685
4686 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4687
4688 if (qos_ptr_1) {
4689 used_limits_a1 = acct_policy_get_acct_used_limits(
4690 &qos_ptr_1->usage->acct_limit_list,
4691 assoc_ptr->acct);
4692 used_limits_u1 = acct_policy_get_user_used_limits(
4693 &qos_ptr_1->usage->user_limit_list,
4694 job_ptr->user_id);
4695 }
4696
4697 if (qos_ptr_2) {
4698 used_limits_a2 = acct_policy_get_acct_used_limits(
4699 &qos_ptr_2->usage->acct_limit_list,
4700 assoc_ptr->acct);
4701 used_limits_u2 = acct_policy_get_user_used_limits(
4702 &qos_ptr_2->usage->user_limit_list,
4703 job_ptr->user_id);
4704 }
4705
4706 /*
4707 * Normally only single jobs come in here, but if we don't have any
4708 * limits the array itself comes in so we need to add it all.
4709 */
4710 if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
4711 job_cnt = job_ptr->array_recs->task_cnt;
4712 else
4713 job_cnt = 1;
4714
4715 _add_accrue_time_internal(assoc_ptr,
4716 qos_ptr_1,
4717 used_limits_a1,
4718 used_limits_u1,
4719 qos_ptr_2,
4720 used_limits_a2,
4721 used_limits_u2,
4722 job_cnt);
4723 if (!assoc_mgr_locked)
4724 assoc_mgr_unlock(&locks);
4725 }
4726
acct_policy_remove_accrue_time(job_record_t * job_ptr,bool assoc_mgr_locked)4727 extern void acct_policy_remove_accrue_time(job_record_t *job_ptr,
4728 bool assoc_mgr_locked)
4729 {
4730 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4731 slurmdb_assoc_rec_t *assoc_ptr;
4732 slurmdb_used_limits_t *used_limits_a1 = NULL, *used_limits_u1 = NULL;
4733 slurmdb_used_limits_t *used_limits_a2 = NULL, *used_limits_u2 = NULL;
4734 assoc_mgr_lock_t locks = { .assoc = WRITE_LOCK, .qos = WRITE_LOCK };
4735 int job_cnt;
4736
4737 /* check to see if we are enforcing limits */
4738 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4739 return;
4740
4741 if (!job_ptr->details || !job_ptr->details->accrue_time)
4742 return;
4743
4744 /* Job has to be pending to accrue time. */
4745 if (!IS_JOB_PENDING(job_ptr))
4746 return;
4747
4748 if (!assoc_mgr_locked)
4749 assoc_mgr_lock(&locks);
4750
4751 assoc_ptr = job_ptr->assoc_ptr;
4752 if (!assoc_ptr) {
4753 debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4754 __func__, job_ptr);
4755 goto end_it;
4756 }
4757
4758 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4759
4760 if (qos_ptr_1) {
4761 used_limits_a1 = acct_policy_get_acct_used_limits(
4762 &qos_ptr_1->usage->acct_limit_list,
4763 assoc_ptr->acct);
4764 used_limits_u1 = acct_policy_get_user_used_limits(
4765 &qos_ptr_1->usage->user_limit_list,
4766 job_ptr->user_id);
4767 }
4768
4769 if (qos_ptr_2) {
4770 used_limits_a2 = acct_policy_get_acct_used_limits(
4771 &qos_ptr_2->usage->acct_limit_list,
4772 assoc_ptr->acct);
4773 used_limits_u2 = acct_policy_get_user_used_limits(
4774 &qos_ptr_2->usage->user_limit_list,
4775 job_ptr->user_id);
4776 }
4777
4778 /*
4779 * Normally only single jobs come in here, but if we don't have any
4780 * limits the array itself comes in so we need to add it all.
4781 */
4782 if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
4783 job_cnt = job_ptr->array_recs->task_cnt;
4784 else
4785 job_cnt = 1;
4786
4787 _remove_accrue_time_internal(assoc_ptr,
4788 qos_ptr_1,
4789 used_limits_a1,
4790 used_limits_u1,
4791 qos_ptr_2,
4792 used_limits_a2,
4793 used_limits_u2,
4794 job_cnt);
4795
4796 /* reset the job */
4797 job_ptr->details->accrue_time = 0;
4798 job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
4799
4800 end_it:
4801 if (!assoc_mgr_locked)
4802 assoc_mgr_unlock(&locks);
4803 }
4804
acct_policy_get_prio_thresh(job_record_t * job_ptr,bool assoc_mgr_locked)4805 extern uint32_t acct_policy_get_prio_thresh(job_record_t *job_ptr,
4806 bool assoc_mgr_locked)
4807 {
4808 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4809 slurmdb_assoc_rec_t *assoc_ptr;
4810 uint32_t prio_thresh = 0;
4811 assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK,
4812 NO_LOCK, NO_LOCK, NO_LOCK };
4813
4814 /* check to see if we are enforcing limits */
4815 if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4816 return 0;
4817
4818 assoc_ptr = job_ptr->assoc_ptr;
4819 if (!assoc_ptr) {
4820 debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4821 __func__, job_ptr);
4822 return 0;
4823 }
4824
4825 if (!assoc_mgr_locked)
4826 assoc_mgr_lock(&locks);
4827
4828 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4829
4830 if (qos_ptr_1)
4831 _get_prio_thresh(&prio_thresh, qos_ptr_1->min_prio_thresh);
4832
4833 if (qos_ptr_2)
4834 _get_prio_thresh(&prio_thresh, qos_ptr_2->min_prio_thresh);
4835
4836 _get_prio_thresh(&prio_thresh, assoc_ptr->min_prio_thresh);
4837
4838 if (!assoc_mgr_locked)
4839 assoc_mgr_unlock(&locks);
4840
4841 return prio_thresh;
4842 }
4843
acct_policy_get_preemptable_time(job_record_t * job_ptr)4844 extern time_t acct_policy_get_preemptable_time(job_record_t *job_ptr)
4845 {
4846 slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4847 uint32_t min1, min2, conf_min;
4848 time_t start = job_ptr->start_time;
4849 xassert(verify_lock(CONF_LOCK, READ_LOCK));
4850 xassert(verify_lock(JOB_LOCK, READ_LOCK));
4851 xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK));
4852
4853 acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4854 min1 = (qos_ptr_1) ? qos_ptr_1->preempt_exempt_time : INFINITE;
4855 min2 = (qos_ptr_2) ? qos_ptr_2->preempt_exempt_time : INFINITE;
4856 conf_min = slurmctld_conf.preempt_exempt_time;
4857
4858 /* priority: min1 > min2 > conf_min. INFINITE means none. */
4859 if (min1 != INFINITE)
4860 return start + min1;
4861 else if (min2 != INFINITE)
4862 return start + min2;
4863 else if (conf_min != INFINITE)
4864 return start + conf_min;
4865 else
4866 return start;
4867 }
4868
acct_policy_is_job_preempt_exempt(job_record_t * job_ptr)4869 extern bool acct_policy_is_job_preempt_exempt(job_record_t *job_ptr)
4870 {
4871 time_t now = time(0);
4872
4873 assoc_mgr_lock_t locks = { .qos = READ_LOCK };
4874 assoc_mgr_lock(&locks);
4875 time_t preempt_time = acct_policy_get_preemptable_time(job_ptr);
4876 assoc_mgr_unlock(&locks);
4877
4878 return now < preempt_time;
4879 }
4880
acct_policy_set_qos_order(job_record_t * job_ptr,slurmdb_qos_rec_t ** qos_ptr_1,slurmdb_qos_rec_t ** qos_ptr_2)4881 extern void acct_policy_set_qos_order(job_record_t *job_ptr,
4882 slurmdb_qos_rec_t **qos_ptr_1,
4883 slurmdb_qos_rec_t **qos_ptr_2)
4884 {
4885 xassert(job_ptr);
4886 xassert(qos_ptr_1);
4887 xassert(qos_ptr_2);
4888
4889 /* Initialize incoming pointers */
4890 *qos_ptr_1 = NULL;
4891 *qos_ptr_2 = NULL;
4892
4893 if (job_ptr->qos_ptr) {
4894 if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr) {
4895 /*
4896 * If the job's QOS has the flag to over ride the
4897 * partition then use that otherwise use the
4898 * partition's QOS as the king.
4899 */
4900 if (job_ptr->qos_ptr->flags & QOS_FLAG_OVER_PART_QOS) {
4901 *qos_ptr_1 = job_ptr->qos_ptr;
4902 *qos_ptr_2 = job_ptr->part_ptr->qos_ptr;
4903 } else {
4904 *qos_ptr_1 = job_ptr->part_ptr->qos_ptr;
4905 *qos_ptr_2 = job_ptr->qos_ptr;
4906 }
4907
4908 /*
4909 * No reason to look at the same QOS twice, actually
4910 * we never want to do that ;).
4911 */
4912 if (*qos_ptr_1 == *qos_ptr_2)
4913 *qos_ptr_2 = NULL;
4914 } else
4915 *qos_ptr_1 = job_ptr->qos_ptr;
4916 } else if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr)
4917 *qos_ptr_1 = job_ptr->part_ptr->qos_ptr;
4918
4919 return;
4920 }
4921
4922 /*
4923 * Checks for record in *user_limit_list of user_id if
4924 * *user_limit_list doesn't exist it will create it, if the user_id
4925 * record doesn't exist it will add it to the list.
4926 * In all cases the user record is returned.
4927 */
acct_policy_get_acct_used_limits(List * acct_limit_list,char * acct)4928 extern slurmdb_used_limits_t *acct_policy_get_acct_used_limits(
4929 List *acct_limit_list, char *acct)
4930 {
4931 slurmdb_used_limits_t *used_limits;
4932
4933 xassert(acct_limit_list);
4934
4935 if (!*acct_limit_list)
4936 *acct_limit_list = list_create(slurmdb_destroy_used_limits);
4937
4938 if (!(used_limits = list_find_first(*acct_limit_list,
4939 _find_used_limits_for_acct,
4940 acct))) {
4941 int i = sizeof(uint64_t) * slurmctld_tres_cnt;
4942
4943 used_limits = xmalloc(sizeof(slurmdb_used_limits_t));
4944 used_limits->acct = xstrdup(acct);
4945
4946 used_limits->tres = xmalloc(i);
4947 used_limits->tres_run_mins = xmalloc(i);
4948
4949 list_append(*acct_limit_list, used_limits);
4950 }
4951
4952 return used_limits;
4953 }
4954
4955 /*
4956 * Checks for record in *user_limit_list of user_id if
4957 * *user_limit_list doesn't exist it will create it, if the user_id
4958 * record doesn't exist it will add it to the list.
4959 * In all cases the user record is returned.
4960 */
acct_policy_get_user_used_limits(List * user_limit_list,uint32_t user_id)4961 extern slurmdb_used_limits_t *acct_policy_get_user_used_limits(
4962 List *user_limit_list, uint32_t user_id)
4963 {
4964 slurmdb_used_limits_t *used_limits;
4965
4966 xassert(user_limit_list);
4967
4968 if (!*user_limit_list)
4969 *user_limit_list = list_create(slurmdb_destroy_used_limits);
4970
4971 if (!(used_limits = list_find_first(*user_limit_list,
4972 _find_used_limits_for_user,
4973 &user_id))) {
4974 int i = sizeof(uint64_t) * slurmctld_tres_cnt;
4975
4976 used_limits = xmalloc(sizeof(slurmdb_used_limits_t));
4977 used_limits->uid = user_id;
4978
4979 used_limits->tres = xmalloc(i);
4980 used_limits->tres_run_mins = xmalloc(i);
4981
4982 list_append(*user_limit_list, used_limits);
4983 }
4984
4985 return used_limits;
4986 }
4987