1 /*****************************************************************************\
2  *  acct_policy.c - Enforce accounting policy
3  *****************************************************************************
4  *  Copyright (C) 2008 Lawrence Livermore National Security.
5  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6  *  Written by Morris Jette <jette1@llnl.gov>
7  *  CODE-OCEC-09-009. All rights reserved.
8  *
9  *  This file is part of Slurm, a resource management program.
10  *  For details, see <https://slurm.schedmd.com/>.
11  *  Please also read the included file: DISCLAIMER.
12  *
13  *  Slurm is free software; you can redistribute it and/or modify it under
14  *  the terms of the GNU General Public License as published by the Free
15  *  Software Foundation; either version 2 of the License, or (at your option)
16  *  any later version.
17  *
18  *  In addition, as a special exception, the copyright holders give permission
19  *  to link the code of portions of this program with the OpenSSL library under
20  *  certain conditions as described in each individual source file, and
21  *  distribute linked combinations including the two. You must obey the GNU
22  *  General Public License in all respects for all of the code used other than
23  *  OpenSSL. If you modify file(s) with this exception, you may extend this
24  *  exception to your version of the file(s), but you are not obligated to do
25  *  so. If you do not wish to do so, delete this exception statement from your
26  *  version.  If you delete this exception statement from all source files in
27  *  the program, then also delete it here.
28  *
29  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
32  *  details.
33  *
34  *  You should have received a copy of the GNU General Public License along
35  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
36  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
37 \*****************************************************************************/
38 
39 #include "slurm/slurm_errno.h"
40 
41 #include "src/common/assoc_mgr.h"
42 #include "src/common/slurm_accounting_storage.h"
43 
44 #include "src/slurmctld/slurmctld.h"
45 #include "src/slurmctld/acct_policy.h"
46 #include "src/common/node_select.h"
47 #include "src/common/slurm_priority.h"
48 
49 #define _DEBUG 0
50 
51 enum {
52 	ACCT_POLICY_ADD_SUBMIT,
53 	ACCT_POLICY_REM_SUBMIT,
54 	ACCT_POLICY_JOB_BEGIN,
55 	ACCT_POLICY_JOB_FINI
56 };
57 
58 typedef enum {
59 	TRES_USAGE_OKAY,
60 	TRES_USAGE_CUR_EXCEEDS_LIMIT,
61 	TRES_USAGE_REQ_EXCEEDS_LIMIT,
62 	TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE
63 } acct_policy_tres_usage_t;
64 
65 typedef struct het_job_limits {
66 	slurmdb_assoc_rec_t *assoc_ptr;
67 	job_record_t *job_ptr;
68 	slurmdb_qos_rec_t *qos_ptr_1;
69 	slurmdb_qos_rec_t *qos_ptr_2;
70 } het_job_limits_t;
71 
72 /*
73  * Update a job's allocated node count to reflect only nodes that are not
74  * already allocated to this association.  Needed to enforce GrpNode limit.
75  */
_get_unique_job_node_cnt(job_record_t * job_ptr,bitstr_t * grp_node_bitmap,uint64_t * node_cnt)76 static void _get_unique_job_node_cnt(job_record_t *job_ptr,
77 				     bitstr_t *grp_node_bitmap,
78 				     uint64_t *node_cnt)
79 {
80 	xassert(node_cnt);
81 #if _DEBUG
82 	char node_bitstr[64];
83 	if (job_ptr->job_resrcs && job_ptr->job_resrcs->node_bitmap) {
84 		bit_fmt(node_bitstr, sizeof(node_bitstr),
85 			job_ptr->job_resrcs->node_bitmap);
86 		info("%s: %pJ job_resrcs->node_bitmap:%s",  __func__, job_ptr,
87 		     node_bitstr);
88 	} else {
89 		info("%s: %pJ job_resrcs->node_bitmap:NULL",  __func__,
90 		     job_ptr);
91 	}
92 
93 	if (grp_node_bitmap) {
94 		bit_fmt(node_bitstr, sizeof(node_bitstr), grp_node_bitmap);
95 		info("%s: object grp_node_bitmap:%s", __func__,
96 		     node_bitstr);
97 	} else {
98 		info("%s: object grp_node_bitmap:NULL", __func__);
99 	}
100 #endif
101 
102 	if (job_ptr->job_resrcs && job_ptr->job_resrcs->node_bitmap &&
103 	    grp_node_bitmap) {
104 		uint64_t overlap_cnt = bit_overlap(
105 			job_ptr->job_resrcs->node_bitmap, grp_node_bitmap);
106 		if (overlap_cnt) {
107 			uint64_t init_cnt = bit_set_count(
108 				job_ptr->job_resrcs->node_bitmap);
109 			*node_cnt = init_cnt - overlap_cnt;
110 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRES_NODE)
111 				info("%s: %pJ unique allocated node count changed from %"PRIu64" to %"PRIu64,
112 				     __func__, job_ptr, init_cnt, *node_cnt);
113 		}
114 	} else if (job_ptr->details && job_ptr->details->req_node_bitmap &&
115 		   grp_node_bitmap) {
116 		uint64_t overlap_cnt = bit_overlap(
117 			job_ptr->details->req_node_bitmap, grp_node_bitmap);
118 		if (overlap_cnt <= *node_cnt) {
119 			*node_cnt -=  overlap_cnt;
120 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRES_NODE)
121 				info("%s: %pJ unique allocated node count changed from %"PRIu64" to %"PRIu64,
122 				     __func__, job_ptr, *node_cnt + overlap_cnt, *node_cnt);
123 		}
124 	}
125 }
126 
127 /*
128  * Update node allocation information for a job being started.
129  * This includes grp_node_bitmap, grp_node_job_cnt and
130  * grp_used_tres[TRES_ARRAY_NODE] of an object (qos, assoc, etc).
131  */
_add_usage_node_bitmap(job_record_t * job_ptr,bitstr_t ** grp_node_bitmap,uint16_t ** grp_node_job_cnt,uint64_t * grp_used_tres)132 static void _add_usage_node_bitmap(job_record_t *job_ptr,
133 				   bitstr_t **grp_node_bitmap,
134 				   uint16_t **grp_node_job_cnt,
135 				   uint64_t *grp_used_tres)
136 {
137 	static int node_cnt = -1;
138 	int i, i_first, i_last;
139 
140 	xassert(grp_node_bitmap);
141 	xassert(grp_node_job_cnt);
142 	xassert(grp_used_tres);
143 
144 	if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) {
145 		if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id) {
146 			/*
147 			 * Hetjobs reach here as part of testing before any
148 			 * resource allocation. See _het_job_limit_check()
149 			 * in src/plugins/sched/backfill/backfill.c
150 			 */
151 		} else if (job_ptr->node_cnt == 0) {
152 			/* Zero size jobs OK to create/destroy burst buffers */
153 		} else {
154 			error("%s: %pJ lacks allocated node bitmap", __func__,
155 			      job_ptr);
156 		}
157 		return;
158 	}
159 	if (*grp_node_bitmap)
160 		bit_or(*grp_node_bitmap, job_ptr->job_resrcs->node_bitmap);
161 	else
162 		*grp_node_bitmap = bit_copy(job_ptr->job_resrcs->node_bitmap);
163 
164 	if (!*grp_node_job_cnt) {
165 		if (node_cnt == -1)
166 			node_cnt = bit_size(*grp_node_bitmap);
167 		*grp_node_job_cnt = xcalloc(node_cnt, sizeof(uint16_t));
168 	}
169 
170 	i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
171 	if (i_first == -1)
172 		i_last = -2;
173 	else
174 		i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
175 	for (i = i_first; i <= i_last; i++) {
176 		if (bit_test(job_ptr->job_resrcs->node_bitmap, i))
177 			(*grp_node_job_cnt)[i]++;
178 	}
179 	*grp_used_tres = bit_set_count(*grp_node_bitmap);
180 }
181 
182 /*
183  * Update node allocation information for a job being completed.
184  * This includes grp_node_bitmap, grp_node_job_cnt and
185  * grp_used_tres[TRES_ARRAY_NODE] of an object (qos, assoc, etc).
186  */
_rm_usage_node_bitmap(job_record_t * job_ptr,bitstr_t * grp_node_bitmap,uint16_t * grp_node_job_cnt,uint64_t * grp_used_tres)187 static void _rm_usage_node_bitmap(job_record_t *job_ptr,
188 				  bitstr_t *grp_node_bitmap,
189 				  uint16_t *grp_node_job_cnt,
190 				  uint64_t *grp_used_tres)
191 {
192 	int i, i_first, i_last;
193 
194 	xassert(grp_used_tres);
195 
196 	if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) {
197 		if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id) {
198 			/*
199 			 * Hetjobs reach here as part of testing before any
200 			 * resource allocation. See _het_job_limit_check()
201 			 * in src/plugins/sched/backfill/backfill.c
202 			 */
203 		} else if (job_ptr->node_cnt == 0) {
204 			/* Zero size jobs OK to create/destroy burst buffers */
205 		} else {
206 			error("%s: %pJ lacks allocated node bitmap", __func__,
207 			      job_ptr);
208 		}
209 		return;
210 	}
211 	if (!grp_node_bitmap) {
212 		error("%s: grp_node_bitmap is NULL", __func__);
213 		return;
214 	}
215 	if (!grp_node_job_cnt) {
216 		error("%s: grp_node_job_cnt is NULL", __func__);
217 		return;
218 	}
219 	i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
220 	if (i_first == -1)
221 		i_last = -2;
222 	else
223 		i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
224 	for (i = i_first; i <= i_last; i++) {
225 		if (!bit_test(job_ptr->job_resrcs->node_bitmap, i))
226 			continue;
227 		if (--grp_node_job_cnt[i] == 0)
228 			bit_clear(grp_node_bitmap, i);
229 	}
230 	*grp_used_tres = bit_set_count(grp_node_bitmap);
231 }
232 
_get_tres_state_reason(int tres_pos,int unk_reason)233 static int _get_tres_state_reason(int tres_pos, int unk_reason)
234 {
235 	switch (tres_pos) {
236 	case TRES_ARRAY_CPU:
237 		switch (unk_reason) {
238 		case WAIT_ASSOC_GRP_UNK:
239 			return WAIT_ASSOC_GRP_CPU;
240 		case WAIT_ASSOC_GRP_UNK_MIN:
241 			return WAIT_ASSOC_GRP_CPU_MIN;
242 		case WAIT_ASSOC_GRP_UNK_RUN_MIN:
243 			return WAIT_ASSOC_GRP_CPU_RUN_MIN;
244 		case WAIT_ASSOC_MAX_UNK_PER_JOB:
245 			return WAIT_ASSOC_MAX_CPU_PER_JOB;
246 		case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
247 			return WAIT_ASSOC_MAX_CPU_MINS_PER_JOB;
248 		case WAIT_ASSOC_MAX_UNK_PER_NODE:
249 			return WAIT_ASSOC_MAX_CPU_PER_NODE;
250 		case WAIT_QOS_GRP_UNK:
251 			return WAIT_QOS_GRP_CPU;
252 		case WAIT_QOS_GRP_UNK_MIN:
253 			return WAIT_QOS_GRP_CPU_MIN;
254 		case WAIT_QOS_GRP_UNK_RUN_MIN:
255 			return WAIT_QOS_GRP_CPU_RUN_MIN;
256 		case WAIT_QOS_MAX_UNK_PER_JOB:
257 			return WAIT_QOS_MAX_CPU_PER_JOB;
258 		case WAIT_QOS_MAX_UNK_PER_NODE:
259 			return WAIT_QOS_MAX_CPU_PER_NODE;
260 		case WAIT_QOS_MAX_UNK_PER_ACCT:
261 			return WAIT_QOS_MAX_CPU_PER_ACCT;
262 		case WAIT_QOS_MAX_UNK_PER_USER:
263 			return WAIT_QOS_MAX_CPU_PER_USER;
264 		case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
265 			return WAIT_QOS_MAX_CPU_MINS_PER_JOB;
266 		case WAIT_QOS_MIN_UNK:
267 			return WAIT_QOS_MIN_CPU;
268 		default:
269 			return unk_reason;
270 			break;
271 		}
272 		break;
273 	case TRES_ARRAY_MEM:
274 		switch (unk_reason) {
275 		case WAIT_ASSOC_GRP_UNK:
276 			return WAIT_ASSOC_GRP_MEM;
277 		case WAIT_ASSOC_GRP_UNK_MIN:
278 			return WAIT_ASSOC_GRP_MEM_MIN;
279 		case WAIT_ASSOC_GRP_UNK_RUN_MIN:
280 			return WAIT_ASSOC_GRP_MEM_RUN_MIN;
281 		case WAIT_ASSOC_MAX_UNK_PER_JOB:
282 			return WAIT_ASSOC_MAX_MEM_PER_JOB;
283 		case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
284 			return WAIT_ASSOC_MAX_MEM_MINS_PER_JOB;
285 		case WAIT_ASSOC_MAX_UNK_PER_NODE:
286 			return WAIT_ASSOC_MAX_MEM_PER_NODE;
287 		case WAIT_QOS_GRP_UNK:
288 			return WAIT_QOS_GRP_MEM;
289 		case WAIT_QOS_GRP_UNK_MIN:
290 			return WAIT_QOS_GRP_MEM_MIN;
291 		case WAIT_QOS_GRP_UNK_RUN_MIN:
292 			return WAIT_QOS_GRP_MEM_RUN_MIN;
293 		case WAIT_QOS_MAX_UNK_PER_JOB:
294 			return WAIT_QOS_MAX_MEM_PER_JOB;
295 		case WAIT_QOS_MAX_UNK_PER_NODE:
296 			return WAIT_QOS_MAX_MEM_PER_NODE;
297 		case WAIT_QOS_MAX_UNK_PER_ACCT:
298 			return WAIT_QOS_MAX_MEM_PER_ACCT;
299 		case WAIT_QOS_MAX_UNK_PER_USER:
300 			return WAIT_QOS_MAX_MEM_PER_USER;
301 		case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
302 			return WAIT_QOS_MAX_MEM_MINS_PER_JOB;
303 		case WAIT_QOS_MIN_UNK:
304 			return WAIT_QOS_MIN_MEM;
305 		default:
306 			return unk_reason;
307 			break;
308 		}
309 		break;
310 	case TRES_ARRAY_ENERGY:
311 		switch (unk_reason) {
312 		case WAIT_ASSOC_GRP_UNK:
313 			return WAIT_ASSOC_GRP_ENERGY;
314 		case WAIT_ASSOC_GRP_UNK_MIN:
315 			return WAIT_ASSOC_GRP_ENERGY_MIN;
316 		case WAIT_ASSOC_GRP_UNK_RUN_MIN:
317 			return WAIT_ASSOC_GRP_ENERGY_RUN_MIN;
318 		case WAIT_ASSOC_MAX_UNK_PER_JOB:
319 			return WAIT_ASSOC_MAX_ENERGY_PER_JOB;
320 		case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
321 			return WAIT_ASSOC_MAX_ENERGY_MINS_PER_JOB;
322 		case WAIT_ASSOC_MAX_UNK_PER_NODE:
323 			return WAIT_ASSOC_MAX_ENERGY_PER_NODE;
324 		case WAIT_QOS_GRP_UNK:
325 			return WAIT_QOS_GRP_ENERGY;
326 		case WAIT_QOS_GRP_UNK_MIN:
327 			return WAIT_QOS_GRP_ENERGY_MIN;
328 		case WAIT_QOS_GRP_UNK_RUN_MIN:
329 			return WAIT_QOS_GRP_ENERGY_RUN_MIN;
330 		case WAIT_QOS_MAX_UNK_PER_JOB:
331 			return WAIT_QOS_MAX_ENERGY_PER_JOB;
332 		case WAIT_QOS_MAX_UNK_PER_NODE:
333 			return WAIT_QOS_MAX_ENERGY_PER_NODE;
334 		case WAIT_QOS_MAX_UNK_PER_ACCT:
335 			return WAIT_QOS_MAX_ENERGY_PER_ACCT;
336 		case WAIT_QOS_MAX_UNK_PER_USER:
337 			return WAIT_QOS_MAX_ENERGY_PER_USER;
338 		case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
339 			return WAIT_QOS_MAX_ENERGY_MINS_PER_JOB;
340 		case WAIT_QOS_MIN_UNK:
341 			return WAIT_QOS_MIN_ENERGY;
342 		default:
343 			return unk_reason;
344 			break;
345 		}
346 		break;
347 	case TRES_ARRAY_NODE:
348 		switch (unk_reason) {
349 		case WAIT_ASSOC_GRP_UNK:
350 			return WAIT_ASSOC_GRP_NODE;
351 		case WAIT_ASSOC_GRP_UNK_MIN:
352 			return WAIT_ASSOC_GRP_NODE_MIN;
353 		case WAIT_ASSOC_GRP_UNK_RUN_MIN:
354 			return WAIT_ASSOC_GRP_NODE_RUN_MIN;
355 		case WAIT_ASSOC_MAX_UNK_PER_JOB:
356 			return WAIT_ASSOC_MAX_NODE_PER_JOB;
357 		case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
358 			return WAIT_ASSOC_MAX_NODE_MINS_PER_JOB;
359 		case WAIT_QOS_GRP_UNK:
360 			return WAIT_QOS_GRP_NODE;
361 		case WAIT_QOS_GRP_UNK_MIN:
362 			return WAIT_QOS_GRP_NODE_MIN;
363 		case WAIT_QOS_GRP_UNK_RUN_MIN:
364 			return WAIT_QOS_GRP_NODE_RUN_MIN;
365 		case WAIT_QOS_MAX_UNK_PER_JOB:
366 			return WAIT_QOS_MAX_NODE_PER_JOB;
367 		case WAIT_QOS_MAX_UNK_PER_ACCT:
368 			return WAIT_QOS_MAX_NODE_PER_ACCT;
369 		case WAIT_QOS_MAX_UNK_PER_USER:
370 			return WAIT_QOS_MAX_NODE_PER_USER;
371 		case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
372 			return WAIT_QOS_MAX_NODE_MINS_PER_JOB;
373 		case WAIT_QOS_MIN_UNK:
374 			return WAIT_QOS_MIN_NODE;
375 		default:
376 			return unk_reason;
377 			break;
378 		}
379 		break;
380 	case TRES_ARRAY_BILLING:
381 		switch (unk_reason) {
382 		case WAIT_ASSOC_GRP_UNK:
383 			return WAIT_ASSOC_GRP_BILLING;
384 		case WAIT_ASSOC_GRP_UNK_MIN:
385 			return WAIT_ASSOC_GRP_BILLING_MIN;
386 		case WAIT_ASSOC_GRP_UNK_RUN_MIN:
387 			return WAIT_ASSOC_GRP_BILLING_RUN_MIN;
388 		case WAIT_ASSOC_MAX_UNK_PER_JOB:
389 			return WAIT_ASSOC_MAX_BILLING_PER_JOB;
390 		case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
391 			return WAIT_ASSOC_MAX_BILLING_MINS_PER_JOB;
392 		case WAIT_ASSOC_MAX_UNK_PER_NODE:
393 			return WAIT_ASSOC_MAX_BILLING_PER_NODE;
394 		case WAIT_QOS_GRP_UNK:
395 			return WAIT_QOS_GRP_BILLING;
396 		case WAIT_QOS_GRP_UNK_MIN:
397 			return WAIT_QOS_GRP_BILLING_MIN;
398 		case WAIT_QOS_GRP_UNK_RUN_MIN:
399 			return WAIT_QOS_GRP_BILLING_RUN_MIN;
400 		case WAIT_QOS_MAX_UNK_PER_JOB:
401 			return WAIT_QOS_MAX_BILLING_PER_JOB;
402 		case WAIT_QOS_MAX_UNK_PER_NODE:
403 			return WAIT_QOS_MAX_BILLING_PER_NODE;
404 		case WAIT_QOS_MAX_UNK_PER_ACCT:
405 			return WAIT_QOS_MAX_BILLING_PER_ACCT;
406 		case WAIT_QOS_MAX_UNK_PER_USER:
407 			return WAIT_QOS_MAX_BILLING_PER_USER;
408 		case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
409 			return WAIT_QOS_MAX_BILLING_MINS_PER_JOB;
410 		case WAIT_QOS_MIN_UNK:
411 			return WAIT_QOS_MIN_BILLING;
412 		default:
413 			return unk_reason;
414 			break;
415 		}
416 		break;
417 	default:
418 		if (!xstrcmp("gres", assoc_mgr_tres_array[tres_pos]->type))
419 			switch (unk_reason) {
420 			case WAIT_ASSOC_GRP_UNK:
421 				return WAIT_ASSOC_GRP_GRES;
422 			case WAIT_ASSOC_GRP_UNK_MIN:
423 				return WAIT_ASSOC_GRP_GRES_MIN;
424 			case WAIT_ASSOC_GRP_UNK_RUN_MIN:
425 				return WAIT_ASSOC_GRP_GRES_RUN_MIN;
426 			case WAIT_ASSOC_MAX_UNK_PER_JOB:
427 				return WAIT_ASSOC_MAX_GRES_PER_JOB;
428 			case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
429 				return WAIT_ASSOC_MAX_GRES_MINS_PER_JOB;
430 			case WAIT_ASSOC_MAX_UNK_PER_NODE:
431 				return WAIT_ASSOC_MAX_GRES_PER_NODE;
432 			case WAIT_QOS_GRP_UNK:
433 				return WAIT_QOS_GRP_GRES;
434 			case WAIT_QOS_GRP_UNK_MIN:
435 				return WAIT_QOS_GRP_GRES_MIN;
436 			case WAIT_QOS_GRP_UNK_RUN_MIN:
437 				return WAIT_QOS_GRP_GRES_RUN_MIN;
438 			case WAIT_QOS_MAX_UNK_PER_JOB:
439 				return WAIT_QOS_MAX_GRES_PER_JOB;
440 			case WAIT_QOS_MAX_UNK_PER_NODE:
441 				return WAIT_QOS_MAX_GRES_PER_NODE;
442 			case WAIT_QOS_MAX_UNK_PER_ACCT:
443 				return WAIT_QOS_MAX_GRES_PER_ACCT;
444 			case WAIT_QOS_MAX_UNK_PER_USER:
445 				return WAIT_QOS_MAX_GRES_PER_USER;
446 			case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
447 				return WAIT_QOS_MAX_GRES_MINS_PER_JOB;
448 			case WAIT_QOS_MIN_UNK:
449 				return WAIT_QOS_MIN_GRES;
450 			default:
451 				return unk_reason;
452 				break;
453 			}
454 		else if (!xstrcmp("license",
455 				  assoc_mgr_tres_array[tres_pos]->type))
456 			switch (unk_reason) {
457 			case WAIT_ASSOC_GRP_UNK:
458 				return WAIT_ASSOC_GRP_LIC;
459 			case WAIT_ASSOC_GRP_UNK_MIN:
460 				return WAIT_ASSOC_GRP_LIC_MIN;
461 			case WAIT_ASSOC_GRP_UNK_RUN_MIN:
462 				return WAIT_ASSOC_GRP_LIC_RUN_MIN;
463 			case WAIT_ASSOC_MAX_UNK_PER_JOB:
464 				return WAIT_ASSOC_MAX_LIC_PER_JOB;
465 			case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
466 				return WAIT_ASSOC_MAX_LIC_MINS_PER_JOB;
467 			case WAIT_QOS_GRP_UNK:
468 				return WAIT_QOS_GRP_LIC;
469 			case WAIT_QOS_GRP_UNK_MIN:
470 				return WAIT_QOS_GRP_LIC_MIN;
471 			case WAIT_QOS_GRP_UNK_RUN_MIN:
472 				return WAIT_QOS_GRP_LIC_RUN_MIN;
473 			case WAIT_QOS_MAX_UNK_PER_JOB:
474 				return WAIT_QOS_MAX_LIC_PER_JOB;
475 			case WAIT_QOS_MAX_UNK_PER_ACCT:
476 				return WAIT_QOS_MAX_LIC_PER_ACCT;
477 			case WAIT_QOS_MAX_UNK_PER_USER:
478 				return WAIT_QOS_MAX_LIC_PER_USER;
479 			case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
480 				return WAIT_QOS_MAX_LIC_MINS_PER_JOB;
481 			case WAIT_QOS_MIN_UNK:
482 				return WAIT_QOS_MIN_LIC;
483 			default:
484 				return unk_reason;
485 				break;
486 			}
487 		else if (!xstrcmp("bb", assoc_mgr_tres_array[tres_pos]->type))
488 			switch (unk_reason) {
489 			case WAIT_ASSOC_GRP_UNK:
490 				return WAIT_ASSOC_GRP_BB;
491 			case WAIT_ASSOC_GRP_UNK_MIN:
492 				return WAIT_ASSOC_GRP_BB_MIN;
493 			case WAIT_ASSOC_GRP_UNK_RUN_MIN:
494 				return WAIT_ASSOC_GRP_BB_RUN_MIN;
495 			case WAIT_ASSOC_MAX_UNK_PER_JOB:
496 				return WAIT_ASSOC_MAX_BB_PER_JOB;
497 			case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
498 				return WAIT_ASSOC_MAX_BB_MINS_PER_JOB;
499 			case WAIT_ASSOC_MAX_UNK_PER_NODE:
500 				return WAIT_ASSOC_MAX_BB_PER_NODE;
501 			case WAIT_QOS_GRP_UNK:
502 				return WAIT_QOS_GRP_BB;
503 			case WAIT_QOS_GRP_UNK_MIN:
504 				return WAIT_QOS_GRP_BB_MIN;
505 			case WAIT_QOS_GRP_UNK_RUN_MIN:
506 				return WAIT_QOS_GRP_BB_RUN_MIN;
507 			case WAIT_QOS_MAX_UNK_PER_JOB:
508 				return WAIT_QOS_MAX_BB_PER_JOB;
509 			case WAIT_QOS_MAX_UNK_PER_NODE:
510 				return WAIT_QOS_MAX_BB_PER_NODE;
511 			case WAIT_QOS_MAX_UNK_PER_ACCT:
512 				return WAIT_QOS_MAX_BB_PER_ACCT;
513 			case WAIT_QOS_MAX_UNK_PER_USER:
514 				return WAIT_QOS_MAX_BB_PER_USER;
515 			case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
516 				return WAIT_QOS_MAX_BB_MINS_PER_JOB;
517 			case WAIT_QOS_MIN_UNK:
518 				return WAIT_QOS_MIN_BB;
519 			default:
520 				return unk_reason;
521 				break;
522 			}
523 		break;
524 	}
525 
526 	return unk_reason;
527 }
528 
_find_used_limits_for_acct(void * x,void * key)529 static int _find_used_limits_for_acct(void *x, void *key)
530 {
531 	slurmdb_used_limits_t *used_limits = (slurmdb_used_limits_t *)x;
532 	char *account = (char *)key;
533 
534 	if (!xstrcmp(account, used_limits->acct))
535 		return 1;
536 
537 	return 0;
538 }
539 
_find_used_limits_for_user(void * x,void * key)540 static int _find_used_limits_for_user(void *x, void *key)
541 {
542 	slurmdb_used_limits_t *used_limits = (slurmdb_used_limits_t *)x;
543 	uint32_t user_id = *(uint32_t *)key;
544 
545 	if (used_limits->uid == user_id)
546 		return 1;
547 
548 	return 0;
549 }
550 
_valid_job_assoc(job_record_t * job_ptr)551 static bool _valid_job_assoc(job_record_t *job_ptr)
552 {
553 	slurmdb_assoc_rec_t assoc_rec;
554 
555 	if ((job_ptr->assoc_ptr == NULL) ||
556 	    (job_ptr->assoc_ptr->id  != job_ptr->assoc_id) ||
557 	    (job_ptr->assoc_ptr->uid != job_ptr->user_id)) {
558 		error("Invalid assoc_ptr for %pJ", job_ptr);
559 		memset(&assoc_rec, 0, sizeof(slurmdb_assoc_rec_t));
560 
561 		assoc_rec.acct      = job_ptr->account;
562 		if (job_ptr->part_ptr)
563 			assoc_rec.partition = job_ptr->part_ptr->name;
564 		assoc_rec.uid       = job_ptr->user_id;
565 
566 		if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
567 					    accounting_enforce,
568 					    &job_ptr->assoc_ptr, false)) {
569 			info("%s: invalid account or partition for uid=%u %pJ",
570 			     __func__, job_ptr->user_id, job_ptr);
571 			return false;
572 		}
573 		job_ptr->assoc_id = assoc_rec.id;
574 	}
575 	return true;
576 }
577 
_qos_adjust_limit_usage(int type,job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,uint64_t * used_tres_run_secs,uint32_t job_cnt)578 static void _qos_adjust_limit_usage(int type, job_record_t *job_ptr,
579 				    slurmdb_qos_rec_t *qos_ptr,
580 				    uint64_t *used_tres_run_secs,
581 				    uint32_t job_cnt)
582 {
583 	slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
584 	int i;
585 
586 	if (!qos_ptr || !job_ptr->assoc_ptr)
587 		return;
588 
589 	used_limits_a =	acct_policy_get_acct_used_limits(
590 		&qos_ptr->usage->acct_limit_list,
591 		job_ptr->assoc_ptr->acct);
592 
593 	used_limits = acct_policy_get_user_used_limits(
594 		&qos_ptr->usage->user_limit_list,
595 		job_ptr->user_id);
596 
597 	switch (type) {
598 	case ACCT_POLICY_ADD_SUBMIT:
599 		qos_ptr->usage->grp_used_submit_jobs += job_cnt;
600 		used_limits->submit_jobs += job_cnt;
601 		used_limits_a->submit_jobs += job_cnt;
602 		break;
603 	case ACCT_POLICY_REM_SUBMIT:
604 		if (qos_ptr->usage->grp_used_submit_jobs >= job_cnt)
605 			qos_ptr->usage->grp_used_submit_jobs -= job_cnt;
606 		else {
607 			qos_ptr->usage->grp_used_submit_jobs = 0;
608 			debug2("acct_policy_remove_job_submit: "
609 			       "grp_submit_jobs underflow for qos %s",
610 			       qos_ptr->name);
611 		}
612 
613 		if (used_limits->submit_jobs >= job_cnt)
614 			used_limits->submit_jobs -= job_cnt;
615 		else {
616 			used_limits->submit_jobs = 0;
617 			debug2("acct_policy_remove_job_submit: "
618 			       "used_submit_jobs underflow for "
619 			       "qos %s user %d",
620 			       qos_ptr->name, used_limits->uid);
621 		}
622 
623 		if (used_limits_a->submit_jobs >= job_cnt)
624 			used_limits_a->submit_jobs -= job_cnt;
625 		else {
626 			used_limits_a->submit_jobs = 0;
627 			debug2("acct_policy_remove_job_submit: "
628 			       "used_submit_jobs underflow for "
629 			       "qos %s account %s",
630 			       qos_ptr->name, used_limits_a->acct);
631 		}
632 
633 		break;
634 	case ACCT_POLICY_JOB_BEGIN:
635 		qos_ptr->usage->grp_used_jobs++;
636 		for (i=0; i<slurmctld_tres_cnt; i++) {
637 			/* tres_alloc_cnt for ENERGY is currently after the
638 			 * fact, so don't add it here or you will get underflows
639 			 * when you remove it.  If this ever changes this will
640 			 * have to be moved to a new TRES ARRAY probably.
641 			 */
642 			if (i == TRES_ARRAY_ENERGY)
643 				continue;
644 			if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
645 				continue;
646 
647 			used_limits->tres[i] += job_ptr->tres_alloc_cnt[i];
648 			used_limits_a->tres[i] += job_ptr->tres_alloc_cnt[i];
649 
650 			qos_ptr->usage->grp_used_tres[i] +=
651 				job_ptr->tres_alloc_cnt[i];
652 			qos_ptr->usage->grp_used_tres_run_secs[i] +=
653 				used_tres_run_secs[i];
654 			debug2("acct_policy_job_begin: after adding %pJ, qos %s grp_used_tres_run_secs(%s) is %"PRIu64,
655 			       job_ptr, qos_ptr->name,
656 			       assoc_mgr_tres_name_array[i],
657 			       qos_ptr->usage->grp_used_tres_run_secs[i]);
658 		}
659 
660 		used_limits->jobs++;
661 		used_limits_a->jobs++;
662 
663 		_add_usage_node_bitmap(
664 			job_ptr,
665 			&qos_ptr->usage->grp_node_bitmap,
666 			&qos_ptr->usage->grp_node_job_cnt,
667 			&qos_ptr->usage->grp_used_tres[TRES_ARRAY_NODE]);
668 
669 		_add_usage_node_bitmap(
670 			job_ptr,
671 			&used_limits->node_bitmap,
672 			&used_limits->node_job_cnt,
673 			&used_limits->tres[TRES_ARRAY_NODE]);
674 
675 		_add_usage_node_bitmap(
676 			job_ptr,
677 			&used_limits_a->node_bitmap,
678 			&used_limits_a->node_job_cnt,
679 			&used_limits_a->tres[TRES_ARRAY_NODE]);
680 		break;
681 	case ACCT_POLICY_JOB_FINI:
682 		/*
683 		 * If tres_alloc_cnt doesn't exist means ACCT_POLICY_JOB_BEGIN
684 		 * was never called so no need to clean up that which was never
685 		 * set up.
686 		 */
687 		if (!job_ptr->tres_alloc_cnt)
688 			break;
689 		qos_ptr->usage->grp_used_jobs--;
690 		if ((int32_t)qos_ptr->usage->grp_used_jobs < 0) {
691 			qos_ptr->usage->grp_used_jobs = 0;
692 			debug2("acct_policy_job_fini: used_jobs "
693 			       "underflow for qos %s", qos_ptr->name);
694 		}
695 
696 		for (i=0; i<slurmctld_tres_cnt; i++) {
697 			if (i == TRES_ARRAY_ENERGY)
698 				continue;
699 
700 			if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
701 				continue;
702 
703 			if (job_ptr->tres_alloc_cnt[i] >
704 			    qos_ptr->usage->grp_used_tres[i]) {
705 				qos_ptr->usage->grp_used_tres[i] = 0;
706 				debug2("acct_policy_job_fini: "
707 				       "grp_used_tres(%s) "
708 				       "underflow for QOS %s",
709 				       assoc_mgr_tres_name_array[i],
710 				       qos_ptr->name);
711 			} else
712 				qos_ptr->usage->grp_used_tres[i] -=
713 					job_ptr->tres_alloc_cnt[i];
714 
715 			if (job_ptr->tres_alloc_cnt[i] > used_limits->tres[i]) {
716 				used_limits->tres[i] = 0;
717 				debug2("acct_policy_job_fini: "
718 				       "used_limits->tres(%s) "
719 				       "underflow for qos %s user %u",
720 				       assoc_mgr_tres_name_array[i],
721 				       qos_ptr->name, used_limits->uid);
722 			} else
723 				used_limits->tres[i] -=
724 					job_ptr->tres_alloc_cnt[i];
725 
726 			if (job_ptr->tres_alloc_cnt[i] >
727 			    used_limits_a->tres[i]) {
728 				used_limits_a->tres[i] = 0;
729 				debug2("acct_policy_job_fini: "
730 				       "used_limits->tres(%s) "
731 				       "underflow for qos %s account %s",
732 				       assoc_mgr_tres_name_array[i],
733 				       qos_ptr->name, used_limits_a->acct);
734 			} else
735 				used_limits_a->tres[i] -=
736 					job_ptr->tres_alloc_cnt[i];
737 		}
738 
739 		if (used_limits->jobs)
740 			used_limits->jobs--;
741 		else
742 			debug2("acct_policy_job_fini: used_jobs "
743 			       "underflow for qos %s user %d",
744 			       qos_ptr->name, used_limits->uid);
745 
746 		if (used_limits_a->jobs)
747 			used_limits_a->jobs--;
748 		else
749 			debug2("acct_policy_job_fini: used_jobs "
750 			       "underflow for qos %s account %s",
751 			       qos_ptr->name, used_limits_a->acct);
752 
753 		_rm_usage_node_bitmap(
754 			job_ptr,
755 			qos_ptr->usage->grp_node_bitmap,
756 			qos_ptr->usage->grp_node_job_cnt,
757 			&qos_ptr->usage->grp_used_tres[TRES_ARRAY_NODE]);
758 
759 		_rm_usage_node_bitmap(
760 			job_ptr,
761 			used_limits->node_bitmap,
762 			used_limits->node_job_cnt,
763 			&used_limits->tres[TRES_ARRAY_NODE]);
764 
765 		_rm_usage_node_bitmap(
766 			job_ptr,
767 			used_limits_a->node_bitmap,
768 			used_limits_a->node_job_cnt,
769 			&used_limits_a->tres[TRES_ARRAY_NODE]);
770 		break;
771 	default:
772 		error("acct_policy: qos unknown type %d", type);
773 		break;
774 	}
775 
776 }
777 
_find_qos_part(void * x,void * key)778 static int _find_qos_part(void *x, void *key)
779 {
780 	if ((slurmdb_qos_rec_t *) x == (slurmdb_qos_rec_t *) key)
781 		return 1;	/* match */
782 
783 	return 0;
784 }
785 
_adjust_limit_usage(int type,job_record_t * job_ptr)786 static void _adjust_limit_usage(int type, job_record_t *job_ptr)
787 {
788 	slurmdb_assoc_rec_t *assoc_ptr = NULL;
789 	assoc_mgr_lock_t locks =
790 		{ .assoc = WRITE_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
791 	uint64_t used_tres_run_secs[slurmctld_tres_cnt];
792 	int i;
793 	uint32_t job_cnt = 1;
794 
795 	memset(used_tres_run_secs, 0, sizeof(uint64_t) * slurmctld_tres_cnt);
796 
797 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
798 	    || !_valid_job_assoc(job_ptr))
799 		return;
800 
801 	if (type == ACCT_POLICY_JOB_FINI)
802 		priority_g_job_end(job_ptr);
803 	else if (type == ACCT_POLICY_JOB_BEGIN) {
804 		uint64_t time_limit_secs = (uint64_t)job_ptr->time_limit * 60;
805 		/* take into account usage factor */
806 		if (job_ptr->qos_ptr &&
807 		    (job_ptr->qos_ptr->usage_factor >= 0))
808 			time_limit_secs *= job_ptr->qos_ptr->usage_factor;
809 		for (i = 0; i < slurmctld_tres_cnt; i++) {
810 			if (i == TRES_ARRAY_ENERGY)
811 				continue;
812 			if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
813 				continue;
814 
815 			used_tres_run_secs[i] =
816 				job_ptr->tres_alloc_cnt[i] * time_limit_secs;
817 		}
818 	} else if (((type == ACCT_POLICY_ADD_SUBMIT) ||
819 		    (type == ACCT_POLICY_REM_SUBMIT)) &&
820 		   job_ptr->array_recs && job_ptr->array_recs->task_cnt)
821 		job_cnt = job_ptr->array_recs->task_cnt;
822 
823 	assoc_mgr_lock(&locks);
824 
825 	/*
826 	 * This handles removal of the accrual_cnt pending on
827 	 * state.  We do not want to call this on add submit as it could push
828 	 * other jobs pending waiting in line for the limit.  The main call to
829 	 * this that handles the initial call happens in build_job_queue().
830 	 */
831 	if (type != ACCT_POLICY_ADD_SUBMIT)
832 		acct_policy_handle_accrue_time(job_ptr, true);
833 
834 	/*
835 	 * If we have submitted to multiple partitions we need to handle all of
836 	 * them on submit and remove if the job was cancelled before it ran
837 	 * (!job_ptr->tres_alloc_str).
838 	 */
839 	if (((type == ACCT_POLICY_ADD_SUBMIT) ||
840 	     (type == ACCT_POLICY_REM_SUBMIT)) &&
841 	    job_ptr->part_ptr_list &&
842 	    (IS_JOB_PENDING(job_ptr) || !job_ptr->tres_alloc_str)) {
843 		bool job_first = false;
844 		ListIterator part_itr;
845 		part_record_t *part_ptr;
846 		List part_qos_list = NULL;
847 
848 		if (job_ptr->qos_ptr &&
849 		    (((slurmdb_qos_rec_t *)job_ptr->qos_ptr)->flags
850 		     & QOS_FLAG_OVER_PART_QOS))
851 			job_first = true;
852 
853 		if (job_first) {
854 			_qos_adjust_limit_usage(type, job_ptr, job_ptr->qos_ptr,
855 						used_tres_run_secs, job_cnt);
856 			part_qos_list = list_create(NULL);
857 			list_push(part_qos_list, job_ptr->qos_ptr);
858 		}
859 
860 		part_itr = list_iterator_create(job_ptr->part_ptr_list);
861 		while ((part_ptr = list_next(part_itr))) {
862 			if (!part_ptr->qos_ptr)
863 				continue;
864 			if (!part_qos_list)
865 				part_qos_list = list_create(NULL);
866 			/*
867 			 * Don't adjust usage to this partition's qos if
868 			 * it's the same as the qos of another partition
869 			 * that we already handled.
870 			 */
871 			if (list_find_first(part_qos_list, _find_qos_part,
872 					    part_ptr->qos_ptr))
873 				continue;
874 			list_push(part_qos_list, part_ptr->qos_ptr);
875 			_qos_adjust_limit_usage(type, job_ptr,
876 						part_ptr->qos_ptr,
877 						used_tres_run_secs, job_cnt);
878 		}
879 		list_iterator_destroy(part_itr);
880 
881 		/*
882 		 * Don't adjust usage to this job's qos if
883 		 * it's the same as the qos of a partition
884 		 * that we already handled.
885 		 */
886 		if (!job_first && job_ptr->qos_ptr &&
887 		    (!part_qos_list ||
888 		     !list_find_first(part_qos_list, _find_qos_part,
889 				      job_ptr->qos_ptr)))
890 			_qos_adjust_limit_usage(type, job_ptr, job_ptr->qos_ptr,
891 						used_tres_run_secs, job_cnt);
892 
893 		FREE_NULL_LIST(part_qos_list);
894 	} else {
895 		slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
896 
897 		/*
898 		 * Here if the job is starting and we had a part_ptr_list before
899 		 * hand we need to remove the submit from all partition qos
900 		 * outside of the one we actually are going to run on.
901 		 */
902 		if ((type == ACCT_POLICY_JOB_BEGIN) &&
903 		    job_ptr->part_ptr_list) {
904 			ListIterator part_itr;
905 			part_record_t *part_ptr;
906 			List part_qos_list = list_create(NULL);
907 
908 			if (job_ptr->qos_ptr)
909 				list_push(part_qos_list, job_ptr->qos_ptr);
910 			if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr &&
911 			    job_ptr->qos_ptr != job_ptr->part_ptr->qos_ptr)
912 				list_push(part_qos_list,
913 					  job_ptr->part_ptr->qos_ptr);
914 
915 			part_itr = list_iterator_create(job_ptr->part_ptr_list);
916 			while ((part_ptr = list_next(part_itr))) {
917 				if (!part_ptr->qos_ptr)
918 					continue;
919 
920 				/*
921 				 * Don't adjust usage to this partition's qos if
922 				 * it's the same as the qos of another partition
923 				 * that we already handled.
924 				 */
925 				if (list_find_first(part_qos_list,
926 						    _find_qos_part,
927 						    part_ptr->qos_ptr))
928 					continue;
929 				_qos_adjust_limit_usage(ACCT_POLICY_REM_SUBMIT,
930 							job_ptr,
931 							part_ptr->qos_ptr,
932 							used_tres_run_secs,
933 							job_cnt);
934 			}
935 			list_iterator_destroy(part_itr);
936 			FREE_NULL_LIST(part_qos_list);
937 		}
938 
939 		acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
940 
941 		_qos_adjust_limit_usage(type, job_ptr, qos_ptr_1,
942 					used_tres_run_secs, job_cnt);
943 		_qos_adjust_limit_usage(type, job_ptr, qos_ptr_2,
944 					used_tres_run_secs, job_cnt);
945 	}
946 
947 	assoc_ptr = job_ptr->assoc_ptr;
948 	while (assoc_ptr) {
949 		switch (type) {
950 		case ACCT_POLICY_ADD_SUBMIT:
951 			assoc_ptr->usage->used_submit_jobs += job_cnt;
952 			break;
953 		case ACCT_POLICY_REM_SUBMIT:
954 			if (assoc_ptr->usage->used_submit_jobs)
955 				assoc_ptr->usage->used_submit_jobs -= job_cnt;
956 			else
957 				debug2("acct_policy_remove_job_submit: "
958 				       "used_submit_jobs underflow for "
959 				       "account %s",
960 				       assoc_ptr->acct);
961 			break;
962 		case ACCT_POLICY_JOB_BEGIN:
963 			assoc_ptr->usage->used_jobs++;
964 			_add_usage_node_bitmap(
965 				job_ptr,
966 				&assoc_ptr->usage->grp_node_bitmap,
967 				&assoc_ptr->usage->grp_node_job_cnt,
968 				&assoc_ptr->usage->
969 				grp_used_tres[TRES_ARRAY_NODE]);
970 
971 			for (i = 0; i < slurmctld_tres_cnt; i++) {
972 				if (i == TRES_ARRAY_ENERGY)
973 					continue;
974 				if (job_ptr->tres_alloc_cnt[i] ==
975 				    NO_CONSUME_VAL64)
976 					continue;
977 
978 				if (i != TRES_ARRAY_NODE) {
979 					assoc_ptr->usage->grp_used_tres[i] +=
980 						job_ptr->tres_alloc_cnt[i];
981 				}
982 				assoc_ptr->usage->grp_used_tres_run_secs[i] +=
983 					used_tres_run_secs[i];
984 				debug2("acct_policy_job_begin: after adding %pJ, assoc %u(%s/%s/%s) grp_used_tres_run_secs(%s) is %"PRIu64,
985 				       job_ptr, assoc_ptr->id, assoc_ptr->acct,
986 				       assoc_ptr->user, assoc_ptr->partition,
987 				       assoc_mgr_tres_name_array[i],
988 				       assoc_ptr->usage->
989 				       grp_used_tres_run_secs[i]);
990 			}
991 			break;
992 		case ACCT_POLICY_JOB_FINI:
993 			if (assoc_ptr->usage->used_jobs)
994 				assoc_ptr->usage->used_jobs--;
995 			else
996 				debug2("acct_policy_job_fini: used_jobs "
997 				       "underflow for account %s",
998 				       assoc_ptr->acct);
999 			_rm_usage_node_bitmap(
1000 				job_ptr,
1001 				assoc_ptr->usage->grp_node_bitmap,
1002 				assoc_ptr->usage->grp_node_job_cnt,
1003 				&assoc_ptr->usage->
1004 				grp_used_tres[TRES_ARRAY_NODE]);
1005 			for (i = 0; i < slurmctld_tres_cnt; i++) {
1006 				if ((i == TRES_ARRAY_ENERGY) ||
1007 				    (i == TRES_ARRAY_NODE))
1008 					continue;
1009 				if (job_ptr->tres_alloc_cnt[i] ==
1010 				    NO_CONSUME_VAL64)
1011 					continue;
1012 
1013 				if (job_ptr->tres_alloc_cnt[i] >
1014 				    assoc_ptr->usage->grp_used_tres[i]) {
1015 					assoc_ptr->usage->grp_used_tres[i] = 0;
1016 					debug2("acct_policy_job_fini: "
1017 					       "grp_used_tres(%s) "
1018 					       "underflow for assoc "
1019 					       "%u(%s/%s/%s)",
1020 					       assoc_mgr_tres_name_array[i],
1021 					       assoc_ptr->id, assoc_ptr->acct,
1022 					       assoc_ptr->user,
1023 					       assoc_ptr->partition);
1024 				} else {
1025 					assoc_ptr->usage->grp_used_tres[i] -=
1026 						job_ptr->tres_alloc_cnt[i];
1027 				}
1028 			}
1029 
1030 			break;
1031 		default:
1032 			error("acct_policy: association unknown type %d", type);
1033 			break;
1034 		}
1035 		/* now handle all the group limits of the parents */
1036 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
1037 	}
1038 	assoc_mgr_unlock(&locks);
1039 }
1040 
_set_time_limit(uint32_t * time_limit,uint32_t part_max_time,uint32_t limit_max_time,uint16_t * limit_set_time)1041 static void _set_time_limit(uint32_t *time_limit, uint32_t part_max_time,
1042 			    uint32_t limit_max_time, uint16_t *limit_set_time)
1043 {
1044 	if ((*time_limit) == NO_VAL) {
1045 		if (limit_max_time)
1046 			(*time_limit) = limit_max_time;
1047 		else if (part_max_time != INFINITE)
1048 			(*time_limit) = part_max_time;
1049 		else
1050 			(*time_limit) = INFINITE;
1051 
1052 		if (limit_set_time)
1053 			(*limit_set_time) = 1;
1054 	} else if (limit_set_time && (*limit_set_time) &&
1055 		   ((*time_limit) > limit_max_time))
1056 		(*time_limit) = limit_max_time;
1057 }
1058 
_qos_alter_job(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,uint64_t * used_tres_run_secs,uint64_t * new_used_tres_run_secs)1059 static void _qos_alter_job(job_record_t *job_ptr,
1060 			   slurmdb_qos_rec_t *qos_ptr,
1061 			   uint64_t *used_tres_run_secs,
1062 			   uint64_t *new_used_tres_run_secs)
1063 {
1064 	int i;
1065 
1066 	if (!qos_ptr || !job_ptr)
1067 		return;
1068 
1069 	for (i=0; i<slurmctld_tres_cnt; i++) {
1070 		if (used_tres_run_secs[i] == new_used_tres_run_secs[i])
1071 			continue;
1072 		qos_ptr->usage->grp_used_tres_run_secs[i] -=
1073 			used_tres_run_secs[i];
1074 		qos_ptr->usage->grp_used_tres_run_secs[i] +=
1075 			new_used_tres_run_secs[i];
1076 		debug2("altering %pJ QOS %s got %"PRIu64" just removed %"PRIu64" and added %"PRIu64,
1077 		       job_ptr, qos_ptr->name,
1078 		       qos_ptr->usage->grp_used_tres_run_secs[i],
1079 		       used_tres_run_secs[i],
1080 		       new_used_tres_run_secs[i]);
1081 	}
1082 }
1083 
1084 /*
1085  * _validate_tres_limits_for_assoc - validate the tres requested against limits
1086  * of an association as well as qos skipping any limit an admin set
1087  *
1088  * OUT - tres_pos - if false is returned position in array of failed limit
1089  * IN - job_tres_array - count of various TRES requested by the job
1090  * IN - divisor - divide the job_tres_array TRES by this variable, 0 if none
1091  * IN - assoc_tres_array - TRES limits from an association (Grp, Max, Min)
1092  * IN - qos_tres_array - TRES limits QOS has imposed already
1093  * IN - acct_policy_limit_set_array - limits that have been overridden
1094  *                                    by an admin
1095  * IN strict_checking - If a limit needs to be enforced now or not.
1096  * IN update_call - If this is an update or a create call
1097  * IN max_limit - Limits are for MAX else, the limits are MIN.
1098  *
1099  * RET - True if no limit is violated, false otherwise with tres_pos
1100  * being set to the position of the failed limit.
1101  */
_validate_tres_limits_for_assoc(int * tres_pos,uint64_t * job_tres_array,uint64_t divisor,uint64_t * assoc_tres_array,uint64_t * qos_tres_array,uint16_t * admin_set_limit_tres_array,bool strict_checking,bool update_call,bool max_limit)1102 static bool _validate_tres_limits_for_assoc(
1103 	int *tres_pos,
1104 	uint64_t *job_tres_array,
1105 	uint64_t divisor,
1106 	uint64_t *assoc_tres_array,
1107 	uint64_t *qos_tres_array,
1108 	uint16_t *admin_set_limit_tres_array,
1109 	bool strict_checking,
1110 	bool update_call, bool max_limit)
1111 {
1112 	int i;
1113 	uint64_t job_tres;
1114 
1115 	if (!strict_checking)
1116 		return true;
1117 
1118 	for (i = 0; i < g_tres_count; i++) {
1119 		(*tres_pos) = i;
1120 
1121 		if ((admin_set_limit_tres_array[i] == ADMIN_SET_LIMIT)
1122 		    || (qos_tres_array[i] != INFINITE64)
1123 		    || (assoc_tres_array[i] == INFINITE64)
1124 		    || (!job_tres_array[i] && !update_call))
1125 			continue;
1126 
1127 		job_tres = job_tres_array[i];
1128 
1129 		if (divisor)
1130 			job_tres /= divisor;
1131 
1132 		if (max_limit) {
1133 			if (job_tres > assoc_tres_array[i])
1134 				return false;
1135 		} else if (job_tres < assoc_tres_array[i])
1136 				return false;
1137 	}
1138 
1139 	return true;
1140 }
1141 
1142 
1143 /*
1144  * _validate_tres_limits_for_qos - validate the tres requested against limits
1145  * of a QOS as well as qos skipping any limit an admin set
1146  *
1147  * OUT - tres_pos - if false is returned position in array of failed limit
1148  * IN - job_tres_array - count of various TRES requested by the job
1149  * IN - divisor - divide the job_tres_array TRES by this variable, 0 if none
1150  * IN - grp_tres_array - Grp TRES limits from QOS
1151  * IN - max_tres_array - Max/Min TRES limits from QOS
1152  * IN/OUT - out_grp_tres_array - Grp TRES limits QOS has imposed already,
1153  *                               if a new limit is found the limit is filled in.
1154  * IN/OUT - out_max_tres_array - Max/Min TRES limits QOS has imposed already,
1155  *                               if a new limit is found the limit is filled in.
1156  * IN - acct_policy_limit_set_array - limits that have been overridden
1157  *                                    by an admin
1158  * IN strict_checking - If a limit needs to be enforced now or not.
1159  * IN max_limit - Limits are for MAX else, the limits are MIN.
1160  *
1161  * RET - True if no limit is violated, false otherwise with tres_pos
1162  * being set to the position of the failed limit.
1163  */
_validate_tres_limits_for_qos(int * tres_pos,uint64_t * job_tres_array,uint64_t divisor,uint64_t * grp_tres_array,uint64_t * max_tres_array,uint64_t * out_grp_tres_array,uint64_t * out_max_tres_array,uint16_t * admin_set_limit_tres_array,bool strict_checking,bool max_limit)1164 static bool _validate_tres_limits_for_qos(
1165 	int *tres_pos,
1166 	uint64_t *job_tres_array,
1167 	uint64_t divisor,
1168 	uint64_t *grp_tres_array,
1169 	uint64_t *max_tres_array,
1170 	uint64_t *out_grp_tres_array,
1171 	uint64_t *out_max_tres_array,
1172 	uint16_t *admin_set_limit_tres_array,
1173 	bool strict_checking, bool max_limit)
1174 {
1175 	uint64_t max_tres_limit, out_max_tres_limit;
1176 	int i;
1177 	uint64_t job_tres;
1178 
1179 	if (!strict_checking)
1180 		return true;
1181 
1182 	for (i = 0; i < g_tres_count; i++) {
1183 		(*tres_pos) = i;
1184 		if (grp_tres_array) {
1185 			max_tres_limit = MIN(grp_tres_array[i],
1186 					     max_tres_array[i]);
1187 			out_max_tres_limit = MIN(out_grp_tres_array[i],
1188 						 out_max_tres_array[i]);
1189 		} else {
1190 			max_tres_limit = max_tres_array[i];
1191 			out_max_tres_limit = out_max_tres_array[i];
1192 		}
1193 
1194 		/* we don't need to look at this limit */
1195 		if ((admin_set_limit_tres_array[i] == ADMIN_SET_LIMIT)
1196 		    || (out_max_tres_limit != INFINITE64)
1197 		    || (max_tres_limit == INFINITE64)
1198 		    || (job_tres_array[i] && (job_tres_array[i] == NO_VAL64)))
1199 			continue;
1200 
1201 		out_max_tres_array[i] = max_tres_array[i];
1202 
1203 		job_tres = job_tres_array[i];
1204 
1205 		if (divisor)
1206 			job_tres /= divisor;
1207 
1208 		if (out_grp_tres_array && grp_tres_array) {
1209 			if (out_grp_tres_array[i] == INFINITE64)
1210 				out_grp_tres_array[i] = grp_tres_array[i];
1211 
1212 			if (max_limit) {
1213 				if (job_tres > grp_tres_array[i])
1214 					return false;
1215 			}  else if (job_tres < grp_tres_array[i])
1216 				return false;
1217 		}
1218 
1219 		if (max_limit) {
1220 			if (job_tres > max_tres_array[i])
1221 				return false;
1222 		} else if (job_tres < max_tres_array[i])
1223 			return false;
1224 	}
1225 
1226 	return true;
1227 }
1228 
1229 /* Only check the time_limits if the admin didn't set
1230  * the timelimit.
1231  * It is important we look at these even if strict_checking
1232  * isn't set so we get the correct time_limit from the job.
1233  */
_validate_time_limit(uint32_t * time_limit_in,uint32_t part_max_time,uint64_t tres_req_cnt,uint64_t max_limit,void * out_max_limit,uint16_t * limit_set_time,bool strict_checking,bool is64)1234 static bool _validate_time_limit(uint32_t *time_limit_in,
1235 				 uint32_t part_max_time,
1236 				 uint64_t tres_req_cnt,
1237 				 uint64_t max_limit,
1238 				 void *out_max_limit,
1239 				 uint16_t *limit_set_time,
1240 				 bool strict_checking,
1241 				 bool is64)
1242 {
1243 	uint32_t max_time_limit;
1244 	uint64_t out_max_64 = *(uint64_t *)out_max_limit;
1245 	uint32_t out_max_32 = *(uint32_t *)out_max_limit;
1246 
1247        if (!tres_req_cnt || (((*time_limit_in) != NO_VAL) &&
1248            (!strict_checking ||
1249             (*limit_set_time) == ADMIN_SET_LIMIT)))
1250 		return true;
1251 
1252 	if (is64) {
1253 		if ((out_max_64 != INFINITE64) ||
1254 		    (max_limit == INFINITE64) ||
1255 		    (tres_req_cnt == NO_VAL64))
1256 			return true;
1257 	} else {
1258 		if ((out_max_32 != INFINITE) ||
1259 		    ((uint32_t)max_limit == INFINITE) ||
1260 		    ((uint32_t)tres_req_cnt == NO_VAL))
1261 			return true;
1262 	}
1263 
1264 	max_time_limit = (uint32_t)(max_limit / tres_req_cnt);
1265 
1266 	_set_time_limit(time_limit_in, part_max_time, max_time_limit,
1267 			limit_set_time);
1268 
1269 	if (is64)
1270 		(*(uint64_t *)out_max_limit) = max_limit;
1271 	else
1272 		(*(uint32_t *)out_max_limit) = (uint32_t)max_limit;
1273 
1274 	if ((*time_limit_in) > max_time_limit)
1275 		return false;
1276 
1277 	return true;
1278 }
1279 
1280 /*
1281  * _validate_tres_time_limits - validate the tres requested
1282  * against limits of an association as well as qos skipping any limit
1283  * an admin set
1284  *
1285  * OUT - tres_pos - if false is returned position in array of failed limit
1286  * IN/OUT - time_limit_in - Job's time limit, set and returned based off limits
1287  *                          if none is given.
1288  * IN - part_max_time - Job's partition max time limit
1289  * IN - job_tres_array - count of various TRES requested by the job
1290  * IN - max_tres_array - Max TRES limits of association/QOS
1291  * OUT - out_max_tres_array - Max TRES limits as set by the various TRES
1292  * OUT - limit_set_time - set if the time_limit was set by a limit QOS/Assoc or
1293  *                        otherwise.
1294  * IN strict_checking - If a limit needs to be enforced now or not.
1295  *
1296  * RET - True if no limit is violated, false otherwise with tres_pos
1297  * being set to the position of the failed limit.
1298  */
_validate_tres_time_limits(int * tres_pos,uint32_t * time_limit_in,uint32_t part_max_time,uint64_t * job_tres_array,uint64_t * max_tres_array,uint64_t * out_max_tres_array,uint16_t * limit_set_time,bool strict_checking)1299 static bool _validate_tres_time_limits(
1300 	int *tres_pos,
1301 	uint32_t *time_limit_in,
1302 	uint32_t part_max_time,
1303 	uint64_t *job_tres_array,
1304 	uint64_t *max_tres_array,
1305 	uint64_t *out_max_tres_array,
1306 	uint16_t *limit_set_time,
1307 	bool strict_checking)
1308 {
1309 	int i;
1310 //	uint32_t max_time_limit;
1311 
1312 	if (!strict_checking || (*limit_set_time) == ADMIN_SET_LIMIT)
1313 		return true;
1314 
1315 	for (i = 0; i < g_tres_count; i++) {
1316 		(*tres_pos) = i;
1317 
1318 		if (!_validate_time_limit(time_limit_in, part_max_time,
1319 					  job_tres_array[i],
1320 					  max_tres_array[i],
1321 					  &out_max_tres_array[i],
1322 					  limit_set_time,
1323 					  strict_checking, true))
1324 			return false;
1325 		/* if ((out_max_tres_array[i] != INFINITE64) || */
1326 		/*     (max_tres_array[i] == INFINITE64) || */
1327 		/*     (job_tres_array[i] == NO_VAL64) || */
1328 		/*     (job_tres_array[i] == 0)) */
1329 		/* 	continue; */
1330 
1331 		/* max_time_limit = (uint32_t)(max_tres_array[i] / */
1332 		/* 			    job_tres_array[i]); */
1333 
1334 		/* _set_time_limit(time_limit_in, */
1335 		/* 		part_max_time, max_time_limit, */
1336 		/* 		limit_set_time); */
1337 
1338 		/* out_max_tres_array[i] = max_tres_array[i]; */
1339 
1340 		/* if ((*time_limit_in) > max_time_limit) */
1341 		/* 	return false; */
1342 	}
1343 
1344 	return true;
1345 }
1346 
1347 /*
1348  * _validate_tres_usage_limits - validate the TRES requested against
1349  * specified limits; when checking for safe limits, also take into
1350  * consideration already used and currently running TRES resources
1351  *
1352  * OUT - tres_pos - if function returns other than TRES_USAGE_OKAY,
1353  *                  position in TRES array of failed limit
1354  * IN - tres_limit_array - count of various TRES limits to check against
1355  * OUT - out_tres_limit_array - optional; assigned values from tres_limit_array
1356  *                              when out_tres_limit_set is true,
1357  *                              skipped when any of:
1358  *                              1) admin_limit_set is set and is an admin
1359  *                                 limit
1360  *                              2) out_tres_limit_array is set and its value
1361  *                                 has been changed since initially being set
1362  *                                 to INFINITE64
1363  *                              3) tres_limit_array is INFINITE64
1364  * IN - tres_req_cnt - must be set when safe_limits is true; the following
1365  *                     is checked with tres_req_cnt:
1366  *                     1) tres_req_cnt > tres_limit_array,
1367  *                        return TRES_USAGE_REQ_EXCEEDS_LIMIT
1368  *                     2) when tres_usage is set:
1369  *                        (tres_req_cnt + tres_usage) >
1370  *                        (tres_limit_array - curr_usage),
1371  *                        return TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE
1372  *                        curr_usage will be 0 when not passed
1373  * IN - tres_usage - TRES (currently running if curr_usage is set, already used
1374  *                   otherwise) optional; This value is used primarily only if
1375  *                   safe_limits is true.  It will be added to tres_req_cnt to
1376  *                   count as extra time to observe, see tres_req_cnt section
1377  *                   above for tres_usage interaction
1378  * IN - curr_usage - TRES (already used) optional; when set, check if:
1379  *                   1) curr_usage > tres_limit_array
1380  *                      return TRES_USAGE_CUR_EXCEEDS_LIMIT
1381  *                   2) when safe_limits is true, see tres_req_cnt section
1382  *                      above for curr_usage interaction
1383  * IN - admin_limit_set - limits that have been overridden by an admin, see
1384  *                        out_tres_limit_array section above for interaction
1385  * IN - safe_limits - requires tres_req_cnt when true; see tres_req_cnt
1386  *                    section above for interaction
1387  * IN - out_tres_limit_set - out_tres_limit_array is set as described above
1388  *      when true; out_tres_limit_array is not modified when false
1389  * RET - TRES_USAGE_OKAY if no limit is violated, otherwise one of the other
1390  *       acct_policy_tres_usage_t enumerations with tres_pos being set to the
1391  *       position of the failed limit.
1392  */
_validate_tres_usage_limits(int * tres_pos,uint64_t * tres_limit_array,uint64_t * out_tres_limit_array,uint64_t * tres_req_cnt,uint64_t * tres_usage,uint64_t * curr_usage,uint16_t * admin_limit_set,bool safe_limits,bool out_tres_limit_set)1393 static acct_policy_tres_usage_t _validate_tres_usage_limits(
1394 	int *tres_pos,
1395 	uint64_t *tres_limit_array,
1396 	uint64_t *out_tres_limit_array,
1397 	uint64_t *tres_req_cnt,
1398 	uint64_t *tres_usage,
1399 	uint64_t *curr_usage,
1400 	uint16_t *admin_limit_set,
1401 	bool safe_limits,
1402 	bool out_tres_limit_set)
1403 {
1404 	int i;
1405 	uint64_t usage = 0;
1406 
1407 	xassert(tres_limit_array);
1408 
1409 	for (i = 0; i < g_tres_count; i++) {
1410 		(*tres_pos) = i;
1411 
1412 		if ((admin_limit_set &&
1413 		     admin_limit_set[i] == ADMIN_SET_LIMIT) ||
1414 		    (out_tres_limit_array &&
1415 		     out_tres_limit_array[i] != INFINITE64) ||
1416 		    (tres_limit_array[i] == INFINITE64))
1417 			continue;
1418 
1419 		if (out_tres_limit_set && out_tres_limit_array)
1420 			out_tres_limit_array[i] = tres_limit_array[i];
1421 
1422 		if (curr_usage && (curr_usage[i] >= tres_limit_array[i]))
1423 			return TRES_USAGE_CUR_EXCEEDS_LIMIT;
1424 
1425 		if (safe_limits) {
1426 			xassert(tres_req_cnt);
1427 			if (tres_req_cnt[i] > tres_limit_array[i])
1428 				return TRES_USAGE_REQ_EXCEEDS_LIMIT;
1429 
1430 			if (curr_usage)
1431 				usage = curr_usage[i];
1432 			if (tres_usage &&
1433 			    ((tres_req_cnt[i] + tres_usage[i]) >
1434 			     (tres_limit_array[i] - usage)))
1435 				return TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE;
1436 		}
1437 	}
1438 
1439 	return TRES_USAGE_OKAY;
1440 }
1441 
1442 /*
1443  * _validate_tres_usage_limits_for_qos - validate the tres requested
1444  * against limits of an qos skipping any limit an admin set
1445  *
1446  * OUT - tres_pos - if false is returned position in array of failed limit
1447  * IN - tres_limit_array - TRES limits from an association
1448  * IN/OUT - out_tres_limit_array - TRES limits QOS has imposed already, if a new
1449  *                                 limit is found the limit is filled in.
1450  * IN - tres_req_cnt - TRES requested from the job
1451  * IN - tres_usage - TRES usage from the QOS (in minutes)
1452  * IN - curr_usage - TRES usage in use right now by the QOS (running jobs)
1453  * IN - admin_limit_set - TRES limits that have been overridden by an admin
1454  * IN - safe_limits - if the safe flag was set on AccountingStorageEnforce
1455  *
1456  * RET - True if no limit is violated, false otherwise with tres_pos
1457  * being set to the position of the failed limit.
1458  */
_validate_tres_usage_limits_for_qos(int * tres_pos,uint64_t * tres_limit_array,uint64_t * out_tres_limit_array,uint64_t * tres_req_cnt,uint64_t * tres_usage,uint64_t * curr_usage,uint16_t * admin_limit_set,bool safe_limits)1459 static acct_policy_tres_usage_t _validate_tres_usage_limits_for_qos(
1460 	int *tres_pos,
1461 	uint64_t *tres_limit_array,
1462 	uint64_t *out_tres_limit_array,
1463 	uint64_t *tres_req_cnt,
1464 	uint64_t *tres_usage,
1465 	uint64_t *curr_usage,
1466 	uint16_t *admin_limit_set,
1467 	bool safe_limits)
1468 {
1469 	return _validate_tres_usage_limits(tres_pos,
1470 					   tres_limit_array,
1471 					   out_tres_limit_array,
1472 					   tres_req_cnt,
1473 					   tres_usage,
1474 					   curr_usage,
1475 					   admin_limit_set,
1476 					   safe_limits,
1477 					   true);
1478 }
1479 
1480 /*
1481  * _validate_tres_usage_limits_for_assoc - validate the tres requested
1482  * against limits of an association as well as qos skipping any limit
1483  * an admin set
1484  *
1485  * OUT - tres_pos - if false is returned position in array of failed limit
1486  * IN - tres_limit_array - TRES limits from an association
1487  * IN - qos_tres_limit_array - TRES limits QOS has imposed already
1488  * IN - tres_req_cnt - TRES requested from the job
1489  * IN - tres_usage - TRES usage from the association (in minutes)
1490  * IN - curr_usage - TRES usage in use right now by the assoc (running jobs)
1491  * IN - admin_limit_set - TRES limits that have been overridden by an admin
1492  * IN - safe_limits - if the safe flag was set on AccountingStorageEnforce
1493  *
1494  * RET - True if no limit is violated, false otherwise with tres_pos
1495  * being set to the position of the failed limit.
1496  */
_validate_tres_usage_limits_for_assoc(int * tres_pos,uint64_t * tres_limit_array,uint64_t * qos_tres_limit_array,uint64_t * tres_req_cnt,uint64_t * tres_usage,uint64_t * curr_usage,uint16_t * admin_limit_set,bool safe_limits)1497 static acct_policy_tres_usage_t _validate_tres_usage_limits_for_assoc(
1498 	int *tres_pos,
1499 	uint64_t *tres_limit_array,
1500 	uint64_t *qos_tres_limit_array,
1501 	uint64_t *tres_req_cnt,
1502 	uint64_t *tres_usage,
1503 	uint64_t *curr_usage,
1504 	uint16_t *admin_limit_set,
1505 	bool safe_limits)
1506 {
1507 	return _validate_tres_usage_limits(tres_pos,
1508 					   tres_limit_array,
1509 					   qos_tres_limit_array,
1510 					   tres_req_cnt,
1511 					   tres_usage,
1512 					   curr_usage,
1513 					   admin_limit_set,
1514 					   safe_limits,
1515 					   false);
1516 }
1517 
_qos_policy_validate(job_desc_msg_t * job_desc,slurmdb_assoc_rec_t * assoc_ptr,part_record_t * part_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr,uint32_t * reason,acct_policy_limit_set_t * acct_policy_limit_set,bool update_call,char * user_name,int job_cnt,bool strict_checking)1518 static int _qos_policy_validate(job_desc_msg_t *job_desc,
1519 				slurmdb_assoc_rec_t *assoc_ptr,
1520 				part_record_t *part_ptr,
1521 				slurmdb_qos_rec_t *qos_ptr,
1522 				slurmdb_qos_rec_t *qos_out_ptr,
1523 				uint32_t *reason,
1524 				acct_policy_limit_set_t *acct_policy_limit_set,
1525 				bool update_call,
1526 				char *user_name,
1527 				int job_cnt,
1528 				bool strict_checking)
1529 {
1530 	int rc = true;
1531 	int tres_pos = 0;
1532 
1533 	if (!qos_ptr || !qos_out_ptr)
1534 		return rc;
1535 
1536 	if (!_validate_tres_limits_for_qos(&tres_pos,
1537 					   job_desc->tres_req_cnt, 0,
1538 					   NULL,
1539 					   qos_ptr->max_tres_pa_ctld,
1540 					   NULL,
1541 					   qos_out_ptr->max_tres_pa_ctld,
1542 					   acct_policy_limit_set->tres,
1543 					   strict_checking, 1)) {
1544 		if (job_desc->tres_req_cnt[tres_pos] >
1545 		    qos_ptr->max_tres_pa_ctld[tres_pos]) {
1546 			if (reason)
1547 				*reason = _get_tres_state_reason(
1548 					tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
1549 
1550 			debug2("job submit for user %s(%u): "
1551 			       "min tres(%s) request %"PRIu64" exceeds "
1552 			       "per-acct max tres limit %"PRIu64" for qos '%s'",
1553 			       user_name,
1554 			       job_desc->user_id,
1555 			       assoc_mgr_tres_name_array[tres_pos],
1556 			       job_desc->tres_req_cnt[tres_pos],
1557 			       qos_ptr->max_tres_pa_ctld[tres_pos],
1558 			       qos_ptr->name);
1559 			rc = false;
1560 			goto end_it;
1561 		}
1562 	}
1563 
1564 	if (!_validate_tres_limits_for_qos(&tres_pos,
1565 					   job_desc->tres_req_cnt, 0,
1566 					   qos_ptr->grp_tres_ctld,
1567 					   qos_ptr->max_tres_pu_ctld,
1568 					   qos_out_ptr->grp_tres_ctld,
1569 					   qos_out_ptr->max_tres_pu_ctld,
1570 					   acct_policy_limit_set->tres,
1571 					   strict_checking, 1)) {
1572 		if (job_desc->tres_req_cnt[tres_pos] >
1573 		    qos_ptr->max_tres_pu_ctld[tres_pos]) {
1574 			if (reason)
1575 				*reason = _get_tres_state_reason(
1576 					tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
1577 
1578 			debug2("job submit for user %s(%u): "
1579 			       "min tres(%s) request %"PRIu64" exceeds "
1580 			       "per-user max tres limit %"PRIu64" for qos '%s'",
1581 			       user_name,
1582 			       job_desc->user_id,
1583 			       assoc_mgr_tres_name_array[tres_pos],
1584 			       job_desc->tres_req_cnt[tres_pos],
1585 			       qos_ptr->max_tres_pu_ctld[tres_pos],
1586 			       qos_ptr->name);
1587 			rc = false;
1588 			goto end_it;
1589 		} else if (job_desc->tres_req_cnt[tres_pos] >
1590 			   qos_ptr->grp_tres_ctld[tres_pos]) {
1591 			if (reason)
1592 				*reason = _get_tres_state_reason(
1593 					tres_pos, WAIT_QOS_GRP_UNK);
1594 
1595 			debug2("job submit for user %s(%u): "
1596 			       "min tres(%s) request %"PRIu64" exceeds "
1597 			       "group max tres limit %"PRIu64" for qos '%s'",
1598 			       user_name,
1599 			       job_desc->user_id,
1600 			       assoc_mgr_tres_name_array[tres_pos],
1601 			       job_desc->tres_req_cnt[tres_pos],
1602 			       qos_ptr->grp_tres_ctld[tres_pos],
1603 			       qos_ptr->name);
1604 			rc = false;
1605 			goto end_it;
1606 		}
1607 	}
1608 
1609 	/* for validation we don't need to look at
1610 	 * qos_ptr->grp_jobs.
1611 	 */
1612 
1613 	if ((qos_out_ptr->grp_submit_jobs == INFINITE) &&
1614 	    (qos_ptr->grp_submit_jobs != INFINITE)) {
1615 
1616 		qos_out_ptr->grp_submit_jobs = qos_ptr->grp_submit_jobs;
1617 
1618 		if ((qos_ptr->usage->grp_used_submit_jobs + job_cnt)
1619 		    > qos_ptr->grp_submit_jobs) {
1620 			if (reason)
1621 				*reason = WAIT_QOS_GRP_SUB_JOB;
1622 			debug2("job submit for user %s(%u): group max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
1623 			       user_name,
1624 			       job_desc->user_id,
1625 			       qos_ptr->grp_submit_jobs,
1626 			       qos_ptr->usage->grp_used_submit_jobs, job_cnt,
1627 			       qos_ptr->name);
1628 			rc = false;
1629 			goto end_it;
1630 		}
1631 	}
1632 
1633 	/* Only check the time_limits if the admin didn't set the timelimit.
1634 	 * It is important we look at these even if strict_checking
1635 	 * isn't set so we get the correct time_limit from the job.
1636 	 */
1637 	if (acct_policy_limit_set->time != ADMIN_SET_LIMIT) {
1638 		if (!_validate_tres_time_limits(
1639 			    &tres_pos,
1640 			    &job_desc->time_limit,
1641 			    part_ptr->max_time,
1642 			    job_desc->tres_req_cnt,
1643 			    qos_ptr->max_tres_mins_pj_ctld,
1644 			    qos_out_ptr->max_tres_mins_pj_ctld,
1645 			    &acct_policy_limit_set->time,
1646 			    strict_checking)) {
1647 			if (reason)
1648 				*reason = _get_tres_state_reason(
1649 					tres_pos,
1650 					WAIT_QOS_MAX_UNK_MINS_PER_JOB);
1651 			debug2("job submit for user %s(%u): "
1652 			       "tres(%s) time limit request %"PRIu64" "
1653 			       "exceeds max per-job limit %"PRIu64" "
1654 			       "for qos '%s'",
1655 			       user_name,
1656 			       job_desc->user_id,
1657 			       assoc_mgr_tres_name_array[tres_pos],
1658 			       ((uint64_t)job_desc->time_limit *
1659 				job_desc->tres_req_cnt[tres_pos]),
1660 			       qos_ptr->max_tres_mins_pj_ctld[tres_pos],
1661 			       qos_ptr->name);
1662 			rc = false;
1663 			goto end_it;
1664 		}
1665 
1666 		if (!_validate_tres_time_limits(
1667 			    &tres_pos,
1668 			    &job_desc->time_limit,
1669 			    part_ptr->max_time,
1670 			    job_desc->tres_req_cnt,
1671 			    qos_ptr->grp_tres_mins_ctld,
1672 			    qos_out_ptr->grp_tres_mins_ctld,
1673 			    &acct_policy_limit_set->time,
1674 			    strict_checking)) {
1675 			if (reason)
1676 				*reason = _get_tres_state_reason(
1677 					tres_pos, WAIT_QOS_GRP_UNK_MIN);
1678 			debug2("job submit for user %s(%u): "
1679 			       "tres(%s) time limit request %"PRIu64" "
1680 			       "exceeds group max limit %"PRIu64" "
1681 			       "for qos '%s'",
1682 			       user_name,
1683 			       job_desc->user_id,
1684 			       assoc_mgr_tres_name_array[tres_pos],
1685 			       ((uint64_t)job_desc->time_limit *
1686 				job_desc->tres_req_cnt[tres_pos]),
1687 			       qos_ptr->grp_tres_mins_ctld[tres_pos],
1688 			       qos_ptr->name);
1689 			rc = false;
1690 			goto end_it;
1691 		}
1692 
1693 		if (!_validate_tres_time_limits(
1694 			    &tres_pos,
1695 			    &job_desc->time_limit,
1696 			    part_ptr->max_time,
1697 			    job_desc->tres_req_cnt,
1698 			    qos_ptr->grp_tres_run_mins_ctld,
1699 			    qos_out_ptr->grp_tres_run_mins_ctld,
1700 			    &acct_policy_limit_set->time,
1701 			    strict_checking)) {
1702 			if (reason)
1703 				*reason = _get_tres_state_reason(
1704 					tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
1705 			debug2("job submit for user %s(%u): "
1706 			       "tres(%s) time limit request %"PRIu64" "
1707 			       "exceeds group max running limit %"PRIu64" "
1708 			       "for qos '%s'",
1709 			       user_name,
1710 			       job_desc->user_id,
1711 			       assoc_mgr_tres_name_array[tres_pos],
1712 			       ((uint64_t)job_desc->time_limit *
1713 				job_desc->tres_req_cnt[tres_pos]),
1714 			       qos_ptr->grp_tres_run_mins_ctld[tres_pos],
1715 			       qos_ptr->name);
1716 			rc = false;
1717 			goto end_it;
1718 		}
1719 
1720 		if ((qos_out_ptr->max_wall_pj == INFINITE) &&
1721 		    (qos_ptr->max_wall_pj != INFINITE) &&
1722 		    (!update_call || (job_desc->time_limit != NO_VAL))) {
1723 			_set_time_limit(&job_desc->time_limit,
1724 					part_ptr->max_time,
1725 					qos_ptr->max_wall_pj,
1726 					&acct_policy_limit_set->time);
1727 			qos_out_ptr->max_wall_pj = qos_ptr->max_wall_pj;
1728 
1729 			if (strict_checking
1730 			    && job_desc->time_limit > qos_ptr->max_wall_pj) {
1731 				if (reason)
1732 					*reason = WAIT_QOS_MAX_WALL_PER_JOB;
1733 				debug2("job submit for user %s(%u): "
1734 				       "time limit %u exceeds qos max %u",
1735 				       user_name,
1736 				       job_desc->user_id,
1737 				       job_desc->time_limit,
1738 				       qos_ptr->max_wall_pj);
1739 				rc = false;
1740 				goto end_it;
1741 			}
1742 		}
1743 
1744 		if ((qos_out_ptr->grp_wall == INFINITE) &&
1745 		    (qos_ptr->grp_wall != INFINITE) &&
1746 		    (!update_call || (job_desc->time_limit != NO_VAL))) {
1747 			_set_time_limit(&job_desc->time_limit,
1748 					part_ptr->max_time,
1749 					qos_ptr->grp_wall,
1750 					&acct_policy_limit_set->time);
1751 
1752 			qos_out_ptr->grp_wall = qos_ptr->grp_wall;
1753 
1754 			if (strict_checking
1755 			    && job_desc->time_limit > qos_ptr->grp_wall) {
1756 				if (reason)
1757 					*reason = WAIT_QOS_GRP_WALL;
1758 				debug2("job submit for user %s(%u): "
1759 				       "time limit %u exceeds qos grp max %u",
1760 				       user_name,
1761 				       job_desc->user_id,
1762 				       job_desc->time_limit,
1763 				       qos_ptr->grp_wall);
1764 				rc = false;
1765 				goto end_it;
1766 			}
1767 		}
1768 	}
1769 
1770 	if (!_validate_tres_limits_for_qos(&tres_pos,
1771 					   job_desc->tres_req_cnt, 0,
1772 					   NULL,
1773 					   qos_ptr->max_tres_pj_ctld,
1774 					   NULL,
1775 					   qos_out_ptr->max_tres_pj_ctld,
1776 					   acct_policy_limit_set->tres,
1777 					   strict_checking, 1)) {
1778 		if (reason)
1779 			*reason = _get_tres_state_reason(
1780 				tres_pos, WAIT_QOS_MAX_UNK_PER_JOB);
1781 
1782 		debug2("job submit for user %s(%u): "
1783 		       "min tres(%s) request %"PRIu64" exceeds "
1784 		       "per-job max tres limit %"PRIu64" for qos '%s'",
1785 		       user_name,
1786 		       job_desc->user_id,
1787 		       assoc_mgr_tres_name_array[tres_pos],
1788 		       job_desc->tres_req_cnt[tres_pos],
1789 		       qos_ptr->max_tres_pj_ctld[tres_pos],
1790 		       qos_ptr->name);
1791 		rc = false;
1792 		goto end_it;
1793 	}
1794 
1795 	if (!_validate_tres_limits_for_qos(&tres_pos,
1796 					   job_desc->tres_req_cnt,
1797 					   job_desc->tres_req_cnt[
1798 						   TRES_ARRAY_NODE],
1799 					   NULL,
1800 					   qos_ptr->max_tres_pn_ctld,
1801 					   NULL,
1802 					   qos_out_ptr->max_tres_pn_ctld,
1803 					   acct_policy_limit_set->tres,
1804 					   strict_checking, 1)) {
1805 		if (reason)
1806 			*reason = _get_tres_state_reason(
1807 				tres_pos, WAIT_QOS_MAX_UNK_PER_NODE);
1808 
1809 		debug2("job submit for user %s(%u): "
1810 		       "min tres(%s) request %"PRIu64" exceeds "
1811 		       "per-node max tres limit %"PRIu64" for qos '%s'",
1812 		       user_name,
1813 		       job_desc->user_id,
1814 		       assoc_mgr_tres_name_array[tres_pos],
1815 		       job_desc->tres_req_cnt[tres_pos] /
1816 		       job_desc->tres_req_cnt[TRES_ARRAY_NODE],
1817 		       qos_ptr->max_tres_pn_ctld[tres_pos],
1818 		       qos_ptr->name);
1819 		rc = false;
1820 		goto end_it;
1821 	}
1822 
1823 	/* for validation we don't need to look at
1824 	 * qos_ptr->max_jobs.
1825 	 */
1826 
1827 	if ((qos_out_ptr->max_submit_jobs_pa == INFINITE) &&
1828 	    (qos_ptr->max_submit_jobs_pa != INFINITE)) {
1829 		slurmdb_used_limits_t *used_limits =
1830 			acct_policy_get_acct_used_limits(
1831 				&qos_ptr->usage->acct_limit_list,
1832 				assoc_ptr->acct);
1833 
1834 		qos_out_ptr->max_submit_jobs_pa = qos_ptr->max_submit_jobs_pa;
1835 
1836 		if ((used_limits->submit_jobs + job_cnt) >
1837 		    qos_ptr->max_submit_jobs_pa) {
1838 			if (reason)
1839 				*reason = WAIT_QOS_MAX_SUB_JOB_PER_ACCT;
1840 			debug2("job submit for account %s: qos max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
1841 			       assoc_ptr->acct,
1842 			       qos_ptr->max_submit_jobs_pa,
1843 			       used_limits->submit_jobs, job_cnt,
1844 			       qos_ptr->name);
1845 			rc = false;
1846 			goto end_it;
1847 		}
1848 	}
1849 
1850 	if ((qos_out_ptr->max_submit_jobs_pu == INFINITE) &&
1851 	    (qos_ptr->max_submit_jobs_pu != INFINITE)) {
1852 		slurmdb_used_limits_t *used_limits =
1853 			acct_policy_get_user_used_limits(
1854 				&qos_ptr->usage->user_limit_list,
1855 				job_desc->user_id);
1856 
1857 		qos_out_ptr->max_submit_jobs_pu = qos_ptr->max_submit_jobs_pu;
1858 
1859 		if ((used_limits->submit_jobs + job_cnt) >
1860 		     qos_ptr->max_submit_jobs_pu) {
1861 			if (reason)
1862 				*reason = WAIT_QOS_MAX_SUB_JOB;
1863 			debug2("job submit for user %s(%u): qos max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
1864 			       user_name,
1865 			       job_desc->user_id,
1866 			       qos_ptr->max_submit_jobs_pu,
1867 			       used_limits->submit_jobs, job_cnt,
1868 			       qos_ptr->name);
1869 			rc = false;
1870 			goto end_it;
1871 		}
1872 	}
1873 
1874 	if (!_validate_tres_limits_for_qos(&tres_pos,
1875 					   job_desc->tres_req_cnt, 0,
1876 					   NULL,
1877 					   qos_ptr->min_tres_pj_ctld,
1878 					   NULL,
1879 					   qos_out_ptr->min_tres_pj_ctld,
1880 					   acct_policy_limit_set->tres,
1881 					   strict_checking, 0)) {
1882 		if (reason)
1883 			*reason = _get_tres_state_reason(
1884 				tres_pos, WAIT_QOS_MIN_UNK);
1885 
1886 		debug2("job submit for user %s(%u): "
1887 		       "min tres(%s) request %"PRIu64" exceeds "
1888 		       "per-job max tres limit %"PRIu64" for qos '%s'",
1889 		       user_name,
1890 		       job_desc->user_id,
1891 		       assoc_mgr_tres_name_array[tres_pos],
1892 		       job_desc->tres_req_cnt[tres_pos],
1893 		       qos_ptr->min_tres_pj_ctld[tres_pos],
1894 		       qos_ptr->name);
1895 		rc = false;
1896 		goto end_it;
1897 	}
1898 
1899 end_it:
1900 	return rc;
1901 }
1902 
_qos_job_runnable_pre_select(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr)1903 static int _qos_job_runnable_pre_select(job_record_t *job_ptr,
1904 					slurmdb_qos_rec_t *qos_ptr,
1905 					slurmdb_qos_rec_t *qos_out_ptr)
1906 {
1907 	uint32_t wall_mins;
1908 	uint32_t time_limit = NO_VAL;
1909 	int rc = true;
1910 	slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
1911 	bool safe_limits = false;
1912 	slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
1913 
1914 	if (!qos_ptr || !qos_out_ptr || !assoc_ptr)
1915 		return rc;
1916 
1917 	/*
1918 	 * check to see if we should be using safe limits, if so we
1919 	 * will only start a job if there are sufficient remaining
1920 	 * cpu-minutes for it to run to completion
1921 	 */
1922 	if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
1923 		safe_limits = true;
1924 
1925 	wall_mins = qos_ptr->usage->grp_used_wall / 60;
1926 
1927 	used_limits_a =	acct_policy_get_acct_used_limits(
1928 		&qos_ptr->usage->acct_limit_list,
1929 		assoc_ptr->acct);
1930 
1931 	used_limits = acct_policy_get_user_used_limits(
1932 		&qos_ptr->usage->user_limit_list,
1933 		job_ptr->user_id);
1934 
1935 
1936 	/* we don't need to check grp_tres_mins here */
1937 
1938 	/* we don't need to check grp_tres here */
1939 
1940 	/* we don't need to check grp_mem here */
1941 	if ((qos_out_ptr->grp_jobs == INFINITE) &&
1942 	    (qos_ptr->grp_jobs != INFINITE)) {
1943 
1944 		qos_out_ptr->grp_jobs = qos_ptr->grp_jobs;
1945 
1946 		if (qos_ptr->usage->grp_used_jobs >= qos_ptr->grp_jobs) {
1947 			xfree(job_ptr->state_desc);
1948 			job_ptr->state_reason = WAIT_QOS_GRP_JOB;
1949 			debug2("%pJ being held, the job is at or exceeds group max jobs limit %u with %u for QOS %s",
1950 			       job_ptr, qos_ptr->grp_jobs,
1951 			       qos_ptr->usage->grp_used_jobs, qos_ptr->name);
1952 
1953 			rc = false;
1954 			goto end_it;
1955 		}
1956 	}
1957 
1958 	/* we don't need to check grp_tres_run_mins here */
1959 
1960 	/* we don't need to check grp_nodes here */
1961 
1962 	/* we don't need to check submit_jobs here */
1963 
1964 	if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
1965 	    && (qos_out_ptr->grp_wall == INFINITE)
1966 	    && (qos_ptr->grp_wall != INFINITE)) {
1967 		if (time_limit == NO_VAL) {
1968 			time_limit = job_ptr->time_limit;
1969 			_set_time_limit(&time_limit,
1970 					job_ptr->part_ptr->max_time,
1971 					MIN(qos_ptr->grp_wall,
1972 					    qos_ptr->max_wall_pj),
1973 					&job_ptr->limit_set.time);
1974 
1975 			/* Account for usage factor, if necessary */
1976 			if ((job_ptr->qos_ptr &&
1977 			     (job_ptr->qos_ptr->flags &
1978 			      QOS_FLAG_USAGE_FACTOR_SAFE) &&
1979 			     (job_ptr->qos_ptr->usage_factor >= 0)) &&
1980 			    ((time_limit != INFINITE) ||
1981 			     (job_ptr->qos_ptr->usage_factor < 1.0))) {
1982 				time_limit *= job_ptr->qos_ptr->usage_factor;
1983 			}
1984 		}
1985 
1986 		qos_out_ptr->grp_wall = qos_ptr->grp_wall;
1987 
1988 		if (wall_mins >= qos_ptr->grp_wall) {
1989 			xfree(job_ptr->state_desc);
1990 			job_ptr->state_reason = WAIT_QOS_GRP_WALL;
1991 			debug2("%pJ being held, the job is at or exceeds group wall limit %u with %u for QOS %s",
1992 			       job_ptr, qos_ptr->grp_wall,
1993 			       wall_mins, qos_ptr->name);
1994 			rc = false;
1995 			goto end_it;
1996 		} else if (safe_limits &&
1997 			   ((wall_mins + time_limit) > qos_ptr->grp_wall)) {
1998 			xfree(job_ptr->state_desc);
1999 			job_ptr->state_reason = WAIT_QOS_GRP_WALL;
2000 			debug2("%pJ being held, the job request will exceed group wall limit %u if ran with %u for QOS %s",
2001 			       job_ptr, qos_ptr->grp_wall,
2002 			       wall_mins + time_limit, qos_ptr->name);
2003 			rc = false;
2004 			goto end_it;
2005 		}
2006 	}
2007 
2008 	/* we don't need to check max_tres_mins_pj here */
2009 
2010 	/* we don't need to check max_tres_pj here */
2011 
2012 	/* we don't need to check max_tres_pn here */
2013 
2014 	/* we don't need to check min_tres_pj here */
2015 
2016 	/* we don't need to check max_tres_pa here */
2017 
2018 	/* we don't need to check max_tres_pu here */
2019 
2020 	if ((qos_out_ptr->max_jobs_pa == INFINITE)
2021 	    && (qos_ptr->max_jobs_pa != INFINITE)) {
2022 
2023 		qos_out_ptr->max_jobs_pa = qos_ptr->max_jobs_pa;
2024 
2025 		if (used_limits_a->jobs >= qos_ptr->max_jobs_pa) {
2026 			xfree(job_ptr->state_desc);
2027 			job_ptr->state_reason =
2028 				WAIT_QOS_MAX_JOB_PER_ACCT;
2029 			debug2("%pJ being held, the job is at or exceeds max jobs per-acct (%s) limit %u with %u for QOS %s",
2030 			       job_ptr, used_limits_a->acct,
2031 			       qos_ptr->max_jobs_pa,
2032 			       used_limits_a->jobs, qos_ptr->name);
2033 			rc = false;
2034 			goto end_it;
2035 		}
2036 	}
2037 
2038 	if ((qos_out_ptr->max_jobs_pu == INFINITE)
2039 	    && (qos_ptr->max_jobs_pu != INFINITE)) {
2040 
2041 		qos_out_ptr->max_jobs_pu = qos_ptr->max_jobs_pu;
2042 
2043 		if (used_limits->jobs >= qos_ptr->max_jobs_pu) {
2044 			xfree(job_ptr->state_desc);
2045 			job_ptr->state_reason =
2046 				WAIT_QOS_MAX_JOB_PER_USER;
2047 			debug2("%pJ being held, the job is at or exceeds max jobs per-user limit %u with %u for QOS %s",
2048 			       job_ptr, qos_ptr->max_jobs_pu,
2049 			       used_limits->jobs, qos_ptr->name);
2050 			rc = false;
2051 			goto end_it;
2052 		}
2053 	}
2054 
2055 	/* we don't need to check submit_jobs_pa here */
2056 
2057 	/* we don't need to check submit_jobs_pu here */
2058 
2059 	/*
2060 	 * if the QOS limits have changed since job
2061 	 * submission and job can not run, then kill it
2062 	 */
2063 	if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
2064 	    && (qos_out_ptr->max_wall_pj == INFINITE)
2065 	    && (qos_ptr->max_wall_pj != INFINITE)) {
2066 		if (time_limit == NO_VAL) {
2067 			time_limit = job_ptr->time_limit;
2068 			_set_time_limit(&time_limit,
2069 					job_ptr->part_ptr->max_time,
2070 					qos_ptr->max_wall_pj,
2071 					&job_ptr->limit_set.time);
2072 		}
2073 
2074 		/* Account for usage factor, if necessary */
2075 		if ((job_ptr->qos_ptr &&
2076 		     (job_ptr->qos_ptr->flags &
2077 		      QOS_FLAG_USAGE_FACTOR_SAFE) &&
2078 		     (job_ptr->qos_ptr->usage_factor >= 0)) &&
2079 		    ((time_limit != INFINITE) ||
2080 		     (job_ptr->qos_ptr->usage_factor < 1.0))) {
2081 			time_limit *= job_ptr->qos_ptr->usage_factor;
2082 		}
2083 
2084 		qos_out_ptr->max_wall_pj = qos_ptr->max_wall_pj;
2085 
2086 		if (time_limit > qos_out_ptr->max_wall_pj) {
2087 			xfree(job_ptr->state_desc);
2088 			job_ptr->state_reason =
2089 				WAIT_QOS_MAX_WALL_PER_JOB;
2090 			debug2("%pJ being held, time limit %u exceeds QOS max wall pj %u",
2091 			       job_ptr, time_limit, qos_out_ptr->max_wall_pj);
2092 			rc = false;
2093 			goto end_it;
2094 		}
2095 	}
2096 end_it:
2097 
2098 	return rc;
2099 }
2100 
_qos_job_runnable_post_select(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr,uint64_t * tres_req_cnt,uint64_t * job_tres_time_limit)2101 static int _qos_job_runnable_post_select(job_record_t *job_ptr,
2102 					 slurmdb_qos_rec_t *qos_ptr,
2103 					 slurmdb_qos_rec_t *qos_out_ptr,
2104 					 uint64_t *tres_req_cnt,
2105 					 uint64_t *job_tres_time_limit)
2106 {
2107 	uint64_t tres_usage_mins[slurmctld_tres_cnt];
2108 	uint64_t tres_run_mins[slurmctld_tres_cnt];
2109 	uint64_t orig_node_cnt;
2110 	slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
2111 	bool safe_limits = false;
2112 	int rc = true, i, tres_pos = 0;
2113 	acct_policy_tres_usage_t tres_usage;
2114 	slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
2115 	double usage_factor = 1.0;
2116 
2117 	if (!qos_ptr || !qos_out_ptr || !assoc_ptr)
2118 		return rc;
2119 
2120 	/*
2121 	 * check to see if we should be using safe limits, if so we will only
2122 	 * will only start a job if there are sufficient remaining cpu-minutes
2123 	 * for it to run to completion
2124 	 */
2125 	if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
2126 		safe_limits = true;
2127 
2128 	/* clang needs this memset to avoid a warning */
2129 	memset(tres_run_mins, 0, sizeof(tres_run_mins));
2130 	memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
2131 	if (job_ptr->qos_ptr &&
2132 	    (job_ptr->qos_ptr->usage_factor >= 0))
2133 		usage_factor = job_ptr->qos_ptr->usage_factor;
2134 	for (i=0; i<slurmctld_tres_cnt; i++) {
2135 		tres_run_mins[i] =
2136 			qos_ptr->usage->grp_used_tres_run_secs[i] / 60;
2137 		tres_usage_mins[i] =
2138 			(uint64_t)(qos_ptr->usage->usage_tres_raw[i] / 60.0);
2139 
2140 		/*
2141 		 * Clear usage if factor is 0 so that jobs can run. Otherwise
2142 		 * multiplying can cause more jobs to be run than the limit
2143 		 * allows (e.g. usagefactor=.5).
2144 		 */
2145 		if (usage_factor == 0.0) {
2146 			tres_run_mins[i] *= usage_factor;
2147 			tres_usage_mins[i] *= usage_factor;
2148 		}
2149 	}
2150 
2151 	used_limits_a =	acct_policy_get_acct_used_limits(
2152 		&qos_ptr->usage->acct_limit_list,
2153 		assoc_ptr->acct);
2154 
2155 	used_limits = acct_policy_get_user_used_limits(
2156 		&qos_ptr->usage->user_limit_list,
2157 		job_ptr->user_id);
2158 
2159 	tres_usage = _validate_tres_usage_limits_for_qos(
2160 		&tres_pos, qos_ptr->grp_tres_mins_ctld,
2161 		qos_out_ptr->grp_tres_mins_ctld, job_tres_time_limit,
2162 		tres_run_mins, tres_usage_mins, job_ptr->limit_set.tres,
2163 		safe_limits);
2164 	switch (tres_usage) {
2165 	case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2166 		xfree(job_ptr->state_desc);
2167 		job_ptr->state_reason = _get_tres_state_reason(
2168 			tres_pos, WAIT_QOS_GRP_UNK_MIN);
2169 		debug2("%pJ being held, QOS %s group max tres(%s) minutes limit of %"PRIu64" is already at or exceeded with %"PRIu64,
2170 		       job_ptr, qos_ptr->name,
2171 		       assoc_mgr_tres_name_array[tres_pos],
2172 		       qos_ptr->grp_tres_mins_ctld[tres_pos],
2173 		       tres_usage_mins[tres_pos]);
2174 		rc = false;
2175 		goto end_it;
2176 		break;
2177 	case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2178 		xfree(job_ptr->state_desc);
2179 		job_ptr->state_reason = _get_tres_state_reason(
2180 			tres_pos, WAIT_QOS_GRP_UNK_MIN);
2181 		debug2("%pJ being held, the job is requesting more than allowed with QOS %s's group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
2182 		       job_ptr, qos_ptr->name,
2183 		       assoc_mgr_tres_name_array[tres_pos],
2184 		       qos_ptr->grp_tres_mins_ctld[tres_pos],
2185 		       job_tres_time_limit[tres_pos]);
2186 		rc = false;
2187 		goto end_it;
2188 		break;
2189 	case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2190 		/*
2191 		 * If we're using safe limits start
2192 		 * the job only if there are
2193 		 * sufficient cpu-mins left such that
2194 		 * it will run to completion without
2195 		 * being killed
2196 		 */
2197 		xfree(job_ptr->state_desc);
2198 		job_ptr->state_reason = _get_tres_state_reason(
2199 			tres_pos, WAIT_QOS_GRP_UNK_MIN);
2200 		debug2("%pJ being held, the job is at or exceeds QOS %s's group max tres(%s) minutes of %"PRIu64" of which %"PRIu64" are still available but request is for %"PRIu64" (plus %"PRIu64" already in use) tres minutes (request tres count %"PRIu64")",
2201 		       job_ptr, qos_ptr->name,
2202 		       assoc_mgr_tres_name_array[tres_pos],
2203 		       qos_ptr->grp_tres_mins_ctld[tres_pos],
2204 		       qos_ptr->grp_tres_mins_ctld[tres_pos] -
2205 		       tres_usage_mins[tres_pos],
2206 		       job_tres_time_limit[tres_pos],
2207 		       tres_run_mins[tres_pos],
2208 		       tres_req_cnt[tres_pos]);
2209 		rc = false;
2210 		goto end_it;
2211 		break;
2212 	case TRES_USAGE_OKAY:
2213 		/* all good */
2214 		break;
2215 	}
2216 
2217 	/*
2218 	 * If the job's CPU limit wasn't administratively set and the QOS
2219 	 * has a GrpCPU limit, cancel the job if its minimum CPU requirement
2220 	 * has exceeded the limit for all CPUs usable by the QOS
2221 	 */
2222 	orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
2223 	_get_unique_job_node_cnt(job_ptr, qos_ptr->usage->grp_node_bitmap,
2224 				 &tres_req_cnt[TRES_ARRAY_NODE]);
2225 	tres_usage = _validate_tres_usage_limits_for_qos(
2226 		&tres_pos,
2227 		qos_ptr->grp_tres_ctld,	qos_out_ptr->grp_tres_ctld,
2228 		tres_req_cnt, qos_ptr->usage->grp_used_tres,
2229 		NULL, job_ptr->limit_set.tres, true);
2230 	tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
2231 	switch (tres_usage) {
2232 	case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2233 		/* not possible because the curr_usage sent in is NULL */
2234 		break;
2235 	case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2236 		xfree(job_ptr->state_desc);
2237 		job_ptr->state_reason = _get_tres_state_reason(
2238 			tres_pos, WAIT_QOS_GRP_UNK);
2239 		debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds group max tres limit %"PRIu64,
2240 		       job_ptr, qos_ptr->name,
2241 		       assoc_mgr_tres_name_array[tres_pos],
2242 		       tres_req_cnt[tres_pos],
2243 		       qos_ptr->grp_tres_ctld[tres_pos]);
2244 		rc = false;
2245 		goto end_it;
2246 		break;
2247 	case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2248 		xfree(job_ptr->state_desc);
2249 		job_ptr->state_reason = _get_tres_state_reason(
2250 			tres_pos, WAIT_QOS_GRP_UNK);
2251 		debug2("%pJ being held, if allowed the job request will exceed QOS %s group max tres(%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2252 		       job_ptr, qos_ptr->name,
2253 		       assoc_mgr_tres_name_array[tres_pos],
2254 		       qos_ptr->grp_tres_ctld[tres_pos],
2255 		       qos_ptr->usage->grp_used_tres[tres_pos],
2256 		       tres_req_cnt[tres_pos]);
2257 		rc = false;
2258 		goto end_it;
2259 	case TRES_USAGE_OKAY:
2260 		/* all good */
2261 		break;
2262 	}
2263 
2264 	/* we don't need to check grp_jobs here */
2265 
2266 	tres_usage = _validate_tres_usage_limits_for_qos(
2267 		&tres_pos,
2268 		qos_ptr->grp_tres_run_mins_ctld,
2269 		qos_out_ptr->grp_tres_run_mins_ctld,
2270 		job_tres_time_limit, tres_run_mins, NULL, NULL, true);
2271 	switch (tres_usage) {
2272 	case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2273 		/* not possible because the curr_usage sent in is NULL */
2274 		break;
2275 	case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2276 		xfree(job_ptr->state_desc);
2277 		job_ptr->state_reason = _get_tres_state_reason(
2278 			tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
2279 		debug2("%pJ is being held, QOS %s group max running tres(%s) minutes request %"PRIu64" exceeds limit %"PRIu64,
2280 		       job_ptr, qos_ptr->name,
2281 		       assoc_mgr_tres_name_array[tres_pos],
2282 		       job_tres_time_limit[tres_pos],
2283 		       qos_ptr->grp_tres_run_mins_ctld[tres_pos]);
2284 		rc = false;
2285 		goto end_it;
2286 		break;
2287 	case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2288 		xfree(job_ptr->state_desc);
2289 		job_ptr->state_reason = _get_tres_state_reason(
2290 			tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
2291 		debug2("%pJ being held, if allowed the job request will exceed QOS %s group max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2292 		       job_ptr, qos_ptr->name,
2293 		       assoc_mgr_tres_name_array[tres_pos],
2294 		       qos_ptr->grp_tres_run_mins_ctld[tres_pos],
2295 		       tres_run_mins[tres_pos],
2296 		       job_tres_time_limit[tres_pos]);
2297 		rc = false;
2298 		goto end_it;
2299 		break;
2300 	case TRES_USAGE_OKAY:
2301 		/* all good */
2302 		break;
2303 	}
2304 
2305 	/* we don't need to check submit_jobs here */
2306 
2307 	/* we don't need to check grp_wall here */
2308 
2309 	if (!_validate_tres_limits_for_qos(&tres_pos,
2310 					   job_tres_time_limit, 0,
2311 					   NULL,
2312 					   qos_ptr->max_tres_mins_pj_ctld,
2313 					   NULL,
2314 					   qos_out_ptr->max_tres_mins_pj_ctld,
2315 					   job_ptr->limit_set.tres,
2316 					   1, 1)) {
2317 		xfree(job_ptr->state_desc);
2318 		job_ptr->state_reason = _get_tres_state_reason(
2319 			tres_pos, WAIT_QOS_MAX_UNK_MINS_PER_JOB);
2320 		debug2("%pJ being held, the job is requesting more than allowed with QOS %s's max tres(%s) minutes of %"PRIu64" with %"PRIu64,
2321 		       job_ptr, qos_ptr->name,
2322 		       assoc_mgr_tres_name_array[tres_pos],
2323 		       qos_ptr->max_tres_mins_pj_ctld[tres_pos],
2324 		       job_tres_time_limit[tres_pos]);
2325 		rc = false;
2326 		goto end_it;
2327 	}
2328 
2329 	if (!_validate_tres_limits_for_qos(&tres_pos,
2330 					   tres_req_cnt, 0,
2331 					   NULL,
2332 					   qos_ptr->max_tres_pj_ctld,
2333 					   NULL,
2334 					   qos_out_ptr->max_tres_pj_ctld,
2335 					   job_ptr->limit_set.tres,
2336 					   1, 1)) {
2337 		xfree(job_ptr->state_desc);
2338 		job_ptr->state_reason = _get_tres_state_reason(
2339 			tres_pos, WAIT_QOS_MAX_UNK_PER_JOB);
2340 		debug2("%pJ is being held, QOS %s min tres(%s) per job request %"PRIu64" exceeds max tres limit %"PRIu64,
2341 		       job_ptr, qos_ptr->name,
2342 		       assoc_mgr_tres_name_array[tres_pos],
2343 		       tres_req_cnt[tres_pos],
2344 		       qos_ptr->max_tres_pj_ctld[tres_pos]);
2345 		rc = false;
2346 		goto end_it;
2347 	}
2348 
2349 	if (!_validate_tres_limits_for_qos(&tres_pos,
2350 					   tres_req_cnt,
2351 					   tres_req_cnt[TRES_ARRAY_NODE],
2352 					   NULL,
2353 					   qos_ptr->max_tres_pn_ctld,
2354 					   NULL,
2355 					   qos_out_ptr->max_tres_pn_ctld,
2356 					   job_ptr->limit_set.tres,
2357 					   1, 1)) {
2358 		uint64_t req_per_node;
2359 		xfree(job_ptr->state_desc);
2360 		job_ptr->state_reason = _get_tres_state_reason(
2361 			tres_pos, WAIT_QOS_MAX_UNK_PER_NODE);
2362 		req_per_node = tres_req_cnt[tres_pos];
2363 		if (tres_req_cnt[TRES_ARRAY_NODE] > 1)
2364 			req_per_node /= tres_req_cnt[TRES_ARRAY_NODE];
2365 		debug2("%pJ is being held, QOS %s min tres(%s) per node request %"PRIu64" exceeds max tres limit %"PRIu64,
2366 		       job_ptr, qos_ptr->name,
2367 		       assoc_mgr_tres_name_array[tres_pos],
2368 		       req_per_node,
2369 		       qos_ptr->max_tres_pn_ctld[tres_pos]);
2370 		rc = false;
2371 		goto end_it;
2372 	}
2373 
2374 	if (!_validate_tres_limits_for_qos(&tres_pos,
2375 					   tres_req_cnt, 0,
2376 					   NULL,
2377 					   qos_ptr->min_tres_pj_ctld,
2378 					   NULL,
2379 					   qos_out_ptr->min_tres_pj_ctld,
2380 					   job_ptr->limit_set.tres,
2381 					   1, 0)) {
2382 		xfree(job_ptr->state_desc);
2383 		job_ptr->state_reason = _get_tres_state_reason(
2384 			tres_pos, WAIT_QOS_MIN_UNK);
2385 		debug2("%pJ is being held, QOS %s min tres(%s) per job request %"PRIu64" exceeds min tres limit %"PRIu64,
2386 		       job_ptr, qos_ptr->name,
2387 		       assoc_mgr_tres_name_array[tres_pos],
2388 		       tres_req_cnt[tres_pos],
2389 		       qos_ptr->min_tres_pj_ctld[tres_pos]);
2390 		rc = false;
2391 		goto end_it;
2392 	}
2393 
2394 	orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
2395 	_get_unique_job_node_cnt(job_ptr, used_limits_a->node_bitmap,
2396 				 &tres_req_cnt[TRES_ARRAY_NODE]);
2397 	tres_usage = _validate_tres_usage_limits_for_qos(
2398 		&tres_pos,
2399 		qos_ptr->max_tres_pa_ctld, qos_out_ptr->max_tres_pa_ctld,
2400 		tres_req_cnt, used_limits_a->tres,
2401 		NULL, job_ptr->limit_set.tres, true);
2402 	tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
2403 	switch (tres_usage) {
2404 	case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2405 		/* not possible because the curr_usage sent in is NULL */
2406 		break;
2407 	case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2408 		/*
2409 		 * Hold the job if it exceeds the per-acct
2410 		 * TRES limit for the given QOS
2411 		 */
2412 		xfree(job_ptr->state_desc);
2413 		job_ptr->state_reason = _get_tres_state_reason(
2414 			tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
2415 		debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds max tres per account (%s) limit %"PRIu64,
2416 		       job_ptr, qos_ptr->name,
2417 		       assoc_mgr_tres_name_array[tres_pos],
2418 		       tres_req_cnt[tres_pos],
2419 		       used_limits_a->acct,
2420 		       qos_ptr->max_tres_pa_ctld[tres_pos]);
2421 		rc = false;
2422 		goto end_it;
2423 		break;
2424 	case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2425 		/*
2426 		 * Hold the job if the user has exceeded the QOS per-user
2427 		 * TRES limit with their current usage
2428 		 */
2429 		xfree(job_ptr->state_desc);
2430 		job_ptr->state_reason = _get_tres_state_reason(
2431 			tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
2432 		debug2("%pJ being held, if allowed the job request will exceed QOS %s max tres(%s) per account (%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2433 		       job_ptr, qos_ptr->name,
2434 		       assoc_mgr_tres_name_array[tres_pos],
2435 		       used_limits_a->acct,
2436 		       qos_ptr->max_tres_pa_ctld[tres_pos],
2437 		       used_limits_a->tres[tres_pos],
2438 		       tres_req_cnt[tres_pos]);
2439 		rc = false;
2440 		goto end_it;
2441 	case TRES_USAGE_OKAY:
2442 		/* all good */
2443 		break;
2444 	}
2445 
2446 	orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
2447 	_get_unique_job_node_cnt(job_ptr, used_limits->node_bitmap,
2448 				 &tres_req_cnt[TRES_ARRAY_NODE]);
2449 	tres_usage = _validate_tres_usage_limits_for_qos(
2450 		&tres_pos,
2451 		qos_ptr->max_tres_pu_ctld, qos_out_ptr->max_tres_pu_ctld,
2452 		tres_req_cnt, used_limits->tres,
2453 		NULL, job_ptr->limit_set.tres, true);
2454 	tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
2455 	switch (tres_usage) {
2456 	case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2457 		/* not possible because the curr_usage sent in is NULL */
2458 		break;
2459 	case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2460 		/*
2461 		 * Hold the job if it exceeds the per-user
2462 		 * TRES limit for the given QOS
2463 		 */
2464 		xfree(job_ptr->state_desc);
2465 		job_ptr->state_reason = _get_tres_state_reason(
2466 			tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
2467 		debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds max tres per user limit %"PRIu64,
2468 		       job_ptr, qos_ptr->name,
2469 		       assoc_mgr_tres_name_array[tres_pos],
2470 		       tres_req_cnt[tres_pos],
2471 		       qos_ptr->max_tres_pu_ctld[tres_pos]);
2472 		rc = false;
2473 		goto end_it;
2474 		break;
2475 	case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2476 		/*
2477 		 * Hold the job if the user has exceeded the QOS
2478 		 * per-user TRES limit with their current usage
2479 		 */
2480 		xfree(job_ptr->state_desc);
2481 		job_ptr->state_reason = _get_tres_state_reason(
2482 			tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
2483 		debug2("%pJ being held, if allowed the job request will exceed QOS %s max tres(%s) per user limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
2484 		       job_ptr, qos_ptr->name,
2485 		       assoc_mgr_tres_name_array[tres_pos],
2486 		       qos_ptr->max_tres_pu_ctld[tres_pos],
2487 		       used_limits->tres[tres_pos],
2488 		       tres_req_cnt[tres_pos]);
2489 		rc = false;
2490 		goto end_it;
2491 	case TRES_USAGE_OKAY:
2492 		/* all good */
2493 		break;
2494 	}
2495 
2496 	/* We do not need to check max_jobs_pa here */
2497 
2498 	/* We do not need to check max_jobs_pu here */
2499 
2500 	/* we don't need to check submit_jobs_pa here */
2501 
2502 	/* we don't need to check submit_jobs_pu here */
2503 
2504 	/* we don't need to check max_wall_pj here */
2505 
2506 end_it:
2507 	if (!rc)
2508 		job_ptr->qos_blocking_ptr = qos_ptr;
2509 
2510 	return rc;
2511 }
2512 
_qos_job_time_out(job_record_t * job_ptr,slurmdb_qos_rec_t * qos_ptr,slurmdb_qos_rec_t * qos_out_ptr,uint64_t * job_tres_usage_mins)2513 static int _qos_job_time_out(job_record_t *job_ptr,
2514 			     slurmdb_qos_rec_t *qos_ptr,
2515 			     slurmdb_qos_rec_t *qos_out_ptr,
2516 			     uint64_t *job_tres_usage_mins)
2517 {
2518 	uint64_t tres_usage_mins[slurmctld_tres_cnt];
2519 	uint32_t wall_mins;
2520 	int rc = true, tres_pos = 0, i;
2521 	acct_policy_tres_usage_t tres_usage;
2522 	time_t now = time(NULL);
2523 
2524 	if (!qos_ptr || !qos_out_ptr)
2525 		return rc;
2526 
2527 	/*
2528 	 * The idea here is for QOS to trump what an association has set for
2529 	 * a limit, so if an association set of wall 10 mins and the QOS has
2530 	 * 20 mins set and the job has been running for 11 minutes it continues
2531 	 * until 20.
2532 	 */
2533 	/* clang needs this memset to avoid a warning */
2534 	memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
2535 	for (i = 0; i < slurmctld_tres_cnt; i++)
2536 		tres_usage_mins[i] =
2537 			(uint64_t)(qos_ptr->usage->usage_tres_raw[i] / 60.0);
2538 	wall_mins = qos_ptr->usage->grp_used_wall / 60;
2539 
2540 	tres_usage = _validate_tres_usage_limits_for_qos(
2541 		&tres_pos, qos_ptr->grp_tres_mins_ctld,
2542 		qos_out_ptr->grp_tres_mins_ctld, NULL,
2543 		NULL, tres_usage_mins, NULL, false);
2544 	switch (tres_usage) {
2545 	case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2546 		last_job_update = now;
2547 		info("%pJ timed out, the job is at or exceeds QOS %s's group max tres(%s) minutes of %"PRIu64" with %"PRIu64"",
2548 		     job_ptr, qos_ptr->name,
2549 		     assoc_mgr_tres_name_array[tres_pos],
2550 		     qos_ptr->grp_tres_mins_ctld[tres_pos],
2551 		     tres_usage_mins[tres_pos]);
2552 		job_ptr->state_reason = FAIL_TIMEOUT;
2553 		rc = false;
2554 		goto end_it;
2555 		break;
2556 	case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2557 		/* not possible safe_limits is 0 */
2558 	case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2559 		/* not possible safe_limits is 0 */
2560 	case TRES_USAGE_OKAY:
2561 		/* all good */
2562 		break;
2563 	}
2564 
2565 	if ((qos_out_ptr->grp_wall == INFINITE)
2566 	    && (qos_ptr->grp_wall != INFINITE)) {
2567 
2568 		qos_out_ptr->grp_wall = qos_ptr->grp_wall;
2569 
2570 		if (wall_mins >= qos_ptr->grp_wall) {
2571 			last_job_update = now;
2572 			info("%pJ timed out, the job is at or exceeds QOS %s's group wall limit of %u with %u",
2573 			     job_ptr, qos_ptr->name,
2574 			     qos_ptr->grp_wall, wall_mins);
2575 			job_ptr->state_reason = FAIL_TIMEOUT;
2576 			rc = false;
2577 			goto end_it;
2578 		}
2579 	}
2580 
2581 	tres_usage = _validate_tres_usage_limits_for_qos(
2582 		&tres_pos, qos_ptr->max_tres_mins_pj_ctld,
2583 		qos_out_ptr->max_tres_mins_pj_ctld, job_tres_usage_mins,
2584 		NULL, NULL, NULL, true);
2585 	switch (tres_usage) {
2586 	case TRES_USAGE_CUR_EXCEEDS_LIMIT:
2587 		/* not possible curr_usage is NULL */
2588 		break;
2589 	case TRES_USAGE_REQ_EXCEEDS_LIMIT:
2590 		last_job_update = now;
2591 		info("%pJ timed out, the job is at or exceeds QOS %s's max tres(%s) minutes of %"PRIu64" with %"PRIu64,
2592 		     job_ptr, qos_ptr->name,
2593 		     assoc_mgr_tres_name_array[tres_pos],
2594 		     qos_ptr->max_tres_mins_pj_ctld[tres_pos],
2595 		     job_tres_usage_mins[tres_pos]);
2596 		job_ptr->state_reason = FAIL_TIMEOUT;
2597 		rc = false;
2598 		goto end_it;
2599 		break;
2600 	case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
2601 		/* not possible tres_usage is NULL */
2602 	case TRES_USAGE_OKAY:
2603 		/* all good */
2604 		break;
2605 	}
2606 
2607 end_it:
2608 	return rc;
2609 }
2610 
2611 /*
2612  * acct_policy_add_job_submit - Note that a job has been submitted for
2613  *	accounting policy purposes.
2614  */
acct_policy_add_job_submit(job_record_t * job_ptr)2615 extern void acct_policy_add_job_submit(job_record_t *job_ptr)
2616 {
2617 	_adjust_limit_usage(ACCT_POLICY_ADD_SUBMIT, job_ptr);
2618 }
2619 
2620 /*
2621  * acct_policy_remove_job_submit - Note that a job has finished (might
2622  *      not had started or been allocated resources) for accounting
2623  *      policy purposes.
2624  */
acct_policy_remove_job_submit(job_record_t * job_ptr)2625 extern void acct_policy_remove_job_submit(job_record_t *job_ptr)
2626 {
2627 	_adjust_limit_usage(ACCT_POLICY_REM_SUBMIT, job_ptr);
2628 }
2629 
2630 /*
2631  * acct_policy_job_begin - Note that a job is starting for accounting
2632  *	policy purposes.
2633  */
acct_policy_job_begin(job_record_t * job_ptr)2634 extern void acct_policy_job_begin(job_record_t *job_ptr)
2635 {
2636 	_adjust_limit_usage(ACCT_POLICY_JOB_BEGIN, job_ptr);
2637 }
2638 
2639 /*
2640  * acct_policy_job_fini - Note that a job is completing for accounting
2641  *	policy purposes.
2642  */
acct_policy_job_fini(job_record_t * job_ptr)2643 extern void acct_policy_job_fini(job_record_t *job_ptr)
2644 {
2645 	/* if end_time_exp == NO_VAL this has already happened */
2646 	if (job_ptr->end_time_exp != (time_t)NO_VAL)
2647 		_adjust_limit_usage(ACCT_POLICY_JOB_FINI, job_ptr);
2648 	else
2649 		debug2("We have already ran the job_fini for %pJ", job_ptr);
2650 }
2651 
acct_policy_alter_job(job_record_t * job_ptr,uint32_t new_time_limit)2652 extern void acct_policy_alter_job(job_record_t *job_ptr,
2653 				  uint32_t new_time_limit)
2654 {
2655 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
2656 	slurmdb_assoc_rec_t *assoc_ptr = NULL;
2657 	assoc_mgr_lock_t locks =
2658 		{ .assoc = WRITE_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
2659 	uint64_t used_tres_run_secs[slurmctld_tres_cnt];
2660 	uint64_t new_used_tres_run_secs[slurmctld_tres_cnt];
2661 	uint64_t time_limit_secs, new_time_limit_secs;
2662 	int i;
2663 
2664 	if (!IS_JOB_RUNNING(job_ptr) || (job_ptr->time_limit == new_time_limit))
2665 		return;
2666 
2667 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
2668 	    || !_valid_job_assoc(job_ptr))
2669 		return;
2670 
2671 	time_limit_secs = (uint64_t)job_ptr->time_limit * 60;
2672 	new_time_limit_secs = (uint64_t)new_time_limit * 60;
2673 
2674 	/* clang needs these memset to avoid a warning */
2675 	memset(used_tres_run_secs, 0, sizeof(used_tres_run_secs));
2676 	memset(new_used_tres_run_secs, 0, sizeof(new_used_tres_run_secs));
2677 	for (i=0; i<slurmctld_tres_cnt; i++) {
2678 		if (i == TRES_ARRAY_ENERGY)
2679 			continue;
2680 		if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
2681 			continue;
2682 
2683 		used_tres_run_secs[i] =
2684 			job_ptr->tres_alloc_cnt[i] * time_limit_secs;
2685 		new_used_tres_run_secs[i] =
2686 			job_ptr->tres_alloc_cnt[i] * new_time_limit_secs;
2687 	}
2688 
2689 	assoc_mgr_lock(&locks);
2690 
2691 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
2692 
2693 	_qos_alter_job(job_ptr, qos_ptr_1,
2694 		       used_tres_run_secs, new_used_tres_run_secs);
2695 	_qos_alter_job(job_ptr, qos_ptr_2,
2696 		       used_tres_run_secs, new_used_tres_run_secs);
2697 
2698 	assoc_ptr = job_ptr->assoc_ptr;
2699 	while (assoc_ptr) {
2700 		for (i=0; i<slurmctld_tres_cnt; i++) {
2701 			if (used_tres_run_secs[i] == new_used_tres_run_secs[i])
2702 				continue;
2703 			assoc_ptr->usage->grp_used_tres_run_secs[i] -=
2704 				used_tres_run_secs[i];
2705 			assoc_ptr->usage->grp_used_tres_run_secs[i] +=
2706 				new_used_tres_run_secs[i];
2707 			debug2("altering %pJ assoc %u(%s/%s/%s) got %"PRIu64" just removed %"PRIu64" and added %"PRIu64,
2708 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
2709 			       assoc_ptr->user, assoc_ptr->partition,
2710 			       assoc_ptr->usage->grp_used_tres_run_secs[i],
2711 			       used_tres_run_secs[i],
2712 			       new_used_tres_run_secs[i]);
2713 		}
2714 
2715 		/* now handle all the group limits of the parents */
2716 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
2717 	}
2718 	assoc_mgr_unlock(&locks);
2719 }
2720 
_get_prio_thresh(uint32_t * prio_thresh,uint32_t in_thresh)2721 static void _get_prio_thresh(uint32_t *prio_thresh, uint32_t in_thresh)
2722 {
2723 	/*
2724 	 * If we already set prio_thresh then call it good.
2725 	 * If in_thresh is INFINITE we don't have a limit
2726 	 */
2727 	if ((*prio_thresh) || (in_thresh == INFINITE))
2728 		return;
2729 
2730 	*prio_thresh = in_thresh;
2731 }
2732 
_get_accrue_create_cnt(uint32_t * max_jobs_accrue,int * create_cnt,uint32_t in_accrue,uint32_t in_used)2733 static void _get_accrue_create_cnt(uint32_t *max_jobs_accrue, int *create_cnt,
2734 				   uint32_t in_accrue, uint32_t in_used)
2735 {
2736 	/*
2737 	 * If we already set max_jobs_accrue then call it good.
2738 	 * If in_accrue is INFINITE we don't have a limit
2739 	 */
2740 	if ((*max_jobs_accrue != INFINITE) || (in_accrue == INFINITE))
2741 		return;
2742 
2743 	*max_jobs_accrue = in_accrue;
2744 	if (*max_jobs_accrue > in_used)
2745 		*create_cnt = *max_jobs_accrue - in_used;
2746 	else
2747 		*create_cnt = 0;
2748 
2749 	return;
2750 }
2751 
_add_accrue_time_internal(slurmdb_assoc_rec_t * assoc_ptr,slurmdb_qos_rec_t * qos_ptr_1,slurmdb_used_limits_t * used_limits_a1,slurmdb_used_limits_t * used_limits_u1,slurmdb_qos_rec_t * qos_ptr_2,slurmdb_used_limits_t * used_limits_a2,slurmdb_used_limits_t * used_limits_u2,int cnt)2752 static void _add_accrue_time_internal(slurmdb_assoc_rec_t *assoc_ptr,
2753 				      slurmdb_qos_rec_t *qos_ptr_1,
2754 				      slurmdb_used_limits_t *used_limits_a1,
2755 				      slurmdb_used_limits_t *used_limits_u1,
2756 				      slurmdb_qos_rec_t *qos_ptr_2,
2757 				      slurmdb_used_limits_t *used_limits_a2,
2758 				      slurmdb_used_limits_t *used_limits_u2,
2759 				      int cnt)
2760 {
2761 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2762 		info("%s: Adding %d to assoc_ptr %p (%p %p %p %p %p %p)",
2763 		     __func__, cnt, assoc_ptr, qos_ptr_1, used_limits_a1,
2764 		     used_limits_u1, qos_ptr_2, used_limits_a2,
2765 		     used_limits_u2);
2766 	}
2767 
2768 	if (qos_ptr_1)
2769 		qos_ptr_1->usage->accrue_cnt += cnt;
2770 	if (used_limits_a1)
2771 		used_limits_a1->accrue_cnt += cnt;
2772 	if (used_limits_u1)
2773 		used_limits_u1->accrue_cnt += cnt;
2774 
2775 	if (qos_ptr_2)
2776 		qos_ptr_2->usage->accrue_cnt += cnt;
2777 	if (used_limits_a2)
2778 		used_limits_a2->accrue_cnt += cnt;
2779 	if (used_limits_u2)
2780 		used_limits_u2->accrue_cnt += cnt;
2781 
2782 	while (assoc_ptr) {
2783 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2784 			info("assoc_id %u(%s/%s/%s/%p) added %d count %d",
2785 			     assoc_ptr->id, assoc_ptr->acct,
2786 			     assoc_ptr->user, assoc_ptr->partition,
2787 			     assoc_ptr->usage, cnt,
2788 			     assoc_ptr->usage->accrue_cnt);
2789 		}
2790 		assoc_ptr->usage->accrue_cnt += cnt;
2791 		/* now go up the hierarchy */
2792 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
2793 	}
2794 }
2795 
_remove_accrue_time_internal(slurmdb_assoc_rec_t * assoc_ptr,slurmdb_qos_rec_t * qos_ptr_1,slurmdb_used_limits_t * used_limits_a1,slurmdb_used_limits_t * used_limits_u1,slurmdb_qos_rec_t * qos_ptr_2,slurmdb_used_limits_t * used_limits_a2,slurmdb_used_limits_t * used_limits_u2,int cnt)2796 static void _remove_accrue_time_internal(slurmdb_assoc_rec_t *assoc_ptr,
2797 					 slurmdb_qos_rec_t *qos_ptr_1,
2798 					 slurmdb_used_limits_t *used_limits_a1,
2799 					 slurmdb_used_limits_t *used_limits_u1,
2800 					 slurmdb_qos_rec_t *qos_ptr_2,
2801 					 slurmdb_used_limits_t *used_limits_a2,
2802 					 slurmdb_used_limits_t *used_limits_u2,
2803 					 int cnt)
2804 {
2805 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2806 		info("%s: Removing %d from assoc_ptr %p (%p %p %p %p %p %p)",
2807 		     __func__, cnt, assoc_ptr, qos_ptr_1, used_limits_a1,
2808 		     used_limits_u1, qos_ptr_2, used_limits_a2,
2809 		     used_limits_u2);
2810 	}
2811 
2812 	if (qos_ptr_1) {
2813 		if (qos_ptr_1->usage->accrue_cnt >= cnt)
2814 			qos_ptr_1->usage->accrue_cnt -= cnt;
2815 		else {
2816 			error("%s: QOS %s accrue_cnt underflow",
2817 			      __func__, qos_ptr_1->name);
2818 			qos_ptr_1->usage->accrue_cnt = 0;
2819 		}
2820 	}
2821 
2822 	if (used_limits_a1) {
2823 		if (used_limits_a1->accrue_cnt >= cnt)
2824 			used_limits_a1->accrue_cnt -= cnt;
2825 		else {
2826 			if (qos_ptr_1) {
2827 				error("%s: QOS %s acct %s accrue_cnt underflow",
2828 				      __func__, qos_ptr_1->name,
2829 				      used_limits_a1->acct);
2830 			}
2831 			used_limits_a1->accrue_cnt = 0;
2832 		}
2833 	}
2834 
2835 	if (used_limits_u1) {
2836 		if (used_limits_u1->accrue_cnt >= cnt)
2837 			used_limits_u1->accrue_cnt -= cnt;
2838 		else {
2839 			if (qos_ptr_1) {
2840 				error("%s: QOS %s user %u accrue_cnt underflow",
2841 				      __func__, qos_ptr_1->name,
2842 				      used_limits_u1->uid);
2843 			}
2844 			used_limits_u1->accrue_cnt = 0;
2845 		}
2846 	}
2847 
2848 	if (qos_ptr_2) {
2849 		if (qos_ptr_2->usage->accrue_cnt)
2850 			qos_ptr_2->usage->accrue_cnt -= cnt;
2851 		else {
2852 			error("%s: QOS %s accrue_cnt underflow",
2853 			      __func__, qos_ptr_2->name);
2854 			qos_ptr_2->usage->accrue_cnt = 0;
2855 		}
2856 	}
2857 
2858 	if (used_limits_a2) {
2859 		if (used_limits_a2->accrue_cnt >= cnt)
2860 			used_limits_a2->accrue_cnt -= cnt;
2861 		else {
2862 			if (qos_ptr_2) {
2863 				error("%s: QOS %s acct %s accrue_cnt underflow",
2864 				      __func__, qos_ptr_2->name,
2865 				      used_limits_a2->acct);
2866 			}
2867 			used_limits_a2->accrue_cnt = 0;
2868 		}
2869 	}
2870 
2871 	if (used_limits_u2) {
2872 		if (used_limits_u2->accrue_cnt >= cnt)
2873 			used_limits_u2->accrue_cnt -= cnt;
2874 		else {
2875 			if (qos_ptr_2 && used_limits_a2) {
2876 				error("%s: QOS %s user %u accrue_cnt underflow",
2877 				      __func__, qos_ptr_2->name,
2878 				      used_limits_a2->uid);
2879 			}
2880 			used_limits_u2->accrue_cnt = 0;
2881 		}
2882 	}
2883 
2884 	while (assoc_ptr) {
2885 		if (assoc_ptr->usage->accrue_cnt >= cnt) {
2886 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE) {
2887 				info("assoc_id %u(%s/%s/%s/%p) removed %d count %d",
2888 				     assoc_ptr->id, assoc_ptr->acct,
2889 				     assoc_ptr->user, assoc_ptr->partition,
2890 				     assoc_ptr->usage, cnt,
2891 				     assoc_ptr->usage->accrue_cnt);
2892 			}
2893 			assoc_ptr->usage->accrue_cnt -= cnt;
2894 		} else {
2895 			error("%s: assoc_id %u(%s/%s/%s) accrue_cnt underflow",
2896 			      __func__, assoc_ptr->id,
2897 			      assoc_ptr->acct,
2898 			      assoc_ptr->user,
2899 			      assoc_ptr->partition);
2900 			assoc_ptr->usage->accrue_cnt = 0;
2901 		}
2902 		/* now go up the hierarchy */
2903 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
2904 	}
2905 }
2906 
_acct_policy_validate(job_desc_msg_t * job_desc,part_record_t * part_ptr,slurmdb_assoc_rec_t * assoc_in,slurmdb_qos_rec_t * qos_ptr_1,slurmdb_qos_rec_t * qos_ptr_2,uint32_t * reason,acct_policy_limit_set_t * acct_policy_limit_set,bool update_call)2907 static bool _acct_policy_validate(job_desc_msg_t *job_desc,
2908 				  part_record_t *part_ptr,
2909 				  slurmdb_assoc_rec_t *assoc_in,
2910 				  slurmdb_qos_rec_t *qos_ptr_1,
2911 				  slurmdb_qos_rec_t *qos_ptr_2,
2912 				  uint32_t *reason,
2913 				  acct_policy_limit_set_t *
2914 					acct_policy_limit_set,
2915 				  bool update_call)
2916 {
2917 	slurmdb_qos_rec_t qos_rec;
2918 	slurmdb_assoc_rec_t *assoc_ptr = assoc_in;
2919 	int parent = 0, job_cnt = 1;
2920 	char *user_name = NULL;
2921 	bool rc = true;
2922 	assoc_mgr_lock_t locks =
2923 		{ .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
2924 	bool strict_checking;
2925 
2926 	xassert(acct_policy_limit_set);
2927 
2928 	if (!assoc_ptr) {
2929 		error("acct_policy_validate: no assoc_ptr given for job.");
2930 		return false;
2931 	}
2932 	user_name = assoc_ptr->user;
2933 
2934 	if (job_desc->array_bitmap)
2935 		job_cnt = bit_set_count(job_desc->array_bitmap);
2936 
2937 	slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
2938 
2939 	assoc_mgr_lock(&locks);
2940 
2941 	assoc_mgr_set_qos_tres_cnt(&qos_rec);
2942 
2943 	if (qos_ptr_1) {
2944 		strict_checking = (qos_ptr_1->flags & QOS_FLAG_DENY_LIMIT);
2945 		if (qos_ptr_2 && !strict_checking)
2946 			strict_checking =
2947 				qos_ptr_2->flags & QOS_FLAG_DENY_LIMIT;
2948 
2949 		if (!(rc = _qos_policy_validate(
2950 			      job_desc, assoc_ptr, part_ptr,
2951 			      qos_ptr_1, &qos_rec,
2952 			      reason, acct_policy_limit_set, update_call,
2953 			      user_name, job_cnt, strict_checking)))
2954 			goto end_it;
2955 		if (!(rc = _qos_policy_validate(
2956 			      job_desc, assoc_ptr,
2957 			      part_ptr, qos_ptr_2, &qos_rec,
2958 			      reason, acct_policy_limit_set, update_call,
2959 			      user_name, job_cnt, strict_checking)))
2960 			goto end_it;
2961 
2962 	} else /*
2963 		* We don't have a QOS to determine if we should fail or not, so
2964 		* we will go with strict_checking by default.
2965 		*/
2966 		strict_checking = true;
2967 
2968 	while (assoc_ptr) {
2969 		int tres_pos = 0;
2970 
2971 		if (!_validate_tres_limits_for_assoc(
2972 			    &tres_pos, job_desc->tres_req_cnt, 0,
2973 			    assoc_ptr->grp_tres_ctld,
2974 			    qos_rec.grp_tres_ctld,
2975 			    acct_policy_limit_set->tres,
2976 			    strict_checking, update_call, 1)) {
2977 			if (reason)
2978 				*reason = _get_tres_state_reason(
2979 					tres_pos, WAIT_ASSOC_GRP_UNK);
2980 
2981 			debug2("job submit for user %s(%u): "
2982 			       "min tres(%s) request %"PRIu64" exceeds "
2983 			       "group max tres limit %"PRIu64" for account %s",
2984 			       user_name,
2985 			       job_desc->user_id,
2986 			       assoc_mgr_tres_name_array[tres_pos],
2987 			       job_desc->tres_req_cnt[tres_pos],
2988 			       assoc_ptr->grp_tres_ctld[tres_pos],
2989 			       assoc_ptr->acct);
2990 			rc = false;
2991 			break;
2992 		}
2993 
2994 		/* for validation we don't need to look at
2995 		 * assoc_ptr->grp_jobs.
2996 		 */
2997 
2998 		if ((qos_rec.grp_submit_jobs == INFINITE) &&
2999 		    (assoc_ptr->grp_submit_jobs != INFINITE) &&
3000 		    ((assoc_ptr->usage->used_submit_jobs + job_cnt)
3001 		     > assoc_ptr->grp_submit_jobs)) {
3002 			if (reason)
3003 				*reason = WAIT_ASSOC_GRP_SUB_JOB;
3004 			debug2("job submit for user %s(%u): group max submit job limit exceeded %u (used:%u + requested:%d) for account '%s'",
3005 			       user_name,
3006 			       job_desc->user_id,
3007 			       assoc_ptr->grp_submit_jobs,
3008 			       assoc_ptr->usage->used_submit_jobs, job_cnt,
3009 			       assoc_ptr->acct);
3010 			rc = false;
3011 			break;
3012 		}
3013 
3014 		tres_pos = 0;
3015 		if (!update_call && !_validate_tres_time_limits(
3016 			    &tres_pos,
3017 			    &job_desc->time_limit,
3018 			    part_ptr->max_time,
3019 			    job_desc->tres_req_cnt,
3020 			    assoc_ptr->grp_tres_mins_ctld,
3021 			    qos_rec.grp_tres_mins_ctld,
3022 			    &acct_policy_limit_set->time,
3023 			    strict_checking)) {
3024 			if (reason)
3025 				*reason = _get_tres_state_reason(
3026 					tres_pos,
3027 					WAIT_ASSOC_GRP_UNK_MIN);
3028 			debug2("job submit for user %s(%u): "
3029 			       "tres(%s) time limit request %"PRIu64" "
3030 			       "exceeds group max limit %"PRIu64" "
3031 			       "for account '%s'",
3032 			       user_name,
3033 			       job_desc->user_id,
3034 			       assoc_mgr_tres_name_array[tres_pos],
3035 			       ((uint64_t)job_desc->time_limit *
3036 				job_desc->tres_req_cnt[tres_pos]),
3037 			       assoc_ptr->
3038 			       grp_tres_mins_ctld[tres_pos],
3039 			       assoc_ptr->acct);
3040 			rc = false;
3041 			goto end_it;
3042 		}
3043 
3044 		tres_pos = 0;
3045 		if (!update_call && !_validate_tres_time_limits(
3046 			    &tres_pos,
3047 			    &job_desc->time_limit,
3048 			    part_ptr->max_time,
3049 			    job_desc->tres_req_cnt,
3050 			    assoc_ptr->grp_tres_run_mins_ctld,
3051 			    qos_rec.grp_tres_run_mins_ctld,
3052 			    &acct_policy_limit_set->time,
3053 			    strict_checking)) {
3054 			if (reason)
3055 				*reason = _get_tres_state_reason(
3056 					tres_pos,
3057 					WAIT_ASSOC_GRP_UNK_RUN_MIN);
3058 			debug2("job submit for user %s(%u): "
3059 			       "tres(%s) time limit request %"PRIu64" "
3060 			       "exceeds group max running "
3061 			       "limit %"PRIu64" for account '%s'",
3062 			       user_name,
3063 			       job_desc->user_id,
3064 			       assoc_mgr_tres_name_array[tres_pos],
3065 			       ((uint64_t)job_desc->time_limit *
3066 				job_desc->tres_req_cnt[tres_pos]),
3067 			       assoc_ptr->
3068 			       grp_tres_run_mins_ctld[tres_pos],
3069 			       assoc_ptr->acct);
3070 			rc = false;
3071 			goto end_it;
3072 		}
3073 
3074 		if (!update_call && !_validate_time_limit(
3075 			    &job_desc->time_limit,
3076 			    part_ptr->max_time,
3077 			    1,
3078 			    assoc_ptr->grp_wall,
3079 			    &qos_rec.grp_wall,
3080 			    &acct_policy_limit_set->time,
3081 			    strict_checking, false)) {
3082 			if (reason)
3083 				*reason = WAIT_ASSOC_GRP_WALL;
3084 			debug2("job submit for user %s(%u): "
3085 			       "time limit %u exceeds max group %u for "
3086 			       "account '%s'",
3087 			       user_name,
3088 			       job_desc->user_id,
3089 			       job_desc->time_limit,
3090 			       assoc_ptr->grp_wall,
3091 			       assoc_ptr->acct);
3092 			rc = false;
3093 			break;
3094 		}
3095 
3096 		/* We don't need to look at the regular limits for
3097 		 * parents since we have pre-propogated them, so just
3098 		 * continue with the next parent
3099 		 */
3100 		if (parent) {
3101 			assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3102 			continue;
3103 		}
3104 
3105 		/* for validation we don't need to look at
3106 		 * assoc_ptr->max_cpu_mins_pj.
3107 		 */
3108 
3109 		tres_pos = 0;
3110 		if (!_validate_tres_limits_for_assoc(
3111 			    &tres_pos, job_desc->tres_req_cnt, 0,
3112 			    assoc_ptr->max_tres_ctld,
3113 			    qos_rec.max_tres_pj_ctld,
3114 			    acct_policy_limit_set->tres,
3115 			    strict_checking, update_call, 1)) {
3116 			if (reason)
3117 				*reason = _get_tres_state_reason(
3118 					tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB);
3119 
3120 			debug2("job submit for user %s(%u): "
3121 			       "min tres(%s) request %"PRIu64" exceeds "
3122 			       "max tres limit %"PRIu64" for account %s",
3123 			       user_name,
3124 			       job_desc->user_id,
3125 			       assoc_mgr_tres_name_array[tres_pos],
3126 			       job_desc->tres_req_cnt[tres_pos],
3127 			       assoc_ptr->max_tres_ctld[tres_pos],
3128 			       assoc_ptr->acct);
3129 			rc = false;
3130 			break;
3131 		}
3132 
3133 		tres_pos = 0;
3134 		if (!_validate_tres_limits_for_assoc(
3135 			    &tres_pos, job_desc->tres_req_cnt,
3136 			    job_desc->tres_req_cnt[TRES_ARRAY_NODE],
3137 			    assoc_ptr->max_tres_pn_ctld,
3138 			    qos_rec.max_tres_pn_ctld,
3139 			    acct_policy_limit_set->tres,
3140 			    strict_checking, update_call, 1)) {
3141 			if (reason)
3142 				*reason = _get_tres_state_reason(
3143 					tres_pos,
3144 					WAIT_ASSOC_MAX_UNK_PER_NODE);
3145 
3146 			debug2("job submit for user %s(%u): "
3147 			       "min tres(%s) request %"PRIu64" exceeds "
3148 			       "max tres limit %"PRIu64" per node "
3149 			       "for account %s",
3150 			       user_name,
3151 			       job_desc->user_id,
3152 			       assoc_mgr_tres_name_array[tres_pos],
3153 			       job_desc->tres_req_cnt[tres_pos] /
3154 			       job_desc->tres_req_cnt[TRES_ARRAY_NODE],
3155 			       assoc_ptr->max_tres_pn_ctld[tres_pos],
3156 			       assoc_ptr->acct);
3157 			rc = false;
3158 			break;
3159 		}
3160 
3161 		/* for validation we don't need to look at
3162 		 * assoc_ptr->max_jobs.
3163 		 */
3164 
3165 		if ((qos_rec.max_submit_jobs_pa == INFINITE) &&
3166 		    (qos_rec.max_submit_jobs_pu == INFINITE) &&
3167 		    (assoc_ptr->max_submit_jobs != INFINITE) &&
3168 		    ((assoc_ptr->usage->used_submit_jobs + job_cnt)
3169 		     > assoc_ptr->max_submit_jobs)) {
3170 			if (reason)
3171 				*reason = WAIT_ASSOC_MAX_SUB_JOB;
3172 			debug2("job submit for user %s(%u): account max submit job limit exceeded %u (used:%u + requested:%d) for account '%s'",
3173 			       user_name,
3174 			       job_desc->user_id,
3175 			       assoc_ptr->max_submit_jobs,
3176 			       assoc_ptr->usage->used_submit_jobs, job_cnt,
3177 			       assoc_ptr->acct);
3178 			rc = false;
3179 			break;
3180 		}
3181 
3182 		if (!update_call && !_validate_tres_time_limits(
3183 			    &tres_pos,
3184 			    &job_desc->time_limit,
3185 			    part_ptr->max_time,
3186 			    job_desc->tres_req_cnt,
3187 			    assoc_ptr->max_tres_mins_ctld,
3188 			    qos_rec.max_tres_mins_pj_ctld,
3189 			    &acct_policy_limit_set->time,
3190 			    strict_checking)) {
3191 			if (reason)
3192 				*reason = _get_tres_state_reason(
3193 					tres_pos,
3194 					WAIT_ASSOC_MAX_UNK_MINS_PER_JOB);
3195 			debug2("job submit for user %s(%u): "
3196 			       "tres(%s) time limit request %"PRIu64" "
3197 			       "exceeds max per-job limit %"PRIu64" "
3198 			       "for account '%s'",
3199 			       user_name,
3200 			       job_desc->user_id,
3201 			       assoc_mgr_tres_name_array[tres_pos],
3202 			       ((uint64_t)job_desc->time_limit *
3203 				job_desc->tres_req_cnt[tres_pos]),
3204 			       assoc_ptr->max_tres_mins_ctld[tres_pos],
3205 			       assoc_ptr->acct);
3206 			rc = false;
3207 			break;
3208 		}
3209 
3210 		if (!update_call && !_validate_time_limit(
3211 			    &job_desc->time_limit,
3212 			    part_ptr->max_time,
3213 			    1,
3214 			    assoc_ptr->max_wall_pj,
3215 			    &qos_rec.max_wall_pj,
3216 			    &acct_policy_limit_set->time,
3217 			    strict_checking, false)) {
3218 			if (reason)
3219 				*reason = WAIT_ASSOC_MAX_WALL_PER_JOB;
3220 			debug2("job submit for user %s(%u): "
3221 			       "time limit %u exceeds max %u for "
3222 			       "account '%s'",
3223 			       user_name,
3224 			       job_desc->user_id,
3225 			       job_desc->time_limit,
3226 			       assoc_ptr->max_wall_pj,
3227 			       assoc_ptr->acct);
3228 			rc = false;
3229 			break;
3230 		}
3231 
3232 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3233 		parent = 1;
3234 	}
3235 end_it:
3236 	assoc_mgr_unlock(&locks);
3237 	slurmdb_free_qos_rec_members(&qos_rec);
3238 
3239 	return rc;
3240 }
3241 
3242 /*
3243  * acct_policy_validate - validate that a job request can be satisfied without
3244  * exceeding any association or QOS limit.
3245  * job_desc IN - job descriptor being submitted
3246  * part_ptr IN - pointer to (one) partition to which the job is being submitted
3247  * assoc_in IN - pointer to association to which the job is being submitted
3248  * qos_ptr IN - pointer to QOS to which the job is being submitted
3249  * state_reason OUT - if non-NULL, set to reason for rejecting the job
3250  * acct_policy_limit_set IN/OUT - limits set for the job, pre-allocated storage
3251  *		is filled in by acct_policy_validate
3252  * update_call IN - true if request to update existing job request
3253  * RET true if valid
3254  */
acct_policy_validate(job_desc_msg_t * job_desc,part_record_t * part_ptr,slurmdb_assoc_rec_t * assoc_in,slurmdb_qos_rec_t * qos_ptr,uint32_t * reason,acct_policy_limit_set_t * acct_policy_limit_set,bool update_call)3255 extern bool acct_policy_validate(job_desc_msg_t *job_desc,
3256 				 part_record_t *part_ptr,
3257 				 slurmdb_assoc_rec_t *assoc_in,
3258 				 slurmdb_qos_rec_t *qos_ptr,
3259 				 uint32_t *reason,
3260 				 acct_policy_limit_set_t *acct_policy_limit_set,
3261 				 bool update_call)
3262 {
3263 	slurmdb_qos_rec_t *qos_ptr_1 = NULL, *qos_ptr_2 = NULL;
3264 	job_record_t job_rec;
3265 	bool rc;
3266 	assoc_mgr_lock_t locks =
3267 		{ .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3268 
3269 	assoc_mgr_lock(&locks);
3270 	job_rec.qos_ptr = qos_ptr;
3271 	job_rec.part_ptr = part_ptr;
3272 	acct_policy_set_qos_order(&job_rec, &qos_ptr_1, &qos_ptr_2);
3273 	assoc_mgr_unlock(&locks);
3274 	rc = _acct_policy_validate(job_desc, part_ptr, assoc_in,
3275 				   qos_ptr_1, qos_ptr_2, reason,
3276 				   acct_policy_limit_set, update_call);
3277 	return rc;
3278 }
3279 
3280 /*
3281  * acct_policy_validate_het_job - validate that a hetjob as a whole (all
3282  * components at once) can be satisfied without exceeding any association
3283  * limit. Build a list of every job's association and QOS information then combine
3284  * usage information for every job sharing an association and test that against
3285  * the appropriate limit.
3286  *
3287  * NOTE: This test is imperfect. Each job actually has up to 3 sets of limits
3288  * to test (association, job QOS and partition QOS). Ideally each would be tested
3289  * independently, but that is complicated due to QOS limits overriding the
3290  * association limits and the ability to have 3 sets of limits for each job.
3291  * This only tests the association limit for each hetjob component based
3292  * upon that component's job and partition QOS.
3293  *
3294  * NOTE: That a hetjob passes this test does not mean that it will be able
3295  * to run. For example, this test assumues resource allocation at the CPU level.
3296  * If each task is allocated one core, with 2 CPUs, then the CPU limit test
3297  * would not be accurate.
3298  *
3299  * submit_job_list IN - list of job_record_t entries (already created)
3300  * RET true if valid
3301  */
acct_policy_validate_het_job(List submit_job_list)3302 extern bool acct_policy_validate_het_job(List submit_job_list)
3303 {
3304 	assoc_mgr_lock_t locks =
3305 		{ .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3306 	List het_job_limit_list;
3307 	ListIterator iter1, iter2;
3308 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
3309 	job_record_t *job_ptr1, *job_ptr2;
3310 	het_job_limits_t *job_limit1, *job_limit2;
3311 	bool rc = true;
3312 	job_desc_msg_t job_desc;
3313 	bool build_job_desc = true;
3314 	acct_policy_limit_set_t acct_policy_limit_set;
3315 	int i, job_cnt;
3316 	uint32_t reason = 0;
3317 	int tres_req_size = sizeof(uint64_t) * g_tres_count;
3318 
3319 	memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set_t));
3320 	acct_policy_limit_set.tres =
3321 		xmalloc(sizeof(uint16_t) * slurmctld_tres_cnt);
3322 
3323 	/* Build list of QOS, association, and job pointers */
3324 	het_job_limit_list = list_create(xfree_ptr);
3325 	iter1 = list_iterator_create(submit_job_list);
3326 	assoc_mgr_lock(&locks);
3327 	while ((job_ptr1 = list_next(iter1))) {
3328 		qos_ptr_1 = NULL;
3329 		qos_ptr_2 = NULL;
3330 		acct_policy_set_qos_order(job_ptr1, &qos_ptr_1, &qos_ptr_2);
3331 		job_limit1 = xmalloc(sizeof(het_job_limits_t));
3332 		job_limit1->assoc_ptr = job_ptr1->assoc_ptr;
3333 		job_limit1->job_ptr   = job_ptr1;
3334 		job_limit1->qos_ptr_1 = qos_ptr_1;
3335 		job_limit1->qos_ptr_2 = qos_ptr_2;
3336 		list_append(het_job_limit_list, job_limit1);
3337 	}
3338 	assoc_mgr_unlock(&locks);
3339 	list_iterator_destroy(iter1);
3340 
3341 	iter1 = list_iterator_create(het_job_limit_list);
3342 	while ((job_limit1 = list_next(iter1))) {
3343 		job_ptr1 = job_limit1->job_ptr;
3344 		if (build_job_desc) {
3345 			build_job_desc = false;
3346 			job_desc.time_limit = job_ptr1->time_limit;
3347 			job_desc.tres_req_cnt = xmalloc(tres_req_size);
3348 			job_desc.user_id = job_ptr1->user_id;
3349 		}
3350 		if (job_limit1->assoc_ptr) {
3351 			job_cnt = 1;
3352 			memcpy(job_desc.tres_req_cnt, job_ptr1->tres_req_cnt,
3353 			       tres_req_size);
3354 			iter2 = list_iterator_create(het_job_limit_list);
3355 			while ((job_limit2 = list_next(iter2))) {
3356 				if ((job_limit2 == job_limit1) ||
3357 				    (job_limit2->assoc_ptr !=
3358 				     job_limit1->assoc_ptr))
3359 					continue;
3360 				job_ptr2 = job_limit2->job_ptr;
3361 				for (i = 0 ; i < g_tres_count; i++) {
3362 					job_desc.tres_req_cnt[i] +=
3363 						job_ptr2->tres_req_cnt[i];
3364 				}
3365 				job_cnt++;
3366 			}
3367 			list_iterator_destroy(iter2);
3368 			if (job_cnt > 1) {
3369 				job_desc.array_bitmap = bit_alloc(job_cnt);
3370 				/*
3371 				 * SET NO BITS. Make this look like zero jobs
3372 				 * are being added. The job count was already
3373 				 * validated when each individual component of
3374 				 * the heterogeneous job was created.
3375 				*/
3376 				rc = _acct_policy_validate(&job_desc,
3377 						job_ptr1->part_ptr,
3378 						job_limit1->assoc_ptr,
3379 						job_limit1->qos_ptr_1,
3380 						job_limit1->qos_ptr_2,
3381 						&reason,
3382 						&acct_policy_limit_set,
3383 						false);
3384 				bit_free(job_desc.array_bitmap);
3385 				if (!rc)
3386 					break;
3387 			}
3388 		}
3389 	}
3390 	list_iterator_destroy(iter1);
3391 
3392 	xfree(job_desc.tres_req_cnt);
3393 	list_destroy(het_job_limit_list);
3394 	xfree(acct_policy_limit_set.tres);
3395 
3396 	return rc;
3397 }
3398 
3399 /*
3400  * Determine if the specified job can execute right now or is currently
3401  * blocked by an association or QOS limit. Does not re-validate job state.
3402  */
acct_policy_job_runnable_state(job_record_t * job_ptr)3403 extern bool acct_policy_job_runnable_state(job_record_t *job_ptr)
3404 {
3405 	/* If any more limits are added this will need to be added to */
3406 	if ((job_ptr->state_reason >= WAIT_QOS_GRP_CPU
3407 	     && job_ptr->state_reason <= WAIT_ASSOC_MAX_SUB_JOB) ||
3408 	    (job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) ||
3409 	    (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) ||
3410 	    (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT) ||
3411 	    (job_ptr->state_reason == WAIT_QOS_JOB_LIMIT) ||
3412 	    (job_ptr->state_reason == WAIT_QOS_TIME_LIMIT)) {
3413 		return false;
3414 	}
3415 
3416 	return true;
3417 }
3418 
3419 /*
3420  * acct_policy_job_runnable_pre_select - Determine if the specified
3421  *	job can execute right now or not depending upon accounting
3422  *	policy (e.g. running job limit for this association). If the
3423  *	association limits prevent the job from ever running (lowered
3424  *	limits since job submission), then cancel the job.
3425  */
acct_policy_job_runnable_pre_select(job_record_t * job_ptr,bool assoc_mgr_locked)3426 extern bool acct_policy_job_runnable_pre_select(job_record_t *job_ptr,
3427 						bool assoc_mgr_locked)
3428 {
3429 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
3430 	slurmdb_qos_rec_t qos_rec;
3431 	slurmdb_assoc_rec_t *assoc_ptr;
3432 	uint32_t time_limit = NO_VAL;
3433 	bool rc = true;
3434 	uint32_t wall_mins;
3435 	bool safe_limits = false;
3436 	int parent = 0; /* flag to tell us if we are looking at the
3437 			 * parent or not
3438 			 */
3439 	assoc_mgr_lock_t locks =
3440 		{ .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3441 
3442 	/* check to see if we are enforcing associations */
3443 	if (!accounting_enforce)
3444 		return true;
3445 
3446 	if (!_valid_job_assoc(job_ptr)) {
3447 		xfree(job_ptr->state_desc);
3448 		job_ptr->state_reason = FAIL_ACCOUNT;
3449 		return false;
3450 	}
3451 
3452 	/* now see if we are enforcing limits */
3453 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
3454 		return true;
3455 
3456 	/* clear old state reason */
3457 	if (!acct_policy_job_runnable_state(job_ptr)) {
3458 		xfree(job_ptr->state_desc);
3459 		job_ptr->state_reason = WAIT_NO_REASON;
3460 	}
3461 
3462 	slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
3463 
3464 	if (!assoc_mgr_locked)
3465 		assoc_mgr_lock(&locks);
3466 
3467 	assoc_mgr_set_qos_tres_cnt(&qos_rec);
3468 
3469 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
3470 
3471 	/* check the first QOS setting it's values in the qos_rec */
3472 	if (qos_ptr_1 &&
3473 	    !(rc = _qos_job_runnable_pre_select(job_ptr, qos_ptr_1, &qos_rec)))
3474 		goto end_it;
3475 
3476 	/* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
3477 	if (qos_ptr_2 &&
3478 	    !(rc = _qos_job_runnable_pre_select(job_ptr, qos_ptr_2, &qos_rec)))
3479 		goto end_it;
3480 
3481 	/*
3482 	 * check to see if we should be using safe limits, if so we
3483 	 * will only start a job if there are sufficient remaining
3484 	 * cpu-minutes for it to run to completion
3485 	 */
3486 	if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
3487 		safe_limits = true;
3488 
3489 	assoc_ptr = job_ptr->assoc_ptr;
3490 	while (assoc_ptr) {
3491 		/* This only trips when the grp_used_wall is divisible
3492 		 * by 60, i.e if a limit is 1 min and you have only
3493 		 * accumulated 59 seconds you will still be able to
3494 		 * get another job in as 59/60 = 0 int wise.
3495 		 */
3496 		wall_mins = assoc_ptr->usage->grp_used_wall / 60;
3497 
3498 #if _DEBUG
3499 		info("acct_job_limits: %u of %u",
3500 		     assoc_ptr->usage->used_jobs, assoc_ptr->max_jobs);
3501 #endif
3502 		/* we don't need to check grp_cpu_mins here */
3503 
3504 		/* we don't need to check grp_cpus here */
3505 
3506 		/* we don't need to check grp_mem here */
3507 
3508 		if ((qos_rec.grp_jobs == INFINITE) &&
3509 		    (assoc_ptr->grp_jobs != INFINITE) &&
3510 		    (assoc_ptr->usage->used_jobs >= assoc_ptr->grp_jobs)) {
3511 			xfree(job_ptr->state_desc);
3512 			job_ptr->state_reason = WAIT_ASSOC_GRP_JOB;
3513 			debug2("%pJ being held, assoc %u is at or exceeds group max jobs limit %u with %u for account %s",
3514 			       job_ptr, assoc_ptr->id, assoc_ptr->grp_jobs,
3515 			       assoc_ptr->usage->used_jobs, assoc_ptr->acct);
3516 
3517 			rc = false;
3518 			goto end_it;
3519 		}
3520 
3521 		/* we don't need to check grp_cpu_run_mins here */
3522 
3523 		/* we don't need to check grp_nodes here */
3524 
3525 		/* we don't need to check submit_jobs here */
3526 
3527 		if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
3528 		    && (qos_rec.grp_wall == INFINITE)
3529 		    && (assoc_ptr->grp_wall != INFINITE)) {
3530 			if (time_limit == NO_VAL) {
3531 				time_limit = job_ptr->time_limit;
3532 				_set_time_limit(&time_limit,
3533 						job_ptr->part_ptr->max_time,
3534 						MIN(assoc_ptr->grp_wall,
3535 						    assoc_ptr->max_wall_pj),
3536 						&job_ptr->limit_set.time);
3537 
3538 				/* Account for usage factor, if necessary */
3539 				if ((job_ptr->qos_ptr &&
3540 				     (job_ptr->qos_ptr->flags &
3541 				      QOS_FLAG_USAGE_FACTOR_SAFE) &&
3542 				     (job_ptr->qos_ptr->usage_factor >= 0)) &&
3543 				    ((time_limit != INFINITE) ||
3544 				     (job_ptr->qos_ptr->usage_factor < 1.0))) {
3545 					time_limit *=
3546 						job_ptr->qos_ptr->usage_factor;
3547 				}
3548 			}
3549 
3550 			if (wall_mins >= assoc_ptr->grp_wall) {
3551 				xfree(job_ptr->state_desc);
3552 				job_ptr->state_reason = WAIT_ASSOC_GRP_WALL;
3553 				debug2("%pJ being held, assoc %u is at or exceeds group wall limit %u with %u for account %s",
3554 				       job_ptr, assoc_ptr->id,
3555 				       assoc_ptr->grp_wall,
3556 				       wall_mins, assoc_ptr->acct);
3557 				rc = false;
3558 				goto end_it;
3559 			} else if (safe_limits &&
3560 				   ((wall_mins + time_limit) >
3561 				    assoc_ptr->grp_wall)) {
3562 				xfree(job_ptr->state_desc);
3563 				job_ptr->state_reason = WAIT_ASSOC_GRP_WALL;
3564 				debug2("%pJ being held, the job request with assoc %u will exceed group wall limit %u if ran with %u for account %s",
3565 				       job_ptr, assoc_ptr->id,
3566 				       assoc_ptr->grp_wall,
3567 				       wall_mins + time_limit, assoc_ptr->acct);
3568 				rc = false;
3569 				goto end_it;
3570 			}
3571 		}
3572 
3573 		/*
3574 		 * We don't need to look at the regular limits for parents
3575 		 * since we have pre-propogated them, so just continue with
3576 		 * the next parent.
3577 		 */
3578 		if (parent) {
3579 			assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3580 			continue;
3581 		}
3582 
3583 		/* we don't need to check max_cpu_mins_pj here */
3584 
3585 		/* we don't need to check max_cpus_pj here */
3586 
3587 		if ((qos_rec.max_jobs_pa == INFINITE) &&
3588 		    (qos_rec.max_jobs_pu == INFINITE) &&
3589 		    (assoc_ptr->max_jobs != INFINITE) &&
3590 		    (assoc_ptr->usage->used_jobs >= assoc_ptr->max_jobs)) {
3591 			xfree(job_ptr->state_desc);
3592 			job_ptr->state_reason = WAIT_ASSOC_MAX_JOBS;
3593 			debug2("%pJ being held, assoc %u is at or exceeds max jobs limit %u with %u for account %s",
3594 			       job_ptr, assoc_ptr->id,
3595 			       assoc_ptr->max_jobs,
3596 			       assoc_ptr->usage->used_jobs, assoc_ptr->acct);
3597 			rc = false;
3598 			goto end_it;
3599 		}
3600 
3601 		/* we don't need to check submit_jobs here */
3602 
3603 		/*
3604 		 * if the association limits have changed since job
3605 		 * submission and job can not run, then kill it
3606 		 */
3607 		if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
3608 		    && (qos_rec.max_wall_pj == INFINITE)
3609 		    && (assoc_ptr->max_wall_pj != INFINITE)) {
3610 			if (time_limit == NO_VAL) {
3611 				time_limit = job_ptr->time_limit;
3612 				_set_time_limit(&time_limit,
3613 						job_ptr->part_ptr->max_time,
3614 						assoc_ptr->max_wall_pj,
3615 						&job_ptr->limit_set.time);
3616 
3617 				/* Account for usage factor, if necessary */
3618 				if ((job_ptr->qos_ptr &&
3619 				     (job_ptr->qos_ptr->flags &
3620 				      QOS_FLAG_USAGE_FACTOR_SAFE) &&
3621 				     (job_ptr->qos_ptr->usage_factor >= 0)) &&
3622 				    ((time_limit != INFINITE) ||
3623 				     (job_ptr->qos_ptr->usage_factor < 1.0))) {
3624 					time_limit *=
3625 						job_ptr->qos_ptr->usage_factor;
3626 				}
3627 			}
3628 
3629 			if (time_limit > assoc_ptr->max_wall_pj) {
3630 				xfree(job_ptr->state_desc);
3631 				job_ptr->state_reason =
3632 					WAIT_ASSOC_MAX_WALL_PER_JOB;
3633 				debug2("%pJ being held, time limit %u exceeds account max %u",
3634 				       job_ptr, job_ptr->time_limit,
3635 				       time_limit);
3636 				rc = false;
3637 				goto end_it;
3638 			}
3639 		}
3640 
3641 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3642 		parent = 1;
3643 	}
3644 end_it:
3645 	if (!assoc_mgr_locked)
3646 		assoc_mgr_unlock(&locks);
3647 	slurmdb_free_qos_rec_members(&qos_rec);
3648 
3649 	return rc;
3650 }
3651 
3652 /*
3653  * acct_policy_job_runnable_post_select - After nodes have been
3654  *	selected for the job verify the counts don't exceed aggregated limits.
3655  */
acct_policy_job_runnable_post_select(job_record_t * job_ptr,uint64_t * tres_req_cnt,bool assoc_mgr_locked)3656 extern bool acct_policy_job_runnable_post_select(job_record_t *job_ptr,
3657 						 uint64_t *tres_req_cnt,
3658 						 bool assoc_mgr_locked)
3659 {
3660 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
3661 	slurmdb_qos_rec_t qos_rec;
3662 	slurmdb_assoc_rec_t *assoc_ptr;
3663 	uint64_t tres_usage_mins[slurmctld_tres_cnt];
3664 	uint64_t tres_run_mins[slurmctld_tres_cnt];
3665 	uint64_t job_tres_time_limit[slurmctld_tres_cnt];
3666 	uint64_t orig_node_cnt;
3667 	uint32_t time_limit;
3668 	bool rc = true;
3669 	bool safe_limits = false;
3670 	int i, tres_pos = 0;
3671 	acct_policy_tres_usage_t tres_usage;
3672 	double usage_factor = 1.0;
3673 	int parent = 0; /* flag to tell us if we are looking at the
3674 			 * parent or not
3675 			 */
3676 	assoc_mgr_lock_t locks =
3677 		{ .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
3678 
3679 	xassert(job_ptr);
3680 	xassert(job_ptr->part_ptr);
3681 	xassert(tres_req_cnt);
3682 
3683 	/* check to see if we are enforcing associations */
3684 	if (!accounting_enforce)
3685 		return true;
3686 
3687 	/* probably don't need to check this here */
3688 	/* if (!_valid_job_assoc(job_ptr)) { */
3689 	/* 	job_ptr->state_reason = FAIL_ACCOUNT; */
3690 	/* 	return false; */
3691 	/* } */
3692 
3693 	/* now see if we are enforcing limits */
3694 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
3695 		return true;
3696 
3697 	/* check to see if we should be using safe limits, if so we
3698 	 * will only start a job if there are sufficient remaining
3699 	 * cpu-minutes for it to run to completion */
3700 	if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
3701 		safe_limits = true;
3702 
3703 	/* clear old state reason */
3704 	if (!acct_policy_job_runnable_state(job_ptr)) {
3705 		xfree(job_ptr->state_desc);
3706 		job_ptr->state_reason = WAIT_NO_REASON;
3707 	}
3708 
3709 	job_ptr->qos_blocking_ptr = NULL;
3710 
3711 	/* clang needs this memset to avoid a warning */
3712 	memset(tres_run_mins, 0, sizeof(tres_run_mins));
3713 	memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
3714 	memset(job_tres_time_limit, 0, sizeof(job_tres_time_limit));
3715 
3716 	time_limit = job_ptr->time_limit;
3717 	_set_time_limit(&time_limit, job_ptr->part_ptr->max_time,
3718 			job_ptr->part_ptr->default_time, NULL);
3719 
3720 	if (job_ptr->qos_ptr) {
3721 		usage_factor = job_ptr->qos_ptr->usage_factor;
3722 
3723 		if ((usage_factor >= 0) &&
3724 		    (job_ptr->qos_ptr->flags & QOS_FLAG_USAGE_FACTOR_SAFE) &&
3725 		    ((time_limit != INFINITE) || (usage_factor < 1.0))) {
3726 			time_limit *= usage_factor;
3727 		}
3728 	}
3729 
3730 	for (i=0; i<slurmctld_tres_cnt; i++)
3731 		job_tres_time_limit[i] = (uint64_t)time_limit * tres_req_cnt[i];
3732 
3733 	slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
3734 
3735 	if (!assoc_mgr_locked)
3736 		assoc_mgr_lock(&locks);
3737 
3738 	assoc_mgr_set_qos_tres_cnt(&qos_rec);
3739 
3740 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
3741 
3742 	/* check the first QOS setting it's values in the qos_rec */
3743 	if (qos_ptr_1 &&
3744 	    !(rc = _qos_job_runnable_post_select(job_ptr, qos_ptr_1,
3745 						 &qos_rec, tres_req_cnt,
3746 						 job_tres_time_limit)))
3747 		goto end_it;
3748 
3749 	/* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
3750 	if (qos_ptr_2 &&
3751 	    !(rc = _qos_job_runnable_post_select(job_ptr, qos_ptr_2,
3752 						 &qos_rec, tres_req_cnt,
3753 						 job_tres_time_limit)))
3754 		goto end_it;
3755 
3756 	assoc_ptr = job_ptr->assoc_ptr;
3757 	while (assoc_ptr) {
3758 		for (i = 0; i < slurmctld_tres_cnt; i++) {
3759 			tres_usage_mins[i] =
3760 				(uint64_t)(assoc_ptr->usage->usage_tres_raw[i]
3761 					   / 60);
3762 			tres_run_mins[i] =
3763 				assoc_ptr->usage->grp_used_tres_run_secs[i] /
3764 				60;
3765 
3766 			/*
3767 			 * Clear usage if factor is 0 so that jobs can run.
3768 			 * Otherwise multiplying can cause more jobs to be run
3769 			 * than the limit allows (e.g. usagefactor=.5).
3770 			 */
3771 			if (usage_factor == 0.0) {
3772 				tres_usage_mins[i] *= usage_factor;
3773 				tres_run_mins[i] *= usage_factor;
3774 			}
3775 		}
3776 
3777 #if _DEBUG
3778 		info("acct_job_limits: %u of %u",
3779 		     assoc_ptr->usage->used_jobs, assoc_ptr->max_jobs);
3780 #endif
3781 		/*
3782 		 * If the association has a GrpCPUMins limit set (and there
3783 		 * is no QOS with GrpCPUMins set) we may hold the job
3784 		 */
3785 		tres_usage = _validate_tres_usage_limits_for_assoc(
3786 			&tres_pos, assoc_ptr->grp_tres_mins_ctld,
3787 			qos_rec.grp_tres_mins_ctld,
3788 			job_tres_time_limit, tres_run_mins,
3789 			tres_usage_mins, job_ptr->limit_set.tres,
3790 			safe_limits);
3791 		switch (tres_usage) {
3792 		case TRES_USAGE_CUR_EXCEEDS_LIMIT:
3793 			xfree(job_ptr->state_desc);
3794 			job_ptr->state_reason = _get_tres_state_reason(
3795 				tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
3796 			debug2("%pJ being held, assoc %u(%s/%s/%s) group max tres(%s) minutes limit of %"PRIu64" is already at or exceeded with %"PRIu64,
3797 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3798 			       assoc_ptr->user, assoc_ptr->partition,
3799 			       assoc_mgr_tres_name_array[tres_pos],
3800 			       assoc_ptr->grp_tres_mins_ctld[tres_pos],
3801 			       tres_usage_mins[tres_pos]);
3802 			rc = false;
3803 			goto end_it;
3804 			break;
3805 		case TRES_USAGE_REQ_EXCEEDS_LIMIT:
3806 			xfree(job_ptr->state_desc);
3807 			job_ptr->state_reason = _get_tres_state_reason(
3808 				tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
3809 			debug2("%pJ being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
3810 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3811 			       assoc_ptr->user, assoc_ptr->partition,
3812 			       assoc_mgr_tres_name_array[tres_pos],
3813 			       assoc_ptr->grp_tres_mins_ctld[tres_pos],
3814 			       job_tres_time_limit[tres_pos]);
3815 			rc = false;
3816 			goto end_it;
3817 			break;
3818 		case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
3819 			/*
3820 			 * If we're using safe limits start
3821 			 * the job only if there are
3822 			 * sufficient cpu-mins left such that
3823 			 * it will run to completion without
3824 			 * being killed
3825 			 */
3826 			xfree(job_ptr->state_desc);
3827 			job_ptr->state_reason = _get_tres_state_reason(
3828 				tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
3829 			debug2("%pJ being held, the job is at or exceeds assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" of which %"PRIu64" are still available but request is for %"PRIu64" (plus %"PRIu64" already in use) tres minutes (request tres count %"PRIu64")",
3830 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3831 			       assoc_ptr->user, assoc_ptr->partition,
3832 			       assoc_mgr_tres_name_array[tres_pos],
3833 			       assoc_ptr->grp_tres_mins_ctld[tres_pos],
3834 			       assoc_ptr->grp_tres_mins_ctld[tres_pos] -
3835 			       tres_usage_mins[tres_pos],
3836 			       job_tres_time_limit[tres_pos],
3837 			       tres_run_mins[tres_pos],
3838 			       tres_req_cnt[tres_pos]);
3839 			rc = false;
3840 			goto end_it;
3841 			break;
3842 		case TRES_USAGE_OKAY:
3843 			/* all good */
3844 			break;
3845 		}
3846 
3847 		orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
3848 		_get_unique_job_node_cnt(job_ptr,
3849 					 assoc_ptr->usage->grp_node_bitmap,
3850 					 &tres_req_cnt[TRES_ARRAY_NODE]);
3851 		tres_usage = _validate_tres_usage_limits_for_assoc(
3852 			&tres_pos,
3853 			assoc_ptr->grp_tres_ctld, qos_rec.grp_tres_ctld,
3854 			tres_req_cnt, assoc_ptr->usage->grp_used_tres,
3855 			NULL, job_ptr->limit_set.tres, true);
3856 		tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
3857 		switch (tres_usage) {
3858 		case TRES_USAGE_CUR_EXCEEDS_LIMIT:
3859 			/* not possible because the curr_usage sent in is NULL*/
3860 			break;
3861 		case TRES_USAGE_REQ_EXCEEDS_LIMIT:
3862 			xfree(job_ptr->state_desc);
3863 			job_ptr->state_reason = _get_tres_state_reason(
3864 				tres_pos, WAIT_ASSOC_GRP_UNK);
3865 			debug2("%pJ is being held, assoc %u(%s/%s/%s) min tres(%s) request %"PRIu64" exceeds group max tres limit %"PRIu64,
3866 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3867 			       assoc_ptr->user, assoc_ptr->partition,
3868 			       assoc_mgr_tres_name_array[tres_pos],
3869 			       tres_req_cnt[tres_pos],
3870 			       assoc_ptr->grp_tres_ctld[tres_pos]);
3871 			rc = false;
3872 			goto end_it;
3873 			break;
3874 		case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
3875 			xfree(job_ptr->state_desc);
3876 			job_ptr->state_reason = _get_tres_state_reason(
3877 				tres_pos, WAIT_ASSOC_GRP_UNK);
3878 			debug2("%pJ being held, if allowed the job request will exceed assoc %u(%s/%s/%s) group max tres(%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
3879 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3880 			       assoc_ptr->user, assoc_ptr->partition,
3881 			       assoc_mgr_tres_name_array[tres_pos],
3882 			       assoc_ptr->grp_tres_ctld[tres_pos],
3883 			       assoc_ptr->usage->grp_used_tres[tres_pos],
3884 			       tres_req_cnt[tres_pos]);
3885 			rc = false;
3886 			goto end_it;
3887 		case TRES_USAGE_OKAY:
3888 			/* all good */
3889 			break;
3890 		}
3891 
3892 		/* we don't need to check grp_jobs here */
3893 
3894 		tres_usage = _validate_tres_usage_limits_for_assoc(
3895 			&tres_pos,
3896 			assoc_ptr->grp_tres_run_mins_ctld,
3897 			qos_rec.grp_tres_run_mins_ctld,
3898 			job_tres_time_limit, tres_run_mins, NULL, NULL, true);
3899 		switch (tres_usage) {
3900 		case TRES_USAGE_CUR_EXCEEDS_LIMIT:
3901 			/* not possible because the curr_usage sent in is NULL*/
3902 			break;
3903 		case TRES_USAGE_REQ_EXCEEDS_LIMIT:
3904 			xfree(job_ptr->state_desc);
3905 			job_ptr->state_reason = _get_tres_state_reason(
3906 				tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN);
3907 			debug2("%pJ is being held, assoc %u(%s/%s/%s) group max running tres(%s) minutes request limit %"PRIu64" exceeds limit %"PRIu64,
3908 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3909 			       assoc_ptr->user, assoc_ptr->partition,
3910 			       assoc_mgr_tres_name_array[tres_pos],
3911 			       tres_run_mins[tres_pos],
3912 			       assoc_ptr->grp_tres_run_mins_ctld[tres_pos]);
3913 			rc = false;
3914 			goto end_it;
3915 			break;
3916 		case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
3917 			xfree(job_ptr->state_desc);
3918 			job_ptr->state_reason = _get_tres_state_reason(
3919 				tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN);
3920 			debug2("%pJ being held, if allowed the job request will exceed assoc %u(%s/%s/%s) group max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
3921 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3922 			       assoc_ptr->user, assoc_ptr->partition,
3923 			       assoc_mgr_tres_name_array[tres_pos],
3924 			       assoc_ptr->grp_tres_run_mins_ctld[tres_pos],
3925 			       tres_run_mins[tres_pos],
3926 			       job_tres_time_limit[tres_pos]);
3927 			rc = false;
3928 			goto end_it;
3929 			break;
3930 		case TRES_USAGE_OKAY:
3931 			/* all good */
3932 			break;
3933 		}
3934 
3935 		/* we don't need to check submit_jobs here */
3936 
3937 		/* we don't need to check grp_wall here */
3938 
3939 
3940 		/* We don't need to look at the regular limits for
3941 		 * parents since we have pre-propogated them, so just
3942 		 * continue with the next parent
3943 		 */
3944 		if (parent) {
3945 			assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
3946 			continue;
3947 		}
3948 
3949 		if (!_validate_tres_limits_for_assoc(
3950 			    &tres_pos, job_tres_time_limit, 0,
3951 			    assoc_ptr->max_tres_mins_ctld,
3952 			    qos_rec.max_tres_mins_pj_ctld,
3953 			    job_ptr->limit_set.tres,
3954 			    1, 0, 1)) {
3955 			xfree(job_ptr->state_desc);
3956 			job_ptr->state_reason = _get_tres_state_reason(
3957 				tres_pos, WAIT_ASSOC_MAX_UNK_MINS_PER_JOB);
3958 			debug2("%pJ being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) minutes of %"PRIu64" with %"PRIu64,
3959 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3960 			       assoc_ptr->user, assoc_ptr->partition,
3961 			       assoc_mgr_tres_name_array[tres_pos],
3962 			       assoc_ptr->max_tres_mins_ctld[tres_pos],
3963 			       job_tres_time_limit[tres_pos]);
3964 			rc = false;
3965 			goto end_it;
3966 		}
3967 
3968 		if (!_validate_tres_limits_for_assoc(
3969 			    &tres_pos, tres_req_cnt, 0,
3970 			    assoc_ptr->max_tres_ctld,
3971 			    qos_rec.max_tres_pj_ctld,
3972 			    job_ptr->limit_set.tres,
3973 			    1, 0, 1)) {
3974 			xfree(job_ptr->state_desc);
3975 			job_ptr->state_reason = _get_tres_state_reason(
3976 				tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB);
3977 			debug2("%pJ is being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) limit of %"PRIu64" with %"PRIu64,
3978 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3979 			       assoc_ptr->user, assoc_ptr->partition,
3980 			       assoc_mgr_tres_name_array[tres_pos],
3981 			       assoc_ptr->max_tres_ctld[tres_pos],
3982 			       tres_req_cnt[tres_pos]);
3983 			rc = false;
3984 			break;
3985 		}
3986 
3987 		if (!_validate_tres_limits_for_assoc(
3988 			    &tres_pos, tres_req_cnt,
3989 			    tres_req_cnt[TRES_ARRAY_NODE],
3990 			    assoc_ptr->max_tres_pn_ctld,
3991 			    qos_rec.max_tres_pn_ctld,
3992 			    job_ptr->limit_set.tres,
3993 			    1, 0, 1)) {
3994 			xfree(job_ptr->state_desc);
3995 			job_ptr->state_reason = _get_tres_state_reason(
3996 				tres_pos, WAIT_ASSOC_MAX_UNK_PER_NODE);
3997 			debug2("%pJ is being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) per node limit of %"PRIu64" with %"PRIu64,
3998 			       job_ptr, assoc_ptr->id, assoc_ptr->acct,
3999 			       assoc_ptr->user, assoc_ptr->partition,
4000 			       assoc_mgr_tres_name_array[tres_pos],
4001 			       assoc_ptr->max_tres_pn_ctld[tres_pos],
4002 			       tres_req_cnt[tres_pos]);
4003 			rc = false;
4004 			break;
4005 		}
4006 
4007 		/* we do not need to check max_jobs here */
4008 
4009 		/* we don't need to check submit_jobs here */
4010 
4011 		/* we don't need to check max_wall_pj here */
4012 
4013 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
4014 		parent = 1;
4015 	}
4016 end_it:
4017 	if (!assoc_mgr_locked)
4018 		assoc_mgr_unlock(&locks);
4019 	slurmdb_free_qos_rec_members(&qos_rec);
4020 
4021 	return rc;
4022 }
4023 
acct_policy_get_max_nodes(job_record_t * job_ptr,uint32_t * wait_reason)4024 extern uint32_t acct_policy_get_max_nodes(job_record_t *job_ptr,
4025 					  uint32_t *wait_reason)
4026 {
4027 	uint64_t max_nodes_limit = INFINITE64, qos_max_p_limit = INFINITE64,
4028 		grp_nodes = INFINITE64;
4029 	assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .qos = READ_LOCK };
4030 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4031 	slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
4032 	bool parent = 0; /* flag to tell us if we are looking at the
4033 			  * parent or not
4034 			  */
4035 	bool grp_set = 0;
4036 
4037 	/* check to see if we are enforcing associations */
4038 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4039 		return max_nodes_limit;
4040 
4041 	xassert(wait_reason);
4042 
4043 	assoc_mgr_lock(&locks);
4044 
4045 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4046 
4047 	if (qos_ptr_1) {
4048 		uint64_t max_nodes_pj =
4049 			qos_ptr_1->max_tres_pj_ctld[TRES_ARRAY_NODE];
4050 		uint64_t max_nodes_pu =
4051 			qos_ptr_1->max_tres_pu_ctld[TRES_ARRAY_NODE];
4052 		uint64_t max_nodes_pa =
4053 			qos_ptr_1->max_tres_pa_ctld[TRES_ARRAY_NODE];
4054 
4055 		grp_nodes = qos_ptr_1->grp_tres_ctld[TRES_ARRAY_NODE];
4056 
4057 		if (qos_ptr_2) {
4058 			if (max_nodes_pa == INFINITE64)
4059 				max_nodes_pa = qos_ptr_2->max_tres_pa_ctld[
4060 					TRES_ARRAY_NODE];
4061 			if (max_nodes_pj == INFINITE64)
4062 				max_nodes_pj = qos_ptr_2->max_tres_pj_ctld[
4063 					TRES_ARRAY_NODE];
4064 			if (max_nodes_pu == INFINITE64)
4065 				max_nodes_pu = qos_ptr_2->max_tres_pu_ctld[
4066 					TRES_ARRAY_NODE];
4067 			if (grp_nodes == INFINITE64)
4068 				grp_nodes = qos_ptr_2->grp_tres_ctld[
4069 					TRES_ARRAY_NODE];
4070 		}
4071 
4072 		if (max_nodes_pa < max_nodes_limit) {
4073 			max_nodes_limit = max_nodes_pa;
4074 			*wait_reason = WAIT_QOS_MAX_NODE_PER_ACCT;
4075 		}
4076 
4077 		if (max_nodes_pj < max_nodes_limit) {
4078 			max_nodes_limit = max_nodes_pj;
4079 			*wait_reason = WAIT_QOS_MAX_NODE_PER_JOB;
4080 		}
4081 
4082 		if (max_nodes_pu < max_nodes_limit) {
4083 			max_nodes_limit = max_nodes_pu;
4084 			*wait_reason = WAIT_QOS_MAX_NODE_PER_USER;
4085 		}
4086 
4087 		qos_max_p_limit = max_nodes_limit;
4088 
4089 		if (grp_nodes < max_nodes_limit) {
4090 			max_nodes_limit = grp_nodes;
4091 			*wait_reason = WAIT_QOS_GRP_NODE;
4092 		}
4093 	}
4094 
4095 	/* We have to traverse all the associations because QOS might
4096 	   not override a particular limit.
4097 	*/
4098 	while (assoc_ptr) {
4099 		if ((!qos_ptr_1 || (grp_nodes == INFINITE64))
4100 		    && (assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)
4101 		    && (assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE] <
4102 			max_nodes_limit)) {
4103 			max_nodes_limit =
4104 				assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE];
4105 			*wait_reason = WAIT_ASSOC_GRP_NODE;
4106 			grp_set = 1;
4107 		}
4108 
4109 		if (!parent
4110 		    && (qos_max_p_limit == INFINITE64)
4111 		    && (assoc_ptr->max_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)
4112 		    && (assoc_ptr->max_tres_ctld[TRES_ARRAY_NODE] <
4113 			max_nodes_limit)) {
4114 			max_nodes_limit =
4115 				assoc_ptr->max_tres_ctld[TRES_ARRAY_NODE];
4116 			*wait_reason = WAIT_ASSOC_MAX_NODE_PER_JOB;
4117 		}
4118 
4119 		/* only check the first grp set */
4120 		if (grp_set)
4121 			break;
4122 
4123 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
4124 		parent = 1;
4125 		continue;
4126 	}
4127 
4128 	assoc_mgr_unlock(&locks);
4129 	return max_nodes_limit;
4130 }
4131 
4132 /*
4133  * acct_policy_update_pending_job - Make sure the limits imposed on a job on
4134  *	submission are correct after an update to a qos or association.  If
4135  *	the association/qos limits prevent the job from running (lowered
4136  *	limits since job submission), then reset its reason field.
4137  */
acct_policy_update_pending_job(job_record_t * job_ptr)4138 extern int acct_policy_update_pending_job(job_record_t *job_ptr)
4139 {
4140 	job_desc_msg_t job_desc;
4141 	acct_policy_limit_set_t acct_policy_limit_set;
4142 	bool update_accounting = false;
4143 	struct job_details *details_ptr;
4144 	int rc = SLURM_SUCCESS;
4145 	uint64_t tres_req_cnt[slurmctld_tres_cnt];
4146 
4147 	/* check to see if we are enforcing associations and the job
4148 	 * is pending or if we are even enforcing limits. */
4149 	if (!accounting_enforce || !IS_JOB_PENDING(job_ptr)
4150 	    || !(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4151 		return SLURM_SUCCESS;
4152 
4153 	details_ptr = job_ptr->details;
4154 
4155 	if (!details_ptr) {
4156 		error("acct_policy_update_pending_job: no details");
4157 		return SLURM_ERROR;
4158 	}
4159 
4160 	/* set up the job desc to make sure things are the way we
4161 	 * need.
4162 	 */
4163 	slurm_init_job_desc_msg(&job_desc);
4164 
4165 	/* copy the limits set from the job the only one that
4166 	 * acct_policy_validate changes is the time limit so we
4167 	 * should be ok with the memcpy here */
4168 	memcpy(&acct_policy_limit_set, &job_ptr->limit_set,
4169 	       sizeof(acct_policy_limit_set_t));
4170 	job_desc.tres_req_cnt = tres_req_cnt;
4171 	/* copy all the tres requests over */
4172 	memcpy(job_desc.tres_req_cnt, job_ptr->tres_req_cnt,
4173 	       sizeof(uint64_t) * slurmctld_tres_cnt);
4174 
4175 	/* Only set this value if not set from a limit */
4176 	if (job_ptr->limit_set.time == ADMIN_SET_LIMIT)
4177 		acct_policy_limit_set.time = job_ptr->limit_set.time;
4178 	else if ((job_ptr->time_limit != NO_VAL) && !job_ptr->limit_set.time)
4179 		job_desc.time_limit = job_ptr->time_limit;
4180 
4181 	if (!acct_policy_validate(&job_desc, job_ptr->part_ptr,
4182 				  job_ptr->assoc_ptr, job_ptr->qos_ptr,
4183 				  &job_ptr->state_reason,
4184 				  &acct_policy_limit_set, 0)) {
4185 		info("%s: exceeded association/qos's cpu, node, memory or time limit for %pJ",
4186 		     __func__, job_ptr);
4187 		return SLURM_ERROR;
4188 	}
4189 
4190 	/* The only variable in acct_policy_limit_set that is changed
4191 	 * in acct_policy_validate is the time limit so only worry
4192 	 * about that one.
4193 	 */
4194 
4195 	/* If it isn't an admin set limit replace it. */
4196 	if (!acct_policy_limit_set.time && (job_ptr->limit_set.time == 1)) {
4197 		job_ptr->time_limit = NO_VAL;
4198 		job_ptr->limit_set.time = 0;
4199 		update_accounting = true;
4200 	} else if (acct_policy_limit_set.time != ADMIN_SET_LIMIT) {
4201 		if (job_ptr->time_limit != job_desc.time_limit) {
4202 			job_ptr->time_limit = job_desc.time_limit;
4203 			update_accounting = true;
4204 		}
4205 		job_ptr->limit_set.time = acct_policy_limit_set.time;
4206 	}
4207 
4208 	if (update_accounting) {
4209 		last_job_update = time(NULL);
4210 		debug("limits changed for %pJ: updating accounting", job_ptr);
4211 		/* Update job record in accounting to reflect changes */
4212 		jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
4213 	}
4214 
4215 	return rc;
4216 }
4217 
4218 /*
4219  * acct_policy_job_runnable - Determine if the specified job has timed
4220  *	out based on it's QOS or association.
4221  */
acct_policy_job_time_out(job_record_t * job_ptr)4222 extern bool acct_policy_job_time_out(job_record_t *job_ptr)
4223 {
4224 	uint64_t job_tres_usage_mins[slurmctld_tres_cnt];
4225 	uint64_t time_delta;
4226 	uint64_t tres_usage_mins[slurmctld_tres_cnt];
4227 	uint32_t wall_mins, orig_node_cnt;
4228 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4229 	slurmdb_qos_rec_t qos_rec;
4230 	slurmdb_assoc_rec_t *assoc = NULL;
4231 	assoc_mgr_lock_t locks =
4232 		{ .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
4233 	time_t now;
4234 	int i, tres_pos = 0;
4235 	acct_policy_tres_usage_t tres_usage;
4236 
4237 	/*
4238 	 * Now see if we are enforcing limits.  If Safe is set then
4239 	 * return false as well since we are being safe if the limit
4240 	 * was changed after the job was already deemed safe to start.
4241 	 */
4242 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
4243 	    || (accounting_enforce & ACCOUNTING_ENFORCE_SAFE))
4244 		return false;
4245 
4246 	slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
4247 	assoc_mgr_lock(&locks);
4248 
4249 	assoc_mgr_set_qos_tres_cnt(&qos_rec);
4250 
4251 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4252 
4253 	assoc =	job_ptr->assoc_ptr;
4254 
4255 	now = time(NULL);
4256 
4257 	time_delta = (uint64_t)(((now - job_ptr->start_time) -
4258 				 job_ptr->tot_sus_time) / 60);
4259 
4260 	/* clang needs this memset to avoid a warning */
4261 	memset(job_tres_usage_mins, 0, sizeof(tres_usage_mins));
4262 	memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
4263 
4264 	/*
4265 	 * find out how many CPU minutes this job has been running for.
4266 	 * We add 1 here to make it so we can check for just > instead of
4267 	 * >= in our checks.
4268 	 */
4269 	for (i = 0; i < slurmctld_tres_cnt; i++) {
4270 		if (i == TRES_ARRAY_ENERGY)
4271 			continue;
4272 		if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
4273 			continue;
4274 
4275 		if (job_ptr->tres_alloc_cnt[i]) {
4276 			job_tres_usage_mins[i] =
4277 				(time_delta * job_ptr->tres_alloc_cnt[i]) + 1;
4278 		}
4279 	}
4280 
4281 	/* check the first QOS setting it's values in the qos_rec */
4282 	if (qos_ptr_1 && !_qos_job_time_out(job_ptr, qos_ptr_1,
4283 					    &qos_rec, job_tres_usage_mins))
4284 		goto job_failed;
4285 
4286 	/* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
4287 	if (qos_ptr_2 && !_qos_job_time_out(job_ptr, qos_ptr_2,
4288 					    &qos_rec, job_tres_usage_mins))
4289 		goto job_failed;
4290 
4291 	/* handle any association stuff here */
4292 	while (assoc) {
4293 		for (i = 0; i < slurmctld_tres_cnt; i++)
4294 			tres_usage_mins[i] =
4295 				(uint64_t)(assoc->usage->usage_tres_raw[i]
4296 					   / 60.0);
4297 		wall_mins = assoc->usage->grp_used_wall / 60;
4298 
4299 		tres_usage = _validate_tres_usage_limits_for_assoc(
4300 			&tres_pos, assoc->grp_tres_mins_ctld,
4301 			qos_rec.grp_tres_mins_ctld, NULL,
4302 			NULL, tres_usage_mins, NULL, false);
4303 		switch (tres_usage) {
4304 		case TRES_USAGE_CUR_EXCEEDS_LIMIT:
4305 			last_job_update = now;
4306 			info("%pJ timed out, the job is at or exceeds assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
4307 			     job_ptr, assoc->id, assoc->acct,
4308 			     assoc->user, assoc->partition,
4309 			     assoc_mgr_tres_name_array[tres_pos],
4310 			     assoc->grp_tres_mins_ctld[tres_pos],
4311 			     tres_usage_mins[tres_pos]);
4312 			job_ptr->state_reason = FAIL_TIMEOUT;
4313 			goto job_failed;
4314 			break;
4315 		case TRES_USAGE_REQ_EXCEEDS_LIMIT:
4316 			/* not possible safe_limits is 0 */
4317 		case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
4318 			/* not possible safe_limits is 0 */
4319 		case TRES_USAGE_OKAY:
4320 			/* all good */
4321 			break;
4322 		}
4323 
4324 		if ((qos_rec.grp_wall == INFINITE)
4325 		    && (assoc->grp_wall != INFINITE)
4326 		    && (wall_mins >= assoc->grp_wall)) {
4327 			info("%pJ timed out, assoc %u is at or exceeds group wall limit %u with %u for account %s",
4328 			     job_ptr, assoc->id, assoc->grp_wall,
4329 			     wall_mins, assoc->acct);
4330 			job_ptr->state_reason = FAIL_TIMEOUT;
4331 			break;
4332 		}
4333 
4334 		orig_node_cnt = job_tres_usage_mins[TRES_ARRAY_NODE];
4335 		job_tres_usage_mins[TRES_ARRAY_NODE] = 0;
4336 		tres_usage = _validate_tres_usage_limits_for_assoc(
4337 			&tres_pos, assoc->max_tres_mins_ctld,
4338 			qos_rec.max_tres_mins_pj_ctld, job_tres_usage_mins,
4339 			NULL, NULL, NULL, true);
4340 		job_tres_usage_mins[TRES_ARRAY_NODE] = orig_node_cnt;
4341 		switch (tres_usage) {
4342 		case TRES_USAGE_CUR_EXCEEDS_LIMIT:
4343 			/* not possible curr_usage is NULL */
4344 			break;
4345 		case TRES_USAGE_REQ_EXCEEDS_LIMIT:
4346 			last_job_update = now;
4347 			info("%pJ timed out, the job is at or exceeds assoc %u(%s/%s/%s) max tres(%s) minutes of %"PRIu64" with %"PRIu64,
4348 			     job_ptr, assoc->id, assoc->acct,
4349 			     assoc->user, assoc->partition,
4350 			     assoc_mgr_tres_name_array[tres_pos],
4351 			     assoc->max_tres_mins_ctld[tres_pos],
4352 			     job_tres_usage_mins[tres_pos]);
4353 			job_ptr->state_reason = FAIL_TIMEOUT;
4354 			goto job_failed;
4355 			break;
4356 		case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
4357 			/* not possible tres_usage is NULL */
4358 		case TRES_USAGE_OKAY:
4359 			/* all good */
4360 			break;
4361 		}
4362 
4363 		assoc = assoc->usage->parent_assoc_ptr;
4364 		/* these limits don't apply to the root assoc */
4365 		if (assoc == assoc_mgr_root_assoc)
4366 			break;
4367 	}
4368 job_failed:
4369 	assoc_mgr_unlock(&locks);
4370 	slurmdb_free_qos_rec_members(&qos_rec);
4371 
4372 	if (job_ptr->state_reason == FAIL_TIMEOUT)
4373 		return true;
4374 
4375 	return false;
4376 }
4377 
acct_policy_handle_accrue_time(job_record_t * job_ptr,bool assoc_mgr_locked)4378 extern int acct_policy_handle_accrue_time(job_record_t *job_ptr,
4379 					  bool assoc_mgr_locked)
4380 {
4381 	job_record_t *old_job_ptr;
4382 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4383 	slurmdb_assoc_rec_t *assoc_ptr;
4384 	struct job_details *details_ptr;
4385 	slurmdb_used_limits_t *used_limits_a1 = NULL, *used_limits_u1 = NULL;
4386 	slurmdb_used_limits_t *used_limits_a2 = NULL, *used_limits_u2 = NULL;
4387 
4388 	uint32_t max_jobs_accrue = INFINITE;
4389 	int create_cnt = 0, i, rc = SLURM_SUCCESS;
4390 	time_t now = time(NULL);
4391 	bool parent = false;
4392 	static time_t sched_update = 0;
4393 	static uint16_t priority_flags = 0;
4394 	assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK,
4395 				   NO_LOCK, NO_LOCK, NO_LOCK };
4396 
4397 	details_ptr = job_ptr->details;
4398 	if (!details_ptr) {
4399 		error("%s: no details", __func__);
4400 		return SLURM_ERROR;
4401 	}
4402 
4403 	if (sched_update != slurmctld_conf.last_update)
4404 		priority_flags = slurm_get_priority_flags();
4405 
4406 	/*
4407 	 * ACCRUE_ALWAYS flag will always force the accrue_time to be the
4408 	 * submit_time (Not begin).  Accrue limits don't work with this flag.
4409 	 */
4410 	if (priority_flags & PRIORITY_FLAGS_ACCRUE_ALWAYS) {
4411 		if (!details_ptr->accrue_time)
4412 			details_ptr->accrue_time = details_ptr->submit_time;
4413 		return SLURM_SUCCESS;
4414 	}
4415 
4416 	/* Always set accrue_time to begin time when not enforcing limits. */
4417 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
4418 		if (!details_ptr->accrue_time)
4419 			details_ptr->accrue_time = details_ptr->begin_time;
4420 		return SLURM_SUCCESS;
4421 	}
4422 
4423 	/*
4424 	 * If the job is not eligible because it is either held, dependent or
4425 	 * because its begin time is in the future don't accrue time.
4426 	 */
4427 	if (!job_ptr->priority || (job_ptr->bit_flags & JOB_DEPENDENT) ||
4428 	    (details_ptr->begin_time && (details_ptr->begin_time > now)))
4429 		return SLURM_SUCCESS;
4430 
4431 	/* No accrue_time and the job isn't pending, bail */
4432 	if (!details_ptr->accrue_time && !IS_JOB_PENDING(job_ptr))
4433 		return SLURM_SUCCESS;
4434 
4435 	assoc_ptr = job_ptr->assoc_ptr;
4436 	if (!assoc_ptr) {
4437 		debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4438 		      __func__, job_ptr);
4439 		return SLURM_ERROR;
4440 	}
4441 
4442 	if (!assoc_mgr_locked)
4443 		assoc_mgr_lock(&locks);
4444 
4445 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4446 
4447 	if (qos_ptr_1) {
4448 		used_limits_a1 = acct_policy_get_acct_used_limits(
4449 			&qos_ptr_1->usage->acct_limit_list,
4450 			assoc_ptr->acct);
4451 		used_limits_u1 = acct_policy_get_user_used_limits(
4452 				&qos_ptr_1->usage->user_limit_list,
4453 				job_ptr->user_id);
4454 	}
4455 
4456 	if (qos_ptr_2) {
4457 		used_limits_a2 = acct_policy_get_acct_used_limits(
4458 			&qos_ptr_2->usage->acct_limit_list,
4459 			assoc_ptr->acct);
4460 		used_limits_u2 = acct_policy_get_user_used_limits(
4461 				&qos_ptr_2->usage->user_limit_list,
4462 				job_ptr->user_id);
4463 	}
4464 
4465 	/* We have started running, let's clear us out of the mix. */
4466 	if (details_ptr->accrue_time) {
4467 		if (!(job_ptr->bit_flags & JOB_ACCRUE_OVER) &&
4468 		    !IS_JOB_PENDING(job_ptr)) {
4469 			int job_cnt;
4470 			/*
4471 			 * Normally only single jobs come in here, but if we
4472 			 * don't have any limits and an array is cancelled the
4473 			 * array itself comes in so we need to remove all of it.
4474 			 */
4475 
4476 			if (job_ptr->array_recs &&
4477 			    job_ptr->array_recs->task_cnt)
4478 				job_cnt = job_ptr->array_recs->task_cnt;
4479 			else
4480 				job_cnt = 1;
4481 
4482 			/* We only want to handle this once */
4483 			job_ptr->bit_flags |= JOB_ACCRUE_OVER;
4484 
4485 			_remove_accrue_time_internal(job_ptr->assoc_ptr,
4486 						     qos_ptr_1,
4487 						     used_limits_a1,
4488 						     used_limits_u1,
4489 						     qos_ptr_2,
4490 						     used_limits_a2,
4491 						     used_limits_u2,
4492 						     job_cnt);
4493 		}
4494 
4495 		/* We already have our time and we aren't an array, endit */
4496 		if (!IS_JOB_PENDING(job_ptr) ||
4497 		    !job_ptr->array_recs || !job_ptr->array_recs->task_cnt)
4498 			goto endit;
4499 	} else if (!IS_JOB_PENDING(job_ptr))
4500 		goto endit;
4501 
4502 	if (qos_ptr_1)
4503 		_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4504 				       qos_ptr_1->grp_jobs_accrue,
4505 				       qos_ptr_1->usage->accrue_cnt);
4506 	if (used_limits_a1)
4507 		_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4508 				       qos_ptr_1->max_jobs_accrue_pa,
4509 				       used_limits_a1->accrue_cnt);
4510 
4511 	if (used_limits_u1)
4512 		_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4513 				       qos_ptr_1->max_jobs_accrue_pu,
4514 				       used_limits_u1->accrue_cnt);
4515 	if (qos_ptr_2)
4516 		_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4517 				       qos_ptr_2->grp_jobs_accrue,
4518 				       qos_ptr_2->usage->accrue_cnt);
4519 	if (used_limits_a2)
4520 		_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4521 				       qos_ptr_2->max_jobs_accrue_pa,
4522 				       used_limits_a2->accrue_cnt);
4523 
4524 	if (used_limits_u2)
4525 		_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4526 				       qos_ptr_2->max_jobs_accrue_pu,
4527 				       used_limits_u2->accrue_cnt);
4528 
4529 	assoc_ptr = job_ptr->assoc_ptr;
4530 	while (assoc_ptr) {
4531 		if (max_jobs_accrue != INFINITE)
4532 			break;
4533 
4534 		_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4535 				       assoc_ptr->grp_jobs_accrue,
4536 				       assoc_ptr->usage->accrue_cnt);
4537 		/* We don't need to look at the regular limits for
4538 		 * parents since we have pre-propogated them, so just
4539 		 * continue with the next parent
4540 		 */
4541 		if (!parent)
4542 			_get_accrue_create_cnt(&max_jobs_accrue, &create_cnt,
4543 					       assoc_ptr->max_jobs_accrue,
4544 					       assoc_ptr->usage->accrue_cnt);
4545 
4546 		/* now go up the hierarchy */
4547 		assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
4548 		parent = true;
4549 	}
4550 
4551 	/* No limit (or there is space to accrue) */
4552 	if ((max_jobs_accrue == INFINITE) ||
4553 	    (create_cnt && (!job_ptr->array_recs ||
4554 			    !job_ptr->array_recs->task_cnt))) {
4555 		if (!details_ptr->accrue_time &&
4556 		    job_ptr->details->begin_time) {
4557 			/*
4558 			 * If no limit and begin_time hasn't happened yet
4559 			 * then set accrue_time to now.
4560 			 */
4561 			details_ptr->accrue_time =
4562 				((max_jobs_accrue == INFINITE) &&
4563 				 details_ptr->begin_time) ?
4564 				details_ptr->begin_time : time(NULL);
4565 
4566 			/*
4567 			 * If we have an array here and no limit we want to add
4568 			 * all the tasks in the array.
4569 			 */
4570 			if (job_ptr->array_recs &&
4571 			    job_ptr->array_recs->task_cnt)
4572 				create_cnt = job_ptr->array_recs->task_cnt;
4573 			else
4574 				create_cnt = 1;
4575 
4576 			_add_accrue_time_internal(job_ptr->assoc_ptr,
4577 						  qos_ptr_1,
4578 						  used_limits_a1,
4579 						  used_limits_u1,
4580 						  qos_ptr_2,
4581 						  used_limits_a2,
4582 						  used_limits_u2,
4583 						  create_cnt);
4584 		}
4585 
4586 		goto endit;
4587 	}
4588 
4589 	/* Looks like we are at the limit */
4590 	if (!create_cnt) {
4591 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE)
4592 			info("%s: %pJ can't accrue, we are over a limit",
4593 			     __func__, job_ptr);
4594 		goto endit;
4595 	}
4596 
4597 	create_cnt = MIN(create_cnt, job_ptr->array_recs->task_cnt);
4598 
4599 	/* How many can we spin off? */
4600 	for (i = 0; i < create_cnt; i++) {
4601 		/*
4602 		 * After we split off the old_job_ptr is what we want to alter
4603 		 * as the job_ptr returned from job_array_post_sched will be the
4604 		 * master job_ptr for the array and we will use that to split
4605 		 * more off if needed.
4606 		 */
4607 		old_job_ptr = job_ptr;
4608 
4609 		job_array_pre_sched(job_ptr);
4610 		job_ptr = job_array_post_sched(job_ptr);
4611 
4612 		details_ptr = old_job_ptr->details;
4613 		if (!details_ptr) {
4614 			fatal_abort("%s: no details after split", __func__);
4615 			rc = SLURM_ERROR;
4616 			_add_accrue_time_internal(job_ptr->assoc_ptr,
4617 						  qos_ptr_1,
4618 						  used_limits_a1,
4619 						  used_limits_u1,
4620 						  qos_ptr_2,
4621 						  used_limits_a2,
4622 						  used_limits_u2,
4623 						  i - 1);
4624 			goto endit;
4625 		}
4626 		details_ptr->accrue_time = now;
4627 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_ACCRUE)
4628 			info("%pJ is now accruing time %ld", old_job_ptr, now);
4629 	}
4630 
4631 	/*
4632 	 * Here we are ok to use all the same pointers from the main job_ptr as
4633 	 * an array will always have the same pointers.  If this ever changes in
4634 	 * the future some how we will need to address it.
4635 	 */
4636 	_add_accrue_time_internal(job_ptr->assoc_ptr,
4637 				  qos_ptr_1,
4638 				  used_limits_a1,
4639 				  used_limits_u1,
4640 				  qos_ptr_2,
4641 				  used_limits_a2,
4642 				  used_limits_u2,
4643 				  create_cnt);
4644 
4645 endit:
4646 
4647 	if (!assoc_mgr_locked)
4648 		assoc_mgr_unlock(&locks);
4649 
4650 	return rc;
4651 }
4652 
acct_policy_add_accrue_time(job_record_t * job_ptr,bool assoc_mgr_locked)4653 extern void acct_policy_add_accrue_time(job_record_t *job_ptr,
4654 					bool assoc_mgr_locked)
4655 {
4656 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4657 	slurmdb_assoc_rec_t *assoc_ptr;
4658 	slurmdb_used_limits_t *used_limits_a1 = NULL, *used_limits_u1 = NULL;
4659 	slurmdb_used_limits_t *used_limits_a2 = NULL, *used_limits_u2 = NULL;
4660 	assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK,
4661 				   NO_LOCK, NO_LOCK, NO_LOCK };
4662 	int job_cnt;
4663 
4664 	/* check to see if we are enforcing limits */
4665 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4666 		return;
4667 
4668 	/* If Job is held or dependent don't accrue time */
4669 	if (!job_ptr->priority || (job_ptr->bit_flags & JOB_DEPENDENT))
4670 		return;
4671 
4672 	/* Job has to be pending to accrue time. */
4673 	if (!IS_JOB_PENDING(job_ptr))
4674 		return;
4675 
4676 	assoc_ptr = job_ptr->assoc_ptr;
4677 	if (!assoc_ptr) {
4678 		debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4679 		      __func__, job_ptr);
4680 		return;
4681 	}
4682 
4683 	if (!assoc_mgr_locked)
4684 		assoc_mgr_lock(&locks);
4685 
4686 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4687 
4688 	if (qos_ptr_1) {
4689 		used_limits_a1 = acct_policy_get_acct_used_limits(
4690 			&qos_ptr_1->usage->acct_limit_list,
4691 			assoc_ptr->acct);
4692 		used_limits_u1 = acct_policy_get_user_used_limits(
4693 				&qos_ptr_1->usage->user_limit_list,
4694 				job_ptr->user_id);
4695 	}
4696 
4697 	if (qos_ptr_2) {
4698 		used_limits_a2 = acct_policy_get_acct_used_limits(
4699 			&qos_ptr_2->usage->acct_limit_list,
4700 			assoc_ptr->acct);
4701 		used_limits_u2 = acct_policy_get_user_used_limits(
4702 				&qos_ptr_2->usage->user_limit_list,
4703 				job_ptr->user_id);
4704 	}
4705 
4706 	/*
4707 	 * Normally only single jobs come in here, but if we don't have any
4708 	 * limits the array itself comes in so we need to add it all.
4709 	 */
4710 	if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
4711 		job_cnt = job_ptr->array_recs->task_cnt;
4712 	else
4713 		job_cnt = 1;
4714 
4715 	_add_accrue_time_internal(assoc_ptr,
4716 				  qos_ptr_1,
4717 				  used_limits_a1,
4718 				  used_limits_u1,
4719 				  qos_ptr_2,
4720 				  used_limits_a2,
4721 				  used_limits_u2,
4722 				  job_cnt);
4723 	if (!assoc_mgr_locked)
4724 		assoc_mgr_unlock(&locks);
4725 }
4726 
acct_policy_remove_accrue_time(job_record_t * job_ptr,bool assoc_mgr_locked)4727 extern void acct_policy_remove_accrue_time(job_record_t *job_ptr,
4728 					   bool assoc_mgr_locked)
4729 {
4730 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4731 	slurmdb_assoc_rec_t *assoc_ptr;
4732 	slurmdb_used_limits_t *used_limits_a1 = NULL, *used_limits_u1 = NULL;
4733 	slurmdb_used_limits_t *used_limits_a2 = NULL, *used_limits_u2 = NULL;
4734 	assoc_mgr_lock_t locks = { .assoc = WRITE_LOCK, .qos = WRITE_LOCK };
4735 	int job_cnt;
4736 
4737 	/* check to see if we are enforcing limits */
4738 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4739 		return;
4740 
4741 	if (!job_ptr->details || !job_ptr->details->accrue_time)
4742 		return;
4743 
4744 	/* Job has to be pending to accrue time. */
4745 	if (!IS_JOB_PENDING(job_ptr))
4746 		return;
4747 
4748 	if (!assoc_mgr_locked)
4749 		assoc_mgr_lock(&locks);
4750 
4751 	assoc_ptr = job_ptr->assoc_ptr;
4752 	if (!assoc_ptr) {
4753 		debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4754 		      __func__, job_ptr);
4755 		goto end_it;
4756 	}
4757 
4758 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4759 
4760 	if (qos_ptr_1) {
4761 		used_limits_a1 = acct_policy_get_acct_used_limits(
4762 			&qos_ptr_1->usage->acct_limit_list,
4763 			assoc_ptr->acct);
4764 		used_limits_u1 = acct_policy_get_user_used_limits(
4765 				&qos_ptr_1->usage->user_limit_list,
4766 				job_ptr->user_id);
4767 	}
4768 
4769 	if (qos_ptr_2) {
4770 		used_limits_a2 = acct_policy_get_acct_used_limits(
4771 			&qos_ptr_2->usage->acct_limit_list,
4772 			assoc_ptr->acct);
4773 		used_limits_u2 = acct_policy_get_user_used_limits(
4774 				&qos_ptr_2->usage->user_limit_list,
4775 				job_ptr->user_id);
4776 	}
4777 
4778 	/*
4779 	 * Normally only single jobs come in here, but if we don't have any
4780 	 * limits the array itself comes in so we need to add it all.
4781 	 */
4782 	if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
4783 		job_cnt = job_ptr->array_recs->task_cnt;
4784 	else
4785 		job_cnt = 1;
4786 
4787 	_remove_accrue_time_internal(assoc_ptr,
4788 				     qos_ptr_1,
4789 				     used_limits_a1,
4790 				     used_limits_u1,
4791 				     qos_ptr_2,
4792 				     used_limits_a2,
4793 				     used_limits_u2,
4794 				     job_cnt);
4795 
4796 	/* reset the job */
4797 	job_ptr->details->accrue_time = 0;
4798 	job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
4799 
4800 end_it:
4801 	if (!assoc_mgr_locked)
4802 		assoc_mgr_unlock(&locks);
4803 }
4804 
acct_policy_get_prio_thresh(job_record_t * job_ptr,bool assoc_mgr_locked)4805 extern uint32_t acct_policy_get_prio_thresh(job_record_t *job_ptr,
4806 					    bool assoc_mgr_locked)
4807 {
4808 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4809 	slurmdb_assoc_rec_t *assoc_ptr;
4810 	uint32_t prio_thresh = 0;
4811 	assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK,
4812 				   NO_LOCK, NO_LOCK, NO_LOCK };
4813 
4814 	/* check to see if we are enforcing limits */
4815 	if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
4816 		return 0;
4817 
4818 	assoc_ptr = job_ptr->assoc_ptr;
4819 	if (!assoc_ptr) {
4820 		debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
4821 		      __func__, job_ptr);
4822 		return 0;
4823 	}
4824 
4825 	if (!assoc_mgr_locked)
4826 		assoc_mgr_lock(&locks);
4827 
4828 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4829 
4830 	if (qos_ptr_1)
4831 		_get_prio_thresh(&prio_thresh, qos_ptr_1->min_prio_thresh);
4832 
4833 	if (qos_ptr_2)
4834 		_get_prio_thresh(&prio_thresh, qos_ptr_2->min_prio_thresh);
4835 
4836 	_get_prio_thresh(&prio_thresh, assoc_ptr->min_prio_thresh);
4837 
4838 	if (!assoc_mgr_locked)
4839 		assoc_mgr_unlock(&locks);
4840 
4841 	return prio_thresh;
4842 }
4843 
acct_policy_get_preemptable_time(job_record_t * job_ptr)4844 extern time_t acct_policy_get_preemptable_time(job_record_t *job_ptr)
4845 {
4846 	slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
4847 	uint32_t min1, min2, conf_min;
4848 	time_t start = job_ptr->start_time;
4849 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
4850 	xassert(verify_lock(JOB_LOCK, READ_LOCK));
4851 	xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK));
4852 
4853 	acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
4854 	min1 = (qos_ptr_1) ? qos_ptr_1->preempt_exempt_time : INFINITE;
4855 	min2 = (qos_ptr_2) ? qos_ptr_2->preempt_exempt_time : INFINITE;
4856 	conf_min = slurmctld_conf.preempt_exempt_time;
4857 
4858 	/* priority: min1 > min2 > conf_min. INFINITE means none. */
4859 	if (min1 != INFINITE)
4860 		return start + min1;
4861 	else if (min2 != INFINITE)
4862 		return start + min2;
4863 	else if (conf_min != INFINITE)
4864 		return start + conf_min;
4865 	else
4866 		return start;
4867 }
4868 
acct_policy_is_job_preempt_exempt(job_record_t * job_ptr)4869 extern bool acct_policy_is_job_preempt_exempt(job_record_t *job_ptr)
4870 {
4871 	time_t now = time(0);
4872 
4873 	assoc_mgr_lock_t locks = { .qos = READ_LOCK };
4874 	assoc_mgr_lock(&locks);
4875 	time_t preempt_time = acct_policy_get_preemptable_time(job_ptr);
4876 	assoc_mgr_unlock(&locks);
4877 
4878 	return now < preempt_time;
4879 }
4880 
acct_policy_set_qos_order(job_record_t * job_ptr,slurmdb_qos_rec_t ** qos_ptr_1,slurmdb_qos_rec_t ** qos_ptr_2)4881 extern void acct_policy_set_qos_order(job_record_t *job_ptr,
4882 				      slurmdb_qos_rec_t **qos_ptr_1,
4883 				      slurmdb_qos_rec_t **qos_ptr_2)
4884 {
4885 	xassert(job_ptr);
4886 	xassert(qos_ptr_1);
4887 	xassert(qos_ptr_2);
4888 
4889 	/* Initialize incoming pointers */
4890 	*qos_ptr_1 = NULL;
4891 	*qos_ptr_2 = NULL;
4892 
4893 	if (job_ptr->qos_ptr) {
4894 		if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr) {
4895 			/*
4896 			 * If the job's QOS has the flag to over ride the
4897 			 * partition then use that otherwise use the
4898 			 * partition's QOS as the king.
4899 			 */
4900 			if (job_ptr->qos_ptr->flags & QOS_FLAG_OVER_PART_QOS) {
4901 				*qos_ptr_1 = job_ptr->qos_ptr;
4902 				*qos_ptr_2 = job_ptr->part_ptr->qos_ptr;
4903 			} else {
4904 				*qos_ptr_1 = job_ptr->part_ptr->qos_ptr;
4905 				*qos_ptr_2 = job_ptr->qos_ptr;
4906 			}
4907 
4908 			/*
4909 			 * No reason to look at the same QOS twice, actually
4910 			 * we never want to do that ;).
4911 			 */
4912 			if (*qos_ptr_1 == *qos_ptr_2)
4913 				*qos_ptr_2 = NULL;
4914 		} else
4915 			*qos_ptr_1 = job_ptr->qos_ptr;
4916 	} else if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr)
4917 		*qos_ptr_1 = job_ptr->part_ptr->qos_ptr;
4918 
4919 	return;
4920 }
4921 
4922 /*
4923  * Checks for record in *user_limit_list of user_id if
4924  * *user_limit_list doesn't exist it will create it, if the user_id
4925  * record doesn't exist it will add it to the list.
4926  * In all cases the user record is returned.
4927  */
acct_policy_get_acct_used_limits(List * acct_limit_list,char * acct)4928 extern slurmdb_used_limits_t *acct_policy_get_acct_used_limits(
4929 	List *acct_limit_list, char *acct)
4930 {
4931 	slurmdb_used_limits_t *used_limits;
4932 
4933 	xassert(acct_limit_list);
4934 
4935 	if (!*acct_limit_list)
4936 		*acct_limit_list = list_create(slurmdb_destroy_used_limits);
4937 
4938 	if (!(used_limits = list_find_first(*acct_limit_list,
4939 					    _find_used_limits_for_acct,
4940 					    acct))) {
4941 		int i = sizeof(uint64_t) * slurmctld_tres_cnt;
4942 
4943 		used_limits = xmalloc(sizeof(slurmdb_used_limits_t));
4944 		used_limits->acct = xstrdup(acct);
4945 
4946 		used_limits->tres = xmalloc(i);
4947 		used_limits->tres_run_mins = xmalloc(i);
4948 
4949 		list_append(*acct_limit_list, used_limits);
4950 	}
4951 
4952 	return used_limits;
4953 }
4954 
4955 /*
4956  * Checks for record in *user_limit_list of user_id if
4957  * *user_limit_list doesn't exist it will create it, if the user_id
4958  * record doesn't exist it will add it to the list.
4959  * In all cases the user record is returned.
4960  */
acct_policy_get_user_used_limits(List * user_limit_list,uint32_t user_id)4961 extern slurmdb_used_limits_t *acct_policy_get_user_used_limits(
4962 	List *user_limit_list, uint32_t user_id)
4963 {
4964 	slurmdb_used_limits_t *used_limits;
4965 
4966 	xassert(user_limit_list);
4967 
4968 	if (!*user_limit_list)
4969 		*user_limit_list = list_create(slurmdb_destroy_used_limits);
4970 
4971 	if (!(used_limits = list_find_first(*user_limit_list,
4972 					    _find_used_limits_for_user,
4973 					    &user_id))) {
4974 		int i = sizeof(uint64_t) * slurmctld_tres_cnt;
4975 
4976 		used_limits = xmalloc(sizeof(slurmdb_used_limits_t));
4977 		used_limits->uid = user_id;
4978 
4979 		used_limits->tres = xmalloc(i);
4980 		used_limits->tres_run_mins = xmalloc(i);
4981 
4982 		list_append(*user_limit_list, used_limits);
4983 	}
4984 
4985 	return used_limits;
4986 }
4987