1#!/usr/bin/env expect
2############################################################################
3# Purpose: Establish global state information for Slurm test suite
4#
5# To define site-specific state information, set the values in a file
6# named 'globals.local'. Those values will override any specified here.
7# for example:
8#
9# $ cat globals.local
10# set slurm_dir  "/usr/local"
11# set build_dir  "/home/mine/SLURM/build_smd"
12# set src_dir    "/home/mine/SLURM/slurm.git"
13# set mpicc      "/usr/local/bin/mpicc"
14#
15# If you want to have more than one test going at the same time for multiple
16# installs you can have multiple globals.local files and set the
17# SLURM_LOCAL_GLOBALS_FILE env var, and have that set to the correct
18# globals.local file for your various installs.  The file can be named anything,
19# not just globals.local.
20#
21############################################################################
22# Copyright (C) 2002-2007 The Regents of the University of California.
23# Copyright (C) 2008-2010 Lawrence Livermore National Security.
24# Portions Copyright (C) 2010-2018 SchedMD LLC.
25# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
26# Written by Morris Jette <jette1@llnl.gov>
27# Additions by Joseph Donaghy <donaghy1@llnl.gov>
28# CODE-OCEC-09-009. All rights reserved.
29#
30# This file is part of Slurm, a resource management program.
31# For details, see <https://slurm.schedmd.com/>.
32# Please also read the supplied file: DISCLAIMER.
33#
34# Slurm is free software; you can redistribute it and/or modify it under
35# the terms of the GNU General Public License as published by the Free
36# Software Foundation; either version 2 of the License, or (at your option)
37# any later version.
38#
39# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
40# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
41# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
42# details.
43#
44# You should have received a copy of the GNU General Public License along
45# with Slurm; if not, write to the Free Software Foundation, Inc.,
46# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
47############################################################################
48
49# Avoid sourcing this file multiple times
50if {[info procs exit] eq "exit"} {
51	return
52}
53
54global sacctmgr sacct salloc sattach sbatch sbcast scancel scontrol sinfo
55global smd squeue sreport srun sstat strigger
56
57################################################################
58#
59# NAME
60#	cset - conditional set
61#
62# SYNOPSIS
63#	cset name value
64#
65# DESCRIPTION
66#	Conditional set.  Only set variable if variable does not yet exist.
67#
68# Input: name  -- name of the variable to set
69#	 value -- value to set to 'name'
70#
71################################################################
72
73proc cset {name value} {
74	if {![uplevel 1 info exists $name]} {
75		upvar $name tmp
76		set tmp $value
77	}
78}
79
80
81#
82# Defining colors here to be able to use them in globals.local.
83# By default, these colors are bold
84#
85set COLOR_RED         "\033\[1;31m"
86set COLOR_RED_NORMAL  "\033\[31m"
87set COLOR_ORANGE      "\033\[1;38;5;208m"
88set COLOR_YELLOW      "\033\[1;33m"
89set COLOR_GREEN       "\033\[1;32m"
90set COLOR_BLUE        "\033\[1;34m"
91set COLOR_MAGENTA     "\033\[1;35m"
92set COLOR_CYAN        "\033\[1;36m"
93set COLOR_NONE        "\033\[0m"
94
95cset local_globals_file "./globals.local"
96
97# Log level "enum"
98# Define log levels here so they are available in globals.local
99set LOG_LEVEL_QUIET   0
100set LOG_LEVEL_FATAL   1
101set LOG_LEVEL_ERROR   2
102set LOG_LEVEL_WARNING 3
103set LOG_LEVEL_INFO    4
104set LOG_LEVEL_PASS    4
105set LOG_LEVEL_COMMAND 4
106set LOG_LEVEL_DEBUG   5
107set LOG_LEVEL_TRACE   6
108
109if {[info exists env(SLURM_LOCAL_GLOBALS_FILE)]} {
110	set local_globals_file $env(SLURM_LOCAL_GLOBALS_FILE)
111}
112
113if [file exists $local_globals_file] {
114	source $local_globals_file
115}
116
117#
118# Specify the slurm install directory.
119# Used to locate binaries, libraries, and header files.
120#
121cset slurm_dir   "/usr"
122cset build_dir   "../../"
123cset src_dir     "../../"
124cset config_h    "${build_dir}/config.h"
125cset sacctmgr    "${slurm_dir}/bin/sacctmgr"
126cset sacct       "${slurm_dir}/bin/sacct"
127cset salloc      "${slurm_dir}/bin/salloc"
128cset sattach     "${slurm_dir}/bin/sattach"
129cset sbatch      "${slurm_dir}/bin/sbatch"
130cset sbcast      "${slurm_dir}/bin/sbcast"
131cset scancel     "${slurm_dir}/bin/scancel"
132cset scontrol    "${slurm_dir}/bin/scontrol"
133cset sdiag       "${slurm_dir}/bin/sdiag"
134cset sgather     "${slurm_dir}/bin/sgather"
135cset sh5util     "${slurm_dir}/bin/sh5util"
136cset sinfo       "${slurm_dir}/bin/sinfo"
137cset smd         "${slurm_dir}/bin/smd"
138cset sprio       "${slurm_dir}/bin/sprio"
139cset squeue      "${slurm_dir}/bin/squeue"
140cset srun        "${slurm_dir}/bin/srun"
141cset sreport     "${slurm_dir}/bin/sreport"
142cset sshare      "${slurm_dir}/bin/sshare"
143cset sstat       "${slurm_dir}/bin/sstat"
144cset strigger    "${slurm_dir}/bin/strigger"
145
146cset slurmd      "${slurm_dir}/sbin/slurmd"
147cset slurmrestd  "${slurm_dir}/sbin/slurmrestd"
148
149cset pbsnodes    "${slurm_dir}/bin/pbsnodes"
150cset qdel        "${slurm_dir}/bin/qdel"
151cset qstat       "${slurm_dir}/bin/qstat"
152cset qsub        "${slurm_dir}/bin/qsub"
153cset qalter      "${slurm_dir}/bin/qalter"
154cset qrerun      "${slurm_dir}/bin/qrerun"
155
156cset seff        "${slurm_dir}/bin/seff"
157
158cset lsid      	 "${slurm_dir}/bin/lsid"
159cset bjobs     	 "${slurm_dir}/bin/bjobs"
160cset bkill     	 "${slurm_dir}/bin/bkill"
161cset bsub     	 "${slurm_dir}/bin/bsub"
162
163# If using MPICH-2 or other version of MPI requiring pmi libary, use this
164#cset mpicc	"/home/jette/mpich2-install/bin/mpicc"
165#cset use_pmi	1
166# OR for other versions of MPICH, use this
167cset mpicc	"/usr/local/bin/mpicc"
168cset nvcc	"/usr/bin/nvcc"
169cset use_pmi	0
170#cset upcc       "/usr/local/bin/upcc"
171cset upcc       "/usr/bin/xlupc"
172cset oshcc      "/usr/local/bin/oshcc"
173
174cset mpirun	"mpirun"
175cset totalviewcli	"/usr/local/bin/totalviewcli"
176
177# Set if using "--enable-memory-leak-debug" configuration option
178cset enable_memory_leak_debug 0
179
180# test_prompt: to be used as prompt for interactive shells
181set test_prompt  "TEST_PROMPT: "
182# reset_bash_prompt: to be used as command on scripts or interactive jobs
183set reset_bash_prompt "unset PROMPT_COMMAND; unset PS0; export PS1=\"$test_prompt\""
184
185#
186# Specify locations of other executable files used
187# Only the shell names (e.g. bin_bash) must be full pathnames
188#
189cset bin_awk	"awk"
190cset bin_bash   [exec which bash | tail -n 1]
191cset bin_cat	"cat"
192cset bin_cc	"gcc"
193cset bin_chmod	"chmod"
194cset bin_cmp	"cmp"
195cset bin_cp	"cp"
196cset bin_date	"date"
197cset bin_diff	"diff"
198cset bin_echo	"echo"
199cset bin_env	"env"
200cset bin_file	"file"
201cset bin_id	"id"
202cset bin_grep   "grep"
203cset bin_head   "head"
204cset bin_ln     "ln"
205cset bin_perldoc "/usr/bin/perldoc"
206
207# Don't user $bin_hostname unless on a front-end system that
208# doesn't fully use the slurmd, use $bin_printenv SLURMD_NODENAME
209cset bin_hostname "hostname"
210
211cset bin_kill	"kill"
212cset bin_make	"make"
213cset bin_mv	"mv"
214cset bin_od	"od"
215cset bin_pkill	"pkill"
216cset bin_printenv "printenv"
217cset bin_ps	"ps"
218cset bin_pwd	"pwd"
219cset bin_rm	"rm"
220cset bin_sed	"sed"
221cset bin_sleep  "sleep"
222cset bin_sort	"sort"
223cset bin_sum	"sum"
224cset bin_sudo	"sudo"
225cset bin_touch	"touch"
226cset bin_true	"true"
227cset bin_uname	"uname"
228cset bin_uniq	"uniq"
229cset bin_wc	"wc"
230
231#
232# Let the commands complete without expect timing out waiting for a
233# response. Single node jobs submitted to the default partition should
234# be initiated within this number of seconds.
235# for interactive slurm jobs: cset timeout $max_job_delay
236#
237cset max_job_delay 120
238
239#
240# Specify the maximum number of tasks to use in the stress tests.
241#
242cset max_stress_tasks 4
243
244#
245# The error message that the "sleep" command prints when we run "sleep aaa".
246#
247cset sleep_error_message "(invalid time interval)|(bad character in argument)|(usage: sleep seconds)"
248
249# Force LANG, as the expect tests aren't localized
250set ::env(LANG)          "en_US.UTF-8"
251
252# Testsuite level variables
253cset testsuite_shared_dir "[$bin_pwd]"
254
255# Testsuite non-privileged user (set it in globals.local)
256cset testsuite_user ""
257
258# Testsuite log variables
259cset testsuite_log_level $LOG_LEVEL_DEBUG
260cset testsuite_log_format "\[%{timestamp}s.%{msecs}03d] %{loglevel}-7s %{message}s \(%{backtrace}s)"
261cset testsuite_time_format "%Y-%m-%d %H:%M:%S"
262
263# Default to using color if writing to a terminal and not if writing to a file
264cset testsuite_colorize        [dict exists [fconfigure stdout] -mode]
265cset testsuite_color_fatal     $COLOR_RED
266cset testsuite_color_error     $COLOR_RED_NORMAL
267cset testsuite_color_warn      $COLOR_ORANGE
268cset testsuite_color_info      $COLOR_YELLOW
269cset testsuite_color_pass      $COLOR_GREEN
270cset testsuite_color_command   $COLOR_CYAN
271cset testsuite_color_debug     $COLOR_BLUE
272cset testsuite_color_trace     $COLOR_MAGENTA
273cset testsuite_color_header    $COLOR_NONE
274cset testsuite_color_success   $COLOR_GREEN
275cset testsuite_color_failure   $COLOR_RED
276cset testsuite_color_skipped   $COLOR_ORANGE
277
278# Set to true to cause the first subtest failure to immediately end the test
279cset testsuite_subtest_fatal   false
280# Set to all, fail_skip, fail or none print datails of subtests and testprocs
281cset testsuite_subtest_details  fail
282cset testsuite_testproc_details fail
283
284# To automatically call cleanup or not when ending the test
285cset testsuite_cleanup_on_failure true
286if {[info exists env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)]} {
287	set testsuite_cleanup_on_failure $env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)
288}
289
290# To avoid potential infinite loops due calls to fail/pass/skip inside
291# custom cleanup procs (_test_fini should be called only once)
292set _test_fini_called false
293
294# Testproc internal variables
295set _testproc_included    [list]
296set _testproc_excluded    [list]
297set _testproc_pass_list   [list]
298set _testproc_skip_list   [list]
299set _testproc_fail_list   [list]
300set _testproc_messages    [dict create]
301set _testproc_skip_next   false
302set _testproc_skip_reason ""
303set _incomplete_reason    ""
304set _subtest_pass_count   0
305set _subtest_skip_count   0
306set _subtest_fail_count   0
307set _subtest_messages     [dict create]
308
309# Other common variables
310set re_word_str          "\\S+"
311set digit                "\\d"
312set eol                  "\r?\n"
313set float                "\\d+\\.?\\d*"
314set number               "\\d+"
315set format_time          "\\d+\\:\\d+\\:\\d+"
316set number_with_suffix   "\\d+\[KM\]*"
317set slash                "/"
318set whitespace		 "\\s+"
319set controlmachine_regex "\\S+"
320# Any characters except ( , : newline
321set no_delim             "\[^(,:\r\n\]"
322set no_delim_slash       "\[^(,:/\r\n\]"
323# The first group matches GRES name
324# The second **optional** group matches GRES type.
325# The third group matches GRES count.
326# Test out the regex here: https://regex101.com/r/FlNYKM/7
327set gres_regex "($no_delim_slash*):($no_delim*)?:?($no_delim*)"
328
329#
330# Global variable used in multiple functions in "globals" file
331#
332set gpu_sock_list {}
333
334#
335# Procedure return values
336#
337set RETURN_SUCCESS 0
338set RETURN_ERROR   1
339set RETURN_TIMEOUT 110 ; # ETIMEDOUT
340
341
342################################################################
343#
344# NAME
345#	fail - fails a test
346#
347# SYNOPSIS
348#	fail message
349#
350# DESCRIPTION
351#	To be used when an error is fatal for the test. This routine
352#	prints the specified error message, optionally cleans up, prints
353#	a final test failure message, and exits the test with exit code 1.
354#
355# ENVIRONMENT
356#	Whether or not the cleanup procedure is called depends on the setting
357#	of the $testsuite_cleanup_on_failure set in the globals.local file or
358#	overridden with the SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment
359#	variable.
360#
361# NOTE
362#	DO NOT call this within your local cleanup procedure.
363#
364################################################################
365
366proc fail { message } {
367	global _incomplete_reason
368
369	# Avoid recursive calls from within cleanup
370	if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} {
371		log_error "Local cleanup shouldn't call pass, fail or skip"
372		return
373	}
374
375	log_fatal $message
376	set _incomplete_reason $message
377
378	# _test_fini will handle cleanup and print the failure message.
379	_test_fini 1
380}
381
382
383################################################################
384#
385# NAME
386#	skip - skips a test
387#
388# SYNOPSIS
389#	skip message
390#
391# DESCRIPTION
392#	To be used when a precondition for the test fails and the test
393#	should be skipped. This routine prints the specified warning message,
394#	calls the cleanup procedure if defined, prints a final test skipped
395#	message, and exits the test with exit code -1 (aka 255).
396#
397# NOTE
398#	DO NOT call this within your local cleanup procedure.
399#
400################################################################
401
402proc skip { message } {
403	global _incomplete_reason
404
405	# Avoid recursive calls from within cleanup
406	if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} {
407		log_error "Local cleanup shouldn't call pass, fail or skip"
408		return
409	}
410
411	log_warn $message
412	set _incomplete_reason $message
413
414	# _test_fini will handle cleanup and print the skipped message.
415	_test_fini -1
416}
417
418
419################################################################
420#
421# NAME
422#	pass - passes a test
423#
424# SYNOPSIS
425#	pass
426#
427# DESCRIPTION
428#	To be used when a test passes and should complete with success.
429#	This routine calls the cleanup procedure if defined, prints a final
430#	test success message, and exits with exit code 0.
431#
432# NOTE
433#	DO NOT call this within your local cleanup procedure.
434#
435################################################################
436
437proc pass { } {
438
439	# Avoid recursive calls from within cleanup
440	if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} {
441		log_error "Local cleanup shouldn't call pass, fail or skip"
442		return
443	}
444
445	# _test_fini will handle cleanup and print the success message.
446	_test_fini 0
447}
448
449
450################################################################
451#
452# NAME
453#	subpass - registers a passing subtest result
454#
455# SYNOPSIS
456#	subpass ?description?
457#
458# DESCRIPTION
459#	Increments the subtest pass count and logs a passing subtest message
460#
461# ARGUMENTS
462#	description
463#		A single-line string describing the subtest being verified
464#
465################################################################
466
467proc subpass args {
468	global _subtest_fail_count _subtest_pass_count _subtest_skip_count
469	global _subtest_messages
470
471	set description ""
472	set argument_count [llength $args]
473	if {$argument_count == 1} { set args [lassign $args description] }
474	if {$argument_count > 1} {
475		fail "Too many arguments ($argument_count): $args"
476	}
477
478	set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1]
479	incr _subtest_pass_count
480	set message [format "Subtest %2d passed" $subtest_count]
481	if {$description ne ""} { append message "  : $description" }
482	log_pass $message
483	dict set _subtest_messages $subtest_count [list pass $message]
484}
485
486
487################################################################
488#
489# NAME
490#	subfail - registers a failing subtest result
491#
492# SYNOPSIS
493#	subfail ?options? ?description? ?diagnostics?
494#
495# DESCRIPTION
496#	Increments the subtest failure count and logs a failing subtest message
497#
498# OPTIONS
499#	-fatal
500#		Causes this subtest failure to be fatal, ending the test
501# ARGUMENTS
502#	description
503#		A single-line string describing the condition being verified
504#	diagnostics
505#		A string providing additional diagnostic information that will
506#		be included with the log message
507#
508# ENVIRONMENT
509#	testsuite_subtest_fatal
510#		Specifies whether first failing subtest aborts the test
511#
512################################################################
513
514proc subfail args {
515	global _subtest_fail_count _subtest_pass_count _subtest_skip_count
516	global testsuite_subtest_fatal
517	global _subtest_messages
518
519	set description ""
520	set fatal       false
521	while {[llength $args]} {
522		switch -glob -- [lindex $args 0] {
523			-fatal   {set fatal true; set args [lrange $args 1 end]}
524			-*       {fail "Unknown option: [lindex $args 0]"}
525			default  break
526		}
527	}
528	set argument_count [llength $args]
529	if {$argument_count >= 1} { set args [lassign $args description] }
530
531	set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1]
532	incr _subtest_fail_count
533	set message [format "Subtest %2d failed" $subtest_count]
534	if {$description ne ""} { append message "  : $description" }
535	if [llength $args] { append message " (" [join $args ", "] ")" }
536	if {$fatal || $testsuite_subtest_fatal} {
537		fail $message
538	} else {
539		log_error $message
540	}
541	dict set _subtest_messages $subtest_count [list fail $message]
542}
543
544
545################################################################
546#
547# NAME
548#	subskip - registers a skipped subtest result
549#
550# SYNOPSIS
551#	subskip ?options? ?description?
552#
553# DESCRIPTION
554#	Increments the subtest skip count and logs a skipped subtest message
555#
556# OPTIONS
557#	-count NUMBER
558#		When used with -skip, indicates the number of subtests that
559#		were skipped
560# ARGUMENTS
561#	description
562#		A single-line string describing the reason the subtest is
563#		being skipped
564#
565################################################################
566
567proc subskip args {
568	global _subtest_fail_count _subtest_pass_count _subtest_skip_count
569	global _subtest_messages
570
571	set description ""
572	set count       1
573	while {[llength $args]} {
574		switch -glob -- [lindex $args 0] {
575			-count   {set args [lassign $args - count]}
576			-*       {fail "Unknown option: [lindex $args 0]"}
577			default  break
578		}
579	}
580	set argument_count [llength $args]
581	if {$argument_count == 1} { set args [lassign $args description] }
582	if {$argument_count > 1} {
583		fail "Too many arguments ($argument_count): $args"
584	}
585
586	set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1]
587	incr _subtest_skip_count $count
588	if {$count > 1} {
589		set message "Subtest $subtest_count-[expr $subtest_count+$count-1] skipped"
590	} else {
591		set message [format "Subtest %2d skipped" $subtest_count]
592	}
593	if {$description ne ""} { append message " : $description" }
594	log_warn $message
595	dict set _subtest_messages [expr $subtest_count] [list skip $message]
596}
597
598
599################################################################
600#
601# NAME
602#	print_time - prints the current date and time
603#
604# SYNOPSIS
605#	print_time
606#
607################################################################
608
609proc print_time { } {
610	global bin_date
611
612	spawn $bin_date
613	expect {
614		eof {
615			wait
616		}
617	}
618
619	return
620}
621
622################################################################
623#
624# NAME
625#	dict_getdef - 'dict get' with ability to specify the default value
626#
627# SYNOPSIS
628#	dict_getdef dictionary_value key default_value
629#
630# DESCRIPTION
631#	Tcl < 8.7 lacks a built in 'dict get' with ability to specify the
632#	default value. Tcl 8.7 adds a dict getdef.
633#	This proc returns the value from the dictionary corresponding to the
634#	keys if it exists, or the default value otherwise.
635#
636# EXAMPLE
637#	dict_getdef $option_dict action "warn"
638#
639# SOURCE
640#	https://core.tcl-lang.org/tips/doc/trunk/tip/342.md
641#	https://core.tcl-lang.org/tcl/tktview/2370575
642#
643################################################################
644
645proc dict_getdef {D args} {
646	if {[dict exists $D {*}[lrange $args 0 end-1]]} then {
647		dict get $D {*}[lrange $args 0 end-1]
648	} else {
649		lindex $args end
650	}
651}
652
653
654################################################################
655#
656# NAME
657#	_line_trace - returns an abbreviated call stack trace with line numbers
658#
659# SYNOPSIS
660#	_line_trace
661#
662################################################################
663
664proc _line_trace {} {
665	set line_trace  ""
666	set first_entry true
667	for {set f [expr [info frame] - 3]} {$f >= 1} {incr f -1} {
668		set frame_dict [info frame $f]
669		if [dict exists $frame_dict file] {
670			if [regexp uplevel [dict get $frame_dict cmd]] {
671				continue
672			}
673			if {$first_entry} {
674				set first_entry false
675			} else {
676				append line_trace ","
677			}
678			if [dict exists $frame_dict proc] {
679				set proc [namespace tail [dict get $frame_dict proc]]
680				if {$proc ne ""} {
681					append line_trace "$proc\@"
682				}
683			}
684			append line_trace [file tail [dict get $frame_dict file]]
685			if [dict exists $frame_dict line] {
686				append line_trace ":[dict get $frame_dict line]"
687			}
688		}
689	}
690	return $line_trace
691}
692
693
694################################################################
695#
696# NAME
697#	tolerance - determines whether a value is within a specified tolerance
698#
699# SYNOPSIS
700#	tolerance expected observed tolerance_expression
701#
702# ARGUMENTS
703#	expected
704#		the expected (numeric) value
705#	observed
706#		the observed (numeric) value
707#	tolerance_expression
708#		a string of the form: [~][+|-]<tolerance>[%]
709#
710# DESCRIPTION
711#	tolerance
712#		A numeric tolerance
713#	symmetry
714#		By default the permitted range of values is symetric:
715#			[expected - tolerance, expected + tolerance]
716#		If the + sign is specified, the tolerance is limited to the
717#		the higher side only:
718#			[expected, expected + tolerance]
719#		If the - sign is specified, the tolerance is limited to the
720#		the lower side only:
721#			[expected - tolerance, expected]
722#	percent
723#		By default the permitted range is computed as absolute values:
724#			[expected - tolerance, expected + tolerance]
725#		If % is specified, the permitted range is computed as a
726#		percentage of the expected value:
727#			[expected*(1-tolerance/100), expected*(1+tolerance/100)]
728#	exclusivity
729#		By default the permitted range of values is inclusive, ie
730#		the min and max tolerated values are included in the range:
731#			[expected - tolerance, expected + tolerance]
732#		If ~ (exclusive) is specified, the tolerance limits are
733#		exclusive, ie the min and max tolerated values are excluded:
734#			(expected - tolerance, expected + tolerance)
735#	expression
736#		any combination of symetry, percent and exclusivity is allowed
737#
738# RETURN VALUE
739#	Returns true if the observed value is within the specified tolerance
740#	range of the expected value, otherwise false
741#
742# EXAMPLES
743#	The indicated tolerance_expression is true if:
744#	"5"     expected - 5  <= observed <= expected + 5
745#	"-5"    expected - 5  <= observed <= expected
746#	"+5"    expected      <= observed <= expected + 5
747#	"5%"    expected - 5% <= observed <= expected + 5%
748#	"~5"    expected - 5  <  observed <  expected + 5
749#	"~+5%"  expected      <= observed <  expected + 5%
750#
751################################################################
752
753proc tolerance { expected observed tolerance_expression } {
754	if {![regexp {^(~?)([-+]?)([0-9\.]+)(%?)$} $tolerance_expression {} exclusive sign tolerance percent]} {
755		fail "Invalid tolerance expression ($tolerance_expression)"
756	}
757
758	set lower_bound_expression $observed
759	if {$sign eq "+" || $exclusive ne "~"} {
760		append lower_bound_expression " >="
761	} else {
762		append lower_bound_expression " >"
763	}
764	append lower_bound_expression " $expected"
765	if {$sign eq "-" || $sign eq ""} {
766		if {$percent eq "%"} {
767			append lower_bound_expression " - $tolerance * $expected / 100"
768		} else {
769			append lower_bound_expression " - $tolerance"
770		}
771	}
772
773	set upper_bound_expression $observed
774	if {$sign eq "-" || $exclusive ne "~"} {
775		append upper_bound_expression " <="
776	} else {
777		append upper_bound_expression " <"
778	}
779	append upper_bound_expression " $expected"
780	if {$sign eq "+" || $sign eq ""} {
781		if {$percent eq "%"} {
782			append upper_bound_expression " + $tolerance * $expected / 100"
783		} else {
784			append upper_bound_expression " + $tolerance"
785		}
786	}
787
788	if {[expr $lower_bound_expression] && [expr $upper_bound_expression]} {
789		log_debug "$observed is within tolerance $tolerance_expression of $expected"
790		return true
791	} else {
792		log_warn "$observed is not within tolerance $tolerance_expression of $expected"
793		return false
794	}
795}
796
797
798################################################################
799#
800# NAME
801#	check_run_as_user - check if the caller may run_command as the supplied user
802#
803# SYNOPSIS
804#	check_run_as_user user
805#
806# DESCRIPTION
807#	Note that a proper sudo config needs to be set in orther to pass this
808#	check. Calling user should be permitted to run_commands as the
809#	supplied user using sudo without password.
810#	See the -user option of run_command.
811#	This proc also log_warn a message if user already exists in the DB
812#	because most probably this user is testsuite_user and that user is
813#	expected NOT to be in the DB and could potentially be removed from it by
814#	the test.
815#
816# RETURN VALUE
817#	Returns a boolean value indicating whether the calling user may
818#	run_command as user.
819#
820################################################################
821
822proc check_run_as_user user {
823	global bin_id
824
825	if {$user eq ""} {
826		return false
827	}
828	if {[run_command_status -none -user $user "$bin_id -un"]} {
829		return false
830	}
831
832	if {[get_admin_level $user] != ""} {
833		log_warn "User $user already exists in DB, but it's probable that it's going to be removed by the test cleanup"
834	}
835
836	return true
837}
838
839
840################################################################
841#
842# NAME
843#	run_command - executes a command and returns a dictionary result
844#
845# SYNOPSIS
846#	run_command ?options? command
847#
848# DESCRIPTION
849#	Executes a command and returns a dictionary that includes the output,
850#	exit code, etc. An action can be taken (fail, warn, subtest, none) if
851#	the command's exit code is unexpected. By default, the action
852#	will be applied if the command fails. If the -xfail option is
853#	specified, the behavior will be reversed to apply the action if the
854#	command ran successfully.
855#	A timeout is always treated as unexpected, so log_error will be shown
856#	by default, or fail/subfail will be called if -fail/-subtest are used.
857#
858# OPTIONS
859#	-fail
860#		If the exit code is unexpected, the action that will
861#		be taken is to fail the test
862#	-subtest
863#		If the exit code is unexpected, the action that will
864#		be taken is to subfail, otherwise subpass will be called
865#	-warn
866#		If the exit code is unexpected, the action that will
867#		be taken is to log a warning (this is the default)
868#	-none
869#		If the exit code is unexpected, no action will be taken
870#	-xfail
871#		If the command exits with zero the action will be applied.
872#		Without this option, the action will be applied if the
873#		command exits with a non-zero exit code.
874#	-timeout <float_number>
875#		Time in seconds to wait for the command to complete before
876#		timing out (default is 60.0)
877#	-nolog
878#		Logging for this command will occur at trace threshold only
879#	-stdin
880#		Provide standard in to be piped into command
881#	-user <user>
882#		Attempt to execute command as <user>. Note that sudo must be
883#		properly configured to permit the caller to execute as <user>.
884#		See check_run_as_user.
885#
886# ARGUMENTS
887#	command
888#		a string containing the command and arguments to execute
889#
890# RETURN VALUE
891#	A dictionary containing the following elements:
892#		command    - The command that was invoked
893#		exit_code  - Exit code
894#		output     - The combined standard output and standard error
895#		start_time - The time (with ms) the command was executed
896#		duration   - The duration (seconds and milliseconds) the
897#		             command took to run
898#
899################################################################
900proc run_command args {
901	global bin_bash bin_sudo
902
903	set alt_user ""
904	set exit_status 0
905	set timedout false
906	set output ""
907	set action "warn"
908	set timeout 60
909	set expect_failure false
910	set log_at_trace_level false
911	set stdin ""
912	while {[llength $args]} {
913		switch -glob -- [lindex $args 0] {
914			-fail    {set action "fail"; set args [lrange $args 1 end]}
915			-subtest {set action "subtest"; set args [lrange $args 1 end]}
916			-none    {set action "none"; set args [lrange $args 1 end]}
917			-timeout {set args [lassign $args - timeout]}
918			-warn    {set action "warn"; set args [lrange $args 1 end]}
919			-xfail   {set expect_failure true; set args [lrange $args 1 end]}
920			-nolog   {set log_at_trace_level true; set args [lrange $args 1 end]}
921			-stdin   {set stdin [lindex $args 1]; set args [lassign $args - stdin]}
922			-user    {set args [lassign $args - alt_user]}
923			-*       {fail "Unknown option: [lindex $args 0]"}
924			default  break
925		}
926	}
927	if {[llength $args] == 1} {
928		lassign $args command
929	} else {
930		fail "Invalid number of arguments [llength $args]: $args"
931	}
932
933	if {$action eq "subtest"} {
934		if {$expect_failure} {
935			set test_description "Command \"$command\" should fail"
936		} else {
937			set test_description "Command \"$command\" should succeed"
938		}
939	}
940
941	if {$log_at_trace_level} {
942		interp alias {} log_run     {} log_trace
943		interp alias {} log_details {} log_trace
944	} else {
945		interp alias {} log_run     {} log_command
946		interp alias {} log_details {} log_debug
947	}
948
949	set orig_log_user [log_user -info]
950	log_user 0
951
952	if {$alt_user ne ""} {
953		log_run "Run Command as user $alt_user: $command"
954	} else {
955		log_run "Run Command: $command"
956	}
957	set start_clock_ms [clock milliseconds]
958	set stty_init raw ; # Prevent the terminal from inserting \r
959	if {$alt_user ne ""} {
960		set expect_pid [spawn -noecho $bin_sudo -nu $alt_user $bin_bash -c "$command"]
961	} else {
962		set expect_pid [spawn -noecho $bin_bash -c "$command"]
963	}
964	if { $stdin != "" } {
965		exp_send "$stdin"
966		set command "$command <<< $stdin"
967	}
968	expect {
969		-re "(.+)" {
970			append output $expect_out(1,string)
971			exp_continue
972		}
973		timeout {
974			slow_kill $expect_pid
975			set exit_status $::RETURN_TIMEOUT
976			set timedout true
977		}
978		eof {
979			lassign [wait] pid spawnid os_error_flag errno
980			set exit_status [expr $errno > 128 ? $errno - 256 : $errno]
981		}
982	}
983	set start_time [format "%.3f" [expr $start_clock_ms / 1000.000]]
984	set end_time [format "%.3f" [expr [clock milliseconds] / 1000.000]]
985	set duration [format "%.3f" [expr $end_time - $start_time]]
986
987	log_details "Command Results:"
988	log_details "	Duration:   $duration"
989	log_details "	Exit Code:  $exit_status"
990	if {[info exists output]} {
991		log_details "	Output: $output"
992	}
993
994	if {$timedout} {
995		set message "Command \"$command\" timed out after $timeout seconds"
996		if {$action eq "fail"} {
997			fail $message
998		} elseif {$action eq "subtest"} {
999			subfail $test_description $message
1000		} else {
1001			log_error $message
1002		}
1003	} elseif {! $expect_failure && $exit_status != 0} {
1004		set message "Command \"$command\" failed with rc=$exit_status"
1005		if {[info exists output] && $output != ""} {
1006			append message ": [string trimright $output]"
1007		}
1008		if {$action eq "warn"} {
1009			log_warn $message
1010		} elseif {$action eq "subtest"} {
1011			subfail $test_description $message
1012		} elseif {$action eq "fail"} {
1013			fail $message
1014		}
1015	} elseif {$expect_failure && $exit_status == 0} {
1016		set message "Command \"$command\" was expected to fail but succeeeded"
1017		if {$action eq "warn"} {
1018			log_warn $message
1019		} elseif {$action eq "subtest"} {
1020			subfail $test_description $message
1021		} elseif {$action eq "fail"} {
1022			fail $message
1023		}
1024	} elseif {$action eq "subtest"} {
1025		subpass $test_description
1026	}
1027	log_user $orig_log_user
1028
1029	dict set result command    $command
1030	dict set result exit_code  $exit_status
1031	dict set result output     $output
1032	dict set result start_time $start_time
1033	dict set result duration   $duration
1034
1035	return $result
1036}
1037
1038
1039################################################################
1040#
1041# NAME
1042#	run_command_output - executes a command and returns the output
1043#
1044# SYNOPSIS
1045#	run_command_output ?options? command
1046#
1047# DESCRIPTION
1048#	Executes a command and returns a dictionary that includes the output,
1049#	exit code, etc. An action can be taken (fail, warn, none)  if the
1050#	command's exit code or timeout is unexpected. By default, the action
1051#	will be applied if the command fails. If the -xfail option is
1052#	specified, the behavior will be reversed to apply the action if the
1053#	command ran successfully.
1054#
1055# OPTIONS
1056#	-fail
1057#		if the exit code or timeout is unexpected, the action that will
1058#		be taken is to fail the test
1059#	-warn
1060#		if the exit code or timeout is unexpected, the action that will
1061#		be taken is to log a warning (this is the default)
1062#	-none
1063#		if the exit code or timeout is unexpected, no action will be
1064#		taken
1065#	-xfail
1066#		if the command exits with zero and does not time out, the
1067#		action will be applied. Without this option, the action will
1068#		be applied if the command exits with a non-zero exit code or
1069#		times out.
1070#	-timeout <float_number>
1071#		time in seconds to wait for the command to complete before
1072#		timing out (default is 60.0)
1073#	-stdin
1074#		Provide standard in to be piped into command
1075#
1076# ARGUMENTS
1077#	command
1078#		a string containing the command and arguments to execute
1079#
1080# RETURN VALUE
1081#	A string containing the combined standard output and standard error
1082#
1083################################################################
1084
1085proc run_command_output args {
1086
1087	set result [run_command {*}$args]
1088
1089	if [dict exists $result output] {
1090		return [dict get $result output]
1091	} else {
1092		return ""
1093	}
1094}
1095
1096
1097################################################################
1098#
1099# NAME
1100#	run_command_status - executes a command and returns the exit code
1101#
1102# SYNOPSIS
1103#	run_command_status ?options? command
1104#
1105# DESCRIPTION
1106#	Executes a command and returns a dictionary that includes the output,
1107#	exit code, etc. An action can be taken (fail, warn, none)  if the
1108#	command's exit code or timeout is unexpected. By default, the action
1109#	will be applied if the command fails. If the -xfail option is
1110#	specified, the behavior will be reversed to apply the action if the
1111#	command ran successfully.
1112#
1113# OPTIONS
1114#	-fail
1115#		if the exit code or timeout is unexpected, the action that will
1116#		be taken is to fail the test
1117#	-warn
1118#		if the exit code or timeout is unexpected, the action that will
1119#		be taken is to log a warning (this is the default)
1120#	-none
1121#		if the exit code or timeout is unexpected, no action will be
1122#		taken
1123#	-xfail
1124#		if the command exits with zero and does not time out, the
1125#		action will be applied. Without this option, the action will
1126#		be applied if the command exits with a non-zero exit code or
1127#		times out.
1128#	-timeout <float_number>
1129#		time in seconds to wait for the command to complete before
1130#		timing out (default is 60.0)
1131#	-stdin
1132#		Provide standard in to be piped into command
1133#
1134# ARGUMENTS
1135#	command
1136#		a string containing the command and arguments to execute
1137#
1138# RETURN VALUE
1139#	The exit code for the invoked command
1140#
1141################################################################
1142
1143proc run_command_status args {
1144
1145	set result [run_command {*}$args]
1146
1147	return [dict get $result exit_code]
1148}
1149
1150
1151################################################################
1152#
1153# NAME
1154#	cancel_job - cancels the specified job list
1155#
1156# SYNOPSIS
1157#	cancel_job job_id_list ?het_job?
1158#
1159# ARGUMENTS
1160#	job_id_list
1161#		The list of Slurm job ids that we want to cancel
1162#	het_job
1163#		1 if jobs are hetjobs and we want to confirm each
1164#		component has completed
1165#
1166# DESCRIPTION
1167#	Cancels one or more jobs. A job_id of 0 will be silently ignored.
1168#
1169# RETURN VALUE
1170#	RETURN_SUCCESS if jobs are cancelled, or non-zero value otherwise.
1171#
1172################################################################
1173
1174proc cancel_job { job_id_list {het_job 0}} {
1175	global scancel
1176
1177	set job_list_clean [list]
1178	foreach job_id $job_id_list {
1179		if {$job_id != 0} {
1180			lappend job_list_clean $job_id
1181		}
1182	}
1183
1184	log_debug "Cancelling $job_list_clean"
1185	if {[run_command_status "$scancel -Q $job_list_clean"]} {
1186		log_warn "scancel command returned error"
1187		return $::RETURN_ERROR
1188	}
1189	foreach job_id $job_list_clean {
1190		if {[wait_for_job $job_id "DONE" $het_job]} {
1191			log_warn "Job $job_id not ended"
1192			return $::RETURN_ERROR
1193		}
1194	}
1195	return $::RETURN_SUCCESS
1196}
1197
1198
1199################################################################
1200#
1201# NAME
1202#	get_line_cnt - returns the size of the specified file
1203#
1204# SYNOPSIS
1205#	get_line_cnt file_name
1206#
1207# RETURN VALUE
1208#	Number of lines in the specified file.
1209#
1210################################################################
1211
1212proc get_line_cnt { file_name } {
1213	global bin_wc number
1214	set lines 0
1215	spawn $bin_wc -l $file_name
1216	expect {
1217		-re "($number) " {
1218			set lines $expect_out(1,string)
1219			exp_continue
1220		}
1221		eof {
1222			wait
1223		}
1224	}
1225	return $lines
1226}
1227
1228
1229################################################################
1230#
1231# NAME
1232#	slow_kill - kills a process slowly
1233#
1234# SYNOPSIS
1235#	slow_kill pid
1236#
1237# DESCRIPTION
1238#	Kill a process slowly, first trying SIGINT, pausing for
1239#       a second, then sending SIGKILL.
1240#
1241# RETURN VALUE
1242#	A non-zero return code indicates a failure.
1243#
1244################################################################
1245
1246proc slow_kill { pid } {
1247	global bin_kill
1248
1249	catch {exec $bin_kill -INT $pid}
1250	catch {exec $bin_kill -INT $pid}
1251	sleep  1
1252	catch {exec $bin_kill -KILL $pid}
1253
1254	return 0
1255}
1256
1257
1258################################################################
1259#
1260# NAME
1261#	get_my_id - gets the id from the running user
1262#
1263# SYNOPSIS
1264#	get_my_id
1265#
1266# RETURN VALUE
1267#	output of id
1268#
1269################################################################
1270
1271proc get_my_id {} {
1272
1273	global bin_id number
1274	set login_info -1
1275
1276	log_user 0
1277
1278	spawn $bin_id
1279	expect {
1280		-re "(uid=.*\n)" {
1281			set login_info $expect_out(1,string)
1282			exp_continue
1283		}
1284		eof {
1285			wait
1286		}
1287	}
1288
1289	log_user 1
1290
1291	if {$login_info == -1} {
1292		fail "Unable to get user info"
1293	}
1294
1295	return $login_info
1296}
1297
1298
1299################################################################
1300#
1301# NAME
1302#	get_my_user_name - gets the name from the running user
1303#
1304# SYNOPSIS
1305#	get_my_user_name
1306#
1307# RETURN VALUE
1308#	A non-zero return code indicates a failure.
1309#
1310################################################################
1311
1312proc get_my_user_name {  } {
1313	global bin_id re_word_str
1314
1315	set user_name -1
1316
1317	log_user 0
1318	spawn $bin_id -nu
1319	expect {
1320		-re "($re_word_str)" {
1321			set user_name $expect_out(1,string)
1322			exp_continue
1323		}
1324		eof {
1325			wait
1326		}
1327	}
1328	log_user 1
1329
1330	if {$user_name == -1} {
1331		fail "Unable to get user name"
1332	}
1333
1334	return $user_name
1335}
1336
1337
1338################################################################
1339#
1340# NAME
1341#	get_my_uid - gets the uid from the running user
1342#
1343# SYNOPSIS
1344#	get_my_uid
1345#
1346# RETURN VALUE
1347#	The uid of the current user, or fails.
1348#
1349################################################################
1350
1351proc get_my_uid {  } {
1352	global bin_id number
1353
1354	set out [run_command_output -nolog -fail "$bin_id -u"]
1355	if {![regexp "($number)" $out - uid]} {
1356		fail "Unable to get UID with $bin_id ($out)"
1357	}
1358
1359	return $uid
1360}
1361
1362
1363################################################################
1364#
1365# NAME
1366#	get_my_gid - gets the gid from the running user
1367#
1368# SYNOPSIS
1369#	get_my_gid
1370#
1371# RETURN VALUE
1372#	A non-zero return code indicates a failure.
1373#
1374################################################################
1375
1376proc get_my_gid {  } {
1377	global bin_id number
1378
1379	set gid -1
1380
1381	log_user 0
1382	spawn $bin_id -g
1383	expect {
1384		-re "($number)" {
1385			set gid $expect_out(1,string)
1386			exp_continue
1387		}
1388		eof {
1389			wait
1390		}
1391	}
1392	log_user 1
1393
1394	return $gid
1395}
1396
1397
1398################################################################
1399#
1400# NAME
1401#	kill_salloc - kills all salloc commands associated with this user
1402#
1403# SYNOPSIS
1404#	kill_salloc
1405#
1406# DESCRIPTION
1407#	Kill all salloc commands associated with this user.
1408#	Issue two SIGINT, sleep 1 and a SIGKILL
1409#
1410# RETURN VALUE
1411#	A non-zero return code indicates a failure.
1412#
1413# NOTE
1414#	Use slow_kill instead of kill_salloc if you can capture
1415#	the process id
1416#
1417################################################################
1418
1419proc kill_salloc {  } {
1420	global bin_id bin_pkill bin_sleep number
1421
1422	set uid [get_my_uid]
1423	catch {exec $bin_pkill -INT -u $uid salloc}
1424	catch {exec $bin_pkill -INT -u $uid salloc}
1425	sleep  1
1426	catch {exec $bin_pkill -KILL -u $uid salloc}
1427
1428	return 0
1429}
1430
1431
1432################################################################
1433#
1434# NAME
1435#	kill_srun - kills all srun commands associated with this user
1436#
1437# SYNOPSIS
1438#	kill_srun
1439#
1440# DESCRIPTION
1441#	Kill all srun commands associated with this user.
1442#	Issue two SIGINT, sleep 1 and a SIGKILL
1443#
1444# RETURN VALUE
1445#	A non-zero return code indicates a failure.
1446#
1447# NOTE
1448#	Use slow_kill instead of kill_srun if you can capture
1449#	the process id
1450#
1451################################################################
1452
1453proc kill_srun {  } {
1454	global bin_id bin_pkill bin_sleep number
1455
1456	set uid [get_my_uid]
1457	catch {exec $bin_pkill -INT -u $uid srun}
1458	catch {exec $bin_pkill -INT -u $uid srun}
1459	sleep  1
1460	catch {exec $bin_pkill -KILL -u $uid srun}
1461
1462	return 0
1463}
1464
1465
1466################################################################
1467#
1468# NAME
1469#	wait_for - generic wait utility
1470#
1471# SYNOPSIS
1472#	wait_for ?options? condition body
1473#
1474# DESCRIPTION
1475#	Generic wait utility allowing you to repeatedly execute a generic block
1476#	of code until a specified boolean expression is met. The code block and
1477#	condition check occur every poll interval until a timeout is reached.
1478#
1479# OPTIONS
1480#	-fail
1481#		abort the test with failure if the condition is not met
1482#	-timeout <float_number>
1483#		time in seconds to wait for the condition to be met before
1484#		timing out (default is 60.0)
1485#	-pollinterval <float_number>
1486#		time in seconds between each loop execution and condition check
1487#		(default is 1.0)
1488#
1489# ARGUMENTS
1490#	condition
1491#		The boolean expression to test
1492#	body
1493#		A block of code to evaluate in the invoking stack frame
1494#
1495# RETURN VALUE
1496#	RETURN_SUCCESS if the condition is met before the timeout occurs,
1497#	RETURN_TIMEOUT if the timeout occurs before the condition is met
1498#
1499################################################################
1500
1501proc wait_for args {
1502	set fatal         false
1503	set timeout       60
1504	set poll_interval 1
1505	while {[llength $args]} {
1506		switch -glob -- [lindex $args 0] {
1507			-fail   {set fatal true; set args [lrange $args 1 end]}
1508			-time*  {set args [lassign $args - timeout]}
1509			-poll*  {set args [lassign $args - poll_interval]}
1510			-*      {fail "Unknown option: [lindex $args 0]"}
1511			default break
1512		}
1513	}
1514	if {[llength $args] == 2} {
1515		lassign $args condition body
1516	} else {
1517		fail "Invalid number of arguments [llength $args]: $args"
1518	}
1519
1520	set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]]
1521
1522	log_debug "Waiting for $condition"
1523
1524	while {1} {
1525		# Evaluate code block
1526		log_trace "Evaluating code block ([string trim $body])"
1527		uplevel $body
1528
1529		# Check condition
1530		if {[uplevel expr [format "{%s}" $condition]]} {
1531			set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
1532			log_debug "Condition ($condition) was met"
1533			return $::RETURN_SUCCESS
1534		} else {
1535			log_trace "Condition ($condition) was not met"
1536		}
1537
1538		# Sleep poll interval
1539		log_trace "Sleeping for $poll_interval seconds"
1540		after [expr {int($poll_interval * 1000)}]
1541
1542		# Check if we have surpassed our timeout
1543		set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
1544		log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])"
1545		if {$now > $start_time + $timeout} {
1546			set message "Condition ($condition) did not occur before timeout ($timeout) seconds"
1547			if {$fatal} {
1548				fail $message
1549			} else {
1550				log_warn $message
1551				return $::RETURN_TIMEOUT
1552			}
1553		}
1554	}
1555}
1556
1557
1558################################################################
1559#
1560# NAME
1561#	wait_for_command - repeat a command until it is successful or meets a specified condition
1562#
1563# SYNOPSIS
1564#	wait_for_command ?options? command ?condition?
1565#
1566# DESCRIPTION
1567#	A command is repeated until it meets a condition or a timeout is reached.
1568#	If a condition is not specified, the command will be repeated until it
1569#	is successful (the exit code is zero).
1570#
1571# OPTIONS
1572#	-fail
1573#		abort the test with failure if the condition is not met by
1574#		the timeout
1575#	-timeout <float_number>
1576#		time in seconds to wait for the condition to be met before
1577#		timing out (default is 60.0)
1578#	-pollinterval <float_number>
1579#		time in seconds between each loop execution and condition
1580#		check (default is 1.0)
1581#
1582# ARGUMENTS
1583#	command
1584#		a string containing the command and arguments to execute
1585#	condition
1586#		The boolean expression to test. For each command invocation,
1587#		the result variable will be set to the dictionary returned
1588#		from run_command.
1589#		The condition expression will normally involve a comparison
1590#		with one or more values of this dictionary. If a condition is
1591#		not specified, this condition will be used:
1592#		{ [dict get $result exit_code] == 0 }
1593#
1594# RETURN VALUE
1595#	RETURN_SUCCESS if the condition is met before the timeout occurs,
1596#	RETURN_TIMEOUT if the timeout occurs before the condition is met
1597#
1598################################################################
1599
1600proc wait_for_command args {
1601	set fatal         false
1602	set timeout       60
1603	set poll_interval 1
1604	while {[llength $args]} {
1605		switch -glob -- [lindex $args 0] {
1606			-fail   {set fatal true; set args [lrange $args 1 end]}
1607			-time*  {set args [lassign $args - timeout]}
1608			-poll*  {set args [lassign $args - poll_interval]}
1609			-*      {fail "Unknown option: [lindex $args 0]"}
1610			default break
1611		}
1612	}
1613
1614	set argument_count [llength $args]
1615	if {$argument_count < 1} {
1616		fail "Too few arguments ($argument_count): $args"
1617	} elseif {$argument_count > 2} {
1618		fail "Too many arguments ($argument_count): $args"
1619	}
1620	lassign $args command
1621	if {$argument_count == 2} {
1622		set condition [lindex $args 1]
1623	} else {
1624		set condition { [dict get $result exit_code] == 0 }
1625	}
1626
1627	set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]]
1628
1629	log_debug "Waiting for $condition"
1630
1631	while {1} {
1632		# Run command
1633		set result [run_command $command]
1634
1635		# Check condition
1636		if {[eval expr [format "{%s}" $condition]]} {
1637			set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
1638			log_debug "Condition ($condition) was met"
1639			return $::RETURN_SUCCESS
1640		} else {
1641			log_trace "Condition ($condition) was not met"
1642		}
1643
1644		# Sleep poll interval
1645		log_trace "Sleeping for $poll_interval seconds"
1646		after [expr {int($poll_interval * 1000)}]
1647
1648		# Check if we have surpassed our timeout
1649		set now [format "%.3f" [expr [clock milliseconds] / 1000.000]]
1650		log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])"
1651		if {$now > $start_time + $timeout} {
1652			set message "Condition ($condition) did not occur before timeout ($timeout) seconds"
1653			if {$fatal} {
1654				fail $message
1655			} else {
1656				log_warn $message
1657				return $::RETURN_TIMEOUT
1658			}
1659		}
1660	}
1661}
1662
1663
1664################################################################
1665#
1666# NAME
1667#	wait_for_command_match - repeat a command until its output matches the specified pattern
1668#
1669# SYNOPSIS
1670#	wait_for_command_match ?options? command pattern
1671#
1672# DESCRIPTION
1673#	A command is repeated until its output matches the specified pattern
1674#
1675# OPTIONS
1676#	-fail
1677#		abort the test with failure if output does not match the pattern
1678#		by the timeout
1679#	-timeout <float_number>
1680#		time in seconds to wait for the pattern to be matched before
1681#		timing out (default is 60.0)
1682#	-pollinterval <float_number>
1683#		time in seconds between each loop execution and match check
1684#		(default is 1.0)
1685#
1686# ARGUMENTS
1687#	command
1688#		a string containing the command and arguments to execute
1689#	pattern
1690#		The regular expression to match against the command output
1691#
1692# RETURN VALUE
1693#	RETURN_SUCCESS if the pattern is matched before the timeout occurs,
1694#	RETURN_TIMEOUT if the timeout occurs before the pattern is matched
1695#
1696################################################################
1697
1698proc wait_for_command_match args {
1699
1700	set pattern [lindex $args end]
1701	set args [lrange $args 0 end-1]
1702
1703	return [wait_for_command {*}$args "\[regexp {$pattern} \[dict get \$result output\]\] == 1"]
1704}
1705
1706
1707################################################################
1708#
1709# NAME
1710#	wait_for_file - waits for a file to exist with non-zero size
1711#
1712# SYNOPSIS
1713#	wait_for_file ?options? file_name
1714#
1715# OPTIONS
1716#	-fail
1717#		If an error occurs or the file does not become present
1718#		by the timeout, fail the test rather than returning an error
1719#	-timeout <integer_number>
1720#		time in seconds to wait for the file to exist before
1721#		timing out (default is 90)
1722#	-pollinterval <integer_number>
1723#		time in seconds between each file existence test (default is 1)
1724#
1725# DESCRIPTION
1726#	Wait for the specified file to exist and have a non-zero size.
1727#	Note that if JobFileAppend=0 is configured, a file can exist and
1728#	be purged then be re-created.
1729#
1730# RETURN VALUE
1731#	RETURN_SUCCESS if the file becomes present within the timeout, or
1732#	non-zero value otherwise.
1733#
1734################################################################
1735
1736proc wait_for_file args {
1737	global bin_sleep
1738
1739	set fatal         false
1740	set timeout       90
1741	set poll_interval 1
1742	while {[llength $args]} {
1743		switch -glob -- [lindex $args 0] {
1744			-fatal  -
1745			-fail   {set fatal true; set args [lrange $args 1 end]}
1746			-time*  {set args [lassign $args - timeout]}
1747			-poll*  {set args [lassign $args - poll_interval]}
1748			-*      {fail "Unknown option: [lindex $args 0]"}
1749			default break
1750		}
1751	}
1752	set argument_count [llength $args]
1753	if {$argument_count != 1} {
1754		fail "Invalid number of arguments ($argument_count): $args"
1755	} else {
1756		lassign $args file_name
1757	}
1758
1759	for {set my_delay 0} {$my_delay <= $timeout} \
1760	                     {set my_delay [expr $my_delay + $poll_interval]} {
1761		if {[file exists $file_name]} {
1762			# Add small delay for I/O buffering
1763			exec $bin_sleep 1
1764			return $::RETURN_SUCCESS
1765		}
1766		exec $bin_sleep $poll_interval
1767
1768		# Expect may fail to load current NFS info.
1769		# Use the ls command to load current info.
1770		set slash_pos [string last $file_name "/"]
1771		if {$slash_pos < 1} {
1772			set dir_name "."
1773		} else {
1774			decr slash_pos
1775			set dir_name [string $file_name 0 $slash_pos]
1776		}
1777		exec /bin/ls $dir_name
1778	}
1779	set message "Timeout waiting for file $file_name"
1780	if {$fatal} {
1781		fail $message
1782	}
1783
1784	log_error $message
1785	return $::RETURN_TIMEOUT
1786}
1787
1788
1789################################################################
1790#
1791# NAME
1792#	_wait_for_single_job - waits for a job to reach the desired state
1793#
1794# SYNOPSIS
1795#	_wait_for_single_job ?options? job_id desired_state
1796#
1797# DESCRIPTION
1798#	Wait for a previously submitted Slurm job to reach the desired state.
1799#
1800# OPTIONS
1801#	-fail
1802#		If an error occurs or the job does not reach the desired state
1803#		by the timeout, fail the test rather than returning an error
1804#	-timeout <integer_number>
1805#		time in seconds to wait for the job to be in the desired state
1806#		before timing out (default is 360)
1807#	-pollinterval <integer_number>
1808#		time in seconds between each job state check (default is 1)
1809#
1810# ARGUMENTS
1811#	job_id
1812#		The Slurm job id of a job we want to wait for.
1813#	desired_state
1814#		The state you want the job to attain before
1815#		returning.  Currently supports:
1816#			DONE any terminated state
1817#			PENDING job is pending
1818#			RUNNING job is running
1819#			SPECIAL_EXIT
1820#			SUSPENDED job is suspended
1821#
1822# RETURN VALUE
1823#	RETURN_SUCCESS, or non-zero on error.
1824#
1825# NOTE: We sleep for two seconds before replying that a job is
1826# done to give time for I/O completion (stdout/stderr files)
1827#
1828################################################################
1829
1830proc _wait_for_single_job args {
1831	global scontrol
1832
1833	set fatal         false
1834	set timeout       360
1835	set poll_interval 1
1836	while {[llength $args]} {
1837		switch -glob -- [lindex $args 0] {
1838			-fatal  -
1839			-fail   {set fatal true; set args [lrange $args 1 end]}
1840			-time*  {set args [lassign $args - timeout]}
1841			-poll*  {set args [lassign $args - poll_interval]}
1842			-*      {fail "Unknown option: [lindex $args 0]"}
1843			default break
1844		}
1845	}
1846	set argument_count [llength $args]
1847	if {$argument_count != 2} {
1848		fail "Invalid number of arguments ($argument_count): $args"
1849	} else {
1850		lassign $args job_id desired_state
1851	}
1852
1853	# First verify that desired_state is supported
1854	switch $desired_state {
1855		"DONE" {}
1856		"PENDING" {}
1857		"RUNNING" {}
1858		"SPECIAL_EXIT" {}
1859		"SUSPENDED" {}
1860		default {
1861			set message "Invalid desired state: $desired_state"
1862			if {$fatal} {
1863				fail $message
1864			}
1865			log_warn $message
1866			return $::RETURN_ERROR
1867		}
1868	}
1869
1870	if {$job_id == 0} {
1871		set message "Invalid job ID: $job_id"
1872		if {$fatal} {
1873			fail $message
1874		}
1875		log_warn $message
1876		return $::RETURN_ERROR
1877	}
1878
1879	set my_delay    0
1880	while 1 {
1881		set fd [open "|$scontrol -o show job $job_id"]
1882		gets $fd line
1883		catch {close $fd}
1884		if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} {
1885			set state "NOT_FOUND"
1886		}
1887
1888		switch $state {
1889			"NOT_FOUND" -
1890			"BOOT_FAIL" -
1891			"CANCELLED" -
1892			"COMPLETED" -
1893			"DEADLINE" -
1894			"FAILED" -
1895			"NODE_FAIL" -
1896			"OUT_OF_MEMORY" -
1897			"PREEMPTED" -
1898			"TIMEOUT" {
1899				if {[string compare $desired_state "DONE"] == 0} {
1900					log_debug "Job $job_id is DONE ($state)"
1901					sleep 2
1902					return $::RETURN_SUCCESS
1903				}
1904				if {[string compare $desired_state "RUNNING"] == 0} {
1905					set message "Job $job_id is $state, but we wanted RUNNING"
1906				}
1907				if {[string compare $desired_state "SUSPENDED"] == 0} {
1908					set message "Job $job_id is $state, but we wanted SUSPENDED"
1909				}
1910				if {$fatal} {
1911					fail $message
1912				}
1913				log_debug $message
1914				return $::RETURN_ERROR
1915			}
1916			"PENDING" {
1917				if {[string compare $desired_state "PENDING"] == 0} {
1918					log_debug "Job $job_id is PENDING"
1919					return $::RETURN_SUCCESS
1920				}
1921				log_debug "Job $job_id is in state $state, desire $desired_state"
1922			}
1923			"RUNNING" {
1924				if {[string compare $desired_state "RUNNING"] == 0} {
1925					log_debug "Job $job_id is RUNNING"
1926					return $::RETURN_SUCCESS
1927				}
1928				log_debug "Job $job_id is in state $state, desire $desired_state"
1929			}
1930			"SPECIAL_EXIT" {
1931				if {[string compare $desired_state "SPECIAL_EXIT"] == 0} {
1932					log_debug "Job $job_id is SPECIAL_EXIT"
1933					return $::RETURN_SUCCESS
1934				}
1935				log_debug "Job $job_id is in state $state, desire $desired_state"
1936			}
1937			"SUSPENDED" {
1938				if {[string compare $desired_state "SUSPENDED"] == 0} {
1939					log_debug "Job $job_id is SUSPENDED"
1940					return $::RETURN_SUCCESS
1941				}
1942				log_debug "Job $job_id is in state $state, desire $desired_state"
1943			}
1944			default {
1945				log_debug "Job $job_id is in state $state, desire $desired_state"
1946			}
1947		}
1948
1949		if { $my_delay > $timeout } {
1950			set message "Timeout waiting for job state $desired_state"
1951			if {$fatal} {
1952				fail $message
1953			}
1954			log_warn "Timeout waiting for job state $desired_state"
1955			return $::RETURN_TIMEOUT
1956		}
1957
1958		exec sleep $poll_interval
1959		set my_delay [expr $my_delay + $poll_interval]
1960	}
1961}
1962
1963
1964################################################################
1965#
1966# NAME
1967#	wait_for_job - waits for job to be in desired state
1968#
1969# SYNOPSIS
1970#	wait_for_job ?options? job_id desired_state ?het_job?
1971#
1972# DESCRIPTION
1973#	Wait for job to be in desired state. Can handle het job components.
1974#
1975# OPTIONS
1976#	-fail
1977#		If an error occurs or the job does not reach the desired state
1978#		by the timeout, fail the test rather than returning an error
1979#	-timeout <integer_number>
1980#		time in seconds to wait for the job to be in the desired state
1981#		before timing out (default is 90)
1982#	-pollinterval <integer_number>
1983#		time in seconds between each job state check (default is 1)
1984#
1985# ARGUMENTS
1986#	job_id
1987#		The Slurm job id of a job we want to wait for.
1988#	desired_state
1989#		The state you want the job to attain before returning.
1990#		Currently supports:
1991#			DONE any terminated state
1992#			PENDING job is pending
1993#			RUNNING job is running
1994#			SPECIAL_EXIT
1995#			SUSPENDED job is suspended
1996#	het_job
1997#		If set, checks the state of each component job if the job
1998#		is a het one.
1999#
2000# RETURN VALUE
2001#	RETURN_SUCCESS if job reaches the desired state, or non-zero value
2002#	otherwise.
2003#
2004# SEE ALSO
2005#	_wait_for_single_job
2006#
2007################################################################
2008
2009proc wait_for_job args {
2010
2011	set options  [list]
2012	set het_job  0
2013
2014	while {[llength $args]} {
2015		switch -glob -- [lindex $args 0] {
2016			-fatal  -
2017			-fail   {
2018				lappend options [lindex $args 0]
2019				set args [lrange $args 1 end]
2020			}
2021			-time*  -
2022			-poll*  {
2023				lappend options {*}[lrange $args 0 1]
2024				set args [lrange $args 2 end]
2025			}
2026			default break
2027		}
2028	}
2029	set argument_count [llength $args]
2030	if {$argument_count < 2} {
2031		fail "Too few arguments ($argument_count): $args"
2032	} elseif {$argument_count > 3}  {
2033		fail "Too many arguments ($argument_count): $args"
2034	} else {
2035		lassign $args job_id desired_state
2036	}
2037	if {$argument_count == 3} { set hetjob [lindex $args 2] }
2038
2039	if { $het_job } {
2040		# get component job ids
2041		set jid_list [get_het_job_ids $job_id 1]
2042	}
2043
2044	set rc       0
2045	set jid_list ""
2046	if { $jid_list == "" } {
2047		# non-het job
2048		set jid_list $job_id
2049	}
2050
2051	foreach jid $jid_list {
2052		set rc [_wait_for_single_job {*}$options $jid $desired_state]
2053		if { $rc } {
2054			# bail out on first failure
2055			break
2056		}
2057	}
2058	return $rc
2059}
2060
2061
2062################################################################
2063#
2064# NAME
2065#	wait_for_account_done - cancels and waits on jobs in specified accounts
2066#
2067# SYNOPSIS
2068#	wait_for_account_done ?options? accounts
2069#
2070# DESCRIPTION
2071#	Cancel jobs on and wait for them to be finished in account(s) given.
2072#
2073# OPTIONS
2074#	-timeout <integer_number>
2075#		time in seconds to wait for the jobs to be finished before
2076#		timing out (default is 360)
2077#	-pollinterval <integer_number>
2078#		time in seconds between each job state check (default is 1)
2079#
2080# ARGUMENTS
2081#	accounts
2082#		Comma-delimited list of accounts
2083#
2084# RETURN VALUE
2085#	RETURN_SUCCESS if all jobs of the account are finished, or non-zero
2086#	otherwise.
2087#
2088# NOTE
2089#	We sleep for two seconds before replying that a job is
2090#	done to give time for I/O completion (stdout/stderr files)
2091#
2092################################################################
2093
2094proc wait_for_account_done args {
2095	global scancel squeue re_word_str
2096
2097	set timeout       360
2098	set poll_interval 1
2099	while {[llength $args]} {
2100		switch -glob -- [lindex $args 0] {
2101			-time*  {set args [lassign $args - timeout]}
2102			-poll*  {set args [lassign $args - poll_interval]}
2103			-*      {fail "Unknown option: [lindex $args 0]"}
2104			default break
2105		}
2106	}
2107	set argument_count [llength $args]
2108	if {$argument_count != 1} {
2109		fail "Invalid number of arguments ($argument_count): $args"
2110	} else {
2111		lassign $args accounts
2112	}
2113
2114	if { $accounts == "" } {
2115		log_error "No account given"
2116		return $::RETURN_ERROR
2117	}
2118
2119	log_user 0
2120	set account_list [split $accounts ","]
2121	foreach item $account_list {
2122		spawn $scancel -A $item
2123		expect {
2124			timeout {
2125				log_warn "No response from scancel"
2126			}
2127			eof {
2128				wait
2129			}
2130		}
2131	}
2132
2133	set my_delay    0
2134	while 1 {
2135		set found 0
2136		spawn $squeue -o Account=%a -h -A$accounts
2137		expect {
2138			-re "Account=($re_word_str)" {
2139				set found 1
2140				exp_continue
2141			}
2142			eof {
2143				wait
2144			}
2145		}
2146
2147		if { !$found } {
2148			log_debug "Account(s) $accounts is/are empty"
2149			break
2150		}
2151
2152		if { $my_delay > $timeout } {
2153			log_error "Timeout waiting for account(s) '$accounts' to be finished"
2154			log_user 1
2155			return $::RETURN_TIMEOUT
2156		}
2157
2158		exec sleep $poll_interval
2159		set my_delay [expr $my_delay + $poll_interval]
2160	}
2161	log_user 1
2162	return $::RETURN_SUCCESS
2163}
2164
2165
2166################################################################
2167#
2168# NAME
2169#	wait_for_part_done - cancels and waits on jobs in specified partition
2170#
2171# SYNOPSIS
2172#	wait_for_part_done ?options? partition
2173#
2174# DESCRIPTION
2175#	Cancel jobs on and wait for them to be finished in partition given.
2176#
2177# OPTIONS
2178#	-timeout <integer_number>
2179#		time in seconds to wait for the jobs to be finished before
2180#		timing out (default is 360)
2181#	-pollinterval <integer_number>
2182#		time in seconds between each job state check (default is 1)
2183#
2184# ARGUMENTS
2185#	partition
2186#		partition name
2187#
2188# RETURN VALUE
2189#	RETURN_SUCCESS if all jobs of the partition are finished, or non-zero
2190#	otherwise.
2191#
2192# NOTE
2193#	We sleep for two seconds before replying that a job is
2194#	done to give time for I/O completion (stdout/stderr files)
2195#
2196################################################################
2197
2198proc wait_for_part_done args {
2199	global scancel squeue re_word_str
2200
2201	set timeout       360
2202	set poll_interval 1
2203	while {[llength $args]} {
2204		switch -glob -- [lindex $args 0] {
2205			-time*  {set args [lassign $args - timeout]}
2206			-poll*  {set args [lassign $args - poll_interval]}
2207			-*      {fail "Unknown option: [lindex $args 0]"}
2208			default break
2209		}
2210	}
2211	set argument_count [llength $args]
2212	if {$argument_count != 1} {
2213		fail "Invalid number of arguments ($argument_count): $args"
2214	} else {
2215		lassign $args partition
2216	}
2217
2218	if { $partition == "" } {
2219		log_error "No partition given"
2220		return $::RETURN_ERROR
2221	}
2222
2223	log_user 0
2224	spawn $scancel -p $partition
2225	expect {
2226		timeout {
2227			log_error "No response from scancel"
2228		}
2229		eof {
2230			wait
2231		}
2232	}
2233
2234	set my_delay    0
2235	while 1 {
2236		set found 0
2237		spawn $squeue -o Part=%P -h -p$partition
2238		expect {
2239			-re "Part=($re_word_str)" {
2240				set found 1
2241				exp_continue
2242			}
2243			eof {
2244				wait
2245			}
2246		}
2247
2248		if { !$found } {
2249			log_debug "Partition $partition is empty"
2250			break
2251		}
2252
2253		if { $my_delay > $timeout } {
2254			log_error "Timeout waiting for partition '$partition' to be finished"
2255			log_user 1
2256			return $::RETURN_TIMEOUT
2257		}
2258
2259		exec sleep $poll_interval
2260		set my_delay [expr $my_delay + $poll_interval]
2261	}
2262	log_user 1
2263	return $::RETURN_SUCCESS
2264}
2265
2266
2267################################################################
2268#
2269# NAME
2270#	wait_for_step - waits for a job step to be found
2271#
2272# SYNOPSIS
2273#	wait_for_step ?options? step_id
2274#
2275# DESCRIPTION
2276#	Wait for a job step to be found.
2277#
2278# OPTIONS
2279#	-timeout <integer_number>
2280#		time in seconds to wait for the job step to be found before
2281#		timing out (default is 360)
2282#	-pollinterval <integer_number>
2283#		time in seconds between each step existence check (default is 1)
2284#
2285# ARGUMENTS
2286#	step_id
2287#		job step id
2288#
2289# RETURN VALUE
2290#	RETURN_SUCCESS if step_id is found, or non-zero otherwise.
2291#
2292################################################################
2293
2294proc wait_for_step args {
2295	global scontrol
2296
2297	set timeout       360
2298	set poll_interval 1
2299	while {[llength $args]} {
2300		switch -glob -- [lindex $args 0] {
2301			-time*  {set args [lassign $args - timeout]}
2302			-poll*  {set args [lassign $args - poll_interval]}
2303			-*      {fail "Unknown option: [lindex $args 0]"}
2304			default break
2305		}
2306	}
2307	set argument_count [llength $args]
2308	if {$argument_count != 1} {
2309		fail "Invalid number of arguments ($argument_count): $args"
2310	} else {
2311		lassign $args step_id
2312	}
2313
2314	set my_delay 0
2315	while 1 {
2316		set fd [open "|$scontrol -o show step $step_id"]
2317		gets $fd line
2318		catch {close $fd}
2319		if {[regexp {Nodes=} $line foo] == 1} {
2320			return $::RETURN_SUCCESS
2321		}
2322		if {[regexp {MidplaneList=} $line foo] == 1} {
2323			return $::RETURN_SUCCESS
2324		}
2325		if { $my_delay > $timeout } {
2326			log_error "Timeout waiting for job step"
2327			return $::RETURN_TIMEOUT
2328		}
2329
2330		log_debug "Step $step_id not done yet. Waiting for $poll_interval seconds"
2331		exec sleep $poll_interval
2332		set my_delay [expr $my_delay + $poll_interval]
2333	}
2334}
2335
2336
2337################################################################
2338#
2339# NAME
2340#	wait_for_all_jobs - waits for jobs to finish having a specified name
2341#
2342# SYNOPSIS
2343#	wait_for_all_jobs ?options? job_name
2344#
2345# DESCRIPTION
2346#	Wait for previously submitted Slurm jobs to finish of a certain name.
2347#
2348# OPTIONS
2349#	-timeout <integer_number>
2350#		time in seconds to wait for the jobs to finish before
2351#		timing out (default is 30)
2352#	-pollinterval <integer_number>
2353#		time in seconds between each job state check (default is 1)
2354#
2355# ARGUMENTS
2356#	job_name
2357#		The name of job to wait for
2358#
2359# RETURN VALUE
2360#	RETURN_SUCCESS if all jobs with the specified name are finished,
2361#	or non-zero otherwise.
2362#	If jobs are not completed after timeout, they are cancelled.
2363#
2364################################################################
2365
2366proc wait_for_all_jobs args {
2367	global scancel squeue
2368
2369	set timeout         30
2370	set poll_interval   1
2371	while {[llength $args]} {
2372		switch -glob -- [lindex $args 0] {
2373			-time*  {set args [lassign $args - timeout]}
2374			-poll*  {set args [lassign $args - poll_interval]}
2375			-*      {fail "Unknown option: [lindex $args 0]"}
2376			default break
2377		}
2378	}
2379	set argument_count [llength $args]
2380	if {$argument_count != 1} {
2381		fail "Invalid number of arguments ($argument_count): $args"
2382	} else {
2383		lassign $args job_name
2384	}
2385
2386	log_debug "Waiting for all jobs to terminate"
2387	set jobs_found   -1
2388	set jobs_desired  0
2389
2390	wait_for -timeout $timeout -pollinterval $poll_interval {$jobs_found == $jobs_desired} {
2391		set jobs_found 0
2392		log_user 0
2393		spawn $squeue -o %j -n $job_name
2394		expect {
2395			-re "$job_name" {
2396				incr jobs_found
2397				exp_continue
2398			}
2399			timeout {
2400				fail "No response from squeue"
2401			}
2402			eof {
2403				wait
2404			}
2405		}
2406		log_user 1
2407		if {$jobs_found != $jobs_desired} {
2408			log_debug "Still $jobs_found jobs remaining"
2409		}
2410	}
2411	if {$jobs_found == $jobs_desired} {
2412		log_debug "All jobs complete"
2413		return $::RETURN_SUCCESS
2414	} else {
2415		log_debug "Cancelling uncompleted jobs"
2416		run_command "$scancel -n $job_name"
2417		return $::RETURN_ERROR
2418	}
2419}
2420
2421
2422################################################################
2423#
2424# NAME
2425#	wait_job_reason - waits for a desired job state and reason
2426#
2427# SYNOPSIS
2428#	wait_job_reason ?options? job_id ?desired_state? ?desired_reason_list?
2429#
2430# DESCRIPTION
2431#	Wait until the job is in desired state and reason is one
2432#	of the desired ones or until the timeout.
2433#
2434# OPTIONS
2435#	-timeout <integer_number>
2436#		time in seconds to wait for the job state and reason before
2437#		timing out (default is 360)
2438#	-pollinterval <integer_number>
2439#		time in seconds between each job state check (default is 1)
2440#
2441# ARGUMENTS
2442#	job_id
2443#		The job to wait for
2444#	desired_state
2445#		Desired state.
2446#	desired_reason_list
2447#		List of desired reasons. Empty list means that any reason
2448#		is ok.
2449#
2450# RETURN VALUE
2451#	RETURN_SUCCESS when job is in the desired state and reason is one
2452#	of the desired ones, or non-zero otherwise.
2453#
2454################################################################
2455
2456proc wait_job_reason args {
2457	global scontrol re_word_str
2458
2459	set final_state "COMPLETED CANCELLED FAILED TIMEOUT DEADLINE
2460	                 OUT_OF_MEMORY"
2461
2462	set timeout       360
2463	set poll_interval 1
2464	set desired_state "PENDING"
2465	set desired_reason_list ""
2466	while {[llength $args]} {
2467		switch -glob -- [lindex $args 0] {
2468			-time*  {set args [lassign $args - timeout]}
2469			-poll*  {set args [lassign $args - poll_interval]}
2470			-*      {fail "Unknown option: [lindex $args 0]"}
2471			default break
2472		}
2473	}
2474	set argument_count [llength $args]
2475	if {$argument_count < 1} {
2476		fail "Too few arguments ($argument_count): $args"
2477	} else {
2478		lassign $args job_id
2479	}
2480	if {$argument_count >= 2} { set desired_state [lindex $args 1] }
2481	if {$argument_count == 3} { set desired_reason_list [lindex $args 2] }
2482	if {$argument_count > 3} {
2483		fail "Too many arguments ($argument_count): $args"
2484	}
2485
2486	set log_user_prev [log_user -info]
2487	log_user 0
2488
2489	set my_delay 0
2490	set rc $::RETURN_ERROR
2491	while true {
2492		set pending 0
2493		set has_reason 1
2494		spawn $scontrol show job $job_id
2495		expect {
2496			-re "JobState=($re_word_str) Reason=(\\S+)" {
2497				set job_state $expect_out(1,string)
2498				set job_reason $expect_out(2,string)
2499			}
2500			timeout {
2501				log_error "No response from scontrol show job"
2502				set rc $::RETURN_TIMEOUT
2503				break
2504			}
2505		}
2506
2507		# Check if both state and reason are the desired ones
2508		if {$job_state == $desired_state} {
2509			set found 0
2510			if {$desired_reason_list == ""} {
2511				set found 1
2512			}
2513			foreach desired_reason $desired_reason_list {
2514				if {$job_reason == $desired_reason } {
2515					set found 1
2516				}
2517			}
2518			if {$found} {
2519				set rc $::RETURN_SUCCESS
2520				break
2521			}
2522		} elseif {[lsearch -exact final_state $job_state] >= 0} {
2523			# Job is in final step no need to wait longer
2524			log_error [format "Job in final state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \
2525			                  $job_state $job_reason \
2526			                  $desired_state $desired_reason_list]
2527			set rc $::RETURN_ERROR
2528			break
2529		}
2530
2531		# Check if this was the last poll
2532		if {$my_delay > $timeout} {
2533			log_error "Timeout"
2534			set rc $::RETURN_TIMEOUT
2535			break
2536		}
2537		set remamining_sec [expr $timeout - $my_delay]
2538		log_debug [format "Job in state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \
2539		                  $job_state $job_reason \
2540		                  $desired_state $desired_reason_list]
2541		log_debug [format "Polling again in %ss, %ss to timeout." \
2542		                  $poll_interval $remamining_sec]
2543
2544		sleep $poll_interval
2545		set my_delay [expr $my_delay + $poll_interval]
2546	}
2547
2548	log_user $log_user_prev
2549	return $rc
2550}
2551
2552
2553################################################################
2554#
2555# NAME
2556#	get_config - returns a dictionary of slurm configuration parameters
2557#
2558# SYNOPSIS
2559#	get_config ?options?
2560#
2561# OPTIONS
2562#	-dbd
2563#		uses `sacctmgr show config` to return slurmdbd configuration
2564#		parameters
2565#	-slurm
2566#		uses `scontrol show config` to return slurm configuration
2567#		parameters (this is the default)
2568#
2569# RETURN VALUE
2570#	Returns a dictionary of parameter values
2571#
2572################################################################
2573
2574proc get_config args {
2575	global sacctmgr scontrol
2576
2577	set command "$scontrol"
2578	while {[llength $args]} {
2579		switch -glob -- [lindex $args 0] {
2580			-slurm  {set command "$scontrol"; set args [lrange $args 1 end]}
2581			-dbd    {set command "$sacctmgr"; set args [lrange $args 1 end]}
2582			-*      {fail "Unknown option: [lindex $args 0]"}
2583			default break
2584		}
2585	}
2586	if {[llength $args] > 0} {
2587		fail "[lindex [info level 0] 0]: No arguments allowed: $args"
2588	}
2589
2590	set output [run_command_output -fail -nolog "$command show config"]
2591
2592	foreach line [split $output "\n"] {
2593		if {[regexp {^(\S+) += (.*)$} $line {} param_name param_value] == 1} {
2594			dict set config_dict $param_name $param_value
2595		}
2596	}
2597
2598	return $config_dict
2599}
2600
2601
2602################################################################
2603#
2604# NAME
2605#	get_config_param - returns a slurm configuration parameter value
2606#
2607# SYNOPSIS
2608#	get_config_param ?options? parameter_name
2609#
2610# OPTIONS
2611#	-dbd
2612#		uses `sacctmgr show config` to return the specified slurmdbd
2613#		configuration parameter value
2614#	-slurm
2615#		uses `scontrol show config` to return the specified slurm
2616#		configuration parameter value (this is the default)
2617#
2618# ARGUMENTS
2619#	parameter_name
2620#		the parameter to return the value for
2621#
2622# DESCRIPTION
2623#	Returns a specific configuration parameter value.
2624#
2625# RETURN VALUE
2626#	Returns the value of the specified parameter or MISSING if it does not
2627#	exist.
2628#
2629################################################################
2630
2631proc get_config_param args {
2632
2633	set options [list]
2634	while {[llength $args]} {
2635		switch -glob -- [lindex $args 0] {
2636			-* {
2637				lappend options [lindex $args 0]
2638				set args [lrange $args 1 end]
2639			}
2640			default break
2641		}
2642	}
2643	if {[llength $args] == 1} {
2644		lassign $args parameter_name
2645	} else {
2646		fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args"
2647	}
2648
2649	set config_dict [get_config {*}$options]
2650
2651	if [dict exists $config_dict $parameter_name] {
2652		return [dict get $config_dict $parameter_name]
2653	} else {
2654		return "MISSING"
2655	}
2656}
2657
2658
2659################################################################
2660#
2661# NAME
2662#	param_contains - test whether a comma-separated-list contains a specified value
2663#
2664# SYNOPSIS
2665#	param_contains haystack needle
2666#
2667# DESCRIPTION
2668#	Searches for the specified value (needle) in the comma-separated-list
2669#	string (haystack). Needle can be a glob-style pattern.
2670#
2671# RETURN VALUE
2672#	Returns a boolean value indicating whether the value (needle) was found
2673#	in the comma-separated-list string (haystack)
2674#
2675################################################################
2676
2677proc param_contains { haystack needle } {
2678	if {[lsearch [split $haystack ","] $needle] != -1} {
2679		return true
2680	} else {
2681		return false
2682	}
2683}
2684
2685
2686################################################################
2687#
2688# NAME
2689#	param_value - returns the value of a parameter in a comma-separated-list
2690#
2691# SYNOPSIS
2692#	param_value params_list param ?default?
2693#
2694# DESCRIPTION
2695#	Searches for the specified param in the comma-separated-list
2696#	string (params_list) and returns its value.
2697#
2698# RETURN VALUE
2699#	Returns the value found or the optional default value if not found.
2700#	If the param is found without a value, returns true (ie like
2701#	param_contains).
2702#
2703################################################################
2704
2705proc param_value {params_list param {default false}} {
2706	global re_word_str
2707	foreach pair [split $params_list ","] {
2708		if {[regexp "$param" $pair] == 1} {
2709			if {[regexp "$param=($re_word_str)" $pair - value] == 1} {
2710				return $value
2711			} else {
2712				return true
2713			}
2714		}
2715	}
2716	return $default
2717}
2718
2719
2720################################################################
2721#
2722# NAME
2723#	get_affinity_types - gets the task plugins running with task/ stripped
2724#
2725# SYNOPSIS
2726#	get_affinity_types
2727#
2728# RETURN VALUE
2729#	Returns comma separated list of task plugins running without the task/
2730#
2731################################################################
2732
2733proc get_affinity_types { } {
2734	global scontrol re_word_str
2735
2736	log_user 0
2737	set affinity ""
2738	spawn $scontrol show config
2739	expect {
2740		-re "TaskPlugin *= ($re_word_str)" {
2741			set parts [split $expect_out(1,string) ",/"]
2742			while 1 {
2743				set task_found [lsearch $parts "task"]
2744				if { $task_found == -1 } break
2745				set parts [lreplace $parts $task_found $task_found]
2746			}
2747			set affinity [join $parts ","]
2748			exp_continue
2749		}
2750		eof {
2751			wait
2752		}
2753	}
2754	log_user 1
2755
2756	return $affinity
2757}
2758
2759
2760################################################################
2761#
2762# NAME
2763#	get_mps_count_by_index - gets the count of a specific gres/mps device
2764#
2765# SYNOPSIS
2766#	get_mps_count_by_index index hostname
2767#
2768# RETURN VALUE
2769#	Returns the Count of a specific gres/mps device
2770#
2771################################################################
2772
2773proc get_mps_count_by_index { index hostname } {
2774	global slurmd number re_word_str
2775
2776	log_user 0
2777	set count 0
2778	spawn $slurmd -G -N $hostname
2779	expect {
2780		-re "Gres Name=mps Type=$re_word_str Count=($number) Index=$index" {
2781			set count $expect_out(1,string)
2782			exp_continue
2783		}
2784		eof {
2785			wait
2786		}
2787	}
2788	log_user 1
2789
2790	return $count
2791}
2792
2793
2794################################################################
2795#
2796# NAME
2797#	check_bb_emulate - determines if Cray burst buffers API is emulated
2798#
2799# SYNOPSIS
2800#	check_bb_emulate
2801#
2802# RETURN VALUE
2803#	Returns true if Cray burst buffers API is emulated, false otherwise
2804#
2805################################################################
2806
2807proc check_bb_emulate { } {
2808	global scontrol
2809
2810	log_user 0
2811	set bb_emulate false
2812	spawn $scontrol show burst
2813	expect {
2814		-re "EmulateCray" {
2815			set bb_emulate true
2816			exp_continue
2817		}
2818		eof {
2819			wait
2820		}
2821	}
2822	log_user 1
2823	return $bb_emulate
2824}
2825
2826
2827################################################################
2828#
2829# NAME
2830#	check_bb_persistent - determines if persistent burst buffers can be created by users
2831#
2832# SYNOPSIS
2833#	check_bb_persistent
2834#
2835# RETURN VALUE
2836#	Returns true if Cray burst buffers can be created by users,
2837#	false otherwise
2838#
2839################################################################
2840
2841proc check_bb_persistent { } {
2842	global scontrol
2843
2844	log_user 0
2845	set bb_persistent false
2846	spawn $scontrol show burst
2847	expect {
2848		-re "EnablePersistent" {
2849			set bb_persistent true
2850			exp_continue
2851		}
2852		eof {
2853			wait
2854		}
2855	}
2856	log_user 1
2857	return $bb_persistent
2858}
2859
2860
2861################################################################
2862#
2863# NAME
2864#	get_bb_types - gets the burst buffer plugins running with task/ stripped
2865#
2866# SYNOPSIS
2867#	get_bb_types
2868#
2869# RETURN VALUE
2870#	Returns comma separated list of task plugins running without the task/
2871#
2872################################################################
2873
2874proc get_bb_types { } {
2875	global scontrol re_word_str
2876
2877	log_user 0
2878	set bb_types ""
2879	spawn $scontrol show config
2880	expect {
2881		-re "BurstBufferType *= ($re_word_str)" {
2882			set parts [split $expect_out(1,string) ",/"]
2883			while 1 {
2884				set task_found [lsearch $parts "burst_buffer"]
2885				if { $task_found == -1 } break
2886				set parts [lreplace $parts $task_found $task_found]
2887			}
2888			set bb_types [join $parts ","]
2889			exp_continue
2890		}
2891		eof {
2892			wait
2893		}
2894	}
2895
2896	log_user 1
2897	return $bb_types
2898}
2899
2900
2901################################################################
2902#
2903# NAME
2904#	get_default_acct - gets user's default account
2905#
2906# SYNOPSIS
2907#	get_default_acct user
2908#
2909# RETURN VALUE
2910#	Returns name of default account if exists, NULL otherwise
2911#
2912################################################################
2913
2914proc get_default_acct { user } {
2915	global sacctmgr re_word_str bin_id
2916
2917	log_user 0
2918	set def_acct ""
2919
2920	if { !$user } {
2921		set user [get_my_user_name]
2922	}
2923
2924	spawn $sacctmgr -n list -P user $user format="DefaultAccount"
2925	expect {
2926		-re "($re_word_str)" {
2927			set def_acct $expect_out(1,string)
2928			exp_continue
2929		}
2930		eof {
2931			wait
2932		}
2933	}
2934	log_user 1
2935
2936	return $def_acct
2937}
2938
2939
2940################################################################
2941#
2942# NAME
2943#	get_cycle_count - get desired iteration count
2944#
2945# SYNOPSIS
2946#	get_cycle_count
2947#
2948# DESCRIPTION
2949#	For tests with iteration counts (e.g. test9.1, test9.2)
2950#	return the desired iteration count
2951#
2952# RETURN VALUE
2953#	Returns desired iteration count
2954#
2955################################################################
2956
2957proc get_cycle_count { } {
2958	global enable_memory_leak_debug
2959
2960	if {$enable_memory_leak_debug != 0} {
2961		return 2
2962	}
2963	return 100
2964}
2965
2966
2967################################################################
2968#
2969# NAME
2970#	get_select_type_params - determines SelectTypeParameters being used for a given partition
2971#
2972# SYNOPSIS
2973#	get_select_type_params ?partition?
2974#
2975# DESCRIPTION
2976#	Determine SelectTypeParameters being used for a given partition.
2977#	If the partition is not specified, the default partition will be used.
2978#
2979# RETURN VALUE
2980#	Returns a string containing SelectTypeParameters
2981#
2982################################################################
2983
2984proc get_select_type_params { {partition ""} } {
2985	global scontrol bin_bash bin_grep re_word_str
2986
2987	log_user 0
2988	set params ""
2989
2990	if {[string length $partition] == 0} {
2991		set partition [default_partition]
2992	}
2993
2994	if {[string compare $partition ""]} {
2995		spawn -noecho $bin_bash -c "exec $scontrol show part $partition | $bin_grep SelectTypeParameters"
2996		expect {
2997			-re "SelectTypeParameters *= *NONE" {
2998				exp_continue
2999			}
3000			-re "SelectTypeParameters *= *($re_word_str)" {
3001				set params $expect_out(1,string)
3002				exp_continue
3003			}
3004			eof {
3005				wait
3006			}
3007		}
3008	}
3009	if { [string compare params ""] } {
3010		spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters"
3011		expect {
3012			-re "SelectTypeParameters *= *($re_word_str)" {
3013				set params $expect_out(1,string)
3014				exp_continue
3015			}
3016			eof {
3017				wait
3018			}
3019		}
3020	}
3021	log_user 1
3022
3023	return $params
3024}
3025
3026
3027################################################################
3028#
3029# NAME
3030#	check_config_select - checks if effectively using the select type
3031#
3032# SYNOPSIS
3033#	check_config_select type
3034#
3035# DESCRIPTION
3036#	Determine if SelectType is equivalent to the passed one by also
3037#	checking other_cons_res and other_cons_tres on SelectTypeParameters
3038#	in case that select/cray_aries is configured.
3039#
3040# ARGUMENTS
3041#	type
3042#		the desired SelectType to check (e.g. cons_tres)
3043#
3044# RETURN VALUE
3045#	Returns true if configured, false otherwise
3046#
3047################################################################
3048
3049proc check_config_select { type } {
3050	set select_type [get_config_param "SelectType"]
3051	set select_type_parameters [get_config_param "SelectTypeParameters"]
3052
3053	if {$select_type eq "select/$type"} {
3054		return true
3055	}
3056	if {$select_type eq "select/cray_aries"} {
3057		if {$type eq "linear" &&
3058		    ![param_contains $select_type_parameters "other_cons_res"] &&
3059		    ![param_contains $select_type_parameters "other_cons_tres"]} {
3060			return true
3061		}
3062		if {$type eq "cons_res" &&
3063		    [param_contains $select_type_parameters "other_cons_res"]} {
3064			return true
3065		}
3066		if {$type eq "cons_tres" &&
3067		    [param_contains $select_type_parameters "other_cons_tres"]} {
3068			return true
3069		}
3070	}
3071
3072	return false
3073}
3074
3075
3076################################################################
3077#
3078# NAME
3079#	get_total_cpus - gets the total amount of CPUs on the default partition
3080#
3081# SYNOPSIS
3082#	get_total_cpus
3083#
3084# RETURN VALUE
3085#	The total amount of CPUs on the default partition.
3086#
3087# NOTE
3088#	CoreSpecCount are not part of the total.
3089#
3090################################################################
3091
3092proc get_total_cpus {} {
3093	global sinfo scontrol re_word_str
3094
3095	set partition [default_partition]
3096	set cpu_cnt 0
3097	set nodes_matched [list]
3098
3099	set re_with_cs    "NodeName=($re_word_str).*CPUTot=(\\d+).*CoreSpecCount=(\\d+).*ThreadsPerCore=(\\d+)"
3100	set re_without_cs "NodeName=($re_word_str).*CPUTot=(\\d+)"
3101
3102	set out [run_command_output -fail "$sinfo -h -o \"%P %N\" -p $partition --state=idle"]
3103	if {[regexp "$partition\\* ($re_word_str)" $out - def_hostlist] != 1} {
3104		fail "Not able to get the hostlist from sinfo"
3105	}
3106
3107	set out [run_command_output "$scontrol --oneliner show node $def_hostlist"]
3108	foreach {re node_name tmp_cpu_cnt core_spec_cnt threads_per_core} [regexp -all -inline -linestop $re_with_cs $out] {
3109		set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt - $core_spec_cnt * $threads_per_core]
3110		lappend nodes_matched $node_name
3111	}
3112	foreach {re node_name tmp_cpu_cnt} [regexp -all -inline -linestop $re_without_cs $out] {
3113		# Not count nodes already counted
3114		if {[lsearch $nodes_matched $node_name] == -1} {
3115			set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt]
3116		}
3117	}
3118
3119	return $cpu_cnt
3120}
3121
3122
3123################################################################
3124#
3125# NAME
3126#	is_super_user - determines if user is root or SlurmUser
3127#
3128# SYNOPSIS
3129#	is_super_user ?user?
3130#
3131# DESCRIPTION
3132#	Determine if user is a Slurm super user (i.e. user
3133#	root or configured SlurmUser)
3134#
3135# RETURN VALUE
3136#	true is user is root or SlurmUser, false otherwise
3137#
3138################################################################
3139
3140proc is_super_user {{user ""}} {
3141	global number
3142
3143	if {$user == ""} {
3144		set user [get_my_user_name]
3145	}
3146
3147	# Check if user is root
3148	if {[string compare $user "root"] == 0} {
3149		return true
3150	}
3151
3152	# Check if user is SlurmUser
3153	set slurm_user [get_config_param "SlurmUser"]
3154	if {[regexp "${user}\\($number\\)" $slurm_user match]} {
3155		return true
3156	}
3157
3158	return false
3159}
3160
3161
3162################################################################
3163#
3164# NAME
3165#	dec2hex - creates a 32 bit hex number from a signed decimal number
3166#
3167# SYNOPSIS
3168#	dec2hex value
3169#
3170# DESCRIPTION
3171#	Create a 32 bit hex number from a signed decimal number
3172#
3173# RETURN VALUE
3174#	32 bit hex version of input 'value'
3175#
3176# SOURCE
3177#	Courtesy of Chris Cornish
3178#	http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982
3179#
3180################################################################
3181# Replace all non-decimal characters
3182
3183proc dec2hex {value} {
3184	regsub -all {[^0-x\.-]} $value {} newtemp
3185	set value [string trim $newtemp]
3186	if {$value < 2147483647 && $value > -2147483648} {
3187		set tempvalue [format "%#010X" [expr $value]]
3188		return [string range $tempvalue 2 9]
3189	} elseif {$value < -2147483647} {
3190		return "80000000"
3191	} else {
3192		return "7FFFFFFF"
3193	}
3194}
3195
3196
3197################################################################
3198#
3199# NAME
3200#	uint2hex - creates a 32 bit hex number from an unsigned decimal
3201#
3202# SYNOPSIS
3203#	uint2hex value
3204#
3205# DESCRIPTION
3206#	Create a 32 bit hex number from an unsigned decimal number.
3207#
3208# ARGUMENTS
3209#	value
3210#		unsigneddecimal number to convert
3211#
3212# RETURN VALUE
3213#	32 bit hex version of input 'value'
3214#
3215# SOURCE
3216#	Courtesy of Chris Cornish
3217#	http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982
3218#
3219################################################################
3220# Replace all non-decimal characters
3221
3222proc uint2hex {value} {
3223	regsub -all {[^0-x\.-]} $value {} newtemp
3224	set value [string trim $newtemp]
3225	if {$value <= 4294967295 && $value >= 0} {
3226		set tempvalue [format "%#010X" [expr $value]]
3227		return [string range $tempvalue 2 9]
3228	} else {
3229		return "FFFFFFFF"
3230	}
3231}
3232
3233
3234################################################################
3235#
3236# NAME
3237#	partition_oversubscribe - determines the oversubscribe configuration of the specified partition
3238#
3239# SYNOPSIS
3240#	partition_oversubscribe ?partition?
3241#
3242# DESCRIPTION
3243#	Determine the oversubscribe configuration of the specified partition.
3244#	If the partition is not specified, the default partition will be used.
3245#
3246# RETURN VALUE
3247#	Return the oversubscribe configuration of the specified partition.
3248#
3249################################################################
3250
3251proc partition_oversubscribe { {partition ""} } {
3252	global sinfo
3253
3254	if {[string length $partition] == 0} {
3255		set partition [default_partition]
3256	}
3257
3258	set oversubscribe "NO"
3259	log_debug "$sinfo --noheader --partition $partition --format %h"
3260	set fd [open "|$sinfo --noheader --partition $partition --format %h"]
3261	gets $fd line
3262	catch {close $fd}
3263	regexp {[a-zA-Z]+} $line oversubscribe
3264	return $oversubscribe
3265}
3266
3267
3268################################################################
3269#
3270# NAME
3271#	default_partition - determines the name of the default partition
3272#
3273# SYNOPSIS
3274#	default_partition
3275#
3276# DESCRIPTION
3277#	Use scontrol to determine the name of the default partition
3278#
3279# RETURN VALUE
3280#	Name of the current default partition
3281#
3282################################################################
3283
3284proc default_partition {} {
3285	global scontrol
3286
3287	set name ""
3288	set fd [open "|$scontrol --all --oneliner show partition"]
3289	while {[gets $fd line] != -1} {
3290		if {[regexp {^PartitionName=([^ ]*).*Default=YES} $line frag name]
3291				== 1} {
3292			break
3293		}
3294	}
3295	catch {close $fd}
3296
3297	if {[string length $name] == 0} {
3298		log_error "Could not identify the default partition"
3299	}
3300
3301	return $name
3302}
3303
3304
3305################################################################
3306#
3307# NAME
3308#	default_part_exclusive - determines if the default partition allocates whole nodes to jobs
3309#
3310# SYNOPSIS
3311#	default_part_exclusive
3312#
3313# DESCRIPTION
3314#	Use scontrol to determine if the default partition
3315#	allocates whole nodes to jobs
3316#
3317# RETURN VALUE
3318#	Name of the current default partition
3319#
3320################################################################
3321
3322proc default_part_exclusive {} {
3323	set def_part [default_partition]
3324	set oversubscribe [partition_oversubscribe $def_part]
3325	if {[string compare $oversubscribe "EXCLUSIVE"] == 0} {
3326		return 1
3327	} else {
3328		return 0
3329	}
3330}
3331
3332
3333################################################################
3334#
3335# NAME
3336#	make_bash_script - creates a bash script
3337#
3338# SYNOPSIS
3339#	make_bash_script script_name script_contents
3340#
3341# DESCRIPTION
3342#	Create a bash script of name "script_name", and
3343#	make the body of the script "script_contents".
3344#	make_bash_script removes the file if it already exists,
3345#	then generates the #! line, and then dumps "script_contents"
3346#	to the file.  Finally, it makes certain that the script
3347#	is executable.
3348#
3349# ARGUMENTS
3350#	script_name
3351#		file name for the bash script
3352#	script_contents
3353#		body of the script, not including the initial #! line.
3354#
3355# RETURN VALUE
3356#	Nothing.
3357#
3358################################################################
3359
3360proc make_bash_script { script_name script_contents } {
3361	global bin_bash bin_chmod
3362
3363	file delete $script_name
3364	set fd [open $script_name "w"]
3365	puts $fd "#!$bin_bash"
3366	puts $fd $script_contents
3367	close $fd
3368	exec $bin_chmod 700 $script_name
3369}
3370
3371
3372################################################################
3373#
3374# NAME
3375#	get_suffix - given a hostname, returns it's numeric suffix
3376#
3377# SYNOPSIS
3378#	get_suffix hostname
3379#
3380# DESCRIPTION
3381#	Given a hostname, return it's numeric suffix
3382#
3383# RETURN VALUE
3384#	numerical suffix for input 'hostname' or -1 if not a number
3385#
3386################################################################
3387
3388proc get_suffix { hostname } {
3389	set host_len [string length $hostname]
3390	set host_inx [expr $host_len-1]
3391	set host_char [string index $hostname $host_inx]
3392	if {[string compare $host_char "0"] < 0 || [string compare $host_char "9"] > 0} {
3393		return -1
3394	}
3395
3396	for {set host_inx [expr $host_len-1]} {$host_inx >= 0} {incr host_inx -1} {
3397		set host_char [string index $hostname $host_inx]
3398		if {[string compare $host_char "0"] < 0} { break }
3399		if {[string compare $host_char "9"] > 0} { break }
3400	}
3401	incr host_inx
3402
3403	if {$host_inx == $host_len} {
3404		log_warn "Hostname lacks a suffix: $hostname"
3405		return "-1"
3406	}
3407
3408#	Strip off leading zeros to avoid doing octal arithmetic
3409	set suffix [string range $hostname $host_inx $host_len]
3410	set suffix_len [string length $suffix]
3411	for {set suffix_inx 0} {$suffix_inx < [expr $suffix_len - 1]} {incr suffix_inx} {
3412		set suffix_char [string index $suffix $suffix_inx]
3413		if {[string compare $suffix_char "0"] != 0} { break }
3414	}
3415
3416	return [string range $suffix $suffix_inx $suffix_len]
3417}
3418
3419
3420################################################################
3421#
3422# NAME
3423#	check_acct_associations - checks associations
3424#
3425# SYNOPSIS
3426#	check_acct_associations
3427#
3428# DESCRIPTION
3429#	Use sacctmgr to check associations
3430#
3431# RETURN VALUE
3432#	true if no error is found, false otherwise
3433#
3434################################################################
3435
3436proc check_acct_associations { } {
3437	global sacctmgr number re_word_str
3438
3439	set rc true
3440	log_user 0
3441	log_debug "Sanity-Checking Associations"
3442	#
3443	# Use sacctmgr to check associations
3444	#
3445	set s_pid [spawn $sacctmgr -n -p list assoc wopi wopl withd format=lft,rgt,cluster]
3446	expect {
3447	       -re "($number)\\|($number)\\|($re_word_str)\\|" {
3448		      # Here we are checking if we have duplicates and
3449		      # setting up an array to check for holes later
3450
3451		      set cluster $expect_out(3,string)
3452		      if { ![info exists c_min($cluster)] } {
3453			      set c_min($cluster) -1
3454			      set c_max($cluster) -1
3455		      }
3456
3457		      set num1 $expect_out(1,string)
3458		      set num2 $expect_out(2,string)
3459		      set first [info exists found($cluster,$num1)]
3460		      set sec [info exists found($cluster,$num2)]
3461		      #log_debug "$first=$num1 $sec=$num2"
3462		      if { $first } {
3463			     log_error "$cluster found lft $num1 again"
3464			     set rc false
3465		      } elseif { $sec } {
3466			     log_error "$cluster found rgt $num2 again"
3467			     set rc false
3468		      } else {
3469			     set found($cluster,$num1) 1
3470			     set found($cluster,$num2) 1
3471			     if { $c_min($cluster) == -1
3472				  || $c_min($cluster) > $num1 } {
3473				    set c_min($cluster) $num1
3474			     }
3475			     if { $c_max($cluster) == -1
3476				  || $c_max($cluster) < $num2 } {
3477				    set c_max($cluster) $num2
3478			     }
3479		      }
3480		      exp_continue
3481	       }
3482	       timeout {
3483			slow_kill $s_pid
3484			fail "sacctmgr add not responding"
3485	       }
3486	       eof {
3487		      wait
3488	       }
3489	}
3490
3491	foreach cluster [array names c_min] {
3492		# Here we are checking for holes in the list from above
3493		for {set inx $c_min($cluster)} {$inx < $c_max($cluster)} {incr inx} {
3494			if { ![info exists found($cluster,$inx)] } {
3495				log_error "$cluster No index at $inx"
3496				set rc false
3497			}
3498		}
3499	}
3500	log_user 1
3501	return $rc
3502}
3503
3504
3505################################################################
3506#
3507# NAME
3508#	get_job_acct_freq - gets the value of the job account gather frequency
3509#
3510# SYNOPSIS
3511#	get_job_acct_freq
3512#
3513# RETURN VALUE
3514#	job account gather frequency
3515#
3516################################################################
3517
3518proc get_job_acct_freq { } {
3519	global scontrol number
3520
3521	log_user 0
3522	set freq_val 0
3523
3524	spawn $scontrol show config
3525	expect {
3526		-re "JobAcctGatherFrequency *= ($number)" {
3527			set freq_val $expect_out(1,string)
3528			if {$freq_val == 0} {
3529				set freq_val 0
3530			}
3531		}
3532		-re "JobAcctGatherFrequency *= task=($number)" {
3533			set freq_val $expect_out(1,string)
3534			if {$freq_val == 0} {
3535				set freq_val 0
3536			}
3537		}
3538		eof {
3539			wait
3540		}
3541	}
3542
3543	log_user 1
3544	return $freq_val
3545}
3546
3547
3548################################################################
3549#
3550# NAME
3551#	get_admin_level - gets the AdminLevel of the user
3552#
3553# SYNOPSIS
3554#	get_admin_level ?user?
3555#
3556# RETURN VALUE
3557#	AdminLevel for the current user
3558#
3559################################################################
3560
3561proc get_admin_level {{user_name ""}} {
3562	global sacctmgr re_word_str re_word_str bin_id
3563
3564	set admin_level ""
3565
3566	if {$user_name == ""} {
3567		set user_name [get_my_user_name]
3568		if { ![string length $user_name] } {
3569			log_error "No name returned from id"
3570			return ""
3571		}
3572	}
3573
3574	if {[is_super_user $user_name]} {
3575		return "Administrator"
3576	}
3577
3578	#
3579	# Use sacctmgr to check admin_level
3580	#
3581	log_user 0
3582	set s_pid [spawn $sacctmgr -n -P list user $user_name format=admin]
3583	expect {
3584		-re "($re_word_str)" {
3585		      set admin_level $expect_out(1,string)
3586		      exp_continue
3587	       }
3588	       timeout {
3589		      slow_kill $s_pid
3590		      fail "sacctmgr add not responding"
3591	       }
3592	       eof {
3593		      wait
3594	       }
3595	}
3596	log_user 1
3597
3598	return $admin_level
3599}
3600
3601
3602###############################################################
3603#
3604# NAME
3605#	change_subbp_state - sets sub mid plane state
3606#
3607# SYNOPSIS
3608#	change_subbp_state node ionodes state
3609#
3610# RETURN VALUE
3611#	Returns SUCCESS if state of mid plane is changed
3612#
3613###############################################################
3614
3615proc change_subbp_state { node ionodes state } {
3616	global scontrol smap
3617
3618	set return_code 0
3619
3620	set my_pid [spawn $scontrol update subbpname=$node\[$ionodes\] state=$state]
3621	expect {
3622		-re "slurm_update error:" {
3623			set return_code 1
3624			exp_continue
3625		}
3626		-re "Unable to contact" {
3627			log_error "Slurm appears to be down"
3628			exp_continue
3629		}
3630		timeout {
3631			log_error "scontrol not responding"
3632			slow_kill $my_pid
3633			set return_code 1
3634		}
3635		eof {
3636			wait
3637		}
3638	}
3639
3640	if { $return_code } {
3641		return $return_code
3642	}
3643
3644	set match 0
3645	set my_pid [spawn $smap -Db -c -h -n $node -I $ionodes]
3646	expect {
3647		-nocase -re "$state" {
3648			incr match
3649			exp_continue
3650		}
3651		-re "$node" {
3652			incr match
3653			exp_continue
3654		}
3655		-re "Unable to contact" {
3656			log_error "Slurm appears to be down"
3657			exp_continue
3658		}
3659		timeout {
3660			log_error "smap not responding"
3661			slow_kill $my_pid
3662			set return_code 1
3663		}
3664		eof {
3665			wait
3666		}
3667	}
3668
3669	if {$match != 2} {
3670		log_error "Subbp did not go into $state state. $match"
3671		set return_code 1
3672	}
3673
3674	return $return_code
3675}
3676
3677
3678#################################################
3679#
3680# NAME
3681#	scale_to_megs - scales the value by the factor T|G|M to megabytes
3682#
3683# SYNOPSIS
3684#	scale_to_megs value factor
3685#
3686# DESCRIPTION
3687#	scale the value by the factor T|G|M to megabytes
3688#
3689# RETURN VALUE
3690#	the scaled variable
3691#
3692#################################################
3693
3694proc scale_to_megs { value factor } {
3695
3696	if {[string compare $factor "T"] == 0} {
3697		set value [expr $value * 1024 * 1024]
3698	} elseif {[string compare $factor "G"] == 0} {
3699		set value [expr $value * 1024]
3700	} elseif {[string compare $factor "M"] == 0} {
3701		set value [expr $value * 1]
3702	} elseif {[string compare $factor "K"] == 0} {
3703		set value [expr $value / 1024]
3704		set value [expr {round($value)}]
3705	} else {
3706		set value [expr $value / (1024 * 1024)]
3707		set value [expr {round($value)}]
3708	}
3709
3710	return $value
3711}
3712
3713
3714#################################################
3715#
3716# NAME
3717#	scale_to_ks - scales the value by the factor G|M|K to kilobytes
3718#
3719# SYNOPSIS
3720#	scale_to_ks value factor
3721#
3722# DESCRIPTION
3723#	scale the value by the factor G|M|K to kilobytes
3724#
3725# RETURN VALUE
3726#	the scaled variable
3727#
3728#################################################
3729
3730proc scale_to_ks { value factor } {
3731
3732	if {[string compare $factor "G"] == 0} {
3733		set value [expr $value * 1024 * 1024]
3734	} elseif {[string compare $factor "M"] == 0} {
3735		set value [expr $value * 1024]
3736	} elseif {[string compare $factor "K"] == 0} {
3737		set value [expr $value * 1]
3738	} else {
3739		set value [expr $value / 1024]
3740		set value [expr {round($value)}]
3741	}
3742
3743	return $value
3744}
3745
3746
3747############################################################
3748#
3749# NAME
3750#	check_config_node_mem - checks that the nodes have memory configured
3751#
3752# SYNOPSIS
3753#	check_config_node_mem
3754#
3755# RETURN VALUE
3756#	true if all nodes have memory, false otherwise
3757#
3758############################################################
3759
3760proc check_config_node_mem { } {
3761
3762	set nodes_dict [get_nodes]
3763	dict for {node_name node_dict} $nodes_dict {
3764		if [dict exists $node_dict "RealMemory"] {
3765			if {[dict get $node_dict "RealMemory"] == 1} {
3766				return false
3767			}
3768		} else {
3769			log_warn "Parameter RealMemory not found on node $node_name"
3770			return false
3771		}
3772	}
3773
3774	return true
3775}
3776
3777
3778################################################################
3779#
3780# NAME
3781#	wait_for_node - waits for nodes in a partition to reach a certain state
3782#
3783# SYNOPSIS
3784#	wait_for_node ?options? state num_nodes ?partition?
3785#
3786# DESCRIPTION
3787#	Wait for a certain number of nodes in a partition to reach a certain
3788#	state.
3789#
3790# OPTIONS
3791#	-timeout <integer_number>
3792#		time in seconds to wait for the node state before
3793#		timing out (default is 3)
3794#	-pollinterval <integer_number>
3795#		time in seconds between each node state check (default is 1)
3796#
3797# ARGUMENTS
3798#	state
3799#		The node state to wait for
3800#	num_nodes
3801#		The number of nodes we want to be in the specified state
3802#	partition
3803#		Partition name (the default partition is used if not specified)
3804#
3805# RETURN VALUE
3806#	RETURN_SUCCESS, or non-zero on failure
3807#
3808################################################################
3809
3810proc wait_for_node args {
3811	global sinfo number
3812
3813	set partition     ""
3814	set timeout       3
3815	set poll_interval 1
3816	set desired_state "PENDING"
3817	set desired_reason_list ""
3818	while {[llength $args]} {
3819		switch -glob -- [lindex $args 0] {
3820			-time*  {set args [lassign $args - timeout]}
3821			-poll*  {set args [lassign $args - poll_interval]}
3822			-*      {fail "Unknown option: [lindex $args 0]"}
3823			default break
3824		}
3825	}
3826	set argument_count [llength $args]
3827	if {$argument_count < 2} {
3828		fail "Too few arguments ($argument_count): $args"
3829	} else {
3830		lassign $args state num_nodes
3831	}
3832	if {$argument_count == 3} { set partition [lindex $args 2] }
3833	if {$argument_count > 3} {
3834		fail "Too many arguments ($argument_count): $args"
3835	}
3836
3837	set wait_time 0
3838	set done      0
3839	set cnt       0
3840	set rt        $::RETURN_SUCCESS
3841
3842	if {[string length $partition] == 0} {
3843		set partition [default_partition]
3844	}
3845
3846	while {$done != 1 && $wait_time < $timeout} {
3847
3848		log_user 0
3849		spawn $sinfo --noheader --partition $partition --state $state --format %D
3850		expect {
3851			-re "($number)" {
3852				set cnt $expect_out(1,string)
3853				exp_continue
3854			}
3855			timeout {
3856				log_error "sinfo is not responding"
3857				set rt $::RETURN_TIMEOUT
3858			}
3859			eof {
3860				wait
3861			}
3862		}
3863		log_user 1
3864
3865		if {$num_nodes <= $cnt} {
3866			set done 1
3867		} else {
3868			log_debug "Partition $partition has $cnt nodes idle and we want $num_nodes"
3869			sleep $poll_interval
3870			incr wait_time 1
3871		}
3872	}
3873	if {$done != 1} {
3874		set rt $::RETURN_ERROR
3875	}
3876
3877	return $rt
3878}
3879
3880
3881#####################################################################
3882#
3883# NAME
3884#	list2hostlist - converts a TCL list into a Slurm hostlist using scontrol
3885#
3886# SYNOPSIS
3887#	list2hostlist nodes_list
3888#
3889# ARGUMENTS
3890#
3891#	nodes_list
3892#		a TCL list of node names
3893#
3894# RETURN VALUE
3895#	the hostlist form returned by scrontrol show hostlist
3896#
3897#####################################################################
3898
3899proc list2hostlist {nodes_list} {
3900	global scontrol
3901
3902	set comalist [join $nodes_list ,]
3903	set hostlist [run_command_output -fail "$scontrol show hostlist $comalist"]
3904	set hostlist [string trimright $hostlist "\r\n"]
3905}
3906
3907
3908#####################################################################
3909#
3910# NAME
3911#	get_nodes_by_state - gets the list of node names in a given partition/states
3912#
3913# SYNOPSIS
3914#	get_nodes_by_state partition states
3915#
3916# DESCRIPTION
3917#	Get the list of node names in a given partition/states
3918#
3919# ARGUMENTS
3920#	partition
3921#		partition to get nodes off
3922#	states
3923#		states to filter on nodes
3924#
3925# RETURN VALUE
3926#	node names list, -1 on sinfo error
3927#
3928#####################################################################
3929
3930proc get_nodes_by_state {{states ""} {partition ""}} {
3931
3932	global sinfo re_word_str
3933	log_user 0
3934	set node_list ""
3935
3936	if {[string length $partition] == 0} {
3937		set partition [default_partition]
3938	}
3939
3940	if {[string length $states] == 0} {
3941		set states "Idle"
3942	}
3943
3944	set sinfo_pid [spawn -noecho $sinfo -h -N -p $partition -o %N -t $states -e]
3945	expect {
3946		-re "($re_word_str)" {
3947			lappend node_list $expect_out(1,string)
3948			exp_continue
3949		}
3950		timeout {
3951			slow_kill $sinfo_pid
3952			fail "sinfo not responding"
3953		}
3954		eof {
3955			wait
3956		}
3957	}
3958
3959	log_user 1
3960	return $node_list
3961}
3962
3963
3964#####################################################################
3965#
3966# NAME
3967#	set_partition_maximum_time_limit - sets the maximum time limit in a given partition
3968#
3969# SYNOPSIS
3970#	set_partition_maximum_time_limit partition limit
3971#
3972# RETURN VALUE
3973#	RETURN_SUCCESS, or non-zero on error
3974#
3975#####################################################################
3976
3977proc set_partition_maximum_time_limit {partition limit} {
3978	global scontrol
3979
3980	if {[string length $partition] == 0} {
3981		set partition [default_partition]
3982		if { $partition == "" } {
3983			return $::RETURN_ERROR
3984		}
3985	}
3986
3987	if { $limit < -1 } {
3988		log_error "Trying to set invalid partition time limit of $limit"
3989		return $::RETURN_ERROR
3990	}
3991	if { $limit == -1 } {
3992		set expected_lim "UNLIMITED"
3993	} else {
3994		set expected_lim limit
3995	}
3996
3997	spawn $scontrol update partitionname=$partition MaxTime=-1
3998	expect {
3999		timeout {
4000			log_error "scontrol not responding"
4001			return $::RETURN_TIMEOUT
4002		}
4003		eof {
4004			wait
4005		}
4006	}
4007
4008	set maxtime [get_partition_maximum_time_limit $partition]
4009	if { $maxtime != $limit } {
4010		log_error "Unable to update partition MaxTime, got $maxtime, wanted $limit"
4011		return $::RETURN_ERROR
4012	}
4013
4014	return $::RETURN_SUCCESS
4015}
4016
4017
4018#####################################################################
4019#
4020# NAME
4021#	get_partition_maximum_time_limit - gets the maximum time limit in a given partition
4022#
4023# SYNOPSIS
4024#	get_partition_maximum_time_limit partition
4025#
4026# DESCRIPTION
4027#	Get the maximum time limit in a given partition
4028#
4029# RETURN VALUE
4030#	time limit in seconds, -1 if undefined or error
4031#
4032#####################################################################
4033
4034proc get_partition_maximum_time_limit {partition} {
4035	global sinfo number
4036
4037	if {[string length $partition] == 0} {
4038		set partition [default_partition]
4039	}
4040
4041	set secs 0
4042	log_user 0
4043	set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O time -e]
4044	expect {
4045		-re "infinite" {
4046			set secs -1
4047			exp_continue
4048		}
4049		-re "n/a" {
4050			set secs -1
4051			exp_continue
4052		}
4053		-re "($number)-($number):($number):($number)" {
4054			set days  [expr $expect_out(1,string) * 24 * 60 * 60]
4055			set hours [expr $expect_out(2,string) * 60 * 60]
4056			set mins  [expr $expect_out(3,string) * 60]
4057			set secs  [expr $days + $hours + $mins + $expect_out(4,string)]
4058			exp_continue
4059		}
4060		-re "($number):($number):($number)" {
4061			set hours [expr $expect_out(1,string) * 60 * 60]
4062			set mins  [expr $expect_out(2,string) * 60]
4063			set secs  [expr $hours + $mins + $expect_out(3,string)]
4064			exp_continue
4065		}
4066		-re "($number):($number)" {
4067			set mins  [expr $expect_out(1,string) * 60]
4068			set secs  [expr $mins + $expect_out(2,string)]
4069			exp_continue
4070		}
4071		-re "($number)" {
4072			set secs  [expr $expect_out(1,string) * 60]
4073			exp_continue
4074		}
4075		timeout {
4076			slow_kill $sinfo_pid
4077			fail "sinfo not responding"
4078		}
4079		eof {
4080			wait
4081		}
4082	}
4083
4084	log_user 1
4085	return $secs
4086}
4087
4088
4089################################################################
4090#
4091# NAME
4092#	get_partition_default_time_limit - gets the default time limit in a given partition
4093#
4094# SYNOPSIS
4095#	get_partition_default_time_limit ?partition?
4096#
4097# DESCRIPTION
4098#	Get the default time limit in a given partition.
4099#	If the partition is not specified, the default partition will be used.
4100#
4101# RETURN VALUE
4102#	Returns: time limit in seconds, -1 if undefined or error.
4103#
4104################################################################
4105
4106proc get_partition_default_time_limit { {partition ""} } {
4107	global sinfo number
4108
4109	if {[string length $partition] == 0} {
4110		set partition [default_partition]
4111	}
4112
4113	set secs 0
4114	log_user 0
4115	set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O defaulttime -e]
4116	expect {
4117		-re "infinite" {
4118			set secs -1
4119			exp_continue
4120		}
4121		-re "n/a" {
4122			set secs -1
4123			exp_continue
4124		}
4125		-re "($number)-($number):($number):($number)" {
4126			set days  [expr $expect_out(1,string) * 24 * 60 * 60]
4127			set hours [expr $expect_out(2,string) * 60 * 60]
4128			set mins  [expr $expect_out(3,string) * 60]
4129			set secs  [expr $days + $hours + $mins + $expect_out(4,string)]
4130			exp_continue
4131		}
4132		-re "($number):($number):($number)" {
4133			set hours [expr $expect_out(1,string) * 60 * 60]
4134			set mins  [expr $expect_out(2,string) * 60]
4135			set secs  [expr $hours + $mins + $expect_out(3,string)]
4136			exp_continue
4137		}
4138		-re "($number):($number)" {
4139			set mins  [expr $expect_out(1,string) * 60]
4140			set secs  [expr $mins + $expect_out(2,string)]
4141			exp_continue
4142		}
4143		-re "($number)" {
4144			set secs  [expr $expect_out(1,string) * 60]
4145			exp_continue
4146		}
4147		timeout {
4148			slow_kill $sinfo_pid
4149			fail "sinfo not responding"
4150		}
4151		eof {
4152			wait
4153		}
4154	}
4155
4156	log_user 1
4157	return $secs
4158}
4159
4160
4161#####################################################################
4162#
4163# NAME
4164#	get_node_cores - given a node, returns its total number of cores
4165#
4166# SYNOPSIS
4167#	get_node_cores node
4168#
4169# DESCRIPTION
4170#	Given a node, return its total number of cores
4171#	(not the CoresPerSocket, but the total cores)
4172#
4173# RETURN VALUE
4174#	node cores if retrieved, -1 otherwise
4175#
4176#####################################################################
4177
4178proc get_node_cores {node} {
4179
4180	global sinfo number
4181	set cores -1
4182	set sockets_per_node 0
4183	set cores_per_socket 0
4184
4185	if {[string length $node] == 0} {
4186		return $cores
4187	}
4188
4189	log_user 0
4190	set sinfo_pid [spawn -noecho $sinfo -o "%X %Y" -h -n $node]
4191
4192	expect {
4193		-re "($number)" {
4194			if {$sockets_per_node == 0} {
4195				set sockets_per_node $expect_out(1,string)
4196			} else {
4197				set cores_per_socket $expect_out(1,string)
4198			}
4199			exp_continue
4200		}
4201		timeout {
4202			slow_kill $sinfo_pid
4203			fail "sinfo not responding"
4204		}
4205		eof {
4206			wait
4207		}
4208	}
4209
4210	log_user 1
4211
4212	set cores [expr $sockets_per_node * $cores_per_socket]
4213
4214	return $cores
4215}
4216
4217
4218#####################################################################
4219#
4220# NAME
4221#	get_node_cpus - given a node, returns its total number of threads we account for
4222#
4223# SYNOPSIS
4224#	get_node_cpus node
4225#
4226# DESCRIPTION
4227#	Given a node, return its total number of threads we account for.
4228#	(not always ThreadsPerCore, but how many threads are in use.
4229#	i.e. CPUs=6 CoresPerSocket=6 ThreadsPerCore=2 Socket=1 would
4230#	result in only 1 thread we care about instead of the 2 listed.)
4231#
4232# RETURN VALUE
4233#	list of node [ tot_cpus threads ] if retrieved, [ -1 -1 ] otherwise
4234#
4235#####################################################################
4236
4237proc get_node_cpus {node} {
4238	global scontrol number
4239
4240	set nthreads -1
4241	set nsockets 0
4242	set ncores 0
4243	set totcpus -1
4244
4245	if {[string length $node] == 0} {
4246		return [list $totcpus $nthreads]
4247	}
4248
4249	# Get the number of CPUs on a node
4250	set scontrol_pid [spawn $scontrol show node $node]
4251	expect {
4252		-re "CoresPerSocket=($number)" {
4253			set ncores $expect_out(1,string)
4254			exp_continue
4255		}
4256		-re "CPUTot=($number)" {
4257			set totcpus $expect_out(1,string)
4258			exp_continue
4259		}
4260		-re "Sockets=($number)" {
4261			set nsockets $expect_out(1,string)
4262			exp_continue
4263		}
4264		-re "ThreadsPerCore=($number)" {
4265			set nthreads $expect_out(1,string)
4266			exp_continue
4267		}
4268		timeout {
4269			slow_kill $scontrol_pid
4270			fail "scontrol is not responding"
4271		}
4272		eof {
4273			wait
4274		}
4275	}
4276
4277	set core_cnt [expr $nsockets * $ncores]
4278	set thread_cnt [expr $ncores * $nthreads]
4279	if {$totcpus != $nthreads && $totcpus == $ncores} {
4280		log_debug "Cores rather than threads are being allocated"
4281		set nthreads 1
4282	}
4283
4284	return [list $totcpus $nthreads]
4285}
4286
4287
4288#####################################################################
4289#
4290# NAME
4291#	get_part_total_cores - given a partition and/or states, return its total cores
4292#
4293# SYNOPSIS
4294#	get_part_total_cores partition states
4295#
4296# DESCRIPTION
4297#	Given a partition and/or states, return its total cores
4298#
4299# ARGUMENTS
4300#	partition
4301#		partition to check cores
4302#	states
4303#		states to filter on partition cores
4304#
4305# RETURN VALUE
4306#	partition cores
4307#
4308#####################################################################
4309
4310proc get_part_total_cores {part states} {
4311	global sinfo number
4312	log_user 0
4313	set cores 0
4314	set tmp 0
4315	set i 0
4316
4317	if {[string length $part] == 0} {
4318		set part [default_partition]
4319	}
4320
4321	if {[string length $states] == 0} {
4322		set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -o "%X %Y"]
4323	} else {
4324		set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -t $states -o "%X %Y"]
4325	}
4326
4327	expect {
4328		-re "($number)" {
4329			set is_even [expr {($i % 2) == 0}]
4330			if {$is_even == 1} {
4331				set tmp $expect_out(1,string)
4332			} else {
4333				set tmp [expr $tmp * $expect_out(1,string)]
4334				set cores [expr $cores + $tmp]
4335			}
4336			incr i
4337			exp_continue
4338		}
4339		timeout {
4340			slow_kill $sinfo_pid
4341			fail "sinfo not responding"
4342		}
4343		eof {
4344			wait
4345		}
4346	}
4347
4348	log_user 1
4349	return $cores
4350}
4351
4352
4353#####################################################################
4354#
4355# NAME
4356#	check_hosts_contiguous - verify if all hosts belong to the partition and are contiguous
4357#
4358# SYNOPSIS
4359#	check_hosts_contiguous check_hosts_list partition
4360#
4361# DESCRIPTION
4362#	Given a partition and a list of hosts, verify if all
4363#	hosts belong to the partition and are contiguous.
4364#	If the partition argument is empty, the default partition
4365#	will be used.
4366#
4367# RETURN VALUE
4368#	Returns: true if hosts are contiguous, false otherwise.
4369#
4370#####################################################################
4371
4372proc check_hosts_contiguous { check_hosts_list {partition ""} } {
4373
4374	global sinfo re_word_str
4375
4376	if {[string length $partition] == 0} {
4377		set partition [default_partition]
4378	}
4379
4380	set part_hosts_list {}
4381
4382	log_user 0
4383	set sinfo_pid [spawn $sinfo --noheader -p $partition -N -o %N]
4384	expect {
4385		-re "($re_word_str)" {
4386			lappend part_hosts_list $expect_out(1,string)
4387			exp_continue
4388		}
4389		-re "Unable to contact" {
4390			log_user 1
4391			fail "Slurm appears to be down"
4392		}
4393		timeout {
4394			log_user 1
4395			slow_kill $sinfo_pid
4396			fail "sinfo not responding"
4397		}
4398		eof {
4399			wait
4400		}
4401	}
4402	log_user 1
4403
4404	foreach host $check_hosts_list {
4405		set idx_cur [lsearch $part_hosts_list $host]
4406		if {$idx_cur == -1} {
4407			fail "$host not found in list of hosts from partition $partition"
4408		}
4409		if {[info exists idx_old]} {
4410			if {$idx_cur != [expr $idx_old + 1]} {
4411				log_error "Node sequence number not contiguous"
4412				return false
4413			}
4414		}
4415		set idx_old $idx_cur
4416	}
4417
4418	return true
4419}
4420
4421
4422################################################################
4423#
4424# NAME
4425#	get_het_job_ids - gets list of component job ids for a het job
4426#
4427# SYNOPSIS
4428#	get_het_job_ids job_id ?use_offset?
4429#
4430# DESCRIPTION
4431#	Gets list of component job ids for a het job.
4432#
4433# ARGUMENTS
4434#	job_id
4435#		Slurm job id
4436#	use_offset
4437#		If zero, returns list of integer job ids, else returns ids in
4438#		the form of X+Y where X is het job master id and Y is the
4439#		offset.
4440#
4441# RETURN VALUE
4442#	A list of ids for a hetjob or an empty list if jobid
4443#	is not a het one.
4444#
4445################################################################
4446
4447proc get_het_job_ids { jobid {use_offset 0}} {
4448	global scontrol number
4449
4450	set id_list ""
4451	set log_user_save [log_user -info]
4452	log_user 0
4453	spawn $scontrol show job $jobid
4454	expect {
4455		-re "JobId=($number) HetJobId=($number) HetJobOffset=($number)" {
4456			if { $use_offset } {
4457				lappend id_list "$expect_out(2,string)+$expect_out(3,string)"
4458			} else {
4459				lappend id_list $expect_out(1,string)
4460			}
4461			exp_continue
4462		}
4463		timeout {
4464			log_error "scontrol not responding"
4465			set id_list ""
4466		}
4467		eof {
4468			wait
4469		}
4470	}
4471	log_user $log_user_save
4472	return $id_list
4473}
4474
4475
4476################################################################
4477#
4478# NAME
4479#	reconfigure - calls scontrol reconfigure
4480#
4481# SYNOPSIS
4482#	reconfigure ?options? ?cluster?
4483#
4484# DESCRIPTION
4485#	Calls scontrol reconfigure. This routine takes the same options as
4486#	run_command, passing them to the underlying run_command invocation.
4487#	This command waits an additional 5 seconds before returning.
4488#
4489# OPTIONS
4490#	See OPTIONS of run_command proc.
4491#
4492# ARGUMENTS
4493#	cluster
4494#		The cluster to reconfigure
4495#
4496# RETURN VALUE
4497#	RETURN_SUCCESS on success, otherwise RETURN_ERROR
4498#
4499################################################################
4500
4501proc reconfigure args {
4502	global scontrol
4503
4504	set options [list]
4505	set cluster ""
4506
4507	while {[llength $args]} {
4508		switch -glob -- [lindex $args 0] {
4509			-* {
4510				lappend options {*}[lrange $args 0 1]
4511				set args [lrange $args 2 end]
4512			}
4513			default break
4514		}
4515	}
4516	set argument_count [llength $args]
4517	if {$argument_count > 1}  {
4518		fail "Too many arguments ($argument_count): $args"
4519	} elseif {$argument_count == 1} {
4520		lassign $args cluster
4521	}
4522
4523	set command $scontrol
4524	if {$cluster ne ""} {
4525		append command " -M$cluster"
4526	}
4527	append command " reconfigure"
4528	set rc [run_command_status {*}$options "$command"]
4529
4530	#
4531	# Wait 5 seconds for reconfigure to complete, then return.
4532	#
4533	sleep 5
4534	return $rc
4535}
4536
4537
4538#####################################################################
4539#
4540# NAME
4541#	log_fatal - prints a fatal message
4542#
4543# SYNOPSIS
4544#	log_fatal message
4545#
4546# SEE ALSO
4547#	_log_format for options governing the message format and colorization
4548#
4549#####################################################################
4550
4551proc log_fatal {message} {
4552	global testsuite_log_level LOG_LEVEL_FATAL
4553
4554	if {$testsuite_log_level >= $LOG_LEVEL_FATAL} {
4555		_log_format "fatal" "$message"
4556	}
4557}
4558
4559
4560#####################################################################
4561#
4562# NAME
4563#	log_error - prints an error message
4564#
4565# SYNOPSIS
4566#	log_error message
4567#
4568# SEE ALSO
4569#	_log_format for options governing the message format and colorization
4570#
4571#####################################################################
4572
4573proc log_error {message} {
4574	global testsuite_log_level LOG_LEVEL_ERROR
4575
4576	if {$testsuite_log_level >= $LOG_LEVEL_ERROR} {
4577		_log_format "error" "$message"
4578	}
4579}
4580
4581
4582#####################################################################
4583#
4584# NAME
4585#	log_warn - prints a warning message
4586#
4587# SYNOPSIS
4588#	log_warn message
4589#
4590# SEE ALSO
4591#	_log_format for options governing the message format and colorization
4592#
4593#####################################################################
4594
4595proc log_warn {message} {
4596	global testsuite_log_level LOG_LEVEL_WARNING
4597
4598	if {$testsuite_log_level >= $LOG_LEVEL_WARNING} {
4599		_log_format "warning" "$message"
4600	}
4601}
4602
4603
4604#####################################################################
4605#
4606# NAME
4607#	log_info - prints an information message
4608#
4609# SYNOPSIS
4610#	log_info message
4611#
4612# SEE ALSO
4613#	_log_format for options governing the message format and colorization
4614#
4615#####################################################################
4616
4617proc log_info {message} {
4618	global testsuite_log_level LOG_LEVEL_INFO
4619
4620	if {$testsuite_log_level >= $LOG_LEVEL_INFO} {
4621		_log_format "info" "$message"
4622	}
4623}
4624
4625
4626#####################################################################
4627#
4628# NAME
4629#	log_pass - prints a pass level message
4630#
4631# SYNOPSIS
4632#	log_pass message
4633#
4634# SEE ALSO
4635#	_log_format for options governing the message format and colorization
4636#
4637#####################################################################
4638
4639proc log_pass {message} {
4640	global testsuite_log_level LOG_LEVEL_PASS
4641
4642	if {$testsuite_log_level >= $LOG_LEVEL_PASS} {
4643		_log_format "pass" "$message"
4644	}
4645}
4646
4647
4648#####################################################################
4649#
4650# NAME
4651#	log_command - prints a command level message
4652#
4653# SYNOPSIS
4654#	log_command message
4655#
4656# SEE ALSO
4657#	_log_format for options governing the message format and colorization
4658#
4659#####################################################################
4660
4661proc log_command {message} {
4662	global testsuite_log_level LOG_LEVEL_COMMAND
4663
4664	if {$testsuite_log_level >= $LOG_LEVEL_COMMAND} {
4665		_log_format "command" "$message"
4666	}
4667}
4668
4669
4670#####################################################################
4671#
4672# NAME
4673#	log_debug - prints a debug level message
4674#
4675# SYNOPSIS
4676#	log_debug message
4677#
4678# SEE ALSO
4679#	_log_format for options governing the message format and colorization
4680#
4681#####################################################################
4682
4683proc log_debug {message} {
4684	global testsuite_log_level LOG_LEVEL_DEBUG
4685
4686	if {$testsuite_log_level >= $LOG_LEVEL_DEBUG} {
4687		_log_format "debug" "$message"
4688	}
4689}
4690
4691
4692#####################################################################
4693#
4694# NAME
4695#	log_trace - prints a trace level message
4696#
4697# SYNOPSIS
4698#	log_trace message
4699#
4700# SEE ALSO
4701#	_log_format for options governing the message format and colorization
4702#
4703#####################################################################
4704
4705proc log_trace {message} {
4706	global testsuite_log_level LOG_LEVEL_TRACE
4707
4708	if {$testsuite_log_level >= $LOG_LEVEL_TRACE} {
4709		_log_format "trace" "$message"
4710	}
4711}
4712
4713
4714################################################################
4715#
4716# NAME
4717#	in_fed - checks whether this cluster is in a federation
4718#
4719# SYNOPSIS
4720#	in_fed
4721#
4722# RETURN VALUE
4723#	Returns true if this cluster is in a federation, false otherwise
4724#
4725################################################################
4726
4727proc in_fed {} {
4728	global scontrol
4729
4730	spawn $scontrol show fed
4731	expect {
4732		-re "Federation" {
4733			return true
4734		}
4735		timeout {
4736			log_error "scontrol not responding"
4737		}
4738		eof {
4739			wait
4740		}
4741	}
4742	return false
4743}
4744
4745
4746################################################################
4747#
4748# NAME
4749#	check_job_state - checks if the state of a job is the expected one
4750#
4751# SYNOPSIS
4752#	check_job_state job state ?het_job?
4753#
4754# DESCRIPTION
4755#	Checks if the state of a job is the expected one.
4756#
4757# ARGUMENTS
4758#	job
4759#		Job ID to check
4760#	state
4761#		Desired state of the job to match
4762#	het_job
4763#		If set, checks state of each component job if the
4764#		job is a hetjob.
4765#
4766# RETURN VALUE
4767#	true if job was on the desired state, or the number of job components
4768#	on that state if it's a hetjob and het_job option enabled, false
4769#	otherwise.
4770#
4771################################################################
4772
4773proc check_job_state { job state {het_job 0}} {
4774	global scontrol
4775
4776	set jid_list ""
4777	if { $het_job } {
4778		set jid_list [get_het_job_ids $job 1]
4779	}
4780
4781	if { $jid_list == "" } {
4782		# non-het job
4783		set jid_list $job
4784	}
4785
4786	foreach jid $jid_list {
4787		set state_match 0
4788		spawn $scontrol show job $jid
4789		expect {
4790			-re "JobState=($state)" {
4791				incr state_match
4792			}
4793			timeout {
4794				fail "scontrol not responding"
4795			}
4796			eof {
4797				wait
4798			}
4799		}
4800
4801		if {$state_match != 1} {
4802			log_error "job $jid should be in $state state, but is not"
4803			return false
4804		}
4805	}
4806
4807	return true
4808}
4809
4810
4811################################################################
4812#
4813# NAME
4814#	get_gres_count - returns a dict of nodes and GRES counts
4815#
4816# SYNOPSIS
4817#	get_gres_count gres_name ?node_list?
4818#
4819# DESCRIPTION
4820#	Returns a dict of node names and the count of a specifed
4821#	GRES aggregating all its types on each node.
4822#
4823# RETURN VALUE
4824#	If the node_list is not specified node name is specified,
4825#	this function will return a dict with the GRES count for all
4826#	the nodes of the default partition.
4827#	If specified, a dict only with the nodes of the node_list.
4828#
4829################################################################
4830
4831proc get_gres_count { gres_name {node_list ""} } {
4832	set nodes_dict [get_nodes $node_list]
4833	set nodes_gres_dict [dict create]
4834	dict for {node_name node_dict} $nodes_dict {
4835		if [dict exists $node_dict "Gres"] {
4836			set gres_param [dict get $node_dict "Gres"]
4837			set gres_dict  [count_gres $gres_param]
4838		}
4839		if [dict exists $gres_dict $gres_name] {
4840			set gres_count [dict get $gres_dict $gres_name]
4841			dict set nodes_gres_dict $node_name $gres_count
4842		}
4843	}
4844	return $nodes_gres_dict
4845}
4846
4847
4848################################################################
4849#
4850# NAME
4851#	count_gres - returns a dict of GRES names and their total counts
4852#
4853# SYNOPSIS
4854#	count_gres gres_param
4855#
4856# DESCRIPTION
4857#	Parses a GRES parameter string typically obtained from nodes or
4858#	jobs info, and returns a dict of GRES names and their count
4859#	aggregating all the types of each GRES.
4860#
4861# RETURN VALUE
4862#	A dict of GRES names and their count aggregating all types of
4863#	each GRES.
4864#
4865################################################################
4866
4867proc count_gres { gres_param } {
4868	global gres_regex
4869
4870	set gres_dict [dict create]
4871	foreach gres [split $gres_param ","] {
4872		if {[regexp $gres_regex $gres {} name type count] == 1} {
4873			if {$count eq ""} { set count $type }
4874
4875			if {[dict exists $gres_dict $name]} {
4876				dict set gres_dict $name [expr [dict get $gres_dict $name] + $count]
4877			} else {
4878				dict set gres_dict $name $count
4879			}
4880		}
4881	}
4882	return $gres_dict
4883}
4884
4885
4886################################################################
4887#
4888# NAME
4889#	get_highest_gres_count - returns highest number of GRES per node on node_count nodes
4890#
4891# SYNOPSIS
4892#	get_highest_gres_count node_count gres_name
4893#
4894# DESCRIPTION
4895#	For a given number of nodes, returns the highest GRES count per
4896#	node available on at least that number of nodes.
4897#
4898# EXAMPLE
4899#	For example: node1 has 1 GPU, node2 has 2 GPUs and node3 has 3 GPUs
4900#	[get_highest_gres_count 1 "gpu"] returns 3 (i.e. 1 node 3 GPUs)
4901#	[get_highest_gres_count 2 "gpu"] returns 2 (i.e. 2 nodes have at least 2 GPUs each)
4902#	[get_highest_gres_count 3 "gpu"] returns 1 (i.e. 3 nodes have at least 1 GPU each)
4903#
4904################################################################
4905
4906proc get_highest_gres_count { node_count gres_name } {
4907
4908	set available_nodes [list2hostlist [get_nodes_by_state]]
4909	set gres_dict [get_gres_count $gres_name $available_nodes]
4910	set gres_count [list]
4911
4912	dict for {node gres} $gres_dict {
4913		lappend gres_count $gres
4914	}
4915
4916	set count [lindex [lsort -decreasing -integer $gres_count] [expr $node_count - 1]]
4917	return $count
4918}
4919
4920
4921################################################################
4922#
4923# NAME
4924#	_set_gpu_socket_inx - adds a socket index to the gpu_sock_list if not already on it
4925#
4926# SYNOPSIS
4927#	_set_gpu_socket_inx sock_inx
4928#
4929# DESCRIPTION
4930#	Add a socket index to the array gpu_sock_list if not already
4931#	on the list. Subroutine used by get_gpu_socket_count
4932#
4933################################################################
4934
4935proc _set_gpu_socket_inx { sock_inx } {
4936	global gpu_sock_list
4937
4938	if {$sock_inx == -1} {
4939		set gpu_sock_list [lreplace $gpu_sock_list 0 99]
4940		return
4941	}
4942
4943	set sock_cnt [llength $gpu_sock_list]
4944	for {set i 0} {$i < $sock_cnt} {incr i} {
4945		if {[lindex $gpu_sock_list $i] == $sock_inx} {
4946			return
4947		}
4948	}
4949	lappend gpu_sock_list $sock_inx
4950}
4951
4952
4953################################################################
4954# Subroutine used by get_gpu_socket_count
4955# Add a socket index to the array gpu_sock_list if not already
4956# on the list.
4957################################################################
4958
4959proc _set_gpu_socket_range { sock_first_inx sock_last_inx } {
4960	global gpu_sock_list
4961
4962	set sock_cnt [llength $gpu_sock_list]
4963	for {set s $sock_first_inx} {$s <= $sock_last_inx} {incr s} {
4964		set found 0
4965		for {set i 0} {$i < $sock_cnt} {incr i} {
4966			if {[lindex $gpu_sock_list $i] == $s} {
4967				set found 1
4968				break
4969			}
4970		}
4971		if {$found == 0} {
4972			lappend gpu_sock_list $s
4973		}
4974	}
4975}
4976
4977
4978################################################################
4979#
4980# NAME
4981#	get_gpu_socket_count - returns the number of sockets with GPUS on a node with the given per-node GPU count
4982#
4983# SYNOPSIS
4984#	get_gpu_socket_count gpu_cnt sockets_per_node
4985#
4986# DESCRIPTION
4987#	Given a per-node GPU count, return the number of sockets with
4988#	GPUs on a node with the given per-node GPU count.
4989#	If the sockets_per_node has a value of 1 then just return 1
4990#	rather than determine the count (for performance reasons).
4991#
4992################################################################
4993
4994proc get_gpu_socket_count { gpu_cnt sockets_per_node } {
4995	global re_word_str bin_rm number scontrol srun
4996	global gpu_sock_list
4997	set sockets_with_gpus 1
4998	set file_in "test_get_gpu_socket_count.input"
4999
5000	if {$sockets_per_node == 1} {
5001		return 1
5002	}
5003
5004	log_user 0
5005	_set_gpu_socket_inx -1
5006	make_bash_script $file_in "$scontrol show node \$SLURMD_NODENAME"
5007	spawn $srun -N1 --gres=gpu:$gpu_cnt $file_in
5008	expect {
5009		-re "gpu:${number}.S:($number)-($number)" {
5010			_set_gpu_socket_range $expect_out(1,string) $expect_out(2,string)
5011			exp_continue
5012		}
5013		-re "gpu:${re_word_str}:${number}.S:($number),($number),($number),($number)" {
5014			_set_gpu_socket_inx $expect_out(1,string)
5015			_set_gpu_socket_inx $expect_out(2,string)
5016			_set_gpu_socket_inx $expect_out(3,string)
5017			_set_gpu_socket_inx $expect_out(4,string)
5018			exp_continue
5019		}
5020		-re "gpu:${re_word_str}:${number}.S:($number),($number),($number)" {
5021			_set_gpu_socket_inx $expect_out(1,string)
5022			_set_gpu_socket_inx $expect_out(2,string)
5023			_set_gpu_socket_inx $expect_out(3,string)
5024			exp_continue
5025		}
5026		-re "gpu:${re_word_str}:${number}.S:($number),($number)" {
5027			_set_gpu_socket_inx $expect_out(1,string)
5028			_set_gpu_socket_inx $expect_out(2,string)
5029			exp_continue
5030		}
5031		-re "gpu:${re_word_str}:${number}.S:($number)" {
5032			_set_gpu_socket_inx $expect_out(1,string)
5033			exp_continue
5034		}
5035		-re "gpu:${number}.S:($number),($number),($number),($number)" {
5036			_set_gpu_socket_inx $expect_out(1,string)
5037			_set_gpu_socket_inx $expect_out(2,string)
5038			_set_gpu_socket_inx $expect_out(3,string)
5039			_set_gpu_socket_inx $expect_out(4,string)
5040			exp_continue
5041		}
5042		-re "gpu:${number}.S:($number),($number),($number)" {
5043			_set_gpu_socket_inx $expect_out(1,string)
5044			_set_gpu_socket_inx $expect_out(2,string)
5045			_set_gpu_socket_inx $expect_out(3,string)
5046			exp_continue
5047		}
5048		-re "gpu:${number}.S:($number),($number)" {
5049			_set_gpu_socket_inx $expect_out(1,string)
5050			_set_gpu_socket_inx $expect_out(2,string)
5051			exp_continue
5052		}
5053		-re "gpu:${number}.S:($number)" {
5054			_set_gpu_socket_inx $expect_out(1,string)
5055			exp_continue
5056		}
5057		eof {
5058			wait
5059		}
5060	}
5061	log_user 1
5062	exec $bin_rm -f $file_in
5063
5064	set sock_cnt [llength $gpu_sock_list]
5065	if {$sock_cnt > 1} {
5066		set sockets_with_gpus $sock_cnt
5067	}
5068
5069	return $sockets_with_gpus
5070}
5071
5072
5073################################################################
5074#
5075# NAME
5076#	get_highest_mps_count - get_highest_gres_count nodes mps, but for "mps per GPU"
5077#
5078# SYNOPSIS
5079#	get_highest_mps_count node_count
5080#
5081# DESCRIPTION
5082#	For a given number of nodes, returns the higest number of MPS per GPU
5083#	available at least on those number of nodes.
5084#
5085################################################################
5086
5087proc get_highest_mps_count { node_count } {
5088	# We cannot use get_highest_gres_count because we need "per gpu",
5089	# so we get all the mps per node and all gpus per node, to create
5090	# a mps_per_gpu list to sort and get the count.
5091	set available_nodes [list2hostlist [get_nodes_by_state]]
5092	set mps_dict [get_gres_count "mps" $available_nodes]
5093	set gpu_dict [get_gres_count "gpu" $available_nodes]
5094	set mps_per_gpu [list]
5095
5096	dict for {node mps} $mps_dict {
5097		if { $mps > 0 } {
5098			if [dict exists $gpu_dict $node] {
5099				set gpu [dict get $gpu_dict $node]
5100				if { $gpu > 0 } {
5101					lappend mps_per_gpu [expr $mps / $gpu]
5102				} else {
5103					fail "All nodes with MPS should have a GPU"
5104				}
5105			} else {
5106				fail "All nodes with MPS should have a GPU"
5107			}
5108		}
5109	}
5110
5111	set count [lindex [lsort -decreasing -integer $mps_per_gpu] [expr $node_count - 1]]
5112	return $count
5113}
5114
5115
5116################################################################
5117#
5118# NAME
5119#	get_mps_node_count - gets the number of nodes with a positive number of GRES MPS
5120#
5121# SYNOPSIS
5122#	get_mps_node_count
5123#
5124# RETURN VALUE
5125#	Return the count of nodes with a non-zero count of GRES MPS
5126#
5127################################################################
5128
5129proc get_mps_node_count { } {
5130	global number sinfo re_word_str
5131	set fini 0
5132	set node_inx 0
5133	set def_part [default_partition]
5134
5135	log_user 0
5136	spawn $sinfo -N -p$def_part -oGRES=%G -h
5137	expect {
5138		-re "GRES=($re_word_str)" {
5139			set mps_count 0
5140			set parts [split $expect_out(1,string) ",/"]
5141			while 1 {
5142				set mps_found [lsearch $parts "mps*"]
5143				if { $mps_found == -1 } break
5144
5145				set parts2 [split [lindex $parts $mps_found] ":(/"]
5146				set col [lsearch -regexp $parts2 ^$number$]
5147				if { $col == -1 } {
5148					incr mps_count
5149				} else {
5150					set mps_count [expr $mps_count + [lindex $parts2 $col]]
5151				}
5152				set parts [lreplace $parts $mps_found $mps_found]
5153			}
5154
5155			if {$mps_count > 0} {
5156				incr node_inx
5157			}
5158			exp_continue
5159		}
5160		eof {
5161			wait
5162		}
5163	}
5164	log_user 1
5165
5166	return $node_inx
5167}
5168
5169
5170################################################################
5171#
5172# NAME
5173#	cuda_count - determines the count of allocated GPUs
5174#
5175# SYNOPSIS
5176#	cuda_count cuda_string
5177#
5178# ARGUMENTS
5179#	cuda_string
5180#		Contents of a CUDA_VISIBLE_DEVICES environment variable
5181#
5182# RETURN VALUE
5183#	Return the number of GPUs or -1 on error
5184#
5185################################################################
5186
5187proc cuda_count { cuda_string } {
5188	set cuda_count 0
5189	set has_number 0
5190	set len [string length $cuda_string]
5191	for {set char_inx 0} {$char_inx < $len} {incr char_inx} {
5192		set cuda_char [string index $cuda_string $char_inx]
5193		if {[string match , $cuda_char]} {
5194			if {$has_number > 0} {
5195				incr cuda_count
5196				set has_number 0
5197			} else {
5198				log_error "Invalid input ($cuda_string)"
5199				return -1
5200			}
5201		} elseif {[string is digit $cuda_char]} {
5202			set has_number 1
5203		}
5204	}
5205	if {$has_number > 0} {
5206		incr cuda_count
5207	} else {
5208		log_error "Invalid input ($cuda_string)"
5209		return -1
5210	}
5211	return $cuda_count
5212}
5213
5214
5215################################################################
5216# NAME
5217#	get_conf_path - gets the path to the slurm.conf file
5218#
5219# SYNOPSIS
5220#	get_conf_path
5221#
5222# RETURN VALUE
5223#	Returns the path to the slurm.conf file
5224#
5225################################################################
5226
5227proc get_conf_path { } {
5228	global scontrol re_word_str eol
5229
5230	if [regexp {(.*)/slurm.conf} [get_config_param "SLURM_CONF"] {} config_dir] {
5231		return $config_dir
5232	} else {
5233		fail "Unable to determine config dir"
5234	}
5235}
5236
5237
5238################################################################
5239#
5240# NAME
5241#	save_conf - saves a backup of the specfied configuration file
5242#
5243# SYNOPSIS
5244#	save_conf file_name
5245#
5246# DESCRIPTION
5247#	If the specified file_name exists, a backup is made which will be
5248#	restored when restore_conf is called.
5249#	If the specified file_name does not exist, a special backup will be
5250#	made that will cause the file to be removed when restore_conf is
5251#	called.
5252#	If a backup already exists, a warning is issued and no backup is made
5253#	(honoring the existing backup).
5254#
5255# SEE ALSO
5256#	restore_conf
5257#
5258################################################################
5259
5260proc save_conf { file_name } {
5261	global test_id
5262	global bin_chmod bin_cp bin_mv bin_touch
5263
5264	log_debug "Saving backup of $file_name"
5265
5266	#
5267	# Check for existing backup
5268	# If a backup exists, issue a warning and return (honor existing backup)
5269	#
5270	set conf_dir [file dirname $file_name]
5271	set dir_files [glob -nocomplain -directory $conf_dir *]
5272	set preexisting_backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"]
5273	if {$preexisting_backup_file ne ""} {
5274		log_warn "Backup file already exists: ($preexisting_backup_file)"
5275		return
5276	}
5277
5278	#
5279	# Check if file to backup exists.
5280	# If it doesn't exist, warn the user, touch an empty backup file with
5281	# the sticky bit set and allow the test to continue.
5282	# restore_conf will remove the file.
5283	#
5284	set new_backup_file "$file_name.test$test_id"
5285	if {![file exists $file_name]} {
5286		log_warn "Backup of a nonexistent file requested: $file_name"
5287		run_command -fail -nolog "$bin_touch $new_backup_file"
5288		run_command -fail -nolog "$bin_chmod +t $new_backup_file"
5289		return
5290	}
5291
5292	run_command -fail -nolog "$bin_mv $file_name $new_backup_file"
5293	run_command -fail -nolog "$bin_cp $new_backup_file $file_name"
5294}
5295
5296
5297################################################################
5298#
5299# NAME
5300#	restore_conf - restores the original confiration file from backup
5301#
5302# SYNOPSIS
5303#	restore_conf file_name
5304#
5305# DESCRIPTION
5306#	If a backup exists for the specified file_name, it is restored.
5307#	If the specified file_name did not exist when originally backed up,
5308#	it will be removed.
5309#	If no backup exists, a warning is issued.
5310#
5311# SEE ALSO
5312#	save_conf
5313#
5314################################################################
5315
5316proc restore_conf { file_name } {
5317	global test_id
5318	global bin_mv bin_rm
5319
5320	log_debug "Restoring backup of $file_name"
5321
5322	set conf_dir [file dirname $file_name]
5323	set dir_files [glob -nocomplain -directory $conf_dir *]
5324	set backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"]
5325	if {$backup_file ne ""} {
5326		file stat $backup_file stat
5327
5328		# If the sticky bit is set and the file is empty, remove both
5329		if {! $stat(size) && [expr $stat(mode) & 512]} {
5330			log_debug "Removing file used for the test: $file_name"
5331			run_command -fail -nolog "$bin_rm -f $backup_file $file_name"
5332		# Else replace the original with the backup
5333		} else {
5334			run_command -fail -nolog "$bin_mv $backup_file $file_name"
5335		}
5336	} else {
5337		#
5338		# If backup file doesn't exist, it has probably already been
5339		# restored by a previous call to restore_conf
5340		#
5341		log_warn "Backup file does not exist for $file_name. It has probably already been restored"
5342		return
5343	}
5344}
5345
5346
5347################################################################
5348#
5349# NAME
5350#	have_nvml - checks if HAVE_NVML is set in config.h
5351#
5352# SYNOPSIS
5353#	have_nvml
5354#
5355# RETURN VALUE
5356#	Returns true if HAVE_NVML is set in config.h. Else, returns false
5357#
5358################################################################
5359
5360proc have_nvml { } {
5361	global bin_grep number config_h
5362
5363	return [expr [run_command_status -none -nolog "$bin_grep \"HAVE_NVML 1\" $config_h"] == 0]
5364}
5365
5366
5367################################################################
5368#
5369# NAME
5370#	delete_part - deletes partition on system
5371#
5372# SYNOPSIS
5373#	delete_part partition
5374#
5375################################################################
5376
5377proc delete_part { part_name } {
5378	global scontrol
5379
5380	# Remove part
5381	spawn $scontrol delete partition=$part_name
5382	expect {
5383		timeout {
5384			fail "scontrol is not responding"
5385		}
5386		eof {
5387			wait
5388		}
5389	}
5390}
5391
5392
5393################################################################
5394#
5395# NAME
5396#	have_lua - checks if HAVE_LUA is set in config.h
5397#
5398# SYNOPSIS
5399#	have_lua
5400#
5401# RETURN VALUE
5402#	Returns true if HAVE_LUA is set in config.h. Else, returns false
5403#
5404################################################################
5405
5406proc have_lua { } {
5407	global bin_grep config_h
5408
5409	return [expr [run_command_status -none -nolog "$bin_grep HAVE_LUA $config_h"] == 0]
5410}
5411
5412
5413################################################################
5414#
5415# NAME
5416#	get_reservations - returns a dictionary of dictionaries of reservation parameters
5417#
5418# SYNOPSIS
5419#	get_reservations ?resv_name?
5420#
5421# RETURN VALUE
5422#	Uses `scontrol show reservation` to return a dictionary of dictionaries
5423#	of job parameters. Specifying an invalid resv_name result in a failure.
5424#
5425################################################################
5426
5427proc get_reservations { {resv_name ""} } {
5428	global scontrol
5429
5430	set command "$scontrol show reservation -o"
5431	if {$resv_name ne ""} {
5432		append command " $resv_name"
5433	}
5434	set output [run_command_output -fail "$command"]
5435
5436	# Iterate over each reservation's parameter list
5437	foreach line [split $output "\n"] {
5438		if {$line eq ""} { continue }
5439
5440		# Peel off the resv parameters one at a time
5441		# The first quantifier sets the greediness for the whole RE
5442		while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
5443			# Remove the consumed parameter from the line
5444			set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
5445			# Add it to the temporary job dictionary
5446			dict set resv_dict $param_name $param_value
5447		}
5448
5449		set resv_name_dict [dict get $resv_dict "ReservationName"]
5450
5451		# Add the resv dictionary to resvs dictionary
5452		dict set resvs_dict $resv_name_dict $resv_dict
5453
5454		# Clear the resv dictionary for the next resv
5455		set resv_dict {}
5456	}
5457
5458	return $resvs_dict
5459}
5460
5461
5462################################################################
5463#
5464# NAME
5465#	get_reservation_param - returns a specific parameter value for a specific reservation
5466#
5467# SYNOPSIS
5468#	get_reservation_param resv_name parameter_name
5469#
5470# DESCRIPTION
5471#	Returns a specific parameter value for a specified reservation if the
5472#	parameter exists for the reservation, or MISSING if it does not exist.
5473#	Specifying an invalid reservation name will result in a failure.
5474#
5475################################################################
5476
5477proc get_reservation_param { resv_name parameter_name } {
5478
5479	set resvs_dict [get_reservations $resv_name]
5480
5481	if [dict exists $resvs_dict $resv_name $parameter_name] {
5482		return [dict get $resvs_dict $resv_name $parameter_name]
5483	} else {
5484		return "MISSING"
5485	}
5486}
5487
5488
5489################################################################
5490#
5491# NAME
5492#	create_res - create new reservation in system
5493#
5494# SYNOPSIS
5495#	create_res ?res_name? ?res_params?
5496#
5497# RETURN VALUE
5498#	the exit code of the scontrol command run
5499#
5500################################################################
5501
5502proc create_res { res_name res_params } {
5503	global scontrol
5504
5505	set result   [run_command "$scontrol create res ReservationName=$res_name $res_params"]
5506	set output   [dict get $result output]
5507	set ret_code [dict get $result exit_code]
5508
5509	if { $ret_code } {
5510		log_warn "[lindex [info level 0] 0]: error from scontrol: $output"
5511	} else {
5512		log_debug "[lindex [info level 0] 0]: success from scontrol: $output"
5513	}
5514
5515	return $ret_code
5516}
5517
5518
5519################################################################
5520#
5521# NAME
5522#	update_res - update exisiting reservation in system
5523#
5524# SYNOPSIS
5525#	update_res ?res_name? ?res_params?
5526#
5527# RETURN VALUE
5528#	the exit code of the scontrol command run
5529#
5530################################################################
5531
5532proc update_res { res_name res_params } {
5533	global scontrol
5534
5535	set result   [run_command "$scontrol update ReservationName=$res_name $res_params"]
5536	set output   [dict get $result output]
5537	set ret_code [dict get $result exit_code]
5538
5539	if { $ret_code } {
5540		log_warn "Return code from scontrol: $ret_code. Output: $output"
5541	}
5542
5543	return $ret_code
5544}
5545
5546
5547################################################################
5548#
5549# NAME
5550#	delete_res - delete reservation from system
5551#
5552# SYNOPSIS
5553#	delete_res ?res_name?
5554#
5555# RETURN VALUE
5556#	the exit code of the scontrol command run
5557#
5558################################################################
5559
5560proc delete_res { res_name } {
5561	global scontrol
5562
5563	set result   [run_command "$scontrol delete ReservationName=$res_name"]
5564	set output   [dict get $result output]
5565	set ret_code [dict get $result exit_code]
5566
5567	if { $ret_code } {
5568		log_warn "Return code from scontrol: $ret_code. Output: $output"
5569	}
5570
5571	return $ret_code
5572}
5573
5574
5575################################################################
5576#
5577# NAME
5578#	create_part - creates a partition
5579#
5580# SYNOPSIS
5581#	create_part partition num_nodes
5582#
5583# ARGUMENTS
5584#	partition
5585#		Name of partition to create
5586#	num_nodes
5587#		Number of nodes of partition to create
5588#
5589# RETURN VALUE
5590#	RETURN_SUCCESS, or non-zero on error
5591#
5592################################################################
5593
5594proc create_part { part_name num_nodes_in } {
5595	global scontrol srun bin_printenv number re_word_str
5596
5597	set nodes ""
5598	set num_nodes_out 0
5599
5600	set found 0
5601	spawn $scontrol show partitionname=$part_name
5602	expect {
5603		-re "PartitionName=$part_name" {
5604			set found 1
5605			exp_continue
5606		}
5607		timeout {
5608			fail "scontrol is not responding"
5609		}
5610		eof {
5611			wait
5612		}
5613	}
5614
5615	if {$found == 1} {
5616		log_error "There is already a partition $part_name"
5617		return $::RETURN_ERROR
5618	}
5619
5620	if {[string length [default_partition]] == 0} {
5621		log_warn "create_part does not work without a default partition"
5622		return $::RETURN_ERROR
5623	}
5624
5625	if { $num_nodes_in } {
5626		set num_nodes $num_nodes_in
5627	} else {
5628		set num_nodes [llength [get_nodes_by_state]]
5629	}
5630
5631	log_user 0
5632	# Get a list of nodes
5633	spawn $srun -t1 -N1-$num_nodes $bin_printenv
5634	expect {
5635		-re "SLURM_JOB_NUM_NODES=($number)" {
5636			set num_nodes_out $expect_out(1,string)
5637			exp_continue
5638		}
5639		-re "SLURM_NODELIST=($re_word_str)" {
5640			set nodes $expect_out(1,string)
5641			exp_continue
5642		}
5643		timeout {
5644			log_error "srun is not responding getting number of nodes creating part"
5645			return $::RETURN_TIMEOUT
5646		}
5647		eof {
5648			wait
5649		}
5650	}
5651
5652	if {[string length $nodes] == 0} {
5653		log_error "Did not get a valid node list"
5654		return $::RETURN_ERROR
5655	} elseif { $num_nodes_out != $num_nodes_in } {
5656		log_error "Did not get enough nodes ($num_nodes_out != $num_nodes_in) to run test"
5657		return $::RETURN_ERROR
5658	}
5659
5660	spawn $scontrol create partitionname=$part_name nodes=$nodes
5661	expect {
5662		timeout {
5663			log_error "scontrol is not responding creating partition"
5664			return $::RETURN_ERROR
5665		}
5666		eof {
5667			wait
5668		}
5669	}
5670
5671	set found 0
5672	spawn $scontrol show partitionname=$part_name
5673	expect {
5674		-re "PartitionName=$part_name" {
5675			set found 1
5676			exp_continue
5677		}
5678		timeout {
5679			fail "scontrol is not responding"
5680		}
5681		eof {
5682			wait
5683		}
5684	}
5685
5686	if { $found == 0 } {
5687		log_error "scontrol did not create partition $part_name"
5688		return $::RETURN_ERROR
5689	}
5690	log_user 1
5691
5692	log_debug "Created partition $part_name with $num_nodes_in nodes"
5693	return $::RETURN_SUCCESS
5694}
5695
5696
5697################################################################
5698#
5699# NAME
5700#	get_nodes - returns a dictionary of dictionaries of node parameters
5701#
5702# SYNOPSIS
5703#	get_nodes ?hostlist_expression?
5704#
5705# DESCRIPTION
5706#	Uses `scontrol show node` to query node parameters, returning a
5707#	dictionary of dictionaries with the node names as keys of the first
5708#	level dictionary and with the parameters as keys of the second level
5709#	dictionary. Specifying an invalid node name will result in a failure.
5710#
5711# RETURN VALUE
5712#	If the optional node expression argument is specified, the result will
5713#	be constrained by the specified hostlist expression. Otherwise, the
5714#	results for all nodes will be returned.
5715#
5716################################################################
5717
5718proc get_nodes { {hostlist_expression ""} } {
5719	global scontrol
5720
5721	set command "$scontrol show node -o"
5722	if {$hostlist_expression ne ""} {
5723		append command " $hostlist_expression"
5724	}
5725	set output [run_command_output -fail -nolog "$command"]
5726
5727	# Iterate over each node parameter line
5728	foreach line [split $output "\n"] {
5729		if {$line eq ""} { continue }
5730
5731		# Peel off the node parameters one at a time
5732		# The first quantifier sets the greediness for the whole RE
5733		while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
5734			# Remove the consumed parameter from the line
5735			set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
5736			# Add it to the temporary node dictionary
5737			dict set node_dict $param_name $param_value
5738		}
5739
5740		set node_name [dict get $node_dict "NodeName"]
5741
5742		# Add the node dictionary to nodes dictionary
5743		dict set nodes_dict $node_name $node_dict
5744
5745		# Clear the node dictionary for use by the next node
5746		set node_dict {}
5747	}
5748
5749	return $nodes_dict
5750}
5751
5752
5753################################################################
5754#
5755# NAME
5756#	get_node_param - returns a specific parameter value for a specific node
5757#
5758# SYNOPSIS
5759#	get_node_param node_name parameter_name
5760#
5761# DESCRIPTION
5762#	Returns a specific parameter value for a specified node if the
5763#	parameter exists for the node, or MISSING if it does not exist.
5764#	Specifying an invalid node name will result in a failure.
5765#
5766################################################################
5767
5768proc get_node_param { node_name parameter_name } {
5769
5770	set nodes_dict [get_nodes $node_name]
5771
5772	if [dict exists $nodes_dict $node_name $parameter_name] {
5773		return [dict get $nodes_dict $node_name $parameter_name]
5774	} else {
5775		return "MISSING"
5776	}
5777}
5778
5779
5780################################################################
5781#
5782# NAME
5783#	get_nodes_by_request - get a list of nodes satisfying requested resources
5784#
5785# SYNOPSIS
5786#	get_nodes_by_request ?options? ?request_args?
5787#
5788# DESCRIPTION
5789#	Using srun (optionally with the specified arguments), returns a list
5790#	of nodes having the requested resources.
5791#	If an error occurs, the invoking test will fail.
5792#
5793# OPTIONS
5794#	-fail
5795#		fail the test if the execution of srun results in an error or timeout
5796#
5797# ARGUMENTS
5798#	request_args
5799#		Desired resources of a node in form of srun arguments,
5800#		e.g. "--gres=gpu:1 -n1 -t1"
5801#
5802# RETURN VALUE
5803#	A list of nodes with at least the requested resources, or an empty
5804#	list otherwise.
5805#
5806################################################################
5807
5808proc get_nodes_by_request args {
5809	global srun
5810
5811	set options [list]
5812	while {[llength $args]} {
5813		switch -glob -- [lindex $args 0] {
5814			-fail {
5815				lappend options [lrange $args 0 0]
5816				set args [lrange $args 1 end]
5817			}
5818			default  break
5819		}
5820	}
5821
5822	if {[llength $args] == 1} {
5823		lassign $args request_args
5824	} elseif {[llength $args] == 0} {
5825		set request_args "-n1 -t1"
5826	} else {
5827		fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args"
5828	}
5829
5830	set command "$srun -Q $request_args printenv SLURMD_NODENAME"
5831	set result [run_command {*}$options $command]
5832
5833	if [dict get $result exit_code] {
5834		return {}
5835	}
5836
5837	set output [dict get $result output]
5838	foreach line [split $output "\n"] {
5839		if {$line eq ""} { continue }
5840		dict incr allocated_nodes $line
5841	}
5842
5843	return [lsort [dict keys $allocated_nodes]]
5844}
5845
5846
5847################################################################
5848#
5849# NAME
5850#	get_partitions - returns a dictionary of dictionaries of partition parameters
5851#
5852# SYNOPSIS
5853#	get_partitions ?partition_name?
5854#
5855# DESCRIPTION
5856#	Uses `scontrol show partitions` to query partition parameters,
5857#	returning a dictionary of dictionaries with the partition names
5858#	as keys of the first level dictionary and with the parameters as
5859#	keys of the second level dictionary. Specifying an invalid partition
5860#	name will result in a failure.
5861#
5862# RETURN VALUE
5863#	If the optional partition_name argument is specified, the result will
5864#	contain only the one patition. Otherwise, the results for all
5865#	partitions will be returned.
5866#
5867################################################################
5868
5869proc get_partitions { {partition_name ""} } {
5870	global scontrol
5871
5872	set command "$scontrol show partition -o"
5873	if {$partition_name ne ""} {
5874		append command " $partition_name"
5875	}
5876	set output [run_command_output -fail -nolog "$command"]
5877
5878	# Iterate over each partition parameter line
5879	foreach line [split $output "\n"] {
5880		if {$line eq ""} { continue }
5881
5882		# Peel off the partition parameters one at a time
5883		# The first quantifier sets the greediness for the whole RE
5884		while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
5885			# Remove the consumed parameter from the line
5886			set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
5887			# Add it to the temporary node dictionary
5888			dict set part_dict $param_name $param_value
5889		}
5890
5891		set part_name [dict get $part_dict "PartitionName"]
5892
5893		# Add the node dictionary to nodes dictionary
5894		dict set parts_dict $part_name $part_dict
5895
5896		# Clear the node dictionary for use by the next node
5897		set part_dict {}
5898	}
5899
5900	return $parts_dict
5901}
5902
5903
5904################################################################
5905#
5906# NAME
5907#	get_partition_param - returns a specific parameter value for a specific partition
5908#
5909# SYNOPSIS
5910#	get_partition_param partitoin_name parameter_name
5911#
5912# DESCRIPTION
5913#	Returns a specific parameter value for a specified partition if the
5914#	parameter exists for the partition, or MISSING if it does not exist.
5915#	Specifying an invalid partition name will result in a failure.
5916#
5917################################################################
5918
5919proc get_partition_param { partition_name parameter_name } {
5920
5921	set partitions_dict [get_partitions $partition_name]
5922
5923	if [dict exists $partitions_dict $partition_name $parameter_name] {
5924		return [dict get $partitions_dict $partition_name $parameter_name]
5925	} else {
5926		return "MISSING"
5927	}
5928}
5929
5930
5931################################################################
5932#
5933# NAME
5934#	get_jobs - returns a dictionary of dictionaries of job parameters
5935#
5936# SYNOPSIS
5937#	get_jobs ?job_id?
5938#
5939# DESCRIPTION
5940#	Uses `scontrol show job` to return a dictionary of dictionaries of job
5941#	parameters. Specifying an invalid job id will result in a failure.
5942#
5943################################################################
5944
5945proc get_jobs { {job_id_in ""} } {
5946	global scontrol
5947
5948	set command "$scontrol show job -d -o"
5949	if {$job_id_in ne ""} {
5950		append command " $job_id_in"
5951	}
5952	set output [run_command_output -fail "$command"]
5953
5954	# Iterate over each job's parameter list
5955	foreach line [split $output "\n"] {
5956		if {$line eq ""} { continue }
5957
5958		# Peel off the job parameters one at a time
5959		# The first quantifier sets the greediness for the whole RE
5960		while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} {
5961			# Remove the consumed parameter from the line
5962			set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}]
5963			# Add it to the temporary job dictionary
5964			dict set job_dict $param_name $param_value
5965		}
5966
5967		set job_id [dict get $job_dict "JobId"]
5968
5969		# Add the job dictionary to jobs dictionary
5970		dict set jobs_dict $job_id $job_dict
5971
5972		# Clear the job dictionary for the next job
5973		set job_dict {}
5974	}
5975
5976	return $jobs_dict
5977}
5978
5979
5980################################################################
5981#
5982# NAME
5983#	get_job_param - returns a specific parameter value for a specific job
5984#
5985# SYNOPSIS
5986#	get_job_param job_id parameter_name
5987#
5988# DESCRIPTION
5989#	Returns a specific parameter value for a specified job if the
5990#	parameter exists for the job, or MISSING if it does not exist.
5991#	Specifying an invalid job id will result in a failure.
5992#
5993################################################################
5994
5995proc get_job_param { job_id parameter_name } {
5996
5997	set jobs_dict [get_jobs $job_id]
5998
5999	if [dict exists $jobs_dict $job_id $parameter_name] {
6000		return [dict get $jobs_dict $job_id $parameter_name]
6001	} else {
6002		return "MISSING"
6003	}
6004}
6005
6006proc check_reason { job_id reason } {
6007	global squeue
6008
6009	set found 0
6010	spawn $squeue -j $job_id --noheader -o "%r"
6011	expect {
6012		-re "$reason" {
6013			set found 1
6014			exp_continue
6015		}
6016		timeout {
6017			log_error "squeue not responding"
6018		}
6019		eof {
6020			wait
6021		}
6022	}
6023
6024	if {$found == 0} {
6025		log_error "Job $job_id should have a wait reason of $reason"
6026	}
6027	return $found
6028}
6029
6030
6031################################################################
6032#
6033# NAME
6034#	submit_job - submits a job with sbatch and returns its job id
6035#
6036# SYNOPSIS
6037#	submit_job ?options? job_args
6038#
6039# DESCRIPTION
6040#	Submits a job with sbatch and returns its jobid, or 0 if error.
6041#	It accepts all the options of run_command.
6042#
6043# OPTIONS
6044#	It accepts and passes all the options of/to run_command and also:
6045#	-env env
6046#		Prepend $env to the actual sbatch command to set environment
6047#		variables. For example "-env 'SLURM_NTASKS_PER_GPU=2'".
6048#
6049# ARGUMENTS
6050#	job_args
6051#		a string containing all the arguments to pass to sbatch
6052#
6053# RETURN VALUE
6054#	the job id, or 0 if an error happen
6055#
6056################################################################
6057
6058proc submit_job args {
6059	global sbatch
6060
6061	set env    ""
6062	set job_id 0
6063
6064	set idx [lsearch $args -env]
6065	if {$idx >= 0} {
6066		set env  [lindex   $args [expr $idx+1]]
6067		set args [lreplace $args $idx [expr $idx+1]]
6068	}
6069
6070	if {[llength $args] < 1} {
6071		fail "Wrong number of parameters, should be >=1"
6072	}
6073	set job_options [lindex $args [expr [llength $args] - 1 ]]
6074	set run_options   ""
6075	if {[llength $args] > 1} {
6076		set run_options [lrange $args 0 [expr [llength $args] - 2 ]]
6077	}
6078
6079	set output [run_command_output {*}$run_options "$env $sbatch $job_options"]
6080	regexp {Submitted \S+ job (\d+)} $output - job_id
6081
6082	return $job_id
6083}
6084
6085
6086################################################################################
6087#
6088# NAME
6089#	compile_against_libslurm - compiles a test program against either libslurm.so or libslurmfull.so
6090#
6091# SYNOPSIS
6092#	compile_against_libslurm ?options? test_prog ?build_args?
6093#
6094# DESCRIPTION
6095#	Compile a test program against either libslurm.so or libslurmfull.so.
6096#
6097# OPTIONS
6098#	-full
6099#		use libslurmfull.so instead of libslurm.so
6100#	-shared
6101#		produces a shared library (adds the -shared compiler option
6102#		and adds a .so suffix to the output file name)
6103#
6104# ARGUMENTS
6105#	test_prog
6106#		The name of the test program (and .c file)
6107#	build_args
6108#		Additional string to be appended to the build command.
6109#		E.g. "-DUSING_VALGRIND -lm ${build_dir}/src/slurmctld/locks.o"
6110#		(initial space will be added automatically).
6111#
6112################################################################################
6113
6114proc compile_against_libslurm args {
6115	global slurm_dir bin_cc src_dir build_dir bin_chmod
6116
6117	set use_full   false
6118	set shared     false
6119	set build_args ""
6120	while {[llength $args]} {
6121		switch -glob -- [lindex $args 0] {
6122			-full    {set use_full true; set args [lrange $args 1 end]}
6123			-shared  {set shared true; set args [lrange $args 1 end]}
6124			-*       {fail "Unknown option: [lindex $args 0]"}
6125			default  break
6126		}
6127	}
6128	set argument_count [llength $args]
6129	if {$argument_count < 1} {
6130		fail "Too few arguments ($argument_count): $args"
6131	} else {
6132		lassign $args test_prog
6133	}
6134	if {$argument_count == 2} { set build_args [lindex $args 1] }
6135	if {$argument_count > 2} {
6136		fail "Too many arguments ($argument_count): $args"
6137	}
6138
6139	if {$use_full} {
6140		set libfile "libslurmfull.so"
6141	} else {
6142		set libfile "libslurm.so"
6143	}
6144
6145	if [file exists $slurm_dir/lib64/slurm/$libfile] {
6146		set libdir "lib64"
6147	} else {
6148		set libdir "lib"
6149	}
6150
6151	if {$use_full} {
6152		set libline "$slurm_dir/$libdir/slurm"
6153		set libfile "slurmfull"
6154	} else {
6155		set libline "$slurm_dir/$libdir"
6156		set libfile "slurm"
6157	}
6158
6159	set build_cmd "$bin_cc ${test_prog}.c -g -pthread"
6160
6161	if {$shared} {
6162		set out "${test_prog}.so"
6163		append build_cmd " -fPIC -shared"
6164	} else {
6165		set out "${test_prog}"
6166	}
6167	append build_cmd " -o $out"
6168
6169	append build_cmd " -I$src_dir -I$build_dir -I$slurm_dir/include -Wl,-rpath=$libline -L$libline -l$libfile -lresolv"
6170
6171	# Add additional arguments to the build command
6172	if {$build_args != ""} {
6173		append build_cmd " $build_args"
6174	}
6175
6176	log_debug "Build command: $build_cmd"
6177	catch {exec {*}$build_cmd} out_str out_dict
6178	if {[dict get $out_dict -code]} {
6179		log_error $out_str
6180		return $::RETURN_ERROR
6181	}
6182
6183	exec $bin_chmod 700 $out
6184	return $::RETURN_SUCCESS
6185}
6186
6187
6188################################################################
6189#
6190# NAME
6191#	subtest - tests a boolean condition and updates subtest tallies
6192#
6193# SYNOPSIS
6194#	subtest ?options? condition description ?diagnostics?
6195#
6196# DESCRIPTION
6197#	Based on the results of testing a boolean expression, increments the
6198#	relevant subtest count (pass, fail or skip) and logs a message.
6199#
6200# OPTIONS
6201#	-fatal
6202#		If the subtest fails, causes a fatal error ending the test
6203#
6204# ARGUMENTS
6205#	condition
6206#		The boolean expression to test
6207#	description
6208#		A single-line string describing what is being tested. This is
6209#		a subtest "name" that is displayed with the log message
6210#		whether the subtest passes or fails
6211#	diagnostics
6212#		A string providing additional diagnostic information that is
6213#		only included in the log message on failure
6214#
6215# RETURN VALUE
6216#	the boolean condition evaluated
6217#
6218# ENVIRONMENT
6219#	testsuite_subtest_fatal
6220#		Specifies whether first failing subtest aborts the test
6221#
6222################################################################
6223
6224proc subtest args {
6225
6226	set options [list]
6227	while {[llength $args]} {
6228		switch -glob -- [lindex $args 0] {
6229			-fatal -
6230			-fail  {
6231				lappend options -fatal
6232				set args [lrange $args 1 end]
6233			}
6234			-*      {fail "Unknown option: [lindex $args 0]"}
6235			default break
6236		}
6237	}
6238	set argument_count [llength $args]
6239	if {$argument_count < 2} {
6240		fail "Too few arguments ($argument_count): $args"
6241	} else {
6242		set args [lassign $args condition description]
6243	}
6244
6245	if [uplevel 1 expr [format "{%s}" $condition]] {
6246		subpass $description
6247		return true
6248	} else {
6249		subfail {*}$options $description {*}$args
6250		return false
6251	}
6252}
6253
6254
6255################################################################
6256#
6257# NAME
6258#	_is_testproc_included - returns if testproc_num was included or not excluded in argv
6259#
6260# SYNOPSIS
6261#	_is_testproc_included testproc_num
6262#
6263# DESCRIPTION
6264#	From command line the test runner can use -i and -e to include or
6265#	exclude some test functions by their number. This function must be
6266#	used to check if the test runner included or excluded the given
6267#	testproc_num.
6268#
6269# RETURN VALUE
6270#	true is testproc_num was included with -i or not excluded -e,
6271#	false otherwise
6272#
6273################################################################
6274
6275proc _is_testproc_included {testproc_num} {
6276	global _testproc_included _testproc_excluded
6277
6278	if {[llength $_testproc_included]} {
6279		if {[lsearch $_testproc_included $testproc_num] >= 0} {
6280			return true
6281		}
6282		return false
6283	}
6284
6285	if {[llength $_testproc_excluded]} {
6286		if {[lsearch $_testproc_excluded $testproc_num] >= 0} {
6287			return false
6288		}
6289	}
6290
6291	return true
6292}
6293
6294
6295################################################################
6296#
6297# NAME
6298#	skip_following_testprocs - the following testproc calls will be skipped
6299#
6300# SYNOPSIS
6301#	skip_following_testprocs reason
6302#
6303# ARGUMENTS
6304#	reason
6305#		The string with the reason message to add on the skip message
6306#		on each skipped testproc.
6307#
6308# DESCRIPTION
6309#	This function disables normal execution of testproc calls.
6310#	It is meant to be used when some testprocs cannot be run due config
6311#	limitations, but still call testproc to register what testprocs
6312#	are skipped for a given reason.
6313#	Use run_following_testprocs to reenable the norma execution of testprocs.
6314
6315################################################################
6316
6317proc skip_following_testprocs {reason} {
6318	global _testproc_skip_next _testproc_skip_reason
6319
6320	set _testproc_skip_next   true
6321	set _testproc_skip_reason $reason
6322}
6323
6324
6325################################################################
6326#
6327# NAME
6328#	run_following_testprocs - the following testproc call will be run (if not excluded from command line)
6329#
6330# SYNOPSIS
6331#	run_following_testprocs
6332#
6333# DESCRIPTION
6334#	This function reenables the normal execution of testproc calls.
6335#	It is meant to be used when skip_following_testprocs was called to skip
6336#	previous testproc calls, and we want to normally run the following ones.
6337#	Note that it does NOT overwrite what -i and -e included are passed
6338#	from command line.
6339#
6340################################################################
6341
6342proc run_following_testprocs {} {
6343	global _testproc_skip_next _testproc_skip_reason
6344
6345	set _testproc_skip_next   false
6346	set _testproc_skip_reason ""
6347}
6348
6349
6350################################################################
6351#
6352# NAME
6353#	testproc - launcher to run or skip a testproc_call
6354#
6355# SYNOPSIS
6356#	testproc testproc_call
6357#
6358# ARGUMENTS
6359#	A testproc_call is any normal call to a proc with any arguments that
6360#	could be done normally without the testproc launcher.
6361#	For example, we could normally do:
6362#
6363#	test_my_feature $some_args $expected_out
6364#
6365#	Or use the launcher like:
6366#
6367#	testproc test_my_feature $some_args $expected_out
6368#
6369# DESCRIPTION
6370#	Using the testproc launcher has the following main benefits:
6371#	a) Handles the -i and -e terminal options to include or exclude some
6372#	   testprocs numbers.
6373#	b) Runs or skips based on the last call of testproc_{skip,run}_following.
6374#	c) Creates extra sections in the status summary
6375#	   (see testsuite_testproc_details).
6376#
6377# RETURN VALUE
6378#	The rc of the testproc_call if it has been run, or $::RETURN_SUCCESS
6379#	otherwise. Using it is not recommended, though.
6380#
6381################################################################
6382
6383proc testproc args {
6384	global _testproc_pass_list _testproc_skip_list _testproc_fail_list
6385	global _testproc_skip_next _testproc_skip_reason
6386	global _testproc_messages
6387	global _subtest_pass_count
6388	global _subtest_skip_count
6389	global _subtest_fail_count
6390
6391	# Save previous subtest counts and next subtest num
6392	set prev_pass $_subtest_pass_count
6393	set prev_skip $_subtest_skip_count
6394	set prev_fail $_subtest_fail_count
6395	set prev_subtest [expr $_subtest_pass_count + \
6396	                       $_subtest_skip_count + \
6397	                       $_subtest_fail_count + 1]
6398
6399	# Get the testproc number
6400	set testproc_num [expr [llength $_testproc_pass_list] + \
6401	                       [llength $_testproc_skip_list] + \
6402	                       [llength $_testproc_fail_list] + 1]
6403
6404	# Run or skip the testproc
6405	set rc $::RETURN_SUCCESS
6406	set reason ""
6407	if {![_is_testproc_included $testproc_num]} {
6408		set reason "(Excluded from command line)"
6409		subskip "Skipping testproc $testproc_num: {$args} $reason"
6410	} else {
6411		if {$_testproc_skip_next} {
6412			set reason "($_testproc_skip_reason)"
6413			subskip  "Skipping testproc $testproc_num: {$args} $reason"
6414		} else {
6415			log_info "Running testproc $testproc_num: $args"
6416			set rc [{*}$args]
6417		}
6418	}
6419
6420	# Get current subtest counts
6421	set curr_subtest [expr $_subtest_pass_count + \
6422	                       $_subtest_skip_count + \
6423	                       $_subtest_fail_count]
6424
6425	# Register the testproc as fail, skip or pass (based on subtests)
6426	if {$_subtest_fail_count > $prev_fail} {
6427		set reason "(Subtests: $prev_subtest to $curr_subtest)"
6428		lappend  _testproc_fail_list $testproc_num
6429		dict set _testproc_messages  $testproc_num [list failed "{$args} $reason"]
6430	} elseif {$_subtest_skip_count > $prev_skip} {
6431		lappend  _testproc_skip_list $testproc_num
6432		dict set _testproc_messages  $testproc_num [list skipped "{$args} $reason"]
6433	} else {
6434		lappend  _testproc_pass_list $testproc_num
6435		dict set _testproc_messages  $testproc_num [list passed "{$args}"]
6436	}
6437
6438	return $rc
6439}
6440
6441
6442################################################################
6443#
6444# NAME
6445#	_log_format - prints a log message with colorization and formatting
6446#
6447# SYNOPSIS
6448#	_log_format log_level message
6449#
6450# DESCRIPTION
6451#	This procedure is called by the log_<level> procedures and
6452#	derives the relevant log level from the caller's procedure name.
6453#
6454# ARGUMENTS
6455#	log_level
6456#		The logging threshold that triggered the log statement
6457#	message
6458#		The message to print with colorization and formatting
6459#
6460# ENVIRONMENT
6461#	testsuite_log_format
6462#		Used as the template for the fields to be output.
6463#		Fields must be expressed in the form:
6464#			%{<field_name>}<format_conversion_specifier>
6465#		Supported fields include the following:
6466#			message
6467#				The log message
6468#			filename
6469#				The file name where the log_<log_level>
6470#				procedure was called from
6471#			lineno
6472#				The line number where the log_<log_level>
6473#				procedure was called from
6474#			timestamp
6475#				The date and time when the log_<log_level>
6476#				procedure was called at
6477#			msecs
6478#				The milliseconds when the log_<log_level>
6479#				procedure was called at
6480#			loglevel
6481#				The log level that triggers the log_<log_level>
6482#				procedure to be called
6483#			backtrace
6484#				An abbreviated call stack trace with line
6485#				numbers
6486#	testsuite_time_format
6487#		Used as a template for the timestamp. See the format groups
6488#		for the tcl clock format command.
6489#	testsuite_colorize
6490#		Boolean that turns colorization on or off
6491#	testsuite_color_<log_level>
6492#		Can be set to define the color used for each log level
6493#
6494################################################################
6495
6496proc _log_format { log_level message } {
6497	global testsuite_colorize testsuite_log_format testsuite_time_format
6498	global COLOR_NONE
6499	global testsuite_color_fatal testsuite_color_error testsuite_color_warn
6500	global testsuite_color_info testsuite_color_pass testsuite_color_command
6501	global testsuite_color_debug testsuite_color_trace
6502
6503	set format_string $testsuite_log_format
6504	set milliseconds_since_epoch [clock milliseconds]
6505	set date_time [clock format [expr {$milliseconds_since_epoch / 1000}] -format "$testsuite_time_format"]
6506	set milliseconds [expr {$milliseconds_since_epoch % 1000}]
6507
6508	set frame_level -2
6509	while { [dict get [info frame $frame_level] type] != "source" } {
6510		incr frame_level -1
6511	}
6512
6513	set format_args {}
6514	while {[regexp "%{\[a-z]+}" $format_string format_field]} {
6515		if {$format_field eq "%{message}"} {
6516			lappend format_args $message
6517		} elseif {$format_field eq "%{filename}"} {
6518			lappend format_args [file tail [dict get [info frame $frame_level] file]]
6519		} elseif {$format_field eq "%{lineno}"} {
6520			lappend format_args [dict get [info frame $frame_level] line]
6521		} elseif {$format_field eq "%{timestamp}"} {
6522			lappend format_args $date_time
6523		} elseif {$format_field eq "%{msecs}" || $format_field eq "%{milliseconds}"} {
6524			lappend format_args $milliseconds
6525		} elseif {$format_field eq "%{loglevel}" || $format_field eq "%{levelname}"} {
6526			lappend format_args [string totitle $log_level]
6527		} elseif {$format_field eq "%{backtrace}"} {
6528			lappend format_args [_line_trace]
6529		} else {
6530			fail "Invalid field ($format_field) specified in testsuite_log_format"
6531		}
6532		regsub $format_field $format_string "%" format_string
6533	}
6534
6535	if ($testsuite_colorize) {
6536		switch $log_level {
6537			fatal   { append output $testsuite_color_fatal }
6538			error   { append output $testsuite_color_error }
6539			warning { append output $testsuite_color_warn }
6540			info    { append output $testsuite_color_info }
6541			pass    { append output $testsuite_color_pass }
6542			command { append output $testsuite_color_command }
6543			debug   { append output $testsuite_color_debug }
6544			trace   { append output $testsuite_color_trace }
6545		}
6546	}
6547	append output [format $format_string {*}$format_args]
6548	if ($testsuite_colorize) {
6549		append output $COLOR_NONE
6550	}
6551	puts $output
6552}
6553
6554
6555################################################################
6556#
6557# NAME
6558#	_print_header - prints a test header
6559#
6560# SYNOPSIS
6561#	_print_header
6562#
6563# ENVIRONMENT
6564#	testsuite_colorize
6565#		Boolean that turns colorization on or off
6566#	testsuite_color_header
6567#		Can be set to define the color used for the header
6568#
6569################################################################
6570
6571proc _print_header { } {
6572	global test_name testsuite_color_header testsuite_colorize COLOR_NONE
6573
6574	if ($testsuite_colorize) {
6575		append output $testsuite_color_header
6576	}
6577	append output [string repeat = 78]\n
6578	append output [format "%-9s" "TEST:"]${test_name}\n
6579	append output [string repeat = 78]
6580	if ($testsuite_colorize) {
6581		append output $COLOR_NONE
6582	}
6583	puts $output
6584}
6585
6586
6587################################################################
6588#
6589# NAME
6590#	_print_summary - prints the final status summary
6591#
6592# SYNOPSIS
6593#	_print_summary status completed
6594#
6595# ARGUMENTS
6596#	status
6597#		The final status of the test.
6598#		When status is zero, we print SUCCESS.
6599#		When status is negative, we print SKIPPED.
6600#		When status is positive, we print FAILURE.
6601#
6602#	completed
6603#		A boolean value that is true if the test completed and false
6604#		if aborted (ended early with exit status != 0)
6605#
6606# ENVIRONMENT
6607#	testsuite_colorize
6608#		Boolean that turns colorization on or off
6609#	testsuite_color_<test_status>
6610#		Can be set to define the color used for each test status
6611#
6612################################################################
6613
6614proc _print_summary {status completed} {
6615	global test_name testsuite_colorize COLOR_NONE
6616	global testsuite_color_success testsuite_color_skipped
6617	global testsuite_color_failure
6618	global _subtest_fail_count _subtest_pass_count _subtest_skip_count
6619	global _testproc_pass_list _testproc_skip_list _testproc_fail_list
6620	global _incomplete_reason
6621	global _subtest_messages _testproc_messages
6622	global testsuite_subtest_details testsuite_testproc_details
6623
6624	if {$status == 0} {
6625		set color $testsuite_color_success
6626		set header "SUCCESS"
6627	} elseif {$status < 0} {
6628		set color $testsuite_color_skipped
6629		set header "SKIPPED"
6630	} elseif {$status > 0} {
6631		set color $testsuite_color_failure;
6632		set header "FAILURE"
6633	}
6634
6635	if ($testsuite_colorize) {
6636		append output $color
6637	}
6638
6639	append output [string repeat = 78]\n
6640
6641	# Get subtest and testproc counts
6642	set testproc_fail  [llength $_testproc_fail_list]
6643	set testproc_skip  [llength $_testproc_skip_list]
6644	set testproc_pass  [llength $_testproc_pass_list]
6645	set testproc_count [expr $testproc_pass + \
6646	                         $testproc_skip + \
6647	                         $testproc_fail]
6648	set subtest_count  [expr $_subtest_fail_count + \
6649	                         $_subtest_pass_count + \
6650	                         $_subtest_skip_count]
6651
6652	# Initial summary
6653	append output [format "%s             : %s\n" $header  $test_name]
6654	if {$testproc_count > 0} {
6655		append output [format "  Testprocs failed  : %3d (%3d%%)%s\n" $testproc_fail [expr $testproc_fail * 100 / $testproc_count] \
6656		                                                             [expr {$testproc_fail ? " List: [join $_testproc_fail_list ,]" : ""}]]
6657		append output [format "  Testprocs skipped : %3d (%3d%%)%s\n" $testproc_skip [expr $testproc_skip * 100 / $testproc_count] \
6658		                                                             [expr {$testproc_skip ? " List: [join $_testproc_skip_list ,]" : ""}]]
6659		append output [format "  Testprocs passed  : %3d (%3d%%)\n"   $testproc_pass [expr $testproc_pass * 100 / $testproc_count]]
6660		append output [format "  Testprocs total   : %3d %s\n"        $testproc_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]]
6661	}
6662	if {$subtest_count > 0} {
6663		if {$testproc_count > 0} {
6664			append output \n
6665		}
6666		append output [format "  Subtests failed   : %3d (%3d%%)\n" $_subtest_fail_count [expr $_subtest_fail_count * 100 / $subtest_count]]
6667		append output [format "  Subtests skipped  : %3d (%3d%%)\n" $_subtest_skip_count [expr $_subtest_skip_count * 100 / $subtest_count]]
6668		append output [format "  Subtests passed   : %3d (%3d%%)\n" $_subtest_pass_count [expr $_subtest_pass_count * 100 / $subtest_count]]
6669		append output [format "  Subtests total    : %3d %s\n"      $subtest_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]]
6670	}
6671	append output [string repeat = 78]\n
6672
6673	# Detailed information
6674	if {$testsuite_subtest_details ne "none"} {
6675		set show_details false
6676		if {$testsuite_subtest_details eq "all" && $subtest_count > 0} {
6677			set show_details true
6678		} elseif {$testsuite_subtest_details eq "fail_skip" && $_subtest_skip_count > 0} {
6679			set show_details true
6680		} elseif {$_subtest_fail_count > 0} {
6681			set show_details true
6682		}
6683
6684		if {$show_details} {
6685			append output [format "SUBTESTS DETAILS     : %s\n" $test_name]
6686			dict for {id result_msg} $_subtest_messages {
6687				set result [lindex $result_msg 0]
6688				if {$testsuite_subtest_details eq "all"} {
6689					append output [format "  %s\n" [lindex $result_msg 1]]
6690				} elseif {$testsuite_subtest_details eq "fail_skip" && $result eq "skip"} {
6691					append output [format "  %s\n" [lindex $result_msg 1]]
6692				} elseif {$result eq "fail"} {
6693					append output [format "  %s\n" [lindex $result_msg 1]]
6694				}
6695			}
6696			append output [string repeat = 78]\n
6697		}
6698	}
6699	if {$testsuite_testproc_details ne "none"} {
6700		set show_details false
6701		if {$testsuite_testproc_details eq "all" && $testproc_count > 0} {
6702			set show_details true
6703		} elseif {$testsuite_testproc_details eq "fail_skip" && $testproc_skip > 0} {
6704			set show_details true
6705		} elseif {$testproc_fail > 0} {
6706			set show_details true
6707		}
6708
6709		if {$show_details} {
6710			append output [format "TESTPROCS DETAILS     : %s\n" $test_name]
6711			dict for {num result_msg} $_testproc_messages {
6712				set result [lindex $result_msg 0]
6713				if {$testsuite_testproc_details eq "all"} {
6714					append output [format "  Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]]
6715				} elseif {$testsuite_testproc_details eq "fail_skip" && $result eq "skipped"} {
6716					append output [format "  Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]]
6717				} elseif {$result eq "failed"} {
6718					append output [format "  Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]]
6719				}
6720			}
6721			append output [string repeat = 78]\n
6722		}
6723	}
6724
6725	if ($testsuite_colorize) {
6726		append output $COLOR_NONE
6727	}
6728
6729	puts -nonewline $output
6730}
6731
6732
6733################################################################
6734#
6735# NAME
6736#	_get_test_name - gets the name of the invoking source script
6737#
6738# SYNOPSIS
6739#	_get_test_name
6740#
6741# RETURN VALUE
6742#	The name of the originally called script
6743#
6744################################################################
6745
6746proc _get_test_name { } {
6747
6748	set test_name unknown
6749	set frame_level 1
6750	while { $frame_level <= [info frame] } {
6751		if { [dict get [info frame $frame_level] type] == "source" } {
6752			set test_name [file tail [dict get [info frame $frame_level] file]]
6753			break
6754		}
6755		incr frame_level
6756	}
6757
6758	return $test_name
6759}
6760
6761
6762################################################################
6763#
6764# NAME
6765#	_test_cleanup - performs the test cleanup
6766#
6767# SYNOPSIS
6768#	_test_cleanup
6769#
6770# DESCRIPTION
6771#	This procedure removes the temporary test_dir and calls the
6772#	test-defined cleanup procedure.
6773#
6774# NOTES
6775#	This function should be called only from _test_init and _test_fini.
6776#
6777################################################################
6778
6779proc _test_cleanup {} {
6780	global log_warn test_dir
6781
6782	set rc 0
6783
6784	# Call global cleanup procedure if it is defined by the test
6785	if {[info procs cleanup] eq "cleanup"} {
6786		if {[catch {cleanup} cleanup_error ]} {
6787			log_error "Cleanup had errors: $cleanup_error"
6788			set rc 1
6789		}
6790	 }
6791
6792	# Remove the temporary test directory
6793	exec rm -rf $test_dir
6794
6795	return $rc
6796}
6797
6798
6799################################################################
6800#
6801# NAME
6802#	_test_init - performs test initialization
6803#
6804# SYNOPSIS
6805#	_test_init
6806#
6807# DESCRIPTION
6808#	This procedure is called automatically at the beginning of each test.
6809#	It prints the header, creates the temporary test dir, etc.
6810#
6811################################################################
6812
6813proc _test_init {} {
6814	global test_dir test_id test_name testsuite_shared_dir
6815	global _testproc_included _testproc_excluded argv
6816
6817	# parse argv to get and remove _testproc_included and _testproc_excluded params
6818	set idx [expr {[info exists argv] ? [lsearch $argv -i] : -1}]
6819	if {$idx >= 0} {
6820		set _testproc_included [split [lindex $argv [expr $idx + 1]] ,]
6821		set argv [lreplace $argv $idx [expr $idx + 1]]
6822	}
6823	set idx [expr {[info exists argv] ? [lsearch $argv -e] : -1}]
6824	if {$idx >= 0} {
6825		set _testproc_excluded [split [lindex $argv [expr $idx + 1]] ,]
6826		set argv [lreplace $argv $idx [expr $idx + 1]]
6827	}
6828
6829	# Set test name to name of originally invoked test script, e.g. test1.1
6830	set test_name [_get_test_name]
6831
6832	# Set test id to suffix of the test script, e.g. 1.1
6833	set test_id [string map {test ""} $test_name]
6834
6835	# Temporary test directory used to stash saved configs, output files...
6836	set test_dir  "$testsuite_shared_dir/${test_name}dir"
6837
6838	# Print test header
6839	_print_header
6840
6841	# Cleanup in case test was not cleaned up on last execution
6842	if { [_test_cleanup] } {
6843		fail "Error in the initial cleanup"
6844	}
6845
6846	# Create temporary shared test directory
6847	exec mkdir -p $test_dir
6848}
6849
6850
6851################################################################
6852#
6853# NAME
6854#	_test_fini - performs test finalization
6855#
6856# SYNOPSIS
6857#	_test_fini ?status?
6858#
6859# DESCRIPTION
6860#	This procedure is called automatically from the ending functions
6861#	pass, skip and failure.
6862#	It cleans up based on the status and the testsuite_cleanup_on_failure
6863#	variable, prints the final test status/summary, and exits the test.
6864#
6865#	We will always cleanup for SUCCESS or SKIPPED tests.
6866#	Whether or not the cleanup procedure is called for FAILURE tests
6867#	depends on the setting of the $testsuite_cleanup_on_failure variable
6868#	which can be set in the globals.local file or overridden with the
6869#	SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment variable.
6870#
6871################################################################
6872
6873proc _test_fini { status } {
6874	global testsuite_cleanup_on_failure _test_fini_called
6875	global _subtest_fail_count _subtest_skip_count
6876
6877	# Avoid potential infinite recursive calls.
6878	# _test_fini should be called only once, but custom cleanup procs
6879	# called from _test_cleanup can potentially call it (eg fail)
6880	if {$_test_fini_called} {
6881		log_debug "Recursive _test_fini call detected, most probably a fail on a cleanup function"
6882		return
6883	}
6884	set _test_fini_called true
6885
6886
6887	# Determine if test completed or was aborted
6888	set completed [expr $status == 0 ? true : false]
6889
6890	# Override status with subtest status if available and necessary
6891	if {$status < 1} {
6892		if {$_subtest_fail_count > 0} {
6893			set status 1
6894		} elseif {$_subtest_skip_count > 0} {
6895			set status -1
6896		}
6897	}
6898
6899	# Only cleanup if test not failed or configured to do so
6900	if {$status != 1 || $testsuite_cleanup_on_failure} {
6901		_test_cleanup
6902	}
6903	_print_summary $status $completed
6904
6905	__exit $status
6906}
6907
6908
6909################################################################
6910#
6911# Overload the exit routine to ensure that no one is explicitly
6912# calling it, and to enforce _test_fini if exit is called when
6913# the test reach its EOF.
6914#
6915# All tests should exit using pass, skip or fail.
6916#
6917################################################################
6918
6919rename exit __exit
6920
6921proc exit { {status 0} } {
6922	global test_name
6923
6924	# To avoid failures when using "expect -c 'source globals'"
6925	if {$test_name eq "globals"} {
6926		pass
6927	}
6928
6929	#
6930	# Minor sanity check to detect if exit was explicitly called (not
6931	# allowed) or automatically executed when the test ends
6932	#
6933	if {[info level] > 1} {
6934		# exit was called from a function, and it shouldn't
6935		fail "Exit should not be directly called, use pass, skip or fail instead"
6936	} else {
6937		set frame_level 1
6938		while { $frame_level <= [info frame] } {
6939			if { [dict get [info frame $frame_level] type] == "source" } {
6940				if { [file tail [dict get [info frame $frame_level] file]] eq $test_name } {
6941					# exit was called explicitly from the
6942					# test, and it shouldn't
6943					fail "Exit should not be directly called, use pass, skip or fail instead"
6944				}
6945				break
6946			}
6947			incr frame_level
6948		}
6949	}
6950
6951	if {$status != 0} {
6952		fail "Some Expect/TCL exception happen"
6953	}
6954
6955	# The exit was called implicitly when the test ends, allowed but
6956	# _test_fini call enforced
6957	pass
6958}
6959
6960
6961# Call _test_init at the beginning of each test
6962_test_init
6963
6964