1#!/usr/bin/env expect 2############################################################################ 3# Purpose: Establish global state information for Slurm test suite 4# 5# To define site-specific state information, set the values in a file 6# named 'globals.local'. Those values will override any specified here. 7# for example: 8# 9# $ cat globals.local 10# set slurm_dir "/usr/local" 11# set build_dir "/home/mine/SLURM/build_smd" 12# set src_dir "/home/mine/SLURM/slurm.git" 13# set mpicc "/usr/local/bin/mpicc" 14# 15# If you want to have more than one test going at the same time for multiple 16# installs you can have multiple globals.local files and set the 17# SLURM_LOCAL_GLOBALS_FILE env var, and have that set to the correct 18# globals.local file for your various installs. The file can be named anything, 19# not just globals.local. 20# 21############################################################################ 22# Copyright (C) 2002-2007 The Regents of the University of California. 23# Copyright (C) 2008-2010 Lawrence Livermore National Security. 24# Portions Copyright (C) 2010-2018 SchedMD LLC. 25# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 26# Written by Morris Jette <jette1@llnl.gov> 27# Additions by Joseph Donaghy <donaghy1@llnl.gov> 28# CODE-OCEC-09-009. All rights reserved. 29# 30# This file is part of Slurm, a resource management program. 31# For details, see <https://slurm.schedmd.com/>. 32# Please also read the supplied file: DISCLAIMER. 33# 34# Slurm is free software; you can redistribute it and/or modify it under 35# the terms of the GNU General Public License as published by the Free 36# Software Foundation; either version 2 of the License, or (at your option) 37# any later version. 38# 39# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 40# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 41# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 42# details. 43# 44# You should have received a copy of the GNU General Public License along 45# with Slurm; if not, write to the Free Software Foundation, Inc., 46# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 47############################################################################ 48 49# Avoid sourcing this file multiple times 50if {[info procs exit] eq "exit"} { 51 return 52} 53 54global sacctmgr sacct salloc sattach sbatch sbcast scancel scontrol sinfo 55global smd squeue sreport srun sstat strigger 56 57################################################################ 58# 59# NAME 60# cset - conditional set 61# 62# SYNOPSIS 63# cset name value 64# 65# DESCRIPTION 66# Conditional set. Only set variable if variable does not yet exist. 67# 68# Input: name -- name of the variable to set 69# value -- value to set to 'name' 70# 71################################################################ 72 73proc cset {name value} { 74 if {![uplevel 1 info exists $name]} { 75 upvar $name tmp 76 set tmp $value 77 } 78} 79 80 81# 82# Defining colors here to be able to use them in globals.local. 83# By default, these colors are bold 84# 85set COLOR_RED "\033\[1;31m" 86set COLOR_RED_NORMAL "\033\[31m" 87set COLOR_ORANGE "\033\[1;38;5;208m" 88set COLOR_YELLOW "\033\[1;33m" 89set COLOR_GREEN "\033\[1;32m" 90set COLOR_BLUE "\033\[1;34m" 91set COLOR_MAGENTA "\033\[1;35m" 92set COLOR_CYAN "\033\[1;36m" 93set COLOR_NONE "\033\[0m" 94 95cset local_globals_file "./globals.local" 96 97# Log level "enum" 98# Define log levels here so they are available in globals.local 99set LOG_LEVEL_QUIET 0 100set LOG_LEVEL_FATAL 1 101set LOG_LEVEL_ERROR 2 102set LOG_LEVEL_WARNING 3 103set LOG_LEVEL_INFO 4 104set LOG_LEVEL_PASS 4 105set LOG_LEVEL_COMMAND 4 106set LOG_LEVEL_DEBUG 5 107set LOG_LEVEL_TRACE 6 108 109if {[info exists env(SLURM_LOCAL_GLOBALS_FILE)]} { 110 set local_globals_file $env(SLURM_LOCAL_GLOBALS_FILE) 111} 112 113if [file exists $local_globals_file] { 114 source $local_globals_file 115} 116 117# 118# Specify the slurm install directory. 119# Used to locate binaries, libraries, and header files. 120# 121cset slurm_dir "/usr" 122cset build_dir "../../" 123cset src_dir "../../" 124cset config_h "${build_dir}/config.h" 125cset sacctmgr "${slurm_dir}/bin/sacctmgr" 126cset sacct "${slurm_dir}/bin/sacct" 127cset salloc "${slurm_dir}/bin/salloc" 128cset sattach "${slurm_dir}/bin/sattach" 129cset sbatch "${slurm_dir}/bin/sbatch" 130cset sbcast "${slurm_dir}/bin/sbcast" 131cset scancel "${slurm_dir}/bin/scancel" 132cset scontrol "${slurm_dir}/bin/scontrol" 133cset sdiag "${slurm_dir}/bin/sdiag" 134cset sgather "${slurm_dir}/bin/sgather" 135cset sh5util "${slurm_dir}/bin/sh5util" 136cset sinfo "${slurm_dir}/bin/sinfo" 137cset smd "${slurm_dir}/bin/smd" 138cset sprio "${slurm_dir}/bin/sprio" 139cset squeue "${slurm_dir}/bin/squeue" 140cset srun "${slurm_dir}/bin/srun" 141cset sreport "${slurm_dir}/bin/sreport" 142cset sshare "${slurm_dir}/bin/sshare" 143cset sstat "${slurm_dir}/bin/sstat" 144cset strigger "${slurm_dir}/bin/strigger" 145 146cset slurmd "${slurm_dir}/sbin/slurmd" 147cset slurmrestd "${slurm_dir}/sbin/slurmrestd" 148 149cset pbsnodes "${slurm_dir}/bin/pbsnodes" 150cset qdel "${slurm_dir}/bin/qdel" 151cset qstat "${slurm_dir}/bin/qstat" 152cset qsub "${slurm_dir}/bin/qsub" 153cset qalter "${slurm_dir}/bin/qalter" 154cset qrerun "${slurm_dir}/bin/qrerun" 155 156cset seff "${slurm_dir}/bin/seff" 157 158cset lsid "${slurm_dir}/bin/lsid" 159cset bjobs "${slurm_dir}/bin/bjobs" 160cset bkill "${slurm_dir}/bin/bkill" 161cset bsub "${slurm_dir}/bin/bsub" 162 163# If using MPICH-2 or other version of MPI requiring pmi libary, use this 164#cset mpicc "/home/jette/mpich2-install/bin/mpicc" 165#cset use_pmi 1 166# OR for other versions of MPICH, use this 167cset mpicc "/usr/local/bin/mpicc" 168cset nvcc "/usr/bin/nvcc" 169cset use_pmi 0 170#cset upcc "/usr/local/bin/upcc" 171cset upcc "/usr/bin/xlupc" 172cset oshcc "/usr/local/bin/oshcc" 173 174cset mpirun "mpirun" 175cset totalviewcli "/usr/local/bin/totalviewcli" 176 177# Set if using "--enable-memory-leak-debug" configuration option 178cset enable_memory_leak_debug 0 179 180# test_prompt: to be used as prompt for interactive shells 181set test_prompt "TEST_PROMPT: " 182# reset_bash_prompt: to be used as command on scripts or interactive jobs 183set reset_bash_prompt "unset PROMPT_COMMAND; unset PS0; export PS1=\"$test_prompt\"" 184 185# 186# Specify locations of other executable files used 187# Only the shell names (e.g. bin_bash) must be full pathnames 188# 189cset bin_awk "awk" 190cset bin_bash [exec which bash | tail -n 1] 191cset bin_cat "cat" 192cset bin_cc "gcc" 193cset bin_chmod "chmod" 194cset bin_cmp "cmp" 195cset bin_cp "cp" 196cset bin_date "date" 197cset bin_diff "diff" 198cset bin_echo "echo" 199cset bin_env "env" 200cset bin_file "file" 201cset bin_id "id" 202cset bin_grep "grep" 203cset bin_head "head" 204cset bin_ln "ln" 205cset bin_perldoc "/usr/bin/perldoc" 206 207# Don't user $bin_hostname unless on a front-end system that 208# doesn't fully use the slurmd, use $bin_printenv SLURMD_NODENAME 209cset bin_hostname "hostname" 210 211cset bin_kill "kill" 212cset bin_make "make" 213cset bin_mv "mv" 214cset bin_od "od" 215cset bin_pkill "pkill" 216cset bin_printenv "printenv" 217cset bin_ps "ps" 218cset bin_pwd "pwd" 219cset bin_rm "rm" 220cset bin_sed "sed" 221cset bin_sleep "sleep" 222cset bin_sort "sort" 223cset bin_sum "sum" 224cset bin_sudo "sudo" 225cset bin_touch "touch" 226cset bin_true "true" 227cset bin_uname "uname" 228cset bin_uniq "uniq" 229cset bin_wc "wc" 230 231# 232# Let the commands complete without expect timing out waiting for a 233# response. Single node jobs submitted to the default partition should 234# be initiated within this number of seconds. 235# for interactive slurm jobs: cset timeout $max_job_delay 236# 237cset max_job_delay 120 238 239# 240# Specify the maximum number of tasks to use in the stress tests. 241# 242cset max_stress_tasks 4 243 244# 245# The error message that the "sleep" command prints when we run "sleep aaa". 246# 247cset sleep_error_message "(invalid time interval)|(bad character in argument)|(usage: sleep seconds)" 248 249# Force LANG, as the expect tests aren't localized 250set ::env(LANG) "en_US.UTF-8" 251 252# Testsuite level variables 253cset testsuite_shared_dir "[$bin_pwd]" 254 255# Testsuite non-privileged user (set it in globals.local) 256cset testsuite_user "" 257 258# Testsuite log variables 259cset testsuite_log_level $LOG_LEVEL_DEBUG 260cset testsuite_log_format "\[%{timestamp}s.%{msecs}03d] %{loglevel}-7s %{message}s \(%{backtrace}s)" 261cset testsuite_time_format "%Y-%m-%d %H:%M:%S" 262 263# Default to using color if writing to a terminal and not if writing to a file 264cset testsuite_colorize [dict exists [fconfigure stdout] -mode] 265cset testsuite_color_fatal $COLOR_RED 266cset testsuite_color_error $COLOR_RED_NORMAL 267cset testsuite_color_warn $COLOR_ORANGE 268cset testsuite_color_info $COLOR_YELLOW 269cset testsuite_color_pass $COLOR_GREEN 270cset testsuite_color_command $COLOR_CYAN 271cset testsuite_color_debug $COLOR_BLUE 272cset testsuite_color_trace $COLOR_MAGENTA 273cset testsuite_color_header $COLOR_NONE 274cset testsuite_color_success $COLOR_GREEN 275cset testsuite_color_failure $COLOR_RED 276cset testsuite_color_skipped $COLOR_ORANGE 277 278# Set to true to cause the first subtest failure to immediately end the test 279cset testsuite_subtest_fatal false 280# Set to all, fail_skip, fail or none print datails of subtests and testprocs 281cset testsuite_subtest_details fail 282cset testsuite_testproc_details fail 283 284# To automatically call cleanup or not when ending the test 285cset testsuite_cleanup_on_failure true 286if {[info exists env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)]} { 287 set testsuite_cleanup_on_failure $env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE) 288} 289 290# To avoid potential infinite loops due calls to fail/pass/skip inside 291# custom cleanup procs (_test_fini should be called only once) 292set _test_fini_called false 293 294# Testproc internal variables 295set _testproc_included [list] 296set _testproc_excluded [list] 297set _testproc_pass_list [list] 298set _testproc_skip_list [list] 299set _testproc_fail_list [list] 300set _testproc_messages [dict create] 301set _testproc_skip_next false 302set _testproc_skip_reason "" 303set _incomplete_reason "" 304set _subtest_pass_count 0 305set _subtest_skip_count 0 306set _subtest_fail_count 0 307set _subtest_messages [dict create] 308 309# Other common variables 310set re_word_str "\\S+" 311set digit "\\d" 312set eol "\r?\n" 313set float "\\d+\\.?\\d*" 314set number "\\d+" 315set format_time "\\d+\\:\\d+\\:\\d+" 316set number_with_suffix "\\d+\[KM\]*" 317set slash "/" 318set whitespace "\\s+" 319set controlmachine_regex "\\S+" 320# Any characters except ( , : newline 321set no_delim "\[^(,:\r\n\]" 322set no_delim_slash "\[^(,:/\r\n\]" 323# The first group matches GRES name 324# The second **optional** group matches GRES type. 325# The third group matches GRES count. 326# Test out the regex here: https://regex101.com/r/FlNYKM/7 327set gres_regex "($no_delim_slash*):($no_delim*)?:?($no_delim*)" 328 329# 330# Global variable used in multiple functions in "globals" file 331# 332set gpu_sock_list {} 333 334# 335# Procedure return values 336# 337set RETURN_SUCCESS 0 338set RETURN_ERROR 1 339set RETURN_TIMEOUT 110 ; # ETIMEDOUT 340 341 342################################################################ 343# 344# NAME 345# fail - fails a test 346# 347# SYNOPSIS 348# fail message 349# 350# DESCRIPTION 351# To be used when an error is fatal for the test. This routine 352# prints the specified error message, optionally cleans up, prints 353# a final test failure message, and exits the test with exit code 1. 354# 355# ENVIRONMENT 356# Whether or not the cleanup procedure is called depends on the setting 357# of the $testsuite_cleanup_on_failure set in the globals.local file or 358# overridden with the SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment 359# variable. 360# 361# NOTE 362# DO NOT call this within your local cleanup procedure. 363# 364################################################################ 365 366proc fail { message } { 367 global _incomplete_reason 368 369 # Avoid recursive calls from within cleanup 370 if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { 371 log_error "Local cleanup shouldn't call pass, fail or skip" 372 return 373 } 374 375 log_fatal $message 376 set _incomplete_reason $message 377 378 # _test_fini will handle cleanup and print the failure message. 379 _test_fini 1 380} 381 382 383################################################################ 384# 385# NAME 386# skip - skips a test 387# 388# SYNOPSIS 389# skip message 390# 391# DESCRIPTION 392# To be used when a precondition for the test fails and the test 393# should be skipped. This routine prints the specified warning message, 394# calls the cleanup procedure if defined, prints a final test skipped 395# message, and exits the test with exit code -1 (aka 255). 396# 397# NOTE 398# DO NOT call this within your local cleanup procedure. 399# 400################################################################ 401 402proc skip { message } { 403 global _incomplete_reason 404 405 # Avoid recursive calls from within cleanup 406 if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { 407 log_error "Local cleanup shouldn't call pass, fail or skip" 408 return 409 } 410 411 log_warn $message 412 set _incomplete_reason $message 413 414 # _test_fini will handle cleanup and print the skipped message. 415 _test_fini -1 416} 417 418 419################################################################ 420# 421# NAME 422# pass - passes a test 423# 424# SYNOPSIS 425# pass 426# 427# DESCRIPTION 428# To be used when a test passes and should complete with success. 429# This routine calls the cleanup procedure if defined, prints a final 430# test success message, and exits with exit code 0. 431# 432# NOTE 433# DO NOT call this within your local cleanup procedure. 434# 435################################################################ 436 437proc pass { } { 438 439 # Avoid recursive calls from within cleanup 440 if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { 441 log_error "Local cleanup shouldn't call pass, fail or skip" 442 return 443 } 444 445 # _test_fini will handle cleanup and print the success message. 446 _test_fini 0 447} 448 449 450################################################################ 451# 452# NAME 453# subpass - registers a passing subtest result 454# 455# SYNOPSIS 456# subpass ?description? 457# 458# DESCRIPTION 459# Increments the subtest pass count and logs a passing subtest message 460# 461# ARGUMENTS 462# description 463# A single-line string describing the subtest being verified 464# 465################################################################ 466 467proc subpass args { 468 global _subtest_fail_count _subtest_pass_count _subtest_skip_count 469 global _subtest_messages 470 471 set description "" 472 set argument_count [llength $args] 473 if {$argument_count == 1} { set args [lassign $args description] } 474 if {$argument_count > 1} { 475 fail "Too many arguments ($argument_count): $args" 476 } 477 478 set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1] 479 incr _subtest_pass_count 480 set message [format "Subtest %2d passed" $subtest_count] 481 if {$description ne ""} { append message " : $description" } 482 log_pass $message 483 dict set _subtest_messages $subtest_count [list pass $message] 484} 485 486 487################################################################ 488# 489# NAME 490# subfail - registers a failing subtest result 491# 492# SYNOPSIS 493# subfail ?options? ?description? ?diagnostics? 494# 495# DESCRIPTION 496# Increments the subtest failure count and logs a failing subtest message 497# 498# OPTIONS 499# -fatal 500# Causes this subtest failure to be fatal, ending the test 501# ARGUMENTS 502# description 503# A single-line string describing the condition being verified 504# diagnostics 505# A string providing additional diagnostic information that will 506# be included with the log message 507# 508# ENVIRONMENT 509# testsuite_subtest_fatal 510# Specifies whether first failing subtest aborts the test 511# 512################################################################ 513 514proc subfail args { 515 global _subtest_fail_count _subtest_pass_count _subtest_skip_count 516 global testsuite_subtest_fatal 517 global _subtest_messages 518 519 set description "" 520 set fatal false 521 while {[llength $args]} { 522 switch -glob -- [lindex $args 0] { 523 -fatal {set fatal true; set args [lrange $args 1 end]} 524 -* {fail "Unknown option: [lindex $args 0]"} 525 default break 526 } 527 } 528 set argument_count [llength $args] 529 if {$argument_count >= 1} { set args [lassign $args description] } 530 531 set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1] 532 incr _subtest_fail_count 533 set message [format "Subtest %2d failed" $subtest_count] 534 if {$description ne ""} { append message " : $description" } 535 if [llength $args] { append message " (" [join $args ", "] ")" } 536 if {$fatal || $testsuite_subtest_fatal} { 537 fail $message 538 } else { 539 log_error $message 540 } 541 dict set _subtest_messages $subtest_count [list fail $message] 542} 543 544 545################################################################ 546# 547# NAME 548# subskip - registers a skipped subtest result 549# 550# SYNOPSIS 551# subskip ?options? ?description? 552# 553# DESCRIPTION 554# Increments the subtest skip count and logs a skipped subtest message 555# 556# OPTIONS 557# -count NUMBER 558# When used with -skip, indicates the number of subtests that 559# were skipped 560# ARGUMENTS 561# description 562# A single-line string describing the reason the subtest is 563# being skipped 564# 565################################################################ 566 567proc subskip args { 568 global _subtest_fail_count _subtest_pass_count _subtest_skip_count 569 global _subtest_messages 570 571 set description "" 572 set count 1 573 while {[llength $args]} { 574 switch -glob -- [lindex $args 0] { 575 -count {set args [lassign $args - count]} 576 -* {fail "Unknown option: [lindex $args 0]"} 577 default break 578 } 579 } 580 set argument_count [llength $args] 581 if {$argument_count == 1} { set args [lassign $args description] } 582 if {$argument_count > 1} { 583 fail "Too many arguments ($argument_count): $args" 584 } 585 586 set subtest_count [expr $_subtest_pass_count + $_subtest_fail_count + $_subtest_skip_count + 1] 587 incr _subtest_skip_count $count 588 if {$count > 1} { 589 set message "Subtest $subtest_count-[expr $subtest_count+$count-1] skipped" 590 } else { 591 set message [format "Subtest %2d skipped" $subtest_count] 592 } 593 if {$description ne ""} { append message " : $description" } 594 log_warn $message 595 dict set _subtest_messages [expr $subtest_count] [list skip $message] 596} 597 598 599################################################################ 600# 601# NAME 602# print_time - prints the current date and time 603# 604# SYNOPSIS 605# print_time 606# 607################################################################ 608 609proc print_time { } { 610 global bin_date 611 612 spawn $bin_date 613 expect { 614 eof { 615 wait 616 } 617 } 618 619 return 620} 621 622################################################################ 623# 624# NAME 625# dict_getdef - 'dict get' with ability to specify the default value 626# 627# SYNOPSIS 628# dict_getdef dictionary_value key default_value 629# 630# DESCRIPTION 631# Tcl < 8.7 lacks a built in 'dict get' with ability to specify the 632# default value. Tcl 8.7 adds a dict getdef. 633# This proc returns the value from the dictionary corresponding to the 634# keys if it exists, or the default value otherwise. 635# 636# EXAMPLE 637# dict_getdef $option_dict action "warn" 638# 639# SOURCE 640# https://core.tcl-lang.org/tips/doc/trunk/tip/342.md 641# https://core.tcl-lang.org/tcl/tktview/2370575 642# 643################################################################ 644 645proc dict_getdef {D args} { 646 if {[dict exists $D {*}[lrange $args 0 end-1]]} then { 647 dict get $D {*}[lrange $args 0 end-1] 648 } else { 649 lindex $args end 650 } 651} 652 653 654################################################################ 655# 656# NAME 657# _line_trace - returns an abbreviated call stack trace with line numbers 658# 659# SYNOPSIS 660# _line_trace 661# 662################################################################ 663 664proc _line_trace {} { 665 set line_trace "" 666 set first_entry true 667 for {set f [expr [info frame] - 3]} {$f >= 1} {incr f -1} { 668 set frame_dict [info frame $f] 669 if [dict exists $frame_dict file] { 670 if [regexp uplevel [dict get $frame_dict cmd]] { 671 continue 672 } 673 if {$first_entry} { 674 set first_entry false 675 } else { 676 append line_trace "," 677 } 678 if [dict exists $frame_dict proc] { 679 set proc [namespace tail [dict get $frame_dict proc]] 680 if {$proc ne ""} { 681 append line_trace "$proc\@" 682 } 683 } 684 append line_trace [file tail [dict get $frame_dict file]] 685 if [dict exists $frame_dict line] { 686 append line_trace ":[dict get $frame_dict line]" 687 } 688 } 689 } 690 return $line_trace 691} 692 693 694################################################################ 695# 696# NAME 697# tolerance - determines whether a value is within a specified tolerance 698# 699# SYNOPSIS 700# tolerance expected observed tolerance_expression 701# 702# ARGUMENTS 703# expected 704# the expected (numeric) value 705# observed 706# the observed (numeric) value 707# tolerance_expression 708# a string of the form: [~][+|-]<tolerance>[%] 709# 710# DESCRIPTION 711# tolerance 712# A numeric tolerance 713# symmetry 714# By default the permitted range of values is symetric: 715# [expected - tolerance, expected + tolerance] 716# If the + sign is specified, the tolerance is limited to the 717# the higher side only: 718# [expected, expected + tolerance] 719# If the - sign is specified, the tolerance is limited to the 720# the lower side only: 721# [expected - tolerance, expected] 722# percent 723# By default the permitted range is computed as absolute values: 724# [expected - tolerance, expected + tolerance] 725# If % is specified, the permitted range is computed as a 726# percentage of the expected value: 727# [expected*(1-tolerance/100), expected*(1+tolerance/100)] 728# exclusivity 729# By default the permitted range of values is inclusive, ie 730# the min and max tolerated values are included in the range: 731# [expected - tolerance, expected + tolerance] 732# If ~ (exclusive) is specified, the tolerance limits are 733# exclusive, ie the min and max tolerated values are excluded: 734# (expected - tolerance, expected + tolerance) 735# expression 736# any combination of symetry, percent and exclusivity is allowed 737# 738# RETURN VALUE 739# Returns true if the observed value is within the specified tolerance 740# range of the expected value, otherwise false 741# 742# EXAMPLES 743# The indicated tolerance_expression is true if: 744# "5" expected - 5 <= observed <= expected + 5 745# "-5" expected - 5 <= observed <= expected 746# "+5" expected <= observed <= expected + 5 747# "5%" expected - 5% <= observed <= expected + 5% 748# "~5" expected - 5 < observed < expected + 5 749# "~+5%" expected <= observed < expected + 5% 750# 751################################################################ 752 753proc tolerance { expected observed tolerance_expression } { 754 if {![regexp {^(~?)([-+]?)([0-9\.]+)(%?)$} $tolerance_expression {} exclusive sign tolerance percent]} { 755 fail "Invalid tolerance expression ($tolerance_expression)" 756 } 757 758 set lower_bound_expression $observed 759 if {$sign eq "+" || $exclusive ne "~"} { 760 append lower_bound_expression " >=" 761 } else { 762 append lower_bound_expression " >" 763 } 764 append lower_bound_expression " $expected" 765 if {$sign eq "-" || $sign eq ""} { 766 if {$percent eq "%"} { 767 append lower_bound_expression " - $tolerance * $expected / 100" 768 } else { 769 append lower_bound_expression " - $tolerance" 770 } 771 } 772 773 set upper_bound_expression $observed 774 if {$sign eq "-" || $exclusive ne "~"} { 775 append upper_bound_expression " <=" 776 } else { 777 append upper_bound_expression " <" 778 } 779 append upper_bound_expression " $expected" 780 if {$sign eq "+" || $sign eq ""} { 781 if {$percent eq "%"} { 782 append upper_bound_expression " + $tolerance * $expected / 100" 783 } else { 784 append upper_bound_expression " + $tolerance" 785 } 786 } 787 788 if {[expr $lower_bound_expression] && [expr $upper_bound_expression]} { 789 log_debug "$observed is within tolerance $tolerance_expression of $expected" 790 return true 791 } else { 792 log_warn "$observed is not within tolerance $tolerance_expression of $expected" 793 return false 794 } 795} 796 797 798################################################################ 799# 800# NAME 801# check_run_as_user - check if the caller may run_command as the supplied user 802# 803# SYNOPSIS 804# check_run_as_user user 805# 806# DESCRIPTION 807# Note that a proper sudo config needs to be set in orther to pass this 808# check. Calling user should be permitted to run_commands as the 809# supplied user using sudo without password. 810# See the -user option of run_command. 811# This proc also log_warn a message if user already exists in the DB 812# because most probably this user is testsuite_user and that user is 813# expected NOT to be in the DB and could potentially be removed from it by 814# the test. 815# 816# RETURN VALUE 817# Returns a boolean value indicating whether the calling user may 818# run_command as user. 819# 820################################################################ 821 822proc check_run_as_user user { 823 global bin_id 824 825 if {$user eq ""} { 826 return false 827 } 828 if {[run_command_status -none -user $user "$bin_id -un"]} { 829 return false 830 } 831 832 if {[get_admin_level $user] != ""} { 833 log_warn "User $user already exists in DB, but it's probable that it's going to be removed by the test cleanup" 834 } 835 836 return true 837} 838 839 840################################################################ 841# 842# NAME 843# run_command - executes a command and returns a dictionary result 844# 845# SYNOPSIS 846# run_command ?options? command 847# 848# DESCRIPTION 849# Executes a command and returns a dictionary that includes the output, 850# exit code, etc. An action can be taken (fail, warn, subtest, none) if 851# the command's exit code is unexpected. By default, the action 852# will be applied if the command fails. If the -xfail option is 853# specified, the behavior will be reversed to apply the action if the 854# command ran successfully. 855# A timeout is always treated as unexpected, so log_error will be shown 856# by default, or fail/subfail will be called if -fail/-subtest are used. 857# 858# OPTIONS 859# -fail 860# If the exit code is unexpected, the action that will 861# be taken is to fail the test 862# -subtest 863# If the exit code is unexpected, the action that will 864# be taken is to subfail, otherwise subpass will be called 865# -warn 866# If the exit code is unexpected, the action that will 867# be taken is to log a warning (this is the default) 868# -none 869# If the exit code is unexpected, no action will be taken 870# -xfail 871# If the command exits with zero the action will be applied. 872# Without this option, the action will be applied if the 873# command exits with a non-zero exit code. 874# -timeout <float_number> 875# Time in seconds to wait for the command to complete before 876# timing out (default is 60.0) 877# -nolog 878# Logging for this command will occur at trace threshold only 879# -stdin 880# Provide standard in to be piped into command 881# -user <user> 882# Attempt to execute command as <user>. Note that sudo must be 883# properly configured to permit the caller to execute as <user>. 884# See check_run_as_user. 885# 886# ARGUMENTS 887# command 888# a string containing the command and arguments to execute 889# 890# RETURN VALUE 891# A dictionary containing the following elements: 892# command - The command that was invoked 893# exit_code - Exit code 894# output - The combined standard output and standard error 895# start_time - The time (with ms) the command was executed 896# duration - The duration (seconds and milliseconds) the 897# command took to run 898# 899################################################################ 900proc run_command args { 901 global bin_bash bin_sudo 902 903 set alt_user "" 904 set exit_status 0 905 set timedout false 906 set output "" 907 set action "warn" 908 set timeout 60 909 set expect_failure false 910 set log_at_trace_level false 911 set stdin "" 912 while {[llength $args]} { 913 switch -glob -- [lindex $args 0] { 914 -fail {set action "fail"; set args [lrange $args 1 end]} 915 -subtest {set action "subtest"; set args [lrange $args 1 end]} 916 -none {set action "none"; set args [lrange $args 1 end]} 917 -timeout {set args [lassign $args - timeout]} 918 -warn {set action "warn"; set args [lrange $args 1 end]} 919 -xfail {set expect_failure true; set args [lrange $args 1 end]} 920 -nolog {set log_at_trace_level true; set args [lrange $args 1 end]} 921 -stdin {set stdin [lindex $args 1]; set args [lassign $args - stdin]} 922 -user {set args [lassign $args - alt_user]} 923 -* {fail "Unknown option: [lindex $args 0]"} 924 default break 925 } 926 } 927 if {[llength $args] == 1} { 928 lassign $args command 929 } else { 930 fail "Invalid number of arguments [llength $args]: $args" 931 } 932 933 if {$action eq "subtest"} { 934 if {$expect_failure} { 935 set test_description "Command \"$command\" should fail" 936 } else { 937 set test_description "Command \"$command\" should succeed" 938 } 939 } 940 941 if {$log_at_trace_level} { 942 interp alias {} log_run {} log_trace 943 interp alias {} log_details {} log_trace 944 } else { 945 interp alias {} log_run {} log_command 946 interp alias {} log_details {} log_debug 947 } 948 949 set orig_log_user [log_user -info] 950 log_user 0 951 952 if {$alt_user ne ""} { 953 log_run "Run Command as user $alt_user: $command" 954 } else { 955 log_run "Run Command: $command" 956 } 957 set start_clock_ms [clock milliseconds] 958 set stty_init raw ; # Prevent the terminal from inserting \r 959 if {$alt_user ne ""} { 960 set expect_pid [spawn -noecho $bin_sudo -nu $alt_user $bin_bash -c "$command"] 961 } else { 962 set expect_pid [spawn -noecho $bin_bash -c "$command"] 963 } 964 if { $stdin != "" } { 965 exp_send "$stdin" 966 set command "$command <<< $stdin" 967 } 968 expect { 969 -re "(.+)" { 970 append output $expect_out(1,string) 971 exp_continue 972 } 973 timeout { 974 slow_kill $expect_pid 975 set exit_status $::RETURN_TIMEOUT 976 set timedout true 977 } 978 eof { 979 lassign [wait] pid spawnid os_error_flag errno 980 set exit_status [expr $errno > 128 ? $errno - 256 : $errno] 981 } 982 } 983 set start_time [format "%.3f" [expr $start_clock_ms / 1000.000]] 984 set end_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] 985 set duration [format "%.3f" [expr $end_time - $start_time]] 986 987 log_details "Command Results:" 988 log_details " Duration: $duration" 989 log_details " Exit Code: $exit_status" 990 if {[info exists output]} { 991 log_details " Output: $output" 992 } 993 994 if {$timedout} { 995 set message "Command \"$command\" timed out after $timeout seconds" 996 if {$action eq "fail"} { 997 fail $message 998 } elseif {$action eq "subtest"} { 999 subfail $test_description $message 1000 } else { 1001 log_error $message 1002 } 1003 } elseif {! $expect_failure && $exit_status != 0} { 1004 set message "Command \"$command\" failed with rc=$exit_status" 1005 if {[info exists output] && $output != ""} { 1006 append message ": [string trimright $output]" 1007 } 1008 if {$action eq "warn"} { 1009 log_warn $message 1010 } elseif {$action eq "subtest"} { 1011 subfail $test_description $message 1012 } elseif {$action eq "fail"} { 1013 fail $message 1014 } 1015 } elseif {$expect_failure && $exit_status == 0} { 1016 set message "Command \"$command\" was expected to fail but succeeeded" 1017 if {$action eq "warn"} { 1018 log_warn $message 1019 } elseif {$action eq "subtest"} { 1020 subfail $test_description $message 1021 } elseif {$action eq "fail"} { 1022 fail $message 1023 } 1024 } elseif {$action eq "subtest"} { 1025 subpass $test_description 1026 } 1027 log_user $orig_log_user 1028 1029 dict set result command $command 1030 dict set result exit_code $exit_status 1031 dict set result output $output 1032 dict set result start_time $start_time 1033 dict set result duration $duration 1034 1035 return $result 1036} 1037 1038 1039################################################################ 1040# 1041# NAME 1042# run_command_output - executes a command and returns the output 1043# 1044# SYNOPSIS 1045# run_command_output ?options? command 1046# 1047# DESCRIPTION 1048# Executes a command and returns a dictionary that includes the output, 1049# exit code, etc. An action can be taken (fail, warn, none) if the 1050# command's exit code or timeout is unexpected. By default, the action 1051# will be applied if the command fails. If the -xfail option is 1052# specified, the behavior will be reversed to apply the action if the 1053# command ran successfully. 1054# 1055# OPTIONS 1056# -fail 1057# if the exit code or timeout is unexpected, the action that will 1058# be taken is to fail the test 1059# -warn 1060# if the exit code or timeout is unexpected, the action that will 1061# be taken is to log a warning (this is the default) 1062# -none 1063# if the exit code or timeout is unexpected, no action will be 1064# taken 1065# -xfail 1066# if the command exits with zero and does not time out, the 1067# action will be applied. Without this option, the action will 1068# be applied if the command exits with a non-zero exit code or 1069# times out. 1070# -timeout <float_number> 1071# time in seconds to wait for the command to complete before 1072# timing out (default is 60.0) 1073# -stdin 1074# Provide standard in to be piped into command 1075# 1076# ARGUMENTS 1077# command 1078# a string containing the command and arguments to execute 1079# 1080# RETURN VALUE 1081# A string containing the combined standard output and standard error 1082# 1083################################################################ 1084 1085proc run_command_output args { 1086 1087 set result [run_command {*}$args] 1088 1089 if [dict exists $result output] { 1090 return [dict get $result output] 1091 } else { 1092 return "" 1093 } 1094} 1095 1096 1097################################################################ 1098# 1099# NAME 1100# run_command_status - executes a command and returns the exit code 1101# 1102# SYNOPSIS 1103# run_command_status ?options? command 1104# 1105# DESCRIPTION 1106# Executes a command and returns a dictionary that includes the output, 1107# exit code, etc. An action can be taken (fail, warn, none) if the 1108# command's exit code or timeout is unexpected. By default, the action 1109# will be applied if the command fails. If the -xfail option is 1110# specified, the behavior will be reversed to apply the action if the 1111# command ran successfully. 1112# 1113# OPTIONS 1114# -fail 1115# if the exit code or timeout is unexpected, the action that will 1116# be taken is to fail the test 1117# -warn 1118# if the exit code or timeout is unexpected, the action that will 1119# be taken is to log a warning (this is the default) 1120# -none 1121# if the exit code or timeout is unexpected, no action will be 1122# taken 1123# -xfail 1124# if the command exits with zero and does not time out, the 1125# action will be applied. Without this option, the action will 1126# be applied if the command exits with a non-zero exit code or 1127# times out. 1128# -timeout <float_number> 1129# time in seconds to wait for the command to complete before 1130# timing out (default is 60.0) 1131# -stdin 1132# Provide standard in to be piped into command 1133# 1134# ARGUMENTS 1135# command 1136# a string containing the command and arguments to execute 1137# 1138# RETURN VALUE 1139# The exit code for the invoked command 1140# 1141################################################################ 1142 1143proc run_command_status args { 1144 1145 set result [run_command {*}$args] 1146 1147 return [dict get $result exit_code] 1148} 1149 1150 1151################################################################ 1152# 1153# NAME 1154# cancel_job - cancels the specified job list 1155# 1156# SYNOPSIS 1157# cancel_job job_id_list ?het_job? 1158# 1159# ARGUMENTS 1160# job_id_list 1161# The list of Slurm job ids that we want to cancel 1162# het_job 1163# 1 if jobs are hetjobs and we want to confirm each 1164# component has completed 1165# 1166# DESCRIPTION 1167# Cancels one or more jobs. A job_id of 0 will be silently ignored. 1168# 1169# RETURN VALUE 1170# RETURN_SUCCESS if jobs are cancelled, or non-zero value otherwise. 1171# 1172################################################################ 1173 1174proc cancel_job { job_id_list {het_job 0}} { 1175 global scancel 1176 1177 set job_list_clean [list] 1178 foreach job_id $job_id_list { 1179 if {$job_id != 0} { 1180 lappend job_list_clean $job_id 1181 } 1182 } 1183 1184 log_debug "Cancelling $job_list_clean" 1185 if {[run_command_status "$scancel -Q $job_list_clean"]} { 1186 log_warn "scancel command returned error" 1187 return $::RETURN_ERROR 1188 } 1189 foreach job_id $job_list_clean { 1190 if {[wait_for_job $job_id "DONE" $het_job]} { 1191 log_warn "Job $job_id not ended" 1192 return $::RETURN_ERROR 1193 } 1194 } 1195 return $::RETURN_SUCCESS 1196} 1197 1198 1199################################################################ 1200# 1201# NAME 1202# get_line_cnt - returns the size of the specified file 1203# 1204# SYNOPSIS 1205# get_line_cnt file_name 1206# 1207# RETURN VALUE 1208# Number of lines in the specified file. 1209# 1210################################################################ 1211 1212proc get_line_cnt { file_name } { 1213 global bin_wc number 1214 set lines 0 1215 spawn $bin_wc -l $file_name 1216 expect { 1217 -re "($number) " { 1218 set lines $expect_out(1,string) 1219 exp_continue 1220 } 1221 eof { 1222 wait 1223 } 1224 } 1225 return $lines 1226} 1227 1228 1229################################################################ 1230# 1231# NAME 1232# slow_kill - kills a process slowly 1233# 1234# SYNOPSIS 1235# slow_kill pid 1236# 1237# DESCRIPTION 1238# Kill a process slowly, first trying SIGINT, pausing for 1239# a second, then sending SIGKILL. 1240# 1241# RETURN VALUE 1242# A non-zero return code indicates a failure. 1243# 1244################################################################ 1245 1246proc slow_kill { pid } { 1247 global bin_kill 1248 1249 catch {exec $bin_kill -INT $pid} 1250 catch {exec $bin_kill -INT $pid} 1251 sleep 1 1252 catch {exec $bin_kill -KILL $pid} 1253 1254 return 0 1255} 1256 1257 1258################################################################ 1259# 1260# NAME 1261# get_my_id - gets the id from the running user 1262# 1263# SYNOPSIS 1264# get_my_id 1265# 1266# RETURN VALUE 1267# output of id 1268# 1269################################################################ 1270 1271proc get_my_id {} { 1272 1273 global bin_id number 1274 set login_info -1 1275 1276 log_user 0 1277 1278 spawn $bin_id 1279 expect { 1280 -re "(uid=.*\n)" { 1281 set login_info $expect_out(1,string) 1282 exp_continue 1283 } 1284 eof { 1285 wait 1286 } 1287 } 1288 1289 log_user 1 1290 1291 if {$login_info == -1} { 1292 fail "Unable to get user info" 1293 } 1294 1295 return $login_info 1296} 1297 1298 1299################################################################ 1300# 1301# NAME 1302# get_my_user_name - gets the name from the running user 1303# 1304# SYNOPSIS 1305# get_my_user_name 1306# 1307# RETURN VALUE 1308# A non-zero return code indicates a failure. 1309# 1310################################################################ 1311 1312proc get_my_user_name { } { 1313 global bin_id re_word_str 1314 1315 set user_name -1 1316 1317 log_user 0 1318 spawn $bin_id -nu 1319 expect { 1320 -re "($re_word_str)" { 1321 set user_name $expect_out(1,string) 1322 exp_continue 1323 } 1324 eof { 1325 wait 1326 } 1327 } 1328 log_user 1 1329 1330 if {$user_name == -1} { 1331 fail "Unable to get user name" 1332 } 1333 1334 return $user_name 1335} 1336 1337 1338################################################################ 1339# 1340# NAME 1341# get_my_uid - gets the uid from the running user 1342# 1343# SYNOPSIS 1344# get_my_uid 1345# 1346# RETURN VALUE 1347# The uid of the current user, or fails. 1348# 1349################################################################ 1350 1351proc get_my_uid { } { 1352 global bin_id number 1353 1354 set out [run_command_output -nolog -fail "$bin_id -u"] 1355 if {![regexp "($number)" $out - uid]} { 1356 fail "Unable to get UID with $bin_id ($out)" 1357 } 1358 1359 return $uid 1360} 1361 1362 1363################################################################ 1364# 1365# NAME 1366# get_my_gid - gets the gid from the running user 1367# 1368# SYNOPSIS 1369# get_my_gid 1370# 1371# RETURN VALUE 1372# A non-zero return code indicates a failure. 1373# 1374################################################################ 1375 1376proc get_my_gid { } { 1377 global bin_id number 1378 1379 set gid -1 1380 1381 log_user 0 1382 spawn $bin_id -g 1383 expect { 1384 -re "($number)" { 1385 set gid $expect_out(1,string) 1386 exp_continue 1387 } 1388 eof { 1389 wait 1390 } 1391 } 1392 log_user 1 1393 1394 return $gid 1395} 1396 1397 1398################################################################ 1399# 1400# NAME 1401# kill_salloc - kills all salloc commands associated with this user 1402# 1403# SYNOPSIS 1404# kill_salloc 1405# 1406# DESCRIPTION 1407# Kill all salloc commands associated with this user. 1408# Issue two SIGINT, sleep 1 and a SIGKILL 1409# 1410# RETURN VALUE 1411# A non-zero return code indicates a failure. 1412# 1413# NOTE 1414# Use slow_kill instead of kill_salloc if you can capture 1415# the process id 1416# 1417################################################################ 1418 1419proc kill_salloc { } { 1420 global bin_id bin_pkill bin_sleep number 1421 1422 set uid [get_my_uid] 1423 catch {exec $bin_pkill -INT -u $uid salloc} 1424 catch {exec $bin_pkill -INT -u $uid salloc} 1425 sleep 1 1426 catch {exec $bin_pkill -KILL -u $uid salloc} 1427 1428 return 0 1429} 1430 1431 1432################################################################ 1433# 1434# NAME 1435# kill_srun - kills all srun commands associated with this user 1436# 1437# SYNOPSIS 1438# kill_srun 1439# 1440# DESCRIPTION 1441# Kill all srun commands associated with this user. 1442# Issue two SIGINT, sleep 1 and a SIGKILL 1443# 1444# RETURN VALUE 1445# A non-zero return code indicates a failure. 1446# 1447# NOTE 1448# Use slow_kill instead of kill_srun if you can capture 1449# the process id 1450# 1451################################################################ 1452 1453proc kill_srun { } { 1454 global bin_id bin_pkill bin_sleep number 1455 1456 set uid [get_my_uid] 1457 catch {exec $bin_pkill -INT -u $uid srun} 1458 catch {exec $bin_pkill -INT -u $uid srun} 1459 sleep 1 1460 catch {exec $bin_pkill -KILL -u $uid srun} 1461 1462 return 0 1463} 1464 1465 1466################################################################ 1467# 1468# NAME 1469# wait_for - generic wait utility 1470# 1471# SYNOPSIS 1472# wait_for ?options? condition body 1473# 1474# DESCRIPTION 1475# Generic wait utility allowing you to repeatedly execute a generic block 1476# of code until a specified boolean expression is met. The code block and 1477# condition check occur every poll interval until a timeout is reached. 1478# 1479# OPTIONS 1480# -fail 1481# abort the test with failure if the condition is not met 1482# -timeout <float_number> 1483# time in seconds to wait for the condition to be met before 1484# timing out (default is 60.0) 1485# -pollinterval <float_number> 1486# time in seconds between each loop execution and condition check 1487# (default is 1.0) 1488# 1489# ARGUMENTS 1490# condition 1491# The boolean expression to test 1492# body 1493# A block of code to evaluate in the invoking stack frame 1494# 1495# RETURN VALUE 1496# RETURN_SUCCESS if the condition is met before the timeout occurs, 1497# RETURN_TIMEOUT if the timeout occurs before the condition is met 1498# 1499################################################################ 1500 1501proc wait_for args { 1502 set fatal false 1503 set timeout 60 1504 set poll_interval 1 1505 while {[llength $args]} { 1506 switch -glob -- [lindex $args 0] { 1507 -fail {set fatal true; set args [lrange $args 1 end]} 1508 -time* {set args [lassign $args - timeout]} 1509 -poll* {set args [lassign $args - poll_interval]} 1510 -* {fail "Unknown option: [lindex $args 0]"} 1511 default break 1512 } 1513 } 1514 if {[llength $args] == 2} { 1515 lassign $args condition body 1516 } else { 1517 fail "Invalid number of arguments [llength $args]: $args" 1518 } 1519 1520 set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] 1521 1522 log_debug "Waiting for $condition" 1523 1524 while {1} { 1525 # Evaluate code block 1526 log_trace "Evaluating code block ([string trim $body])" 1527 uplevel $body 1528 1529 # Check condition 1530 if {[uplevel expr [format "{%s}" $condition]]} { 1531 set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] 1532 log_debug "Condition ($condition) was met" 1533 return $::RETURN_SUCCESS 1534 } else { 1535 log_trace "Condition ($condition) was not met" 1536 } 1537 1538 # Sleep poll interval 1539 log_trace "Sleeping for $poll_interval seconds" 1540 after [expr {int($poll_interval * 1000)}] 1541 1542 # Check if we have surpassed our timeout 1543 set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] 1544 log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])" 1545 if {$now > $start_time + $timeout} { 1546 set message "Condition ($condition) did not occur before timeout ($timeout) seconds" 1547 if {$fatal} { 1548 fail $message 1549 } else { 1550 log_warn $message 1551 return $::RETURN_TIMEOUT 1552 } 1553 } 1554 } 1555} 1556 1557 1558################################################################ 1559# 1560# NAME 1561# wait_for_command - repeat a command until it is successful or meets a specified condition 1562# 1563# SYNOPSIS 1564# wait_for_command ?options? command ?condition? 1565# 1566# DESCRIPTION 1567# A command is repeated until it meets a condition or a timeout is reached. 1568# If a condition is not specified, the command will be repeated until it 1569# is successful (the exit code is zero). 1570# 1571# OPTIONS 1572# -fail 1573# abort the test with failure if the condition is not met by 1574# the timeout 1575# -timeout <float_number> 1576# time in seconds to wait for the condition to be met before 1577# timing out (default is 60.0) 1578# -pollinterval <float_number> 1579# time in seconds between each loop execution and condition 1580# check (default is 1.0) 1581# 1582# ARGUMENTS 1583# command 1584# a string containing the command and arguments to execute 1585# condition 1586# The boolean expression to test. For each command invocation, 1587# the result variable will be set to the dictionary returned 1588# from run_command. 1589# The condition expression will normally involve a comparison 1590# with one or more values of this dictionary. If a condition is 1591# not specified, this condition will be used: 1592# { [dict get $result exit_code] == 0 } 1593# 1594# RETURN VALUE 1595# RETURN_SUCCESS if the condition is met before the timeout occurs, 1596# RETURN_TIMEOUT if the timeout occurs before the condition is met 1597# 1598################################################################ 1599 1600proc wait_for_command args { 1601 set fatal false 1602 set timeout 60 1603 set poll_interval 1 1604 while {[llength $args]} { 1605 switch -glob -- [lindex $args 0] { 1606 -fail {set fatal true; set args [lrange $args 1 end]} 1607 -time* {set args [lassign $args - timeout]} 1608 -poll* {set args [lassign $args - poll_interval]} 1609 -* {fail "Unknown option: [lindex $args 0]"} 1610 default break 1611 } 1612 } 1613 1614 set argument_count [llength $args] 1615 if {$argument_count < 1} { 1616 fail "Too few arguments ($argument_count): $args" 1617 } elseif {$argument_count > 2} { 1618 fail "Too many arguments ($argument_count): $args" 1619 } 1620 lassign $args command 1621 if {$argument_count == 2} { 1622 set condition [lindex $args 1] 1623 } else { 1624 set condition { [dict get $result exit_code] == 0 } 1625 } 1626 1627 set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] 1628 1629 log_debug "Waiting for $condition" 1630 1631 while {1} { 1632 # Run command 1633 set result [run_command $command] 1634 1635 # Check condition 1636 if {[eval expr [format "{%s}" $condition]]} { 1637 set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] 1638 log_debug "Condition ($condition) was met" 1639 return $::RETURN_SUCCESS 1640 } else { 1641 log_trace "Condition ($condition) was not met" 1642 } 1643 1644 # Sleep poll interval 1645 log_trace "Sleeping for $poll_interval seconds" 1646 after [expr {int($poll_interval * 1000)}] 1647 1648 # Check if we have surpassed our timeout 1649 set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] 1650 log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])" 1651 if {$now > $start_time + $timeout} { 1652 set message "Condition ($condition) did not occur before timeout ($timeout) seconds" 1653 if {$fatal} { 1654 fail $message 1655 } else { 1656 log_warn $message 1657 return $::RETURN_TIMEOUT 1658 } 1659 } 1660 } 1661} 1662 1663 1664################################################################ 1665# 1666# NAME 1667# wait_for_command_match - repeat a command until its output matches the specified pattern 1668# 1669# SYNOPSIS 1670# wait_for_command_match ?options? command pattern 1671# 1672# DESCRIPTION 1673# A command is repeated until its output matches the specified pattern 1674# 1675# OPTIONS 1676# -fail 1677# abort the test with failure if output does not match the pattern 1678# by the timeout 1679# -timeout <float_number> 1680# time in seconds to wait for the pattern to be matched before 1681# timing out (default is 60.0) 1682# -pollinterval <float_number> 1683# time in seconds between each loop execution and match check 1684# (default is 1.0) 1685# 1686# ARGUMENTS 1687# command 1688# a string containing the command and arguments to execute 1689# pattern 1690# The regular expression to match against the command output 1691# 1692# RETURN VALUE 1693# RETURN_SUCCESS if the pattern is matched before the timeout occurs, 1694# RETURN_TIMEOUT if the timeout occurs before the pattern is matched 1695# 1696################################################################ 1697 1698proc wait_for_command_match args { 1699 1700 set pattern [lindex $args end] 1701 set args [lrange $args 0 end-1] 1702 1703 return [wait_for_command {*}$args "\[regexp {$pattern} \[dict get \$result output\]\] == 1"] 1704} 1705 1706 1707################################################################ 1708# 1709# NAME 1710# wait_for_file - waits for a file to exist with non-zero size 1711# 1712# SYNOPSIS 1713# wait_for_file ?options? file_name 1714# 1715# OPTIONS 1716# -fail 1717# If an error occurs or the file does not become present 1718# by the timeout, fail the test rather than returning an error 1719# -timeout <integer_number> 1720# time in seconds to wait for the file to exist before 1721# timing out (default is 90) 1722# -pollinterval <integer_number> 1723# time in seconds between each file existence test (default is 1) 1724# 1725# DESCRIPTION 1726# Wait for the specified file to exist and have a non-zero size. 1727# Note that if JobFileAppend=0 is configured, a file can exist and 1728# be purged then be re-created. 1729# 1730# RETURN VALUE 1731# RETURN_SUCCESS if the file becomes present within the timeout, or 1732# non-zero value otherwise. 1733# 1734################################################################ 1735 1736proc wait_for_file args { 1737 global bin_sleep 1738 1739 set fatal false 1740 set timeout 90 1741 set poll_interval 1 1742 while {[llength $args]} { 1743 switch -glob -- [lindex $args 0] { 1744 -fatal - 1745 -fail {set fatal true; set args [lrange $args 1 end]} 1746 -time* {set args [lassign $args - timeout]} 1747 -poll* {set args [lassign $args - poll_interval]} 1748 -* {fail "Unknown option: [lindex $args 0]"} 1749 default break 1750 } 1751 } 1752 set argument_count [llength $args] 1753 if {$argument_count != 1} { 1754 fail "Invalid number of arguments ($argument_count): $args" 1755 } else { 1756 lassign $args file_name 1757 } 1758 1759 for {set my_delay 0} {$my_delay <= $timeout} \ 1760 {set my_delay [expr $my_delay + $poll_interval]} { 1761 if {[file exists $file_name]} { 1762 # Add small delay for I/O buffering 1763 exec $bin_sleep 1 1764 return $::RETURN_SUCCESS 1765 } 1766 exec $bin_sleep $poll_interval 1767 1768 # Expect may fail to load current NFS info. 1769 # Use the ls command to load current info. 1770 set slash_pos [string last $file_name "/"] 1771 if {$slash_pos < 1} { 1772 set dir_name "." 1773 } else { 1774 decr slash_pos 1775 set dir_name [string $file_name 0 $slash_pos] 1776 } 1777 exec /bin/ls $dir_name 1778 } 1779 set message "Timeout waiting for file $file_name" 1780 if {$fatal} { 1781 fail $message 1782 } 1783 1784 log_error $message 1785 return $::RETURN_TIMEOUT 1786} 1787 1788 1789################################################################ 1790# 1791# NAME 1792# _wait_for_single_job - waits for a job to reach the desired state 1793# 1794# SYNOPSIS 1795# _wait_for_single_job ?options? job_id desired_state 1796# 1797# DESCRIPTION 1798# Wait for a previously submitted Slurm job to reach the desired state. 1799# 1800# OPTIONS 1801# -fail 1802# If an error occurs or the job does not reach the desired state 1803# by the timeout, fail the test rather than returning an error 1804# -timeout <integer_number> 1805# time in seconds to wait for the job to be in the desired state 1806# before timing out (default is 360) 1807# -pollinterval <integer_number> 1808# time in seconds between each job state check (default is 1) 1809# 1810# ARGUMENTS 1811# job_id 1812# The Slurm job id of a job we want to wait for. 1813# desired_state 1814# The state you want the job to attain before 1815# returning. Currently supports: 1816# DONE any terminated state 1817# PENDING job is pending 1818# RUNNING job is running 1819# SPECIAL_EXIT 1820# SUSPENDED job is suspended 1821# 1822# RETURN VALUE 1823# RETURN_SUCCESS, or non-zero on error. 1824# 1825# NOTE: We sleep for two seconds before replying that a job is 1826# done to give time for I/O completion (stdout/stderr files) 1827# 1828################################################################ 1829 1830proc _wait_for_single_job args { 1831 global scontrol 1832 1833 set fatal false 1834 set timeout 360 1835 set poll_interval 1 1836 while {[llength $args]} { 1837 switch -glob -- [lindex $args 0] { 1838 -fatal - 1839 -fail {set fatal true; set args [lrange $args 1 end]} 1840 -time* {set args [lassign $args - timeout]} 1841 -poll* {set args [lassign $args - poll_interval]} 1842 -* {fail "Unknown option: [lindex $args 0]"} 1843 default break 1844 } 1845 } 1846 set argument_count [llength $args] 1847 if {$argument_count != 2} { 1848 fail "Invalid number of arguments ($argument_count): $args" 1849 } else { 1850 lassign $args job_id desired_state 1851 } 1852 1853 # First verify that desired_state is supported 1854 switch $desired_state { 1855 "DONE" {} 1856 "PENDING" {} 1857 "RUNNING" {} 1858 "SPECIAL_EXIT" {} 1859 "SUSPENDED" {} 1860 default { 1861 set message "Invalid desired state: $desired_state" 1862 if {$fatal} { 1863 fail $message 1864 } 1865 log_warn $message 1866 return $::RETURN_ERROR 1867 } 1868 } 1869 1870 if {$job_id == 0} { 1871 set message "Invalid job ID: $job_id" 1872 if {$fatal} { 1873 fail $message 1874 } 1875 log_warn $message 1876 return $::RETURN_ERROR 1877 } 1878 1879 set my_delay 0 1880 while 1 { 1881 set fd [open "|$scontrol -o show job $job_id"] 1882 gets $fd line 1883 catch {close $fd} 1884 if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} { 1885 set state "NOT_FOUND" 1886 } 1887 1888 switch $state { 1889 "NOT_FOUND" - 1890 "BOOT_FAIL" - 1891 "CANCELLED" - 1892 "COMPLETED" - 1893 "DEADLINE" - 1894 "FAILED" - 1895 "NODE_FAIL" - 1896 "OUT_OF_MEMORY" - 1897 "PREEMPTED" - 1898 "TIMEOUT" { 1899 if {[string compare $desired_state "DONE"] == 0} { 1900 log_debug "Job $job_id is DONE ($state)" 1901 sleep 2 1902 return $::RETURN_SUCCESS 1903 } 1904 if {[string compare $desired_state "RUNNING"] == 0} { 1905 set message "Job $job_id is $state, but we wanted RUNNING" 1906 } 1907 if {[string compare $desired_state "SUSPENDED"] == 0} { 1908 set message "Job $job_id is $state, but we wanted SUSPENDED" 1909 } 1910 if {$fatal} { 1911 fail $message 1912 } 1913 log_debug $message 1914 return $::RETURN_ERROR 1915 } 1916 "PENDING" { 1917 if {[string compare $desired_state "PENDING"] == 0} { 1918 log_debug "Job $job_id is PENDING" 1919 return $::RETURN_SUCCESS 1920 } 1921 log_debug "Job $job_id is in state $state, desire $desired_state" 1922 } 1923 "RUNNING" { 1924 if {[string compare $desired_state "RUNNING"] == 0} { 1925 log_debug "Job $job_id is RUNNING" 1926 return $::RETURN_SUCCESS 1927 } 1928 log_debug "Job $job_id is in state $state, desire $desired_state" 1929 } 1930 "SPECIAL_EXIT" { 1931 if {[string compare $desired_state "SPECIAL_EXIT"] == 0} { 1932 log_debug "Job $job_id is SPECIAL_EXIT" 1933 return $::RETURN_SUCCESS 1934 } 1935 log_debug "Job $job_id is in state $state, desire $desired_state" 1936 } 1937 "SUSPENDED" { 1938 if {[string compare $desired_state "SUSPENDED"] == 0} { 1939 log_debug "Job $job_id is SUSPENDED" 1940 return $::RETURN_SUCCESS 1941 } 1942 log_debug "Job $job_id is in state $state, desire $desired_state" 1943 } 1944 default { 1945 log_debug "Job $job_id is in state $state, desire $desired_state" 1946 } 1947 } 1948 1949 if { $my_delay > $timeout } { 1950 set message "Timeout waiting for job state $desired_state" 1951 if {$fatal} { 1952 fail $message 1953 } 1954 log_warn "Timeout waiting for job state $desired_state" 1955 return $::RETURN_TIMEOUT 1956 } 1957 1958 exec sleep $poll_interval 1959 set my_delay [expr $my_delay + $poll_interval] 1960 } 1961} 1962 1963 1964################################################################ 1965# 1966# NAME 1967# wait_for_job - waits for job to be in desired state 1968# 1969# SYNOPSIS 1970# wait_for_job ?options? job_id desired_state ?het_job? 1971# 1972# DESCRIPTION 1973# Wait for job to be in desired state. Can handle het job components. 1974# 1975# OPTIONS 1976# -fail 1977# If an error occurs or the job does not reach the desired state 1978# by the timeout, fail the test rather than returning an error 1979# -timeout <integer_number> 1980# time in seconds to wait for the job to be in the desired state 1981# before timing out (default is 90) 1982# -pollinterval <integer_number> 1983# time in seconds between each job state check (default is 1) 1984# 1985# ARGUMENTS 1986# job_id 1987# The Slurm job id of a job we want to wait for. 1988# desired_state 1989# The state you want the job to attain before returning. 1990# Currently supports: 1991# DONE any terminated state 1992# PENDING job is pending 1993# RUNNING job is running 1994# SPECIAL_EXIT 1995# SUSPENDED job is suspended 1996# het_job 1997# If set, checks the state of each component job if the job 1998# is a het one. 1999# 2000# RETURN VALUE 2001# RETURN_SUCCESS if job reaches the desired state, or non-zero value 2002# otherwise. 2003# 2004# SEE ALSO 2005# _wait_for_single_job 2006# 2007################################################################ 2008 2009proc wait_for_job args { 2010 2011 set options [list] 2012 set het_job 0 2013 2014 while {[llength $args]} { 2015 switch -glob -- [lindex $args 0] { 2016 -fatal - 2017 -fail { 2018 lappend options [lindex $args 0] 2019 set args [lrange $args 1 end] 2020 } 2021 -time* - 2022 -poll* { 2023 lappend options {*}[lrange $args 0 1] 2024 set args [lrange $args 2 end] 2025 } 2026 default break 2027 } 2028 } 2029 set argument_count [llength $args] 2030 if {$argument_count < 2} { 2031 fail "Too few arguments ($argument_count): $args" 2032 } elseif {$argument_count > 3} { 2033 fail "Too many arguments ($argument_count): $args" 2034 } else { 2035 lassign $args job_id desired_state 2036 } 2037 if {$argument_count == 3} { set hetjob [lindex $args 2] } 2038 2039 if { $het_job } { 2040 # get component job ids 2041 set jid_list [get_het_job_ids $job_id 1] 2042 } 2043 2044 set rc 0 2045 set jid_list "" 2046 if { $jid_list == "" } { 2047 # non-het job 2048 set jid_list $job_id 2049 } 2050 2051 foreach jid $jid_list { 2052 set rc [_wait_for_single_job {*}$options $jid $desired_state] 2053 if { $rc } { 2054 # bail out on first failure 2055 break 2056 } 2057 } 2058 return $rc 2059} 2060 2061 2062################################################################ 2063# 2064# NAME 2065# wait_for_account_done - cancels and waits on jobs in specified accounts 2066# 2067# SYNOPSIS 2068# wait_for_account_done ?options? accounts 2069# 2070# DESCRIPTION 2071# Cancel jobs on and wait for them to be finished in account(s) given. 2072# 2073# OPTIONS 2074# -timeout <integer_number> 2075# time in seconds to wait for the jobs to be finished before 2076# timing out (default is 360) 2077# -pollinterval <integer_number> 2078# time in seconds between each job state check (default is 1) 2079# 2080# ARGUMENTS 2081# accounts 2082# Comma-delimited list of accounts 2083# 2084# RETURN VALUE 2085# RETURN_SUCCESS if all jobs of the account are finished, or non-zero 2086# otherwise. 2087# 2088# NOTE 2089# We sleep for two seconds before replying that a job is 2090# done to give time for I/O completion (stdout/stderr files) 2091# 2092################################################################ 2093 2094proc wait_for_account_done args { 2095 global scancel squeue re_word_str 2096 2097 set timeout 360 2098 set poll_interval 1 2099 while {[llength $args]} { 2100 switch -glob -- [lindex $args 0] { 2101 -time* {set args [lassign $args - timeout]} 2102 -poll* {set args [lassign $args - poll_interval]} 2103 -* {fail "Unknown option: [lindex $args 0]"} 2104 default break 2105 } 2106 } 2107 set argument_count [llength $args] 2108 if {$argument_count != 1} { 2109 fail "Invalid number of arguments ($argument_count): $args" 2110 } else { 2111 lassign $args accounts 2112 } 2113 2114 if { $accounts == "" } { 2115 log_error "No account given" 2116 return $::RETURN_ERROR 2117 } 2118 2119 log_user 0 2120 set account_list [split $accounts ","] 2121 foreach item $account_list { 2122 spawn $scancel -A $item 2123 expect { 2124 timeout { 2125 log_warn "No response from scancel" 2126 } 2127 eof { 2128 wait 2129 } 2130 } 2131 } 2132 2133 set my_delay 0 2134 while 1 { 2135 set found 0 2136 spawn $squeue -o Account=%a -h -A$accounts 2137 expect { 2138 -re "Account=($re_word_str)" { 2139 set found 1 2140 exp_continue 2141 } 2142 eof { 2143 wait 2144 } 2145 } 2146 2147 if { !$found } { 2148 log_debug "Account(s) $accounts is/are empty" 2149 break 2150 } 2151 2152 if { $my_delay > $timeout } { 2153 log_error "Timeout waiting for account(s) '$accounts' to be finished" 2154 log_user 1 2155 return $::RETURN_TIMEOUT 2156 } 2157 2158 exec sleep $poll_interval 2159 set my_delay [expr $my_delay + $poll_interval] 2160 } 2161 log_user 1 2162 return $::RETURN_SUCCESS 2163} 2164 2165 2166################################################################ 2167# 2168# NAME 2169# wait_for_part_done - cancels and waits on jobs in specified partition 2170# 2171# SYNOPSIS 2172# wait_for_part_done ?options? partition 2173# 2174# DESCRIPTION 2175# Cancel jobs on and wait for them to be finished in partition given. 2176# 2177# OPTIONS 2178# -timeout <integer_number> 2179# time in seconds to wait for the jobs to be finished before 2180# timing out (default is 360) 2181# -pollinterval <integer_number> 2182# time in seconds between each job state check (default is 1) 2183# 2184# ARGUMENTS 2185# partition 2186# partition name 2187# 2188# RETURN VALUE 2189# RETURN_SUCCESS if all jobs of the partition are finished, or non-zero 2190# otherwise. 2191# 2192# NOTE 2193# We sleep for two seconds before replying that a job is 2194# done to give time for I/O completion (stdout/stderr files) 2195# 2196################################################################ 2197 2198proc wait_for_part_done args { 2199 global scancel squeue re_word_str 2200 2201 set timeout 360 2202 set poll_interval 1 2203 while {[llength $args]} { 2204 switch -glob -- [lindex $args 0] { 2205 -time* {set args [lassign $args - timeout]} 2206 -poll* {set args [lassign $args - poll_interval]} 2207 -* {fail "Unknown option: [lindex $args 0]"} 2208 default break 2209 } 2210 } 2211 set argument_count [llength $args] 2212 if {$argument_count != 1} { 2213 fail "Invalid number of arguments ($argument_count): $args" 2214 } else { 2215 lassign $args partition 2216 } 2217 2218 if { $partition == "" } { 2219 log_error "No partition given" 2220 return $::RETURN_ERROR 2221 } 2222 2223 log_user 0 2224 spawn $scancel -p $partition 2225 expect { 2226 timeout { 2227 log_error "No response from scancel" 2228 } 2229 eof { 2230 wait 2231 } 2232 } 2233 2234 set my_delay 0 2235 while 1 { 2236 set found 0 2237 spawn $squeue -o Part=%P -h -p$partition 2238 expect { 2239 -re "Part=($re_word_str)" { 2240 set found 1 2241 exp_continue 2242 } 2243 eof { 2244 wait 2245 } 2246 } 2247 2248 if { !$found } { 2249 log_debug "Partition $partition is empty" 2250 break 2251 } 2252 2253 if { $my_delay > $timeout } { 2254 log_error "Timeout waiting for partition '$partition' to be finished" 2255 log_user 1 2256 return $::RETURN_TIMEOUT 2257 } 2258 2259 exec sleep $poll_interval 2260 set my_delay [expr $my_delay + $poll_interval] 2261 } 2262 log_user 1 2263 return $::RETURN_SUCCESS 2264} 2265 2266 2267################################################################ 2268# 2269# NAME 2270# wait_for_step - waits for a job step to be found 2271# 2272# SYNOPSIS 2273# wait_for_step ?options? step_id 2274# 2275# DESCRIPTION 2276# Wait for a job step to be found. 2277# 2278# OPTIONS 2279# -timeout <integer_number> 2280# time in seconds to wait for the job step to be found before 2281# timing out (default is 360) 2282# -pollinterval <integer_number> 2283# time in seconds between each step existence check (default is 1) 2284# 2285# ARGUMENTS 2286# step_id 2287# job step id 2288# 2289# RETURN VALUE 2290# RETURN_SUCCESS if step_id is found, or non-zero otherwise. 2291# 2292################################################################ 2293 2294proc wait_for_step args { 2295 global scontrol 2296 2297 set timeout 360 2298 set poll_interval 1 2299 while {[llength $args]} { 2300 switch -glob -- [lindex $args 0] { 2301 -time* {set args [lassign $args - timeout]} 2302 -poll* {set args [lassign $args - poll_interval]} 2303 -* {fail "Unknown option: [lindex $args 0]"} 2304 default break 2305 } 2306 } 2307 set argument_count [llength $args] 2308 if {$argument_count != 1} { 2309 fail "Invalid number of arguments ($argument_count): $args" 2310 } else { 2311 lassign $args step_id 2312 } 2313 2314 set my_delay 0 2315 while 1 { 2316 set fd [open "|$scontrol -o show step $step_id"] 2317 gets $fd line 2318 catch {close $fd} 2319 if {[regexp {Nodes=} $line foo] == 1} { 2320 return $::RETURN_SUCCESS 2321 } 2322 if {[regexp {MidplaneList=} $line foo] == 1} { 2323 return $::RETURN_SUCCESS 2324 } 2325 if { $my_delay > $timeout } { 2326 log_error "Timeout waiting for job step" 2327 return $::RETURN_TIMEOUT 2328 } 2329 2330 log_debug "Step $step_id not done yet. Waiting for $poll_interval seconds" 2331 exec sleep $poll_interval 2332 set my_delay [expr $my_delay + $poll_interval] 2333 } 2334} 2335 2336 2337################################################################ 2338# 2339# NAME 2340# wait_for_all_jobs - waits for jobs to finish having a specified name 2341# 2342# SYNOPSIS 2343# wait_for_all_jobs ?options? job_name 2344# 2345# DESCRIPTION 2346# Wait for previously submitted Slurm jobs to finish of a certain name. 2347# 2348# OPTIONS 2349# -timeout <integer_number> 2350# time in seconds to wait for the jobs to finish before 2351# timing out (default is 30) 2352# -pollinterval <integer_number> 2353# time in seconds between each job state check (default is 1) 2354# 2355# ARGUMENTS 2356# job_name 2357# The name of job to wait for 2358# 2359# RETURN VALUE 2360# RETURN_SUCCESS if all jobs with the specified name are finished, 2361# or non-zero otherwise. 2362# If jobs are not completed after timeout, they are cancelled. 2363# 2364################################################################ 2365 2366proc wait_for_all_jobs args { 2367 global scancel squeue 2368 2369 set timeout 30 2370 set poll_interval 1 2371 while {[llength $args]} { 2372 switch -glob -- [lindex $args 0] { 2373 -time* {set args [lassign $args - timeout]} 2374 -poll* {set args [lassign $args - poll_interval]} 2375 -* {fail "Unknown option: [lindex $args 0]"} 2376 default break 2377 } 2378 } 2379 set argument_count [llength $args] 2380 if {$argument_count != 1} { 2381 fail "Invalid number of arguments ($argument_count): $args" 2382 } else { 2383 lassign $args job_name 2384 } 2385 2386 log_debug "Waiting for all jobs to terminate" 2387 set jobs_found -1 2388 set jobs_desired 0 2389 2390 wait_for -timeout $timeout -pollinterval $poll_interval {$jobs_found == $jobs_desired} { 2391 set jobs_found 0 2392 log_user 0 2393 spawn $squeue -o %j -n $job_name 2394 expect { 2395 -re "$job_name" { 2396 incr jobs_found 2397 exp_continue 2398 } 2399 timeout { 2400 fail "No response from squeue" 2401 } 2402 eof { 2403 wait 2404 } 2405 } 2406 log_user 1 2407 if {$jobs_found != $jobs_desired} { 2408 log_debug "Still $jobs_found jobs remaining" 2409 } 2410 } 2411 if {$jobs_found == $jobs_desired} { 2412 log_debug "All jobs complete" 2413 return $::RETURN_SUCCESS 2414 } else { 2415 log_debug "Cancelling uncompleted jobs" 2416 run_command "$scancel -n $job_name" 2417 return $::RETURN_ERROR 2418 } 2419} 2420 2421 2422################################################################ 2423# 2424# NAME 2425# wait_job_reason - waits for a desired job state and reason 2426# 2427# SYNOPSIS 2428# wait_job_reason ?options? job_id ?desired_state? ?desired_reason_list? 2429# 2430# DESCRIPTION 2431# Wait until the job is in desired state and reason is one 2432# of the desired ones or until the timeout. 2433# 2434# OPTIONS 2435# -timeout <integer_number> 2436# time in seconds to wait for the job state and reason before 2437# timing out (default is 360) 2438# -pollinterval <integer_number> 2439# time in seconds between each job state check (default is 1) 2440# 2441# ARGUMENTS 2442# job_id 2443# The job to wait for 2444# desired_state 2445# Desired state. 2446# desired_reason_list 2447# List of desired reasons. Empty list means that any reason 2448# is ok. 2449# 2450# RETURN VALUE 2451# RETURN_SUCCESS when job is in the desired state and reason is one 2452# of the desired ones, or non-zero otherwise. 2453# 2454################################################################ 2455 2456proc wait_job_reason args { 2457 global scontrol re_word_str 2458 2459 set final_state "COMPLETED CANCELLED FAILED TIMEOUT DEADLINE 2460 OUT_OF_MEMORY" 2461 2462 set timeout 360 2463 set poll_interval 1 2464 set desired_state "PENDING" 2465 set desired_reason_list "" 2466 while {[llength $args]} { 2467 switch -glob -- [lindex $args 0] { 2468 -time* {set args [lassign $args - timeout]} 2469 -poll* {set args [lassign $args - poll_interval]} 2470 -* {fail "Unknown option: [lindex $args 0]"} 2471 default break 2472 } 2473 } 2474 set argument_count [llength $args] 2475 if {$argument_count < 1} { 2476 fail "Too few arguments ($argument_count): $args" 2477 } else { 2478 lassign $args job_id 2479 } 2480 if {$argument_count >= 2} { set desired_state [lindex $args 1] } 2481 if {$argument_count == 3} { set desired_reason_list [lindex $args 2] } 2482 if {$argument_count > 3} { 2483 fail "Too many arguments ($argument_count): $args" 2484 } 2485 2486 set log_user_prev [log_user -info] 2487 log_user 0 2488 2489 set my_delay 0 2490 set rc $::RETURN_ERROR 2491 while true { 2492 set pending 0 2493 set has_reason 1 2494 spawn $scontrol show job $job_id 2495 expect { 2496 -re "JobState=($re_word_str) Reason=(\\S+)" { 2497 set job_state $expect_out(1,string) 2498 set job_reason $expect_out(2,string) 2499 } 2500 timeout { 2501 log_error "No response from scontrol show job" 2502 set rc $::RETURN_TIMEOUT 2503 break 2504 } 2505 } 2506 2507 # Check if both state and reason are the desired ones 2508 if {$job_state == $desired_state} { 2509 set found 0 2510 if {$desired_reason_list == ""} { 2511 set found 1 2512 } 2513 foreach desired_reason $desired_reason_list { 2514 if {$job_reason == $desired_reason } { 2515 set found 1 2516 } 2517 } 2518 if {$found} { 2519 set rc $::RETURN_SUCCESS 2520 break 2521 } 2522 } elseif {[lsearch -exact final_state $job_state] >= 0} { 2523 # Job is in final step no need to wait longer 2524 log_error [format "Job in final state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ 2525 $job_state $job_reason \ 2526 $desired_state $desired_reason_list] 2527 set rc $::RETURN_ERROR 2528 break 2529 } 2530 2531 # Check if this was the last poll 2532 if {$my_delay > $timeout} { 2533 log_error "Timeout" 2534 set rc $::RETURN_TIMEOUT 2535 break 2536 } 2537 set remamining_sec [expr $timeout - $my_delay] 2538 log_debug [format "Job in state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ 2539 $job_state $job_reason \ 2540 $desired_state $desired_reason_list] 2541 log_debug [format "Polling again in %ss, %ss to timeout." \ 2542 $poll_interval $remamining_sec] 2543 2544 sleep $poll_interval 2545 set my_delay [expr $my_delay + $poll_interval] 2546 } 2547 2548 log_user $log_user_prev 2549 return $rc 2550} 2551 2552 2553################################################################ 2554# 2555# NAME 2556# get_config - returns a dictionary of slurm configuration parameters 2557# 2558# SYNOPSIS 2559# get_config ?options? 2560# 2561# OPTIONS 2562# -dbd 2563# uses `sacctmgr show config` to return slurmdbd configuration 2564# parameters 2565# -slurm 2566# uses `scontrol show config` to return slurm configuration 2567# parameters (this is the default) 2568# 2569# RETURN VALUE 2570# Returns a dictionary of parameter values 2571# 2572################################################################ 2573 2574proc get_config args { 2575 global sacctmgr scontrol 2576 2577 set command "$scontrol" 2578 while {[llength $args]} { 2579 switch -glob -- [lindex $args 0] { 2580 -slurm {set command "$scontrol"; set args [lrange $args 1 end]} 2581 -dbd {set command "$sacctmgr"; set args [lrange $args 1 end]} 2582 -* {fail "Unknown option: [lindex $args 0]"} 2583 default break 2584 } 2585 } 2586 if {[llength $args] > 0} { 2587 fail "[lindex [info level 0] 0]: No arguments allowed: $args" 2588 } 2589 2590 set output [run_command_output -fail -nolog "$command show config"] 2591 2592 foreach line [split $output "\n"] { 2593 if {[regexp {^(\S+) += (.*)$} $line {} param_name param_value] == 1} { 2594 dict set config_dict $param_name $param_value 2595 } 2596 } 2597 2598 return $config_dict 2599} 2600 2601 2602################################################################ 2603# 2604# NAME 2605# get_config_param - returns a slurm configuration parameter value 2606# 2607# SYNOPSIS 2608# get_config_param ?options? parameter_name 2609# 2610# OPTIONS 2611# -dbd 2612# uses `sacctmgr show config` to return the specified slurmdbd 2613# configuration parameter value 2614# -slurm 2615# uses `scontrol show config` to return the specified slurm 2616# configuration parameter value (this is the default) 2617# 2618# ARGUMENTS 2619# parameter_name 2620# the parameter to return the value for 2621# 2622# DESCRIPTION 2623# Returns a specific configuration parameter value. 2624# 2625# RETURN VALUE 2626# Returns the value of the specified parameter or MISSING if it does not 2627# exist. 2628# 2629################################################################ 2630 2631proc get_config_param args { 2632 2633 set options [list] 2634 while {[llength $args]} { 2635 switch -glob -- [lindex $args 0] { 2636 -* { 2637 lappend options [lindex $args 0] 2638 set args [lrange $args 1 end] 2639 } 2640 default break 2641 } 2642 } 2643 if {[llength $args] == 1} { 2644 lassign $args parameter_name 2645 } else { 2646 fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args" 2647 } 2648 2649 set config_dict [get_config {*}$options] 2650 2651 if [dict exists $config_dict $parameter_name] { 2652 return [dict get $config_dict $parameter_name] 2653 } else { 2654 return "MISSING" 2655 } 2656} 2657 2658 2659################################################################ 2660# 2661# NAME 2662# param_contains - test whether a comma-separated-list contains a specified value 2663# 2664# SYNOPSIS 2665# param_contains haystack needle 2666# 2667# DESCRIPTION 2668# Searches for the specified value (needle) in the comma-separated-list 2669# string (haystack). Needle can be a glob-style pattern. 2670# 2671# RETURN VALUE 2672# Returns a boolean value indicating whether the value (needle) was found 2673# in the comma-separated-list string (haystack) 2674# 2675################################################################ 2676 2677proc param_contains { haystack needle } { 2678 if {[lsearch [split $haystack ","] $needle] != -1} { 2679 return true 2680 } else { 2681 return false 2682 } 2683} 2684 2685 2686################################################################ 2687# 2688# NAME 2689# param_value - returns the value of a parameter in a comma-separated-list 2690# 2691# SYNOPSIS 2692# param_value params_list param ?default? 2693# 2694# DESCRIPTION 2695# Searches for the specified param in the comma-separated-list 2696# string (params_list) and returns its value. 2697# 2698# RETURN VALUE 2699# Returns the value found or the optional default value if not found. 2700# If the param is found without a value, returns true (ie like 2701# param_contains). 2702# 2703################################################################ 2704 2705proc param_value {params_list param {default false}} { 2706 global re_word_str 2707 foreach pair [split $params_list ","] { 2708 if {[regexp "$param" $pair] == 1} { 2709 if {[regexp "$param=($re_word_str)" $pair - value] == 1} { 2710 return $value 2711 } else { 2712 return true 2713 } 2714 } 2715 } 2716 return $default 2717} 2718 2719 2720################################################################ 2721# 2722# NAME 2723# get_affinity_types - gets the task plugins running with task/ stripped 2724# 2725# SYNOPSIS 2726# get_affinity_types 2727# 2728# RETURN VALUE 2729# Returns comma separated list of task plugins running without the task/ 2730# 2731################################################################ 2732 2733proc get_affinity_types { } { 2734 global scontrol re_word_str 2735 2736 log_user 0 2737 set affinity "" 2738 spawn $scontrol show config 2739 expect { 2740 -re "TaskPlugin *= ($re_word_str)" { 2741 set parts [split $expect_out(1,string) ",/"] 2742 while 1 { 2743 set task_found [lsearch $parts "task"] 2744 if { $task_found == -1 } break 2745 set parts [lreplace $parts $task_found $task_found] 2746 } 2747 set affinity [join $parts ","] 2748 exp_continue 2749 } 2750 eof { 2751 wait 2752 } 2753 } 2754 log_user 1 2755 2756 return $affinity 2757} 2758 2759 2760################################################################ 2761# 2762# NAME 2763# get_mps_count_by_index - gets the count of a specific gres/mps device 2764# 2765# SYNOPSIS 2766# get_mps_count_by_index index hostname 2767# 2768# RETURN VALUE 2769# Returns the Count of a specific gres/mps device 2770# 2771################################################################ 2772 2773proc get_mps_count_by_index { index hostname } { 2774 global slurmd number re_word_str 2775 2776 log_user 0 2777 set count 0 2778 spawn $slurmd -G -N $hostname 2779 expect { 2780 -re "Gres Name=mps Type=$re_word_str Count=($number) Index=$index" { 2781 set count $expect_out(1,string) 2782 exp_continue 2783 } 2784 eof { 2785 wait 2786 } 2787 } 2788 log_user 1 2789 2790 return $count 2791} 2792 2793 2794################################################################ 2795# 2796# NAME 2797# check_bb_emulate - determines if Cray burst buffers API is emulated 2798# 2799# SYNOPSIS 2800# check_bb_emulate 2801# 2802# RETURN VALUE 2803# Returns true if Cray burst buffers API is emulated, false otherwise 2804# 2805################################################################ 2806 2807proc check_bb_emulate { } { 2808 global scontrol 2809 2810 log_user 0 2811 set bb_emulate false 2812 spawn $scontrol show burst 2813 expect { 2814 -re "EmulateCray" { 2815 set bb_emulate true 2816 exp_continue 2817 } 2818 eof { 2819 wait 2820 } 2821 } 2822 log_user 1 2823 return $bb_emulate 2824} 2825 2826 2827################################################################ 2828# 2829# NAME 2830# check_bb_persistent - determines if persistent burst buffers can be created by users 2831# 2832# SYNOPSIS 2833# check_bb_persistent 2834# 2835# RETURN VALUE 2836# Returns true if Cray burst buffers can be created by users, 2837# false otherwise 2838# 2839################################################################ 2840 2841proc check_bb_persistent { } { 2842 global scontrol 2843 2844 log_user 0 2845 set bb_persistent false 2846 spawn $scontrol show burst 2847 expect { 2848 -re "EnablePersistent" { 2849 set bb_persistent true 2850 exp_continue 2851 } 2852 eof { 2853 wait 2854 } 2855 } 2856 log_user 1 2857 return $bb_persistent 2858} 2859 2860 2861################################################################ 2862# 2863# NAME 2864# get_bb_types - gets the burst buffer plugins running with task/ stripped 2865# 2866# SYNOPSIS 2867# get_bb_types 2868# 2869# RETURN VALUE 2870# Returns comma separated list of task plugins running without the task/ 2871# 2872################################################################ 2873 2874proc get_bb_types { } { 2875 global scontrol re_word_str 2876 2877 log_user 0 2878 set bb_types "" 2879 spawn $scontrol show config 2880 expect { 2881 -re "BurstBufferType *= ($re_word_str)" { 2882 set parts [split $expect_out(1,string) ",/"] 2883 while 1 { 2884 set task_found [lsearch $parts "burst_buffer"] 2885 if { $task_found == -1 } break 2886 set parts [lreplace $parts $task_found $task_found] 2887 } 2888 set bb_types [join $parts ","] 2889 exp_continue 2890 } 2891 eof { 2892 wait 2893 } 2894 } 2895 2896 log_user 1 2897 return $bb_types 2898} 2899 2900 2901################################################################ 2902# 2903# NAME 2904# get_default_acct - gets user's default account 2905# 2906# SYNOPSIS 2907# get_default_acct user 2908# 2909# RETURN VALUE 2910# Returns name of default account if exists, NULL otherwise 2911# 2912################################################################ 2913 2914proc get_default_acct { user } { 2915 global sacctmgr re_word_str bin_id 2916 2917 log_user 0 2918 set def_acct "" 2919 2920 if { !$user } { 2921 set user [get_my_user_name] 2922 } 2923 2924 spawn $sacctmgr -n list -P user $user format="DefaultAccount" 2925 expect { 2926 -re "($re_word_str)" { 2927 set def_acct $expect_out(1,string) 2928 exp_continue 2929 } 2930 eof { 2931 wait 2932 } 2933 } 2934 log_user 1 2935 2936 return $def_acct 2937} 2938 2939 2940################################################################ 2941# 2942# NAME 2943# get_cycle_count - get desired iteration count 2944# 2945# SYNOPSIS 2946# get_cycle_count 2947# 2948# DESCRIPTION 2949# For tests with iteration counts (e.g. test9.1, test9.2) 2950# return the desired iteration count 2951# 2952# RETURN VALUE 2953# Returns desired iteration count 2954# 2955################################################################ 2956 2957proc get_cycle_count { } { 2958 global enable_memory_leak_debug 2959 2960 if {$enable_memory_leak_debug != 0} { 2961 return 2 2962 } 2963 return 100 2964} 2965 2966 2967################################################################ 2968# 2969# NAME 2970# get_select_type_params - determines SelectTypeParameters being used for a given partition 2971# 2972# SYNOPSIS 2973# get_select_type_params ?partition? 2974# 2975# DESCRIPTION 2976# Determine SelectTypeParameters being used for a given partition. 2977# If the partition is not specified, the default partition will be used. 2978# 2979# RETURN VALUE 2980# Returns a string containing SelectTypeParameters 2981# 2982################################################################ 2983 2984proc get_select_type_params { {partition ""} } { 2985 global scontrol bin_bash bin_grep re_word_str 2986 2987 log_user 0 2988 set params "" 2989 2990 if {[string length $partition] == 0} { 2991 set partition [default_partition] 2992 } 2993 2994 if {[string compare $partition ""]} { 2995 spawn -noecho $bin_bash -c "exec $scontrol show part $partition | $bin_grep SelectTypeParameters" 2996 expect { 2997 -re "SelectTypeParameters *= *NONE" { 2998 exp_continue 2999 } 3000 -re "SelectTypeParameters *= *($re_word_str)" { 3001 set params $expect_out(1,string) 3002 exp_continue 3003 } 3004 eof { 3005 wait 3006 } 3007 } 3008 } 3009 if { [string compare params ""] } { 3010 spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters" 3011 expect { 3012 -re "SelectTypeParameters *= *($re_word_str)" { 3013 set params $expect_out(1,string) 3014 exp_continue 3015 } 3016 eof { 3017 wait 3018 } 3019 } 3020 } 3021 log_user 1 3022 3023 return $params 3024} 3025 3026 3027################################################################ 3028# 3029# NAME 3030# check_config_select - checks if effectively using the select type 3031# 3032# SYNOPSIS 3033# check_config_select type 3034# 3035# DESCRIPTION 3036# Determine if SelectType is equivalent to the passed one by also 3037# checking other_cons_res and other_cons_tres on SelectTypeParameters 3038# in case that select/cray_aries is configured. 3039# 3040# ARGUMENTS 3041# type 3042# the desired SelectType to check (e.g. cons_tres) 3043# 3044# RETURN VALUE 3045# Returns true if configured, false otherwise 3046# 3047################################################################ 3048 3049proc check_config_select { type } { 3050 set select_type [get_config_param "SelectType"] 3051 set select_type_parameters [get_config_param "SelectTypeParameters"] 3052 3053 if {$select_type eq "select/$type"} { 3054 return true 3055 } 3056 if {$select_type eq "select/cray_aries"} { 3057 if {$type eq "linear" && 3058 ![param_contains $select_type_parameters "other_cons_res"] && 3059 ![param_contains $select_type_parameters "other_cons_tres"]} { 3060 return true 3061 } 3062 if {$type eq "cons_res" && 3063 [param_contains $select_type_parameters "other_cons_res"]} { 3064 return true 3065 } 3066 if {$type eq "cons_tres" && 3067 [param_contains $select_type_parameters "other_cons_tres"]} { 3068 return true 3069 } 3070 } 3071 3072 return false 3073} 3074 3075 3076################################################################ 3077# 3078# NAME 3079# get_total_cpus - gets the total amount of CPUs on the default partition 3080# 3081# SYNOPSIS 3082# get_total_cpus 3083# 3084# RETURN VALUE 3085# The total amount of CPUs on the default partition. 3086# 3087# NOTE 3088# CoreSpecCount are not part of the total. 3089# 3090################################################################ 3091 3092proc get_total_cpus {} { 3093 global sinfo scontrol re_word_str 3094 3095 set partition [default_partition] 3096 set cpu_cnt 0 3097 set nodes_matched [list] 3098 3099 set re_with_cs "NodeName=($re_word_str).*CPUTot=(\\d+).*CoreSpecCount=(\\d+).*ThreadsPerCore=(\\d+)" 3100 set re_without_cs "NodeName=($re_word_str).*CPUTot=(\\d+)" 3101 3102 set out [run_command_output -fail "$sinfo -h -o \"%P %N\" -p $partition --state=idle"] 3103 if {[regexp "$partition\\* ($re_word_str)" $out - def_hostlist] != 1} { 3104 fail "Not able to get the hostlist from sinfo" 3105 } 3106 3107 set out [run_command_output "$scontrol --oneliner show node $def_hostlist"] 3108 foreach {re node_name tmp_cpu_cnt core_spec_cnt threads_per_core} [regexp -all -inline -linestop $re_with_cs $out] { 3109 set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt - $core_spec_cnt * $threads_per_core] 3110 lappend nodes_matched $node_name 3111 } 3112 foreach {re node_name tmp_cpu_cnt} [regexp -all -inline -linestop $re_without_cs $out] { 3113 # Not count nodes already counted 3114 if {[lsearch $nodes_matched $node_name] == -1} { 3115 set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt] 3116 } 3117 } 3118 3119 return $cpu_cnt 3120} 3121 3122 3123################################################################ 3124# 3125# NAME 3126# is_super_user - determines if user is root or SlurmUser 3127# 3128# SYNOPSIS 3129# is_super_user ?user? 3130# 3131# DESCRIPTION 3132# Determine if user is a Slurm super user (i.e. user 3133# root or configured SlurmUser) 3134# 3135# RETURN VALUE 3136# true is user is root or SlurmUser, false otherwise 3137# 3138################################################################ 3139 3140proc is_super_user {{user ""}} { 3141 global number 3142 3143 if {$user == ""} { 3144 set user [get_my_user_name] 3145 } 3146 3147 # Check if user is root 3148 if {[string compare $user "root"] == 0} { 3149 return true 3150 } 3151 3152 # Check if user is SlurmUser 3153 set slurm_user [get_config_param "SlurmUser"] 3154 if {[regexp "${user}\\($number\\)" $slurm_user match]} { 3155 return true 3156 } 3157 3158 return false 3159} 3160 3161 3162################################################################ 3163# 3164# NAME 3165# dec2hex - creates a 32 bit hex number from a signed decimal number 3166# 3167# SYNOPSIS 3168# dec2hex value 3169# 3170# DESCRIPTION 3171# Create a 32 bit hex number from a signed decimal number 3172# 3173# RETURN VALUE 3174# 32 bit hex version of input 'value' 3175# 3176# SOURCE 3177# Courtesy of Chris Cornish 3178# http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 3179# 3180################################################################ 3181# Replace all non-decimal characters 3182 3183proc dec2hex {value} { 3184 regsub -all {[^0-x\.-]} $value {} newtemp 3185 set value [string trim $newtemp] 3186 if {$value < 2147483647 && $value > -2147483648} { 3187 set tempvalue [format "%#010X" [expr $value]] 3188 return [string range $tempvalue 2 9] 3189 } elseif {$value < -2147483647} { 3190 return "80000000" 3191 } else { 3192 return "7FFFFFFF" 3193 } 3194} 3195 3196 3197################################################################ 3198# 3199# NAME 3200# uint2hex - creates a 32 bit hex number from an unsigned decimal 3201# 3202# SYNOPSIS 3203# uint2hex value 3204# 3205# DESCRIPTION 3206# Create a 32 bit hex number from an unsigned decimal number. 3207# 3208# ARGUMENTS 3209# value 3210# unsigneddecimal number to convert 3211# 3212# RETURN VALUE 3213# 32 bit hex version of input 'value' 3214# 3215# SOURCE 3216# Courtesy of Chris Cornish 3217# http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 3218# 3219################################################################ 3220# Replace all non-decimal characters 3221 3222proc uint2hex {value} { 3223 regsub -all {[^0-x\.-]} $value {} newtemp 3224 set value [string trim $newtemp] 3225 if {$value <= 4294967295 && $value >= 0} { 3226 set tempvalue [format "%#010X" [expr $value]] 3227 return [string range $tempvalue 2 9] 3228 } else { 3229 return "FFFFFFFF" 3230 } 3231} 3232 3233 3234################################################################ 3235# 3236# NAME 3237# partition_oversubscribe - determines the oversubscribe configuration of the specified partition 3238# 3239# SYNOPSIS 3240# partition_oversubscribe ?partition? 3241# 3242# DESCRIPTION 3243# Determine the oversubscribe configuration of the specified partition. 3244# If the partition is not specified, the default partition will be used. 3245# 3246# RETURN VALUE 3247# Return the oversubscribe configuration of the specified partition. 3248# 3249################################################################ 3250 3251proc partition_oversubscribe { {partition ""} } { 3252 global sinfo 3253 3254 if {[string length $partition] == 0} { 3255 set partition [default_partition] 3256 } 3257 3258 set oversubscribe "NO" 3259 log_debug "$sinfo --noheader --partition $partition --format %h" 3260 set fd [open "|$sinfo --noheader --partition $partition --format %h"] 3261 gets $fd line 3262 catch {close $fd} 3263 regexp {[a-zA-Z]+} $line oversubscribe 3264 return $oversubscribe 3265} 3266 3267 3268################################################################ 3269# 3270# NAME 3271# default_partition - determines the name of the default partition 3272# 3273# SYNOPSIS 3274# default_partition 3275# 3276# DESCRIPTION 3277# Use scontrol to determine the name of the default partition 3278# 3279# RETURN VALUE 3280# Name of the current default partition 3281# 3282################################################################ 3283 3284proc default_partition {} { 3285 global scontrol 3286 3287 set name "" 3288 set fd [open "|$scontrol --all --oneliner show partition"] 3289 while {[gets $fd line] != -1} { 3290 if {[regexp {^PartitionName=([^ ]*).*Default=YES} $line frag name] 3291 == 1} { 3292 break 3293 } 3294 } 3295 catch {close $fd} 3296 3297 if {[string length $name] == 0} { 3298 log_error "Could not identify the default partition" 3299 } 3300 3301 return $name 3302} 3303 3304 3305################################################################ 3306# 3307# NAME 3308# default_part_exclusive - determines if the default partition allocates whole nodes to jobs 3309# 3310# SYNOPSIS 3311# default_part_exclusive 3312# 3313# DESCRIPTION 3314# Use scontrol to determine if the default partition 3315# allocates whole nodes to jobs 3316# 3317# RETURN VALUE 3318# Name of the current default partition 3319# 3320################################################################ 3321 3322proc default_part_exclusive {} { 3323 set def_part [default_partition] 3324 set oversubscribe [partition_oversubscribe $def_part] 3325 if {[string compare $oversubscribe "EXCLUSIVE"] == 0} { 3326 return 1 3327 } else { 3328 return 0 3329 } 3330} 3331 3332 3333################################################################ 3334# 3335# NAME 3336# make_bash_script - creates a bash script 3337# 3338# SYNOPSIS 3339# make_bash_script script_name script_contents 3340# 3341# DESCRIPTION 3342# Create a bash script of name "script_name", and 3343# make the body of the script "script_contents". 3344# make_bash_script removes the file if it already exists, 3345# then generates the #! line, and then dumps "script_contents" 3346# to the file. Finally, it makes certain that the script 3347# is executable. 3348# 3349# ARGUMENTS 3350# script_name 3351# file name for the bash script 3352# script_contents 3353# body of the script, not including the initial #! line. 3354# 3355# RETURN VALUE 3356# Nothing. 3357# 3358################################################################ 3359 3360proc make_bash_script { script_name script_contents } { 3361 global bin_bash bin_chmod 3362 3363 file delete $script_name 3364 set fd [open $script_name "w"] 3365 puts $fd "#!$bin_bash" 3366 puts $fd $script_contents 3367 close $fd 3368 exec $bin_chmod 700 $script_name 3369} 3370 3371 3372################################################################ 3373# 3374# NAME 3375# get_suffix - given a hostname, returns it's numeric suffix 3376# 3377# SYNOPSIS 3378# get_suffix hostname 3379# 3380# DESCRIPTION 3381# Given a hostname, return it's numeric suffix 3382# 3383# RETURN VALUE 3384# numerical suffix for input 'hostname' or -1 if not a number 3385# 3386################################################################ 3387 3388proc get_suffix { hostname } { 3389 set host_len [string length $hostname] 3390 set host_inx [expr $host_len-1] 3391 set host_char [string index $hostname $host_inx] 3392 if {[string compare $host_char "0"] < 0 || [string compare $host_char "9"] > 0} { 3393 return -1 3394 } 3395 3396 for {set host_inx [expr $host_len-1]} {$host_inx >= 0} {incr host_inx -1} { 3397 set host_char [string index $hostname $host_inx] 3398 if {[string compare $host_char "0"] < 0} { break } 3399 if {[string compare $host_char "9"] > 0} { break } 3400 } 3401 incr host_inx 3402 3403 if {$host_inx == $host_len} { 3404 log_warn "Hostname lacks a suffix: $hostname" 3405 return "-1" 3406 } 3407 3408# Strip off leading zeros to avoid doing octal arithmetic 3409 set suffix [string range $hostname $host_inx $host_len] 3410 set suffix_len [string length $suffix] 3411 for {set suffix_inx 0} {$suffix_inx < [expr $suffix_len - 1]} {incr suffix_inx} { 3412 set suffix_char [string index $suffix $suffix_inx] 3413 if {[string compare $suffix_char "0"] != 0} { break } 3414 } 3415 3416 return [string range $suffix $suffix_inx $suffix_len] 3417} 3418 3419 3420################################################################ 3421# 3422# NAME 3423# check_acct_associations - checks associations 3424# 3425# SYNOPSIS 3426# check_acct_associations 3427# 3428# DESCRIPTION 3429# Use sacctmgr to check associations 3430# 3431# RETURN VALUE 3432# true if no error is found, false otherwise 3433# 3434################################################################ 3435 3436proc check_acct_associations { } { 3437 global sacctmgr number re_word_str 3438 3439 set rc true 3440 log_user 0 3441 log_debug "Sanity-Checking Associations" 3442 # 3443 # Use sacctmgr to check associations 3444 # 3445 set s_pid [spawn $sacctmgr -n -p list assoc wopi wopl withd format=lft,rgt,cluster] 3446 expect { 3447 -re "($number)\\|($number)\\|($re_word_str)\\|" { 3448 # Here we are checking if we have duplicates and 3449 # setting up an array to check for holes later 3450 3451 set cluster $expect_out(3,string) 3452 if { ![info exists c_min($cluster)] } { 3453 set c_min($cluster) -1 3454 set c_max($cluster) -1 3455 } 3456 3457 set num1 $expect_out(1,string) 3458 set num2 $expect_out(2,string) 3459 set first [info exists found($cluster,$num1)] 3460 set sec [info exists found($cluster,$num2)] 3461 #log_debug "$first=$num1 $sec=$num2" 3462 if { $first } { 3463 log_error "$cluster found lft $num1 again" 3464 set rc false 3465 } elseif { $sec } { 3466 log_error "$cluster found rgt $num2 again" 3467 set rc false 3468 } else { 3469 set found($cluster,$num1) 1 3470 set found($cluster,$num2) 1 3471 if { $c_min($cluster) == -1 3472 || $c_min($cluster) > $num1 } { 3473 set c_min($cluster) $num1 3474 } 3475 if { $c_max($cluster) == -1 3476 || $c_max($cluster) < $num2 } { 3477 set c_max($cluster) $num2 3478 } 3479 } 3480 exp_continue 3481 } 3482 timeout { 3483 slow_kill $s_pid 3484 fail "sacctmgr add not responding" 3485 } 3486 eof { 3487 wait 3488 } 3489 } 3490 3491 foreach cluster [array names c_min] { 3492 # Here we are checking for holes in the list from above 3493 for {set inx $c_min($cluster)} {$inx < $c_max($cluster)} {incr inx} { 3494 if { ![info exists found($cluster,$inx)] } { 3495 log_error "$cluster No index at $inx" 3496 set rc false 3497 } 3498 } 3499 } 3500 log_user 1 3501 return $rc 3502} 3503 3504 3505################################################################ 3506# 3507# NAME 3508# get_job_acct_freq - gets the value of the job account gather frequency 3509# 3510# SYNOPSIS 3511# get_job_acct_freq 3512# 3513# RETURN VALUE 3514# job account gather frequency 3515# 3516################################################################ 3517 3518proc get_job_acct_freq { } { 3519 global scontrol number 3520 3521 log_user 0 3522 set freq_val 0 3523 3524 spawn $scontrol show config 3525 expect { 3526 -re "JobAcctGatherFrequency *= ($number)" { 3527 set freq_val $expect_out(1,string) 3528 if {$freq_val == 0} { 3529 set freq_val 0 3530 } 3531 } 3532 -re "JobAcctGatherFrequency *= task=($number)" { 3533 set freq_val $expect_out(1,string) 3534 if {$freq_val == 0} { 3535 set freq_val 0 3536 } 3537 } 3538 eof { 3539 wait 3540 } 3541 } 3542 3543 log_user 1 3544 return $freq_val 3545} 3546 3547 3548################################################################ 3549# 3550# NAME 3551# get_admin_level - gets the AdminLevel of the user 3552# 3553# SYNOPSIS 3554# get_admin_level ?user? 3555# 3556# RETURN VALUE 3557# AdminLevel for the current user 3558# 3559################################################################ 3560 3561proc get_admin_level {{user_name ""}} { 3562 global sacctmgr re_word_str re_word_str bin_id 3563 3564 set admin_level "" 3565 3566 if {$user_name == ""} { 3567 set user_name [get_my_user_name] 3568 if { ![string length $user_name] } { 3569 log_error "No name returned from id" 3570 return "" 3571 } 3572 } 3573 3574 if {[is_super_user $user_name]} { 3575 return "Administrator" 3576 } 3577 3578 # 3579 # Use sacctmgr to check admin_level 3580 # 3581 log_user 0 3582 set s_pid [spawn $sacctmgr -n -P list user $user_name format=admin] 3583 expect { 3584 -re "($re_word_str)" { 3585 set admin_level $expect_out(1,string) 3586 exp_continue 3587 } 3588 timeout { 3589 slow_kill $s_pid 3590 fail "sacctmgr add not responding" 3591 } 3592 eof { 3593 wait 3594 } 3595 } 3596 log_user 1 3597 3598 return $admin_level 3599} 3600 3601 3602############################################################### 3603# 3604# NAME 3605# change_subbp_state - sets sub mid plane state 3606# 3607# SYNOPSIS 3608# change_subbp_state node ionodes state 3609# 3610# RETURN VALUE 3611# Returns SUCCESS if state of mid plane is changed 3612# 3613############################################################### 3614 3615proc change_subbp_state { node ionodes state } { 3616 global scontrol smap 3617 3618 set return_code 0 3619 3620 set my_pid [spawn $scontrol update subbpname=$node\[$ionodes\] state=$state] 3621 expect { 3622 -re "slurm_update error:" { 3623 set return_code 1 3624 exp_continue 3625 } 3626 -re "Unable to contact" { 3627 log_error "Slurm appears to be down" 3628 exp_continue 3629 } 3630 timeout { 3631 log_error "scontrol not responding" 3632 slow_kill $my_pid 3633 set return_code 1 3634 } 3635 eof { 3636 wait 3637 } 3638 } 3639 3640 if { $return_code } { 3641 return $return_code 3642 } 3643 3644 set match 0 3645 set my_pid [spawn $smap -Db -c -h -n $node -I $ionodes] 3646 expect { 3647 -nocase -re "$state" { 3648 incr match 3649 exp_continue 3650 } 3651 -re "$node" { 3652 incr match 3653 exp_continue 3654 } 3655 -re "Unable to contact" { 3656 log_error "Slurm appears to be down" 3657 exp_continue 3658 } 3659 timeout { 3660 log_error "smap not responding" 3661 slow_kill $my_pid 3662 set return_code 1 3663 } 3664 eof { 3665 wait 3666 } 3667 } 3668 3669 if {$match != 2} { 3670 log_error "Subbp did not go into $state state. $match" 3671 set return_code 1 3672 } 3673 3674 return $return_code 3675} 3676 3677 3678################################################# 3679# 3680# NAME 3681# scale_to_megs - scales the value by the factor T|G|M to megabytes 3682# 3683# SYNOPSIS 3684# scale_to_megs value factor 3685# 3686# DESCRIPTION 3687# scale the value by the factor T|G|M to megabytes 3688# 3689# RETURN VALUE 3690# the scaled variable 3691# 3692################################################# 3693 3694proc scale_to_megs { value factor } { 3695 3696 if {[string compare $factor "T"] == 0} { 3697 set value [expr $value * 1024 * 1024] 3698 } elseif {[string compare $factor "G"] == 0} { 3699 set value [expr $value * 1024] 3700 } elseif {[string compare $factor "M"] == 0} { 3701 set value [expr $value * 1] 3702 } elseif {[string compare $factor "K"] == 0} { 3703 set value [expr $value / 1024] 3704 set value [expr {round($value)}] 3705 } else { 3706 set value [expr $value / (1024 * 1024)] 3707 set value [expr {round($value)}] 3708 } 3709 3710 return $value 3711} 3712 3713 3714################################################# 3715# 3716# NAME 3717# scale_to_ks - scales the value by the factor G|M|K to kilobytes 3718# 3719# SYNOPSIS 3720# scale_to_ks value factor 3721# 3722# DESCRIPTION 3723# scale the value by the factor G|M|K to kilobytes 3724# 3725# RETURN VALUE 3726# the scaled variable 3727# 3728################################################# 3729 3730proc scale_to_ks { value factor } { 3731 3732 if {[string compare $factor "G"] == 0} { 3733 set value [expr $value * 1024 * 1024] 3734 } elseif {[string compare $factor "M"] == 0} { 3735 set value [expr $value * 1024] 3736 } elseif {[string compare $factor "K"] == 0} { 3737 set value [expr $value * 1] 3738 } else { 3739 set value [expr $value / 1024] 3740 set value [expr {round($value)}] 3741 } 3742 3743 return $value 3744} 3745 3746 3747############################################################ 3748# 3749# NAME 3750# check_config_node_mem - checks that the nodes have memory configured 3751# 3752# SYNOPSIS 3753# check_config_node_mem 3754# 3755# RETURN VALUE 3756# true if all nodes have memory, false otherwise 3757# 3758############################################################ 3759 3760proc check_config_node_mem { } { 3761 3762 set nodes_dict [get_nodes] 3763 dict for {node_name node_dict} $nodes_dict { 3764 if [dict exists $node_dict "RealMemory"] { 3765 if {[dict get $node_dict "RealMemory"] == 1} { 3766 return false 3767 } 3768 } else { 3769 log_warn "Parameter RealMemory not found on node $node_name" 3770 return false 3771 } 3772 } 3773 3774 return true 3775} 3776 3777 3778################################################################ 3779# 3780# NAME 3781# wait_for_node - waits for nodes in a partition to reach a certain state 3782# 3783# SYNOPSIS 3784# wait_for_node ?options? state num_nodes ?partition? 3785# 3786# DESCRIPTION 3787# Wait for a certain number of nodes in a partition to reach a certain 3788# state. 3789# 3790# OPTIONS 3791# -timeout <integer_number> 3792# time in seconds to wait for the node state before 3793# timing out (default is 3) 3794# -pollinterval <integer_number> 3795# time in seconds between each node state check (default is 1) 3796# 3797# ARGUMENTS 3798# state 3799# The node state to wait for 3800# num_nodes 3801# The number of nodes we want to be in the specified state 3802# partition 3803# Partition name (the default partition is used if not specified) 3804# 3805# RETURN VALUE 3806# RETURN_SUCCESS, or non-zero on failure 3807# 3808################################################################ 3809 3810proc wait_for_node args { 3811 global sinfo number 3812 3813 set partition "" 3814 set timeout 3 3815 set poll_interval 1 3816 set desired_state "PENDING" 3817 set desired_reason_list "" 3818 while {[llength $args]} { 3819 switch -glob -- [lindex $args 0] { 3820 -time* {set args [lassign $args - timeout]} 3821 -poll* {set args [lassign $args - poll_interval]} 3822 -* {fail "Unknown option: [lindex $args 0]"} 3823 default break 3824 } 3825 } 3826 set argument_count [llength $args] 3827 if {$argument_count < 2} { 3828 fail "Too few arguments ($argument_count): $args" 3829 } else { 3830 lassign $args state num_nodes 3831 } 3832 if {$argument_count == 3} { set partition [lindex $args 2] } 3833 if {$argument_count > 3} { 3834 fail "Too many arguments ($argument_count): $args" 3835 } 3836 3837 set wait_time 0 3838 set done 0 3839 set cnt 0 3840 set rt $::RETURN_SUCCESS 3841 3842 if {[string length $partition] == 0} { 3843 set partition [default_partition] 3844 } 3845 3846 while {$done != 1 && $wait_time < $timeout} { 3847 3848 log_user 0 3849 spawn $sinfo --noheader --partition $partition --state $state --format %D 3850 expect { 3851 -re "($number)" { 3852 set cnt $expect_out(1,string) 3853 exp_continue 3854 } 3855 timeout { 3856 log_error "sinfo is not responding" 3857 set rt $::RETURN_TIMEOUT 3858 } 3859 eof { 3860 wait 3861 } 3862 } 3863 log_user 1 3864 3865 if {$num_nodes <= $cnt} { 3866 set done 1 3867 } else { 3868 log_debug "Partition $partition has $cnt nodes idle and we want $num_nodes" 3869 sleep $poll_interval 3870 incr wait_time 1 3871 } 3872 } 3873 if {$done != 1} { 3874 set rt $::RETURN_ERROR 3875 } 3876 3877 return $rt 3878} 3879 3880 3881##################################################################### 3882# 3883# NAME 3884# list2hostlist - converts a TCL list into a Slurm hostlist using scontrol 3885# 3886# SYNOPSIS 3887# list2hostlist nodes_list 3888# 3889# ARGUMENTS 3890# 3891# nodes_list 3892# a TCL list of node names 3893# 3894# RETURN VALUE 3895# the hostlist form returned by scrontrol show hostlist 3896# 3897##################################################################### 3898 3899proc list2hostlist {nodes_list} { 3900 global scontrol 3901 3902 set comalist [join $nodes_list ,] 3903 set hostlist [run_command_output -fail "$scontrol show hostlist $comalist"] 3904 set hostlist [string trimright $hostlist "\r\n"] 3905} 3906 3907 3908##################################################################### 3909# 3910# NAME 3911# get_nodes_by_state - gets the list of node names in a given partition/states 3912# 3913# SYNOPSIS 3914# get_nodes_by_state partition states 3915# 3916# DESCRIPTION 3917# Get the list of node names in a given partition/states 3918# 3919# ARGUMENTS 3920# partition 3921# partition to get nodes off 3922# states 3923# states to filter on nodes 3924# 3925# RETURN VALUE 3926# node names list, -1 on sinfo error 3927# 3928##################################################################### 3929 3930proc get_nodes_by_state {{states ""} {partition ""}} { 3931 3932 global sinfo re_word_str 3933 log_user 0 3934 set node_list "" 3935 3936 if {[string length $partition] == 0} { 3937 set partition [default_partition] 3938 } 3939 3940 if {[string length $states] == 0} { 3941 set states "Idle" 3942 } 3943 3944 set sinfo_pid [spawn -noecho $sinfo -h -N -p $partition -o %N -t $states -e] 3945 expect { 3946 -re "($re_word_str)" { 3947 lappend node_list $expect_out(1,string) 3948 exp_continue 3949 } 3950 timeout { 3951 slow_kill $sinfo_pid 3952 fail "sinfo not responding" 3953 } 3954 eof { 3955 wait 3956 } 3957 } 3958 3959 log_user 1 3960 return $node_list 3961} 3962 3963 3964##################################################################### 3965# 3966# NAME 3967# set_partition_maximum_time_limit - sets the maximum time limit in a given partition 3968# 3969# SYNOPSIS 3970# set_partition_maximum_time_limit partition limit 3971# 3972# RETURN VALUE 3973# RETURN_SUCCESS, or non-zero on error 3974# 3975##################################################################### 3976 3977proc set_partition_maximum_time_limit {partition limit} { 3978 global scontrol 3979 3980 if {[string length $partition] == 0} { 3981 set partition [default_partition] 3982 if { $partition == "" } { 3983 return $::RETURN_ERROR 3984 } 3985 } 3986 3987 if { $limit < -1 } { 3988 log_error "Trying to set invalid partition time limit of $limit" 3989 return $::RETURN_ERROR 3990 } 3991 if { $limit == -1 } { 3992 set expected_lim "UNLIMITED" 3993 } else { 3994 set expected_lim limit 3995 } 3996 3997 spawn $scontrol update partitionname=$partition MaxTime=-1 3998 expect { 3999 timeout { 4000 log_error "scontrol not responding" 4001 return $::RETURN_TIMEOUT 4002 } 4003 eof { 4004 wait 4005 } 4006 } 4007 4008 set maxtime [get_partition_maximum_time_limit $partition] 4009 if { $maxtime != $limit } { 4010 log_error "Unable to update partition MaxTime, got $maxtime, wanted $limit" 4011 return $::RETURN_ERROR 4012 } 4013 4014 return $::RETURN_SUCCESS 4015} 4016 4017 4018##################################################################### 4019# 4020# NAME 4021# get_partition_maximum_time_limit - gets the maximum time limit in a given partition 4022# 4023# SYNOPSIS 4024# get_partition_maximum_time_limit partition 4025# 4026# DESCRIPTION 4027# Get the maximum time limit in a given partition 4028# 4029# RETURN VALUE 4030# time limit in seconds, -1 if undefined or error 4031# 4032##################################################################### 4033 4034proc get_partition_maximum_time_limit {partition} { 4035 global sinfo number 4036 4037 if {[string length $partition] == 0} { 4038 set partition [default_partition] 4039 } 4040 4041 set secs 0 4042 log_user 0 4043 set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O time -e] 4044 expect { 4045 -re "infinite" { 4046 set secs -1 4047 exp_continue 4048 } 4049 -re "n/a" { 4050 set secs -1 4051 exp_continue 4052 } 4053 -re "($number)-($number):($number):($number)" { 4054 set days [expr $expect_out(1,string) * 24 * 60 * 60] 4055 set hours [expr $expect_out(2,string) * 60 * 60] 4056 set mins [expr $expect_out(3,string) * 60] 4057 set secs [expr $days + $hours + $mins + $expect_out(4,string)] 4058 exp_continue 4059 } 4060 -re "($number):($number):($number)" { 4061 set hours [expr $expect_out(1,string) * 60 * 60] 4062 set mins [expr $expect_out(2,string) * 60] 4063 set secs [expr $hours + $mins + $expect_out(3,string)] 4064 exp_continue 4065 } 4066 -re "($number):($number)" { 4067 set mins [expr $expect_out(1,string) * 60] 4068 set secs [expr $mins + $expect_out(2,string)] 4069 exp_continue 4070 } 4071 -re "($number)" { 4072 set secs [expr $expect_out(1,string) * 60] 4073 exp_continue 4074 } 4075 timeout { 4076 slow_kill $sinfo_pid 4077 fail "sinfo not responding" 4078 } 4079 eof { 4080 wait 4081 } 4082 } 4083 4084 log_user 1 4085 return $secs 4086} 4087 4088 4089################################################################ 4090# 4091# NAME 4092# get_partition_default_time_limit - gets the default time limit in a given partition 4093# 4094# SYNOPSIS 4095# get_partition_default_time_limit ?partition? 4096# 4097# DESCRIPTION 4098# Get the default time limit in a given partition. 4099# If the partition is not specified, the default partition will be used. 4100# 4101# RETURN VALUE 4102# Returns: time limit in seconds, -1 if undefined or error. 4103# 4104################################################################ 4105 4106proc get_partition_default_time_limit { {partition ""} } { 4107 global sinfo number 4108 4109 if {[string length $partition] == 0} { 4110 set partition [default_partition] 4111 } 4112 4113 set secs 0 4114 log_user 0 4115 set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O defaulttime -e] 4116 expect { 4117 -re "infinite" { 4118 set secs -1 4119 exp_continue 4120 } 4121 -re "n/a" { 4122 set secs -1 4123 exp_continue 4124 } 4125 -re "($number)-($number):($number):($number)" { 4126 set days [expr $expect_out(1,string) * 24 * 60 * 60] 4127 set hours [expr $expect_out(2,string) * 60 * 60] 4128 set mins [expr $expect_out(3,string) * 60] 4129 set secs [expr $days + $hours + $mins + $expect_out(4,string)] 4130 exp_continue 4131 } 4132 -re "($number):($number):($number)" { 4133 set hours [expr $expect_out(1,string) * 60 * 60] 4134 set mins [expr $expect_out(2,string) * 60] 4135 set secs [expr $hours + $mins + $expect_out(3,string)] 4136 exp_continue 4137 } 4138 -re "($number):($number)" { 4139 set mins [expr $expect_out(1,string) * 60] 4140 set secs [expr $mins + $expect_out(2,string)] 4141 exp_continue 4142 } 4143 -re "($number)" { 4144 set secs [expr $expect_out(1,string) * 60] 4145 exp_continue 4146 } 4147 timeout { 4148 slow_kill $sinfo_pid 4149 fail "sinfo not responding" 4150 } 4151 eof { 4152 wait 4153 } 4154 } 4155 4156 log_user 1 4157 return $secs 4158} 4159 4160 4161##################################################################### 4162# 4163# NAME 4164# get_node_cores - given a node, returns its total number of cores 4165# 4166# SYNOPSIS 4167# get_node_cores node 4168# 4169# DESCRIPTION 4170# Given a node, return its total number of cores 4171# (not the CoresPerSocket, but the total cores) 4172# 4173# RETURN VALUE 4174# node cores if retrieved, -1 otherwise 4175# 4176##################################################################### 4177 4178proc get_node_cores {node} { 4179 4180 global sinfo number 4181 set cores -1 4182 set sockets_per_node 0 4183 set cores_per_socket 0 4184 4185 if {[string length $node] == 0} { 4186 return $cores 4187 } 4188 4189 log_user 0 4190 set sinfo_pid [spawn -noecho $sinfo -o "%X %Y" -h -n $node] 4191 4192 expect { 4193 -re "($number)" { 4194 if {$sockets_per_node == 0} { 4195 set sockets_per_node $expect_out(1,string) 4196 } else { 4197 set cores_per_socket $expect_out(1,string) 4198 } 4199 exp_continue 4200 } 4201 timeout { 4202 slow_kill $sinfo_pid 4203 fail "sinfo not responding" 4204 } 4205 eof { 4206 wait 4207 } 4208 } 4209 4210 log_user 1 4211 4212 set cores [expr $sockets_per_node * $cores_per_socket] 4213 4214 return $cores 4215} 4216 4217 4218##################################################################### 4219# 4220# NAME 4221# get_node_cpus - given a node, returns its total number of threads we account for 4222# 4223# SYNOPSIS 4224# get_node_cpus node 4225# 4226# DESCRIPTION 4227# Given a node, return its total number of threads we account for. 4228# (not always ThreadsPerCore, but how many threads are in use. 4229# i.e. CPUs=6 CoresPerSocket=6 ThreadsPerCore=2 Socket=1 would 4230# result in only 1 thread we care about instead of the 2 listed.) 4231# 4232# RETURN VALUE 4233# list of node [ tot_cpus threads ] if retrieved, [ -1 -1 ] otherwise 4234# 4235##################################################################### 4236 4237proc get_node_cpus {node} { 4238 global scontrol number 4239 4240 set nthreads -1 4241 set nsockets 0 4242 set ncores 0 4243 set totcpus -1 4244 4245 if {[string length $node] == 0} { 4246 return [list $totcpus $nthreads] 4247 } 4248 4249 # Get the number of CPUs on a node 4250 set scontrol_pid [spawn $scontrol show node $node] 4251 expect { 4252 -re "CoresPerSocket=($number)" { 4253 set ncores $expect_out(1,string) 4254 exp_continue 4255 } 4256 -re "CPUTot=($number)" { 4257 set totcpus $expect_out(1,string) 4258 exp_continue 4259 } 4260 -re "Sockets=($number)" { 4261 set nsockets $expect_out(1,string) 4262 exp_continue 4263 } 4264 -re "ThreadsPerCore=($number)" { 4265 set nthreads $expect_out(1,string) 4266 exp_continue 4267 } 4268 timeout { 4269 slow_kill $scontrol_pid 4270 fail "scontrol is not responding" 4271 } 4272 eof { 4273 wait 4274 } 4275 } 4276 4277 set core_cnt [expr $nsockets * $ncores] 4278 set thread_cnt [expr $ncores * $nthreads] 4279 if {$totcpus != $nthreads && $totcpus == $ncores} { 4280 log_debug "Cores rather than threads are being allocated" 4281 set nthreads 1 4282 } 4283 4284 return [list $totcpus $nthreads] 4285} 4286 4287 4288##################################################################### 4289# 4290# NAME 4291# get_part_total_cores - given a partition and/or states, return its total cores 4292# 4293# SYNOPSIS 4294# get_part_total_cores partition states 4295# 4296# DESCRIPTION 4297# Given a partition and/or states, return its total cores 4298# 4299# ARGUMENTS 4300# partition 4301# partition to check cores 4302# states 4303# states to filter on partition cores 4304# 4305# RETURN VALUE 4306# partition cores 4307# 4308##################################################################### 4309 4310proc get_part_total_cores {part states} { 4311 global sinfo number 4312 log_user 0 4313 set cores 0 4314 set tmp 0 4315 set i 0 4316 4317 if {[string length $part] == 0} { 4318 set part [default_partition] 4319 } 4320 4321 if {[string length $states] == 0} { 4322 set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -o "%X %Y"] 4323 } else { 4324 set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -t $states -o "%X %Y"] 4325 } 4326 4327 expect { 4328 -re "($number)" { 4329 set is_even [expr {($i % 2) == 0}] 4330 if {$is_even == 1} { 4331 set tmp $expect_out(1,string) 4332 } else { 4333 set tmp [expr $tmp * $expect_out(1,string)] 4334 set cores [expr $cores + $tmp] 4335 } 4336 incr i 4337 exp_continue 4338 } 4339 timeout { 4340 slow_kill $sinfo_pid 4341 fail "sinfo not responding" 4342 } 4343 eof { 4344 wait 4345 } 4346 } 4347 4348 log_user 1 4349 return $cores 4350} 4351 4352 4353##################################################################### 4354# 4355# NAME 4356# check_hosts_contiguous - verify if all hosts belong to the partition and are contiguous 4357# 4358# SYNOPSIS 4359# check_hosts_contiguous check_hosts_list partition 4360# 4361# DESCRIPTION 4362# Given a partition and a list of hosts, verify if all 4363# hosts belong to the partition and are contiguous. 4364# If the partition argument is empty, the default partition 4365# will be used. 4366# 4367# RETURN VALUE 4368# Returns: true if hosts are contiguous, false otherwise. 4369# 4370##################################################################### 4371 4372proc check_hosts_contiguous { check_hosts_list {partition ""} } { 4373 4374 global sinfo re_word_str 4375 4376 if {[string length $partition] == 0} { 4377 set partition [default_partition] 4378 } 4379 4380 set part_hosts_list {} 4381 4382 log_user 0 4383 set sinfo_pid [spawn $sinfo --noheader -p $partition -N -o %N] 4384 expect { 4385 -re "($re_word_str)" { 4386 lappend part_hosts_list $expect_out(1,string) 4387 exp_continue 4388 } 4389 -re "Unable to contact" { 4390 log_user 1 4391 fail "Slurm appears to be down" 4392 } 4393 timeout { 4394 log_user 1 4395 slow_kill $sinfo_pid 4396 fail "sinfo not responding" 4397 } 4398 eof { 4399 wait 4400 } 4401 } 4402 log_user 1 4403 4404 foreach host $check_hosts_list { 4405 set idx_cur [lsearch $part_hosts_list $host] 4406 if {$idx_cur == -1} { 4407 fail "$host not found in list of hosts from partition $partition" 4408 } 4409 if {[info exists idx_old]} { 4410 if {$idx_cur != [expr $idx_old + 1]} { 4411 log_error "Node sequence number not contiguous" 4412 return false 4413 } 4414 } 4415 set idx_old $idx_cur 4416 } 4417 4418 return true 4419} 4420 4421 4422################################################################ 4423# 4424# NAME 4425# get_het_job_ids - gets list of component job ids for a het job 4426# 4427# SYNOPSIS 4428# get_het_job_ids job_id ?use_offset? 4429# 4430# DESCRIPTION 4431# Gets list of component job ids for a het job. 4432# 4433# ARGUMENTS 4434# job_id 4435# Slurm job id 4436# use_offset 4437# If zero, returns list of integer job ids, else returns ids in 4438# the form of X+Y where X is het job master id and Y is the 4439# offset. 4440# 4441# RETURN VALUE 4442# A list of ids for a hetjob or an empty list if jobid 4443# is not a het one. 4444# 4445################################################################ 4446 4447proc get_het_job_ids { jobid {use_offset 0}} { 4448 global scontrol number 4449 4450 set id_list "" 4451 set log_user_save [log_user -info] 4452 log_user 0 4453 spawn $scontrol show job $jobid 4454 expect { 4455 -re "JobId=($number) HetJobId=($number) HetJobOffset=($number)" { 4456 if { $use_offset } { 4457 lappend id_list "$expect_out(2,string)+$expect_out(3,string)" 4458 } else { 4459 lappend id_list $expect_out(1,string) 4460 } 4461 exp_continue 4462 } 4463 timeout { 4464 log_error "scontrol not responding" 4465 set id_list "" 4466 } 4467 eof { 4468 wait 4469 } 4470 } 4471 log_user $log_user_save 4472 return $id_list 4473} 4474 4475 4476################################################################ 4477# 4478# NAME 4479# reconfigure - calls scontrol reconfigure 4480# 4481# SYNOPSIS 4482# reconfigure ?options? ?cluster? 4483# 4484# DESCRIPTION 4485# Calls scontrol reconfigure. This routine takes the same options as 4486# run_command, passing them to the underlying run_command invocation. 4487# This command waits an additional 5 seconds before returning. 4488# 4489# OPTIONS 4490# See OPTIONS of run_command proc. 4491# 4492# ARGUMENTS 4493# cluster 4494# The cluster to reconfigure 4495# 4496# RETURN VALUE 4497# RETURN_SUCCESS on success, otherwise RETURN_ERROR 4498# 4499################################################################ 4500 4501proc reconfigure args { 4502 global scontrol 4503 4504 set options [list] 4505 set cluster "" 4506 4507 while {[llength $args]} { 4508 switch -glob -- [lindex $args 0] { 4509 -* { 4510 lappend options {*}[lrange $args 0 1] 4511 set args [lrange $args 2 end] 4512 } 4513 default break 4514 } 4515 } 4516 set argument_count [llength $args] 4517 if {$argument_count > 1} { 4518 fail "Too many arguments ($argument_count): $args" 4519 } elseif {$argument_count == 1} { 4520 lassign $args cluster 4521 } 4522 4523 set command $scontrol 4524 if {$cluster ne ""} { 4525 append command " -M$cluster" 4526 } 4527 append command " reconfigure" 4528 set rc [run_command_status {*}$options "$command"] 4529 4530 # 4531 # Wait 5 seconds for reconfigure to complete, then return. 4532 # 4533 sleep 5 4534 return $rc 4535} 4536 4537 4538##################################################################### 4539# 4540# NAME 4541# log_fatal - prints a fatal message 4542# 4543# SYNOPSIS 4544# log_fatal message 4545# 4546# SEE ALSO 4547# _log_format for options governing the message format and colorization 4548# 4549##################################################################### 4550 4551proc log_fatal {message} { 4552 global testsuite_log_level LOG_LEVEL_FATAL 4553 4554 if {$testsuite_log_level >= $LOG_LEVEL_FATAL} { 4555 _log_format "fatal" "$message" 4556 } 4557} 4558 4559 4560##################################################################### 4561# 4562# NAME 4563# log_error - prints an error message 4564# 4565# SYNOPSIS 4566# log_error message 4567# 4568# SEE ALSO 4569# _log_format for options governing the message format and colorization 4570# 4571##################################################################### 4572 4573proc log_error {message} { 4574 global testsuite_log_level LOG_LEVEL_ERROR 4575 4576 if {$testsuite_log_level >= $LOG_LEVEL_ERROR} { 4577 _log_format "error" "$message" 4578 } 4579} 4580 4581 4582##################################################################### 4583# 4584# NAME 4585# log_warn - prints a warning message 4586# 4587# SYNOPSIS 4588# log_warn message 4589# 4590# SEE ALSO 4591# _log_format for options governing the message format and colorization 4592# 4593##################################################################### 4594 4595proc log_warn {message} { 4596 global testsuite_log_level LOG_LEVEL_WARNING 4597 4598 if {$testsuite_log_level >= $LOG_LEVEL_WARNING} { 4599 _log_format "warning" "$message" 4600 } 4601} 4602 4603 4604##################################################################### 4605# 4606# NAME 4607# log_info - prints an information message 4608# 4609# SYNOPSIS 4610# log_info message 4611# 4612# SEE ALSO 4613# _log_format for options governing the message format and colorization 4614# 4615##################################################################### 4616 4617proc log_info {message} { 4618 global testsuite_log_level LOG_LEVEL_INFO 4619 4620 if {$testsuite_log_level >= $LOG_LEVEL_INFO} { 4621 _log_format "info" "$message" 4622 } 4623} 4624 4625 4626##################################################################### 4627# 4628# NAME 4629# log_pass - prints a pass level message 4630# 4631# SYNOPSIS 4632# log_pass message 4633# 4634# SEE ALSO 4635# _log_format for options governing the message format and colorization 4636# 4637##################################################################### 4638 4639proc log_pass {message} { 4640 global testsuite_log_level LOG_LEVEL_PASS 4641 4642 if {$testsuite_log_level >= $LOG_LEVEL_PASS} { 4643 _log_format "pass" "$message" 4644 } 4645} 4646 4647 4648##################################################################### 4649# 4650# NAME 4651# log_command - prints a command level message 4652# 4653# SYNOPSIS 4654# log_command message 4655# 4656# SEE ALSO 4657# _log_format for options governing the message format and colorization 4658# 4659##################################################################### 4660 4661proc log_command {message} { 4662 global testsuite_log_level LOG_LEVEL_COMMAND 4663 4664 if {$testsuite_log_level >= $LOG_LEVEL_COMMAND} { 4665 _log_format "command" "$message" 4666 } 4667} 4668 4669 4670##################################################################### 4671# 4672# NAME 4673# log_debug - prints a debug level message 4674# 4675# SYNOPSIS 4676# log_debug message 4677# 4678# SEE ALSO 4679# _log_format for options governing the message format and colorization 4680# 4681##################################################################### 4682 4683proc log_debug {message} { 4684 global testsuite_log_level LOG_LEVEL_DEBUG 4685 4686 if {$testsuite_log_level >= $LOG_LEVEL_DEBUG} { 4687 _log_format "debug" "$message" 4688 } 4689} 4690 4691 4692##################################################################### 4693# 4694# NAME 4695# log_trace - prints a trace level message 4696# 4697# SYNOPSIS 4698# log_trace message 4699# 4700# SEE ALSO 4701# _log_format for options governing the message format and colorization 4702# 4703##################################################################### 4704 4705proc log_trace {message} { 4706 global testsuite_log_level LOG_LEVEL_TRACE 4707 4708 if {$testsuite_log_level >= $LOG_LEVEL_TRACE} { 4709 _log_format "trace" "$message" 4710 } 4711} 4712 4713 4714################################################################ 4715# 4716# NAME 4717# in_fed - checks whether this cluster is in a federation 4718# 4719# SYNOPSIS 4720# in_fed 4721# 4722# RETURN VALUE 4723# Returns true if this cluster is in a federation, false otherwise 4724# 4725################################################################ 4726 4727proc in_fed {} { 4728 global scontrol 4729 4730 spawn $scontrol show fed 4731 expect { 4732 -re "Federation" { 4733 return true 4734 } 4735 timeout { 4736 log_error "scontrol not responding" 4737 } 4738 eof { 4739 wait 4740 } 4741 } 4742 return false 4743} 4744 4745 4746################################################################ 4747# 4748# NAME 4749# check_job_state - checks if the state of a job is the expected one 4750# 4751# SYNOPSIS 4752# check_job_state job state ?het_job? 4753# 4754# DESCRIPTION 4755# Checks if the state of a job is the expected one. 4756# 4757# ARGUMENTS 4758# job 4759# Job ID to check 4760# state 4761# Desired state of the job to match 4762# het_job 4763# If set, checks state of each component job if the 4764# job is a hetjob. 4765# 4766# RETURN VALUE 4767# true if job was on the desired state, or the number of job components 4768# on that state if it's a hetjob and het_job option enabled, false 4769# otherwise. 4770# 4771################################################################ 4772 4773proc check_job_state { job state {het_job 0}} { 4774 global scontrol 4775 4776 set jid_list "" 4777 if { $het_job } { 4778 set jid_list [get_het_job_ids $job 1] 4779 } 4780 4781 if { $jid_list == "" } { 4782 # non-het job 4783 set jid_list $job 4784 } 4785 4786 foreach jid $jid_list { 4787 set state_match 0 4788 spawn $scontrol show job $jid 4789 expect { 4790 -re "JobState=($state)" { 4791 incr state_match 4792 } 4793 timeout { 4794 fail "scontrol not responding" 4795 } 4796 eof { 4797 wait 4798 } 4799 } 4800 4801 if {$state_match != 1} { 4802 log_error "job $jid should be in $state state, but is not" 4803 return false 4804 } 4805 } 4806 4807 return true 4808} 4809 4810 4811################################################################ 4812# 4813# NAME 4814# get_gres_count - returns a dict of nodes and GRES counts 4815# 4816# SYNOPSIS 4817# get_gres_count gres_name ?node_list? 4818# 4819# DESCRIPTION 4820# Returns a dict of node names and the count of a specifed 4821# GRES aggregating all its types on each node. 4822# 4823# RETURN VALUE 4824# If the node_list is not specified node name is specified, 4825# this function will return a dict with the GRES count for all 4826# the nodes of the default partition. 4827# If specified, a dict only with the nodes of the node_list. 4828# 4829################################################################ 4830 4831proc get_gres_count { gres_name {node_list ""} } { 4832 set nodes_dict [get_nodes $node_list] 4833 set nodes_gres_dict [dict create] 4834 dict for {node_name node_dict} $nodes_dict { 4835 if [dict exists $node_dict "Gres"] { 4836 set gres_param [dict get $node_dict "Gres"] 4837 set gres_dict [count_gres $gres_param] 4838 } 4839 if [dict exists $gres_dict $gres_name] { 4840 set gres_count [dict get $gres_dict $gres_name] 4841 dict set nodes_gres_dict $node_name $gres_count 4842 } 4843 } 4844 return $nodes_gres_dict 4845} 4846 4847 4848################################################################ 4849# 4850# NAME 4851# count_gres - returns a dict of GRES names and their total counts 4852# 4853# SYNOPSIS 4854# count_gres gres_param 4855# 4856# DESCRIPTION 4857# Parses a GRES parameter string typically obtained from nodes or 4858# jobs info, and returns a dict of GRES names and their count 4859# aggregating all the types of each GRES. 4860# 4861# RETURN VALUE 4862# A dict of GRES names and their count aggregating all types of 4863# each GRES. 4864# 4865################################################################ 4866 4867proc count_gres { gres_param } { 4868 global gres_regex 4869 4870 set gres_dict [dict create] 4871 foreach gres [split $gres_param ","] { 4872 if {[regexp $gres_regex $gres {} name type count] == 1} { 4873 if {$count eq ""} { set count $type } 4874 4875 if {[dict exists $gres_dict $name]} { 4876 dict set gres_dict $name [expr [dict get $gres_dict $name] + $count] 4877 } else { 4878 dict set gres_dict $name $count 4879 } 4880 } 4881 } 4882 return $gres_dict 4883} 4884 4885 4886################################################################ 4887# 4888# NAME 4889# get_highest_gres_count - returns highest number of GRES per node on node_count nodes 4890# 4891# SYNOPSIS 4892# get_highest_gres_count node_count gres_name 4893# 4894# DESCRIPTION 4895# For a given number of nodes, returns the highest GRES count per 4896# node available on at least that number of nodes. 4897# 4898# EXAMPLE 4899# For example: node1 has 1 GPU, node2 has 2 GPUs and node3 has 3 GPUs 4900# [get_highest_gres_count 1 "gpu"] returns 3 (i.e. 1 node 3 GPUs) 4901# [get_highest_gres_count 2 "gpu"] returns 2 (i.e. 2 nodes have at least 2 GPUs each) 4902# [get_highest_gres_count 3 "gpu"] returns 1 (i.e. 3 nodes have at least 1 GPU each) 4903# 4904################################################################ 4905 4906proc get_highest_gres_count { node_count gres_name } { 4907 4908 set available_nodes [list2hostlist [get_nodes_by_state]] 4909 set gres_dict [get_gres_count $gres_name $available_nodes] 4910 set gres_count [list] 4911 4912 dict for {node gres} $gres_dict { 4913 lappend gres_count $gres 4914 } 4915 4916 set count [lindex [lsort -decreasing -integer $gres_count] [expr $node_count - 1]] 4917 return $count 4918} 4919 4920 4921################################################################ 4922# 4923# NAME 4924# _set_gpu_socket_inx - adds a socket index to the gpu_sock_list if not already on it 4925# 4926# SYNOPSIS 4927# _set_gpu_socket_inx sock_inx 4928# 4929# DESCRIPTION 4930# Add a socket index to the array gpu_sock_list if not already 4931# on the list. Subroutine used by get_gpu_socket_count 4932# 4933################################################################ 4934 4935proc _set_gpu_socket_inx { sock_inx } { 4936 global gpu_sock_list 4937 4938 if {$sock_inx == -1} { 4939 set gpu_sock_list [lreplace $gpu_sock_list 0 99] 4940 return 4941 } 4942 4943 set sock_cnt [llength $gpu_sock_list] 4944 for {set i 0} {$i < $sock_cnt} {incr i} { 4945 if {[lindex $gpu_sock_list $i] == $sock_inx} { 4946 return 4947 } 4948 } 4949 lappend gpu_sock_list $sock_inx 4950} 4951 4952 4953################################################################ 4954# Subroutine used by get_gpu_socket_count 4955# Add a socket index to the array gpu_sock_list if not already 4956# on the list. 4957################################################################ 4958 4959proc _set_gpu_socket_range { sock_first_inx sock_last_inx } { 4960 global gpu_sock_list 4961 4962 set sock_cnt [llength $gpu_sock_list] 4963 for {set s $sock_first_inx} {$s <= $sock_last_inx} {incr s} { 4964 set found 0 4965 for {set i 0} {$i < $sock_cnt} {incr i} { 4966 if {[lindex $gpu_sock_list $i] == $s} { 4967 set found 1 4968 break 4969 } 4970 } 4971 if {$found == 0} { 4972 lappend gpu_sock_list $s 4973 } 4974 } 4975} 4976 4977 4978################################################################ 4979# 4980# NAME 4981# get_gpu_socket_count - returns the number of sockets with GPUS on a node with the given per-node GPU count 4982# 4983# SYNOPSIS 4984# get_gpu_socket_count gpu_cnt sockets_per_node 4985# 4986# DESCRIPTION 4987# Given a per-node GPU count, return the number of sockets with 4988# GPUs on a node with the given per-node GPU count. 4989# If the sockets_per_node has a value of 1 then just return 1 4990# rather than determine the count (for performance reasons). 4991# 4992################################################################ 4993 4994proc get_gpu_socket_count { gpu_cnt sockets_per_node } { 4995 global re_word_str bin_rm number scontrol srun 4996 global gpu_sock_list 4997 set sockets_with_gpus 1 4998 set file_in "test_get_gpu_socket_count.input" 4999 5000 if {$sockets_per_node == 1} { 5001 return 1 5002 } 5003 5004 log_user 0 5005 _set_gpu_socket_inx -1 5006 make_bash_script $file_in "$scontrol show node \$SLURMD_NODENAME" 5007 spawn $srun -N1 --gres=gpu:$gpu_cnt $file_in 5008 expect { 5009 -re "gpu:${number}.S:($number)-($number)" { 5010 _set_gpu_socket_range $expect_out(1,string) $expect_out(2,string) 5011 exp_continue 5012 } 5013 -re "gpu:${re_word_str}:${number}.S:($number),($number),($number),($number)" { 5014 _set_gpu_socket_inx $expect_out(1,string) 5015 _set_gpu_socket_inx $expect_out(2,string) 5016 _set_gpu_socket_inx $expect_out(3,string) 5017 _set_gpu_socket_inx $expect_out(4,string) 5018 exp_continue 5019 } 5020 -re "gpu:${re_word_str}:${number}.S:($number),($number),($number)" { 5021 _set_gpu_socket_inx $expect_out(1,string) 5022 _set_gpu_socket_inx $expect_out(2,string) 5023 _set_gpu_socket_inx $expect_out(3,string) 5024 exp_continue 5025 } 5026 -re "gpu:${re_word_str}:${number}.S:($number),($number)" { 5027 _set_gpu_socket_inx $expect_out(1,string) 5028 _set_gpu_socket_inx $expect_out(2,string) 5029 exp_continue 5030 } 5031 -re "gpu:${re_word_str}:${number}.S:($number)" { 5032 _set_gpu_socket_inx $expect_out(1,string) 5033 exp_continue 5034 } 5035 -re "gpu:${number}.S:($number),($number),($number),($number)" { 5036 _set_gpu_socket_inx $expect_out(1,string) 5037 _set_gpu_socket_inx $expect_out(2,string) 5038 _set_gpu_socket_inx $expect_out(3,string) 5039 _set_gpu_socket_inx $expect_out(4,string) 5040 exp_continue 5041 } 5042 -re "gpu:${number}.S:($number),($number),($number)" { 5043 _set_gpu_socket_inx $expect_out(1,string) 5044 _set_gpu_socket_inx $expect_out(2,string) 5045 _set_gpu_socket_inx $expect_out(3,string) 5046 exp_continue 5047 } 5048 -re "gpu:${number}.S:($number),($number)" { 5049 _set_gpu_socket_inx $expect_out(1,string) 5050 _set_gpu_socket_inx $expect_out(2,string) 5051 exp_continue 5052 } 5053 -re "gpu:${number}.S:($number)" { 5054 _set_gpu_socket_inx $expect_out(1,string) 5055 exp_continue 5056 } 5057 eof { 5058 wait 5059 } 5060 } 5061 log_user 1 5062 exec $bin_rm -f $file_in 5063 5064 set sock_cnt [llength $gpu_sock_list] 5065 if {$sock_cnt > 1} { 5066 set sockets_with_gpus $sock_cnt 5067 } 5068 5069 return $sockets_with_gpus 5070} 5071 5072 5073################################################################ 5074# 5075# NAME 5076# get_highest_mps_count - get_highest_gres_count nodes mps, but for "mps per GPU" 5077# 5078# SYNOPSIS 5079# get_highest_mps_count node_count 5080# 5081# DESCRIPTION 5082# For a given number of nodes, returns the higest number of MPS per GPU 5083# available at least on those number of nodes. 5084# 5085################################################################ 5086 5087proc get_highest_mps_count { node_count } { 5088 # We cannot use get_highest_gres_count because we need "per gpu", 5089 # so we get all the mps per node and all gpus per node, to create 5090 # a mps_per_gpu list to sort and get the count. 5091 set available_nodes [list2hostlist [get_nodes_by_state]] 5092 set mps_dict [get_gres_count "mps" $available_nodes] 5093 set gpu_dict [get_gres_count "gpu" $available_nodes] 5094 set mps_per_gpu [list] 5095 5096 dict for {node mps} $mps_dict { 5097 if { $mps > 0 } { 5098 if [dict exists $gpu_dict $node] { 5099 set gpu [dict get $gpu_dict $node] 5100 if { $gpu > 0 } { 5101 lappend mps_per_gpu [expr $mps / $gpu] 5102 } else { 5103 fail "All nodes with MPS should have a GPU" 5104 } 5105 } else { 5106 fail "All nodes with MPS should have a GPU" 5107 } 5108 } 5109 } 5110 5111 set count [lindex [lsort -decreasing -integer $mps_per_gpu] [expr $node_count - 1]] 5112 return $count 5113} 5114 5115 5116################################################################ 5117# 5118# NAME 5119# get_mps_node_count - gets the number of nodes with a positive number of GRES MPS 5120# 5121# SYNOPSIS 5122# get_mps_node_count 5123# 5124# RETURN VALUE 5125# Return the count of nodes with a non-zero count of GRES MPS 5126# 5127################################################################ 5128 5129proc get_mps_node_count { } { 5130 global number sinfo re_word_str 5131 set fini 0 5132 set node_inx 0 5133 set def_part [default_partition] 5134 5135 log_user 0 5136 spawn $sinfo -N -p$def_part -oGRES=%G -h 5137 expect { 5138 -re "GRES=($re_word_str)" { 5139 set mps_count 0 5140 set parts [split $expect_out(1,string) ",/"] 5141 while 1 { 5142 set mps_found [lsearch $parts "mps*"] 5143 if { $mps_found == -1 } break 5144 5145 set parts2 [split [lindex $parts $mps_found] ":(/"] 5146 set col [lsearch -regexp $parts2 ^$number$] 5147 if { $col == -1 } { 5148 incr mps_count 5149 } else { 5150 set mps_count [expr $mps_count + [lindex $parts2 $col]] 5151 } 5152 set parts [lreplace $parts $mps_found $mps_found] 5153 } 5154 5155 if {$mps_count > 0} { 5156 incr node_inx 5157 } 5158 exp_continue 5159 } 5160 eof { 5161 wait 5162 } 5163 } 5164 log_user 1 5165 5166 return $node_inx 5167} 5168 5169 5170################################################################ 5171# 5172# NAME 5173# cuda_count - determines the count of allocated GPUs 5174# 5175# SYNOPSIS 5176# cuda_count cuda_string 5177# 5178# ARGUMENTS 5179# cuda_string 5180# Contents of a CUDA_VISIBLE_DEVICES environment variable 5181# 5182# RETURN VALUE 5183# Return the number of GPUs or -1 on error 5184# 5185################################################################ 5186 5187proc cuda_count { cuda_string } { 5188 set cuda_count 0 5189 set has_number 0 5190 set len [string length $cuda_string] 5191 for {set char_inx 0} {$char_inx < $len} {incr char_inx} { 5192 set cuda_char [string index $cuda_string $char_inx] 5193 if {[string match , $cuda_char]} { 5194 if {$has_number > 0} { 5195 incr cuda_count 5196 set has_number 0 5197 } else { 5198 log_error "Invalid input ($cuda_string)" 5199 return -1 5200 } 5201 } elseif {[string is digit $cuda_char]} { 5202 set has_number 1 5203 } 5204 } 5205 if {$has_number > 0} { 5206 incr cuda_count 5207 } else { 5208 log_error "Invalid input ($cuda_string)" 5209 return -1 5210 } 5211 return $cuda_count 5212} 5213 5214 5215################################################################ 5216# NAME 5217# get_conf_path - gets the path to the slurm.conf file 5218# 5219# SYNOPSIS 5220# get_conf_path 5221# 5222# RETURN VALUE 5223# Returns the path to the slurm.conf file 5224# 5225################################################################ 5226 5227proc get_conf_path { } { 5228 global scontrol re_word_str eol 5229 5230 if [regexp {(.*)/slurm.conf} [get_config_param "SLURM_CONF"] {} config_dir] { 5231 return $config_dir 5232 } else { 5233 fail "Unable to determine config dir" 5234 } 5235} 5236 5237 5238################################################################ 5239# 5240# NAME 5241# save_conf - saves a backup of the specfied configuration file 5242# 5243# SYNOPSIS 5244# save_conf file_name 5245# 5246# DESCRIPTION 5247# If the specified file_name exists, a backup is made which will be 5248# restored when restore_conf is called. 5249# If the specified file_name does not exist, a special backup will be 5250# made that will cause the file to be removed when restore_conf is 5251# called. 5252# If a backup already exists, a warning is issued and no backup is made 5253# (honoring the existing backup). 5254# 5255# SEE ALSO 5256# restore_conf 5257# 5258################################################################ 5259 5260proc save_conf { file_name } { 5261 global test_id 5262 global bin_chmod bin_cp bin_mv bin_touch 5263 5264 log_debug "Saving backup of $file_name" 5265 5266 # 5267 # Check for existing backup 5268 # If a backup exists, issue a warning and return (honor existing backup) 5269 # 5270 set conf_dir [file dirname $file_name] 5271 set dir_files [glob -nocomplain -directory $conf_dir *] 5272 set preexisting_backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"] 5273 if {$preexisting_backup_file ne ""} { 5274 log_warn "Backup file already exists: ($preexisting_backup_file)" 5275 return 5276 } 5277 5278 # 5279 # Check if file to backup exists. 5280 # If it doesn't exist, warn the user, touch an empty backup file with 5281 # the sticky bit set and allow the test to continue. 5282 # restore_conf will remove the file. 5283 # 5284 set new_backup_file "$file_name.test$test_id" 5285 if {![file exists $file_name]} { 5286 log_warn "Backup of a nonexistent file requested: $file_name" 5287 run_command -fail -nolog "$bin_touch $new_backup_file" 5288 run_command -fail -nolog "$bin_chmod +t $new_backup_file" 5289 return 5290 } 5291 5292 run_command -fail -nolog "$bin_mv $file_name $new_backup_file" 5293 run_command -fail -nolog "$bin_cp $new_backup_file $file_name" 5294} 5295 5296 5297################################################################ 5298# 5299# NAME 5300# restore_conf - restores the original confiration file from backup 5301# 5302# SYNOPSIS 5303# restore_conf file_name 5304# 5305# DESCRIPTION 5306# If a backup exists for the specified file_name, it is restored. 5307# If the specified file_name did not exist when originally backed up, 5308# it will be removed. 5309# If no backup exists, a warning is issued. 5310# 5311# SEE ALSO 5312# save_conf 5313# 5314################################################################ 5315 5316proc restore_conf { file_name } { 5317 global test_id 5318 global bin_mv bin_rm 5319 5320 log_debug "Restoring backup of $file_name" 5321 5322 set conf_dir [file dirname $file_name] 5323 set dir_files [glob -nocomplain -directory $conf_dir *] 5324 set backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"] 5325 if {$backup_file ne ""} { 5326 file stat $backup_file stat 5327 5328 # If the sticky bit is set and the file is empty, remove both 5329 if {! $stat(size) && [expr $stat(mode) & 512]} { 5330 log_debug "Removing file used for the test: $file_name" 5331 run_command -fail -nolog "$bin_rm -f $backup_file $file_name" 5332 # Else replace the original with the backup 5333 } else { 5334 run_command -fail -nolog "$bin_mv $backup_file $file_name" 5335 } 5336 } else { 5337 # 5338 # If backup file doesn't exist, it has probably already been 5339 # restored by a previous call to restore_conf 5340 # 5341 log_warn "Backup file does not exist for $file_name. It has probably already been restored" 5342 return 5343 } 5344} 5345 5346 5347################################################################ 5348# 5349# NAME 5350# have_nvml - checks if HAVE_NVML is set in config.h 5351# 5352# SYNOPSIS 5353# have_nvml 5354# 5355# RETURN VALUE 5356# Returns true if HAVE_NVML is set in config.h. Else, returns false 5357# 5358################################################################ 5359 5360proc have_nvml { } { 5361 global bin_grep number config_h 5362 5363 return [expr [run_command_status -none -nolog "$bin_grep \"HAVE_NVML 1\" $config_h"] == 0] 5364} 5365 5366 5367################################################################ 5368# 5369# NAME 5370# delete_part - deletes partition on system 5371# 5372# SYNOPSIS 5373# delete_part partition 5374# 5375################################################################ 5376 5377proc delete_part { part_name } { 5378 global scontrol 5379 5380 # Remove part 5381 spawn $scontrol delete partition=$part_name 5382 expect { 5383 timeout { 5384 fail "scontrol is not responding" 5385 } 5386 eof { 5387 wait 5388 } 5389 } 5390} 5391 5392 5393################################################################ 5394# 5395# NAME 5396# have_lua - checks if HAVE_LUA is set in config.h 5397# 5398# SYNOPSIS 5399# have_lua 5400# 5401# RETURN VALUE 5402# Returns true if HAVE_LUA is set in config.h. Else, returns false 5403# 5404################################################################ 5405 5406proc have_lua { } { 5407 global bin_grep config_h 5408 5409 return [expr [run_command_status -none -nolog "$bin_grep HAVE_LUA $config_h"] == 0] 5410} 5411 5412 5413################################################################ 5414# 5415# NAME 5416# get_reservations - returns a dictionary of dictionaries of reservation parameters 5417# 5418# SYNOPSIS 5419# get_reservations ?resv_name? 5420# 5421# RETURN VALUE 5422# Uses `scontrol show reservation` to return a dictionary of dictionaries 5423# of job parameters. Specifying an invalid resv_name result in a failure. 5424# 5425################################################################ 5426 5427proc get_reservations { {resv_name ""} } { 5428 global scontrol 5429 5430 set command "$scontrol show reservation -o" 5431 if {$resv_name ne ""} { 5432 append command " $resv_name" 5433 } 5434 set output [run_command_output -fail "$command"] 5435 5436 # Iterate over each reservation's parameter list 5437 foreach line [split $output "\n"] { 5438 if {$line eq ""} { continue } 5439 5440 # Peel off the resv parameters one at a time 5441 # The first quantifier sets the greediness for the whole RE 5442 while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { 5443 # Remove the consumed parameter from the line 5444 set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] 5445 # Add it to the temporary job dictionary 5446 dict set resv_dict $param_name $param_value 5447 } 5448 5449 set resv_name_dict [dict get $resv_dict "ReservationName"] 5450 5451 # Add the resv dictionary to resvs dictionary 5452 dict set resvs_dict $resv_name_dict $resv_dict 5453 5454 # Clear the resv dictionary for the next resv 5455 set resv_dict {} 5456 } 5457 5458 return $resvs_dict 5459} 5460 5461 5462################################################################ 5463# 5464# NAME 5465# get_reservation_param - returns a specific parameter value for a specific reservation 5466# 5467# SYNOPSIS 5468# get_reservation_param resv_name parameter_name 5469# 5470# DESCRIPTION 5471# Returns a specific parameter value for a specified reservation if the 5472# parameter exists for the reservation, or MISSING if it does not exist. 5473# Specifying an invalid reservation name will result in a failure. 5474# 5475################################################################ 5476 5477proc get_reservation_param { resv_name parameter_name } { 5478 5479 set resvs_dict [get_reservations $resv_name] 5480 5481 if [dict exists $resvs_dict $resv_name $parameter_name] { 5482 return [dict get $resvs_dict $resv_name $parameter_name] 5483 } else { 5484 return "MISSING" 5485 } 5486} 5487 5488 5489################################################################ 5490# 5491# NAME 5492# create_res - create new reservation in system 5493# 5494# SYNOPSIS 5495# create_res ?res_name? ?res_params? 5496# 5497# RETURN VALUE 5498# the exit code of the scontrol command run 5499# 5500################################################################ 5501 5502proc create_res { res_name res_params } { 5503 global scontrol 5504 5505 set result [run_command "$scontrol create res ReservationName=$res_name $res_params"] 5506 set output [dict get $result output] 5507 set ret_code [dict get $result exit_code] 5508 5509 if { $ret_code } { 5510 log_warn "[lindex [info level 0] 0]: error from scontrol: $output" 5511 } else { 5512 log_debug "[lindex [info level 0] 0]: success from scontrol: $output" 5513 } 5514 5515 return $ret_code 5516} 5517 5518 5519################################################################ 5520# 5521# NAME 5522# update_res - update exisiting reservation in system 5523# 5524# SYNOPSIS 5525# update_res ?res_name? ?res_params? 5526# 5527# RETURN VALUE 5528# the exit code of the scontrol command run 5529# 5530################################################################ 5531 5532proc update_res { res_name res_params } { 5533 global scontrol 5534 5535 set result [run_command "$scontrol update ReservationName=$res_name $res_params"] 5536 set output [dict get $result output] 5537 set ret_code [dict get $result exit_code] 5538 5539 if { $ret_code } { 5540 log_warn "Return code from scontrol: $ret_code. Output: $output" 5541 } 5542 5543 return $ret_code 5544} 5545 5546 5547################################################################ 5548# 5549# NAME 5550# delete_res - delete reservation from system 5551# 5552# SYNOPSIS 5553# delete_res ?res_name? 5554# 5555# RETURN VALUE 5556# the exit code of the scontrol command run 5557# 5558################################################################ 5559 5560proc delete_res { res_name } { 5561 global scontrol 5562 5563 set result [run_command "$scontrol delete ReservationName=$res_name"] 5564 set output [dict get $result output] 5565 set ret_code [dict get $result exit_code] 5566 5567 if { $ret_code } { 5568 log_warn "Return code from scontrol: $ret_code. Output: $output" 5569 } 5570 5571 return $ret_code 5572} 5573 5574 5575################################################################ 5576# 5577# NAME 5578# create_part - creates a partition 5579# 5580# SYNOPSIS 5581# create_part partition num_nodes 5582# 5583# ARGUMENTS 5584# partition 5585# Name of partition to create 5586# num_nodes 5587# Number of nodes of partition to create 5588# 5589# RETURN VALUE 5590# RETURN_SUCCESS, or non-zero on error 5591# 5592################################################################ 5593 5594proc create_part { part_name num_nodes_in } { 5595 global scontrol srun bin_printenv number re_word_str 5596 5597 set nodes "" 5598 set num_nodes_out 0 5599 5600 set found 0 5601 spawn $scontrol show partitionname=$part_name 5602 expect { 5603 -re "PartitionName=$part_name" { 5604 set found 1 5605 exp_continue 5606 } 5607 timeout { 5608 fail "scontrol is not responding" 5609 } 5610 eof { 5611 wait 5612 } 5613 } 5614 5615 if {$found == 1} { 5616 log_error "There is already a partition $part_name" 5617 return $::RETURN_ERROR 5618 } 5619 5620 if {[string length [default_partition]] == 0} { 5621 log_warn "create_part does not work without a default partition" 5622 return $::RETURN_ERROR 5623 } 5624 5625 if { $num_nodes_in } { 5626 set num_nodes $num_nodes_in 5627 } else { 5628 set num_nodes [llength [get_nodes_by_state]] 5629 } 5630 5631 log_user 0 5632 # Get a list of nodes 5633 spawn $srun -t1 -N1-$num_nodes $bin_printenv 5634 expect { 5635 -re "SLURM_JOB_NUM_NODES=($number)" { 5636 set num_nodes_out $expect_out(1,string) 5637 exp_continue 5638 } 5639 -re "SLURM_NODELIST=($re_word_str)" { 5640 set nodes $expect_out(1,string) 5641 exp_continue 5642 } 5643 timeout { 5644 log_error "srun is not responding getting number of nodes creating part" 5645 return $::RETURN_TIMEOUT 5646 } 5647 eof { 5648 wait 5649 } 5650 } 5651 5652 if {[string length $nodes] == 0} { 5653 log_error "Did not get a valid node list" 5654 return $::RETURN_ERROR 5655 } elseif { $num_nodes_out != $num_nodes_in } { 5656 log_error "Did not get enough nodes ($num_nodes_out != $num_nodes_in) to run test" 5657 return $::RETURN_ERROR 5658 } 5659 5660 spawn $scontrol create partitionname=$part_name nodes=$nodes 5661 expect { 5662 timeout { 5663 log_error "scontrol is not responding creating partition" 5664 return $::RETURN_ERROR 5665 } 5666 eof { 5667 wait 5668 } 5669 } 5670 5671 set found 0 5672 spawn $scontrol show partitionname=$part_name 5673 expect { 5674 -re "PartitionName=$part_name" { 5675 set found 1 5676 exp_continue 5677 } 5678 timeout { 5679 fail "scontrol is not responding" 5680 } 5681 eof { 5682 wait 5683 } 5684 } 5685 5686 if { $found == 0 } { 5687 log_error "scontrol did not create partition $part_name" 5688 return $::RETURN_ERROR 5689 } 5690 log_user 1 5691 5692 log_debug "Created partition $part_name with $num_nodes_in nodes" 5693 return $::RETURN_SUCCESS 5694} 5695 5696 5697################################################################ 5698# 5699# NAME 5700# get_nodes - returns a dictionary of dictionaries of node parameters 5701# 5702# SYNOPSIS 5703# get_nodes ?hostlist_expression? 5704# 5705# DESCRIPTION 5706# Uses `scontrol show node` to query node parameters, returning a 5707# dictionary of dictionaries with the node names as keys of the first 5708# level dictionary and with the parameters as keys of the second level 5709# dictionary. Specifying an invalid node name will result in a failure. 5710# 5711# RETURN VALUE 5712# If the optional node expression argument is specified, the result will 5713# be constrained by the specified hostlist expression. Otherwise, the 5714# results for all nodes will be returned. 5715# 5716################################################################ 5717 5718proc get_nodes { {hostlist_expression ""} } { 5719 global scontrol 5720 5721 set command "$scontrol show node -o" 5722 if {$hostlist_expression ne ""} { 5723 append command " $hostlist_expression" 5724 } 5725 set output [run_command_output -fail -nolog "$command"] 5726 5727 # Iterate over each node parameter line 5728 foreach line [split $output "\n"] { 5729 if {$line eq ""} { continue } 5730 5731 # Peel off the node parameters one at a time 5732 # The first quantifier sets the greediness for the whole RE 5733 while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { 5734 # Remove the consumed parameter from the line 5735 set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] 5736 # Add it to the temporary node dictionary 5737 dict set node_dict $param_name $param_value 5738 } 5739 5740 set node_name [dict get $node_dict "NodeName"] 5741 5742 # Add the node dictionary to nodes dictionary 5743 dict set nodes_dict $node_name $node_dict 5744 5745 # Clear the node dictionary for use by the next node 5746 set node_dict {} 5747 } 5748 5749 return $nodes_dict 5750} 5751 5752 5753################################################################ 5754# 5755# NAME 5756# get_node_param - returns a specific parameter value for a specific node 5757# 5758# SYNOPSIS 5759# get_node_param node_name parameter_name 5760# 5761# DESCRIPTION 5762# Returns a specific parameter value for a specified node if the 5763# parameter exists for the node, or MISSING if it does not exist. 5764# Specifying an invalid node name will result in a failure. 5765# 5766################################################################ 5767 5768proc get_node_param { node_name parameter_name } { 5769 5770 set nodes_dict [get_nodes $node_name] 5771 5772 if [dict exists $nodes_dict $node_name $parameter_name] { 5773 return [dict get $nodes_dict $node_name $parameter_name] 5774 } else { 5775 return "MISSING" 5776 } 5777} 5778 5779 5780################################################################ 5781# 5782# NAME 5783# get_nodes_by_request - get a list of nodes satisfying requested resources 5784# 5785# SYNOPSIS 5786# get_nodes_by_request ?options? ?request_args? 5787# 5788# DESCRIPTION 5789# Using srun (optionally with the specified arguments), returns a list 5790# of nodes having the requested resources. 5791# If an error occurs, the invoking test will fail. 5792# 5793# OPTIONS 5794# -fail 5795# fail the test if the execution of srun results in an error or timeout 5796# 5797# ARGUMENTS 5798# request_args 5799# Desired resources of a node in form of srun arguments, 5800# e.g. "--gres=gpu:1 -n1 -t1" 5801# 5802# RETURN VALUE 5803# A list of nodes with at least the requested resources, or an empty 5804# list otherwise. 5805# 5806################################################################ 5807 5808proc get_nodes_by_request args { 5809 global srun 5810 5811 set options [list] 5812 while {[llength $args]} { 5813 switch -glob -- [lindex $args 0] { 5814 -fail { 5815 lappend options [lrange $args 0 0] 5816 set args [lrange $args 1 end] 5817 } 5818 default break 5819 } 5820 } 5821 5822 if {[llength $args] == 1} { 5823 lassign $args request_args 5824 } elseif {[llength $args] == 0} { 5825 set request_args "-n1 -t1" 5826 } else { 5827 fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args" 5828 } 5829 5830 set command "$srun -Q $request_args printenv SLURMD_NODENAME" 5831 set result [run_command {*}$options $command] 5832 5833 if [dict get $result exit_code] { 5834 return {} 5835 } 5836 5837 set output [dict get $result output] 5838 foreach line [split $output "\n"] { 5839 if {$line eq ""} { continue } 5840 dict incr allocated_nodes $line 5841 } 5842 5843 return [lsort [dict keys $allocated_nodes]] 5844} 5845 5846 5847################################################################ 5848# 5849# NAME 5850# get_partitions - returns a dictionary of dictionaries of partition parameters 5851# 5852# SYNOPSIS 5853# get_partitions ?partition_name? 5854# 5855# DESCRIPTION 5856# Uses `scontrol show partitions` to query partition parameters, 5857# returning a dictionary of dictionaries with the partition names 5858# as keys of the first level dictionary and with the parameters as 5859# keys of the second level dictionary. Specifying an invalid partition 5860# name will result in a failure. 5861# 5862# RETURN VALUE 5863# If the optional partition_name argument is specified, the result will 5864# contain only the one patition. Otherwise, the results for all 5865# partitions will be returned. 5866# 5867################################################################ 5868 5869proc get_partitions { {partition_name ""} } { 5870 global scontrol 5871 5872 set command "$scontrol show partition -o" 5873 if {$partition_name ne ""} { 5874 append command " $partition_name" 5875 } 5876 set output [run_command_output -fail -nolog "$command"] 5877 5878 # Iterate over each partition parameter line 5879 foreach line [split $output "\n"] { 5880 if {$line eq ""} { continue } 5881 5882 # Peel off the partition parameters one at a time 5883 # The first quantifier sets the greediness for the whole RE 5884 while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { 5885 # Remove the consumed parameter from the line 5886 set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] 5887 # Add it to the temporary node dictionary 5888 dict set part_dict $param_name $param_value 5889 } 5890 5891 set part_name [dict get $part_dict "PartitionName"] 5892 5893 # Add the node dictionary to nodes dictionary 5894 dict set parts_dict $part_name $part_dict 5895 5896 # Clear the node dictionary for use by the next node 5897 set part_dict {} 5898 } 5899 5900 return $parts_dict 5901} 5902 5903 5904################################################################ 5905# 5906# NAME 5907# get_partition_param - returns a specific parameter value for a specific partition 5908# 5909# SYNOPSIS 5910# get_partition_param partitoin_name parameter_name 5911# 5912# DESCRIPTION 5913# Returns a specific parameter value for a specified partition if the 5914# parameter exists for the partition, or MISSING if it does not exist. 5915# Specifying an invalid partition name will result in a failure. 5916# 5917################################################################ 5918 5919proc get_partition_param { partition_name parameter_name } { 5920 5921 set partitions_dict [get_partitions $partition_name] 5922 5923 if [dict exists $partitions_dict $partition_name $parameter_name] { 5924 return [dict get $partitions_dict $partition_name $parameter_name] 5925 } else { 5926 return "MISSING" 5927 } 5928} 5929 5930 5931################################################################ 5932# 5933# NAME 5934# get_jobs - returns a dictionary of dictionaries of job parameters 5935# 5936# SYNOPSIS 5937# get_jobs ?job_id? 5938# 5939# DESCRIPTION 5940# Uses `scontrol show job` to return a dictionary of dictionaries of job 5941# parameters. Specifying an invalid job id will result in a failure. 5942# 5943################################################################ 5944 5945proc get_jobs { {job_id_in ""} } { 5946 global scontrol 5947 5948 set command "$scontrol show job -d -o" 5949 if {$job_id_in ne ""} { 5950 append command " $job_id_in" 5951 } 5952 set output [run_command_output -fail "$command"] 5953 5954 # Iterate over each job's parameter list 5955 foreach line [split $output "\n"] { 5956 if {$line eq ""} { continue } 5957 5958 # Peel off the job parameters one at a time 5959 # The first quantifier sets the greediness for the whole RE 5960 while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { 5961 # Remove the consumed parameter from the line 5962 set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] 5963 # Add it to the temporary job dictionary 5964 dict set job_dict $param_name $param_value 5965 } 5966 5967 set job_id [dict get $job_dict "JobId"] 5968 5969 # Add the job dictionary to jobs dictionary 5970 dict set jobs_dict $job_id $job_dict 5971 5972 # Clear the job dictionary for the next job 5973 set job_dict {} 5974 } 5975 5976 return $jobs_dict 5977} 5978 5979 5980################################################################ 5981# 5982# NAME 5983# get_job_param - returns a specific parameter value for a specific job 5984# 5985# SYNOPSIS 5986# get_job_param job_id parameter_name 5987# 5988# DESCRIPTION 5989# Returns a specific parameter value for a specified job if the 5990# parameter exists for the job, or MISSING if it does not exist. 5991# Specifying an invalid job id will result in a failure. 5992# 5993################################################################ 5994 5995proc get_job_param { job_id parameter_name } { 5996 5997 set jobs_dict [get_jobs $job_id] 5998 5999 if [dict exists $jobs_dict $job_id $parameter_name] { 6000 return [dict get $jobs_dict $job_id $parameter_name] 6001 } else { 6002 return "MISSING" 6003 } 6004} 6005 6006proc check_reason { job_id reason } { 6007 global squeue 6008 6009 set found 0 6010 spawn $squeue -j $job_id --noheader -o "%r" 6011 expect { 6012 -re "$reason" { 6013 set found 1 6014 exp_continue 6015 } 6016 timeout { 6017 log_error "squeue not responding" 6018 } 6019 eof { 6020 wait 6021 } 6022 } 6023 6024 if {$found == 0} { 6025 log_error "Job $job_id should have a wait reason of $reason" 6026 } 6027 return $found 6028} 6029 6030 6031################################################################ 6032# 6033# NAME 6034# submit_job - submits a job with sbatch and returns its job id 6035# 6036# SYNOPSIS 6037# submit_job ?options? job_args 6038# 6039# DESCRIPTION 6040# Submits a job with sbatch and returns its jobid, or 0 if error. 6041# It accepts all the options of run_command. 6042# 6043# OPTIONS 6044# It accepts and passes all the options of/to run_command and also: 6045# -env env 6046# Prepend $env to the actual sbatch command to set environment 6047# variables. For example "-env 'SLURM_NTASKS_PER_GPU=2'". 6048# 6049# ARGUMENTS 6050# job_args 6051# a string containing all the arguments to pass to sbatch 6052# 6053# RETURN VALUE 6054# the job id, or 0 if an error happen 6055# 6056################################################################ 6057 6058proc submit_job args { 6059 global sbatch 6060 6061 set env "" 6062 set job_id 0 6063 6064 set idx [lsearch $args -env] 6065 if {$idx >= 0} { 6066 set env [lindex $args [expr $idx+1]] 6067 set args [lreplace $args $idx [expr $idx+1]] 6068 } 6069 6070 if {[llength $args] < 1} { 6071 fail "Wrong number of parameters, should be >=1" 6072 } 6073 set job_options [lindex $args [expr [llength $args] - 1 ]] 6074 set run_options "" 6075 if {[llength $args] > 1} { 6076 set run_options [lrange $args 0 [expr [llength $args] - 2 ]] 6077 } 6078 6079 set output [run_command_output {*}$run_options "$env $sbatch $job_options"] 6080 regexp {Submitted \S+ job (\d+)} $output - job_id 6081 6082 return $job_id 6083} 6084 6085 6086################################################################################ 6087# 6088# NAME 6089# compile_against_libslurm - compiles a test program against either libslurm.so or libslurmfull.so 6090# 6091# SYNOPSIS 6092# compile_against_libslurm ?options? test_prog ?build_args? 6093# 6094# DESCRIPTION 6095# Compile a test program against either libslurm.so or libslurmfull.so. 6096# 6097# OPTIONS 6098# -full 6099# use libslurmfull.so instead of libslurm.so 6100# -shared 6101# produces a shared library (adds the -shared compiler option 6102# and adds a .so suffix to the output file name) 6103# 6104# ARGUMENTS 6105# test_prog 6106# The name of the test program (and .c file) 6107# build_args 6108# Additional string to be appended to the build command. 6109# E.g. "-DUSING_VALGRIND -lm ${build_dir}/src/slurmctld/locks.o" 6110# (initial space will be added automatically). 6111# 6112################################################################################ 6113 6114proc compile_against_libslurm args { 6115 global slurm_dir bin_cc src_dir build_dir bin_chmod 6116 6117 set use_full false 6118 set shared false 6119 set build_args "" 6120 while {[llength $args]} { 6121 switch -glob -- [lindex $args 0] { 6122 -full {set use_full true; set args [lrange $args 1 end]} 6123 -shared {set shared true; set args [lrange $args 1 end]} 6124 -* {fail "Unknown option: [lindex $args 0]"} 6125 default break 6126 } 6127 } 6128 set argument_count [llength $args] 6129 if {$argument_count < 1} { 6130 fail "Too few arguments ($argument_count): $args" 6131 } else { 6132 lassign $args test_prog 6133 } 6134 if {$argument_count == 2} { set build_args [lindex $args 1] } 6135 if {$argument_count > 2} { 6136 fail "Too many arguments ($argument_count): $args" 6137 } 6138 6139 if {$use_full} { 6140 set libfile "libslurmfull.so" 6141 } else { 6142 set libfile "libslurm.so" 6143 } 6144 6145 if [file exists $slurm_dir/lib64/slurm/$libfile] { 6146 set libdir "lib64" 6147 } else { 6148 set libdir "lib" 6149 } 6150 6151 if {$use_full} { 6152 set libline "$slurm_dir/$libdir/slurm" 6153 set libfile "slurmfull" 6154 } else { 6155 set libline "$slurm_dir/$libdir" 6156 set libfile "slurm" 6157 } 6158 6159 set build_cmd "$bin_cc ${test_prog}.c -g -pthread" 6160 6161 if {$shared} { 6162 set out "${test_prog}.so" 6163 append build_cmd " -fPIC -shared" 6164 } else { 6165 set out "${test_prog}" 6166 } 6167 append build_cmd " -o $out" 6168 6169 append build_cmd " -I$src_dir -I$build_dir -I$slurm_dir/include -Wl,-rpath=$libline -L$libline -l$libfile -lresolv" 6170 6171 # Add additional arguments to the build command 6172 if {$build_args != ""} { 6173 append build_cmd " $build_args" 6174 } 6175 6176 log_debug "Build command: $build_cmd" 6177 catch {exec {*}$build_cmd} out_str out_dict 6178 if {[dict get $out_dict -code]} { 6179 log_error $out_str 6180 return $::RETURN_ERROR 6181 } 6182 6183 exec $bin_chmod 700 $out 6184 return $::RETURN_SUCCESS 6185} 6186 6187 6188################################################################ 6189# 6190# NAME 6191# subtest - tests a boolean condition and updates subtest tallies 6192# 6193# SYNOPSIS 6194# subtest ?options? condition description ?diagnostics? 6195# 6196# DESCRIPTION 6197# Based on the results of testing a boolean expression, increments the 6198# relevant subtest count (pass, fail or skip) and logs a message. 6199# 6200# OPTIONS 6201# -fatal 6202# If the subtest fails, causes a fatal error ending the test 6203# 6204# ARGUMENTS 6205# condition 6206# The boolean expression to test 6207# description 6208# A single-line string describing what is being tested. This is 6209# a subtest "name" that is displayed with the log message 6210# whether the subtest passes or fails 6211# diagnostics 6212# A string providing additional diagnostic information that is 6213# only included in the log message on failure 6214# 6215# RETURN VALUE 6216# the boolean condition evaluated 6217# 6218# ENVIRONMENT 6219# testsuite_subtest_fatal 6220# Specifies whether first failing subtest aborts the test 6221# 6222################################################################ 6223 6224proc subtest args { 6225 6226 set options [list] 6227 while {[llength $args]} { 6228 switch -glob -- [lindex $args 0] { 6229 -fatal - 6230 -fail { 6231 lappend options -fatal 6232 set args [lrange $args 1 end] 6233 } 6234 -* {fail "Unknown option: [lindex $args 0]"} 6235 default break 6236 } 6237 } 6238 set argument_count [llength $args] 6239 if {$argument_count < 2} { 6240 fail "Too few arguments ($argument_count): $args" 6241 } else { 6242 set args [lassign $args condition description] 6243 } 6244 6245 if [uplevel 1 expr [format "{%s}" $condition]] { 6246 subpass $description 6247 return true 6248 } else { 6249 subfail {*}$options $description {*}$args 6250 return false 6251 } 6252} 6253 6254 6255################################################################ 6256# 6257# NAME 6258# _is_testproc_included - returns if testproc_num was included or not excluded in argv 6259# 6260# SYNOPSIS 6261# _is_testproc_included testproc_num 6262# 6263# DESCRIPTION 6264# From command line the test runner can use -i and -e to include or 6265# exclude some test functions by their number. This function must be 6266# used to check if the test runner included or excluded the given 6267# testproc_num. 6268# 6269# RETURN VALUE 6270# true is testproc_num was included with -i or not excluded -e, 6271# false otherwise 6272# 6273################################################################ 6274 6275proc _is_testproc_included {testproc_num} { 6276 global _testproc_included _testproc_excluded 6277 6278 if {[llength $_testproc_included]} { 6279 if {[lsearch $_testproc_included $testproc_num] >= 0} { 6280 return true 6281 } 6282 return false 6283 } 6284 6285 if {[llength $_testproc_excluded]} { 6286 if {[lsearch $_testproc_excluded $testproc_num] >= 0} { 6287 return false 6288 } 6289 } 6290 6291 return true 6292} 6293 6294 6295################################################################ 6296# 6297# NAME 6298# skip_following_testprocs - the following testproc calls will be skipped 6299# 6300# SYNOPSIS 6301# skip_following_testprocs reason 6302# 6303# ARGUMENTS 6304# reason 6305# The string with the reason message to add on the skip message 6306# on each skipped testproc. 6307# 6308# DESCRIPTION 6309# This function disables normal execution of testproc calls. 6310# It is meant to be used when some testprocs cannot be run due config 6311# limitations, but still call testproc to register what testprocs 6312# are skipped for a given reason. 6313# Use run_following_testprocs to reenable the norma execution of testprocs. 6314 6315################################################################ 6316 6317proc skip_following_testprocs {reason} { 6318 global _testproc_skip_next _testproc_skip_reason 6319 6320 set _testproc_skip_next true 6321 set _testproc_skip_reason $reason 6322} 6323 6324 6325################################################################ 6326# 6327# NAME 6328# run_following_testprocs - the following testproc call will be run (if not excluded from command line) 6329# 6330# SYNOPSIS 6331# run_following_testprocs 6332# 6333# DESCRIPTION 6334# This function reenables the normal execution of testproc calls. 6335# It is meant to be used when skip_following_testprocs was called to skip 6336# previous testproc calls, and we want to normally run the following ones. 6337# Note that it does NOT overwrite what -i and -e included are passed 6338# from command line. 6339# 6340################################################################ 6341 6342proc run_following_testprocs {} { 6343 global _testproc_skip_next _testproc_skip_reason 6344 6345 set _testproc_skip_next false 6346 set _testproc_skip_reason "" 6347} 6348 6349 6350################################################################ 6351# 6352# NAME 6353# testproc - launcher to run or skip a testproc_call 6354# 6355# SYNOPSIS 6356# testproc testproc_call 6357# 6358# ARGUMENTS 6359# A testproc_call is any normal call to a proc with any arguments that 6360# could be done normally without the testproc launcher. 6361# For example, we could normally do: 6362# 6363# test_my_feature $some_args $expected_out 6364# 6365# Or use the launcher like: 6366# 6367# testproc test_my_feature $some_args $expected_out 6368# 6369# DESCRIPTION 6370# Using the testproc launcher has the following main benefits: 6371# a) Handles the -i and -e terminal options to include or exclude some 6372# testprocs numbers. 6373# b) Runs or skips based on the last call of testproc_{skip,run}_following. 6374# c) Creates extra sections in the status summary 6375# (see testsuite_testproc_details). 6376# 6377# RETURN VALUE 6378# The rc of the testproc_call if it has been run, or $::RETURN_SUCCESS 6379# otherwise. Using it is not recommended, though. 6380# 6381################################################################ 6382 6383proc testproc args { 6384 global _testproc_pass_list _testproc_skip_list _testproc_fail_list 6385 global _testproc_skip_next _testproc_skip_reason 6386 global _testproc_messages 6387 global _subtest_pass_count 6388 global _subtest_skip_count 6389 global _subtest_fail_count 6390 6391 # Save previous subtest counts and next subtest num 6392 set prev_pass $_subtest_pass_count 6393 set prev_skip $_subtest_skip_count 6394 set prev_fail $_subtest_fail_count 6395 set prev_subtest [expr $_subtest_pass_count + \ 6396 $_subtest_skip_count + \ 6397 $_subtest_fail_count + 1] 6398 6399 # Get the testproc number 6400 set testproc_num [expr [llength $_testproc_pass_list] + \ 6401 [llength $_testproc_skip_list] + \ 6402 [llength $_testproc_fail_list] + 1] 6403 6404 # Run or skip the testproc 6405 set rc $::RETURN_SUCCESS 6406 set reason "" 6407 if {![_is_testproc_included $testproc_num]} { 6408 set reason "(Excluded from command line)" 6409 subskip "Skipping testproc $testproc_num: {$args} $reason" 6410 } else { 6411 if {$_testproc_skip_next} { 6412 set reason "($_testproc_skip_reason)" 6413 subskip "Skipping testproc $testproc_num: {$args} $reason" 6414 } else { 6415 log_info "Running testproc $testproc_num: $args" 6416 set rc [{*}$args] 6417 } 6418 } 6419 6420 # Get current subtest counts 6421 set curr_subtest [expr $_subtest_pass_count + \ 6422 $_subtest_skip_count + \ 6423 $_subtest_fail_count] 6424 6425 # Register the testproc as fail, skip or pass (based on subtests) 6426 if {$_subtest_fail_count > $prev_fail} { 6427 set reason "(Subtests: $prev_subtest to $curr_subtest)" 6428 lappend _testproc_fail_list $testproc_num 6429 dict set _testproc_messages $testproc_num [list failed "{$args} $reason"] 6430 } elseif {$_subtest_skip_count > $prev_skip} { 6431 lappend _testproc_skip_list $testproc_num 6432 dict set _testproc_messages $testproc_num [list skipped "{$args} $reason"] 6433 } else { 6434 lappend _testproc_pass_list $testproc_num 6435 dict set _testproc_messages $testproc_num [list passed "{$args}"] 6436 } 6437 6438 return $rc 6439} 6440 6441 6442################################################################ 6443# 6444# NAME 6445# _log_format - prints a log message with colorization and formatting 6446# 6447# SYNOPSIS 6448# _log_format log_level message 6449# 6450# DESCRIPTION 6451# This procedure is called by the log_<level> procedures and 6452# derives the relevant log level from the caller's procedure name. 6453# 6454# ARGUMENTS 6455# log_level 6456# The logging threshold that triggered the log statement 6457# message 6458# The message to print with colorization and formatting 6459# 6460# ENVIRONMENT 6461# testsuite_log_format 6462# Used as the template for the fields to be output. 6463# Fields must be expressed in the form: 6464# %{<field_name>}<format_conversion_specifier> 6465# Supported fields include the following: 6466# message 6467# The log message 6468# filename 6469# The file name where the log_<log_level> 6470# procedure was called from 6471# lineno 6472# The line number where the log_<log_level> 6473# procedure was called from 6474# timestamp 6475# The date and time when the log_<log_level> 6476# procedure was called at 6477# msecs 6478# The milliseconds when the log_<log_level> 6479# procedure was called at 6480# loglevel 6481# The log level that triggers the log_<log_level> 6482# procedure to be called 6483# backtrace 6484# An abbreviated call stack trace with line 6485# numbers 6486# testsuite_time_format 6487# Used as a template for the timestamp. See the format groups 6488# for the tcl clock format command. 6489# testsuite_colorize 6490# Boolean that turns colorization on or off 6491# testsuite_color_<log_level> 6492# Can be set to define the color used for each log level 6493# 6494################################################################ 6495 6496proc _log_format { log_level message } { 6497 global testsuite_colorize testsuite_log_format testsuite_time_format 6498 global COLOR_NONE 6499 global testsuite_color_fatal testsuite_color_error testsuite_color_warn 6500 global testsuite_color_info testsuite_color_pass testsuite_color_command 6501 global testsuite_color_debug testsuite_color_trace 6502 6503 set format_string $testsuite_log_format 6504 set milliseconds_since_epoch [clock milliseconds] 6505 set date_time [clock format [expr {$milliseconds_since_epoch / 1000}] -format "$testsuite_time_format"] 6506 set milliseconds [expr {$milliseconds_since_epoch % 1000}] 6507 6508 set frame_level -2 6509 while { [dict get [info frame $frame_level] type] != "source" } { 6510 incr frame_level -1 6511 } 6512 6513 set format_args {} 6514 while {[regexp "%{\[a-z]+}" $format_string format_field]} { 6515 if {$format_field eq "%{message}"} { 6516 lappend format_args $message 6517 } elseif {$format_field eq "%{filename}"} { 6518 lappend format_args [file tail [dict get [info frame $frame_level] file]] 6519 } elseif {$format_field eq "%{lineno}"} { 6520 lappend format_args [dict get [info frame $frame_level] line] 6521 } elseif {$format_field eq "%{timestamp}"} { 6522 lappend format_args $date_time 6523 } elseif {$format_field eq "%{msecs}" || $format_field eq "%{milliseconds}"} { 6524 lappend format_args $milliseconds 6525 } elseif {$format_field eq "%{loglevel}" || $format_field eq "%{levelname}"} { 6526 lappend format_args [string totitle $log_level] 6527 } elseif {$format_field eq "%{backtrace}"} { 6528 lappend format_args [_line_trace] 6529 } else { 6530 fail "Invalid field ($format_field) specified in testsuite_log_format" 6531 } 6532 regsub $format_field $format_string "%" format_string 6533 } 6534 6535 if ($testsuite_colorize) { 6536 switch $log_level { 6537 fatal { append output $testsuite_color_fatal } 6538 error { append output $testsuite_color_error } 6539 warning { append output $testsuite_color_warn } 6540 info { append output $testsuite_color_info } 6541 pass { append output $testsuite_color_pass } 6542 command { append output $testsuite_color_command } 6543 debug { append output $testsuite_color_debug } 6544 trace { append output $testsuite_color_trace } 6545 } 6546 } 6547 append output [format $format_string {*}$format_args] 6548 if ($testsuite_colorize) { 6549 append output $COLOR_NONE 6550 } 6551 puts $output 6552} 6553 6554 6555################################################################ 6556# 6557# NAME 6558# _print_header - prints a test header 6559# 6560# SYNOPSIS 6561# _print_header 6562# 6563# ENVIRONMENT 6564# testsuite_colorize 6565# Boolean that turns colorization on or off 6566# testsuite_color_header 6567# Can be set to define the color used for the header 6568# 6569################################################################ 6570 6571proc _print_header { } { 6572 global test_name testsuite_color_header testsuite_colorize COLOR_NONE 6573 6574 if ($testsuite_colorize) { 6575 append output $testsuite_color_header 6576 } 6577 append output [string repeat = 78]\n 6578 append output [format "%-9s" "TEST:"]${test_name}\n 6579 append output [string repeat = 78] 6580 if ($testsuite_colorize) { 6581 append output $COLOR_NONE 6582 } 6583 puts $output 6584} 6585 6586 6587################################################################ 6588# 6589# NAME 6590# _print_summary - prints the final status summary 6591# 6592# SYNOPSIS 6593# _print_summary status completed 6594# 6595# ARGUMENTS 6596# status 6597# The final status of the test. 6598# When status is zero, we print SUCCESS. 6599# When status is negative, we print SKIPPED. 6600# When status is positive, we print FAILURE. 6601# 6602# completed 6603# A boolean value that is true if the test completed and false 6604# if aborted (ended early with exit status != 0) 6605# 6606# ENVIRONMENT 6607# testsuite_colorize 6608# Boolean that turns colorization on or off 6609# testsuite_color_<test_status> 6610# Can be set to define the color used for each test status 6611# 6612################################################################ 6613 6614proc _print_summary {status completed} { 6615 global test_name testsuite_colorize COLOR_NONE 6616 global testsuite_color_success testsuite_color_skipped 6617 global testsuite_color_failure 6618 global _subtest_fail_count _subtest_pass_count _subtest_skip_count 6619 global _testproc_pass_list _testproc_skip_list _testproc_fail_list 6620 global _incomplete_reason 6621 global _subtest_messages _testproc_messages 6622 global testsuite_subtest_details testsuite_testproc_details 6623 6624 if {$status == 0} { 6625 set color $testsuite_color_success 6626 set header "SUCCESS" 6627 } elseif {$status < 0} { 6628 set color $testsuite_color_skipped 6629 set header "SKIPPED" 6630 } elseif {$status > 0} { 6631 set color $testsuite_color_failure; 6632 set header "FAILURE" 6633 } 6634 6635 if ($testsuite_colorize) { 6636 append output $color 6637 } 6638 6639 append output [string repeat = 78]\n 6640 6641 # Get subtest and testproc counts 6642 set testproc_fail [llength $_testproc_fail_list] 6643 set testproc_skip [llength $_testproc_skip_list] 6644 set testproc_pass [llength $_testproc_pass_list] 6645 set testproc_count [expr $testproc_pass + \ 6646 $testproc_skip + \ 6647 $testproc_fail] 6648 set subtest_count [expr $_subtest_fail_count + \ 6649 $_subtest_pass_count + \ 6650 $_subtest_skip_count] 6651 6652 # Initial summary 6653 append output [format "%s : %s\n" $header $test_name] 6654 if {$testproc_count > 0} { 6655 append output [format " Testprocs failed : %3d (%3d%%)%s\n" $testproc_fail [expr $testproc_fail * 100 / $testproc_count] \ 6656 [expr {$testproc_fail ? " List: [join $_testproc_fail_list ,]" : ""}]] 6657 append output [format " Testprocs skipped : %3d (%3d%%)%s\n" $testproc_skip [expr $testproc_skip * 100 / $testproc_count] \ 6658 [expr {$testproc_skip ? " List: [join $_testproc_skip_list ,]" : ""}]] 6659 append output [format " Testprocs passed : %3d (%3d%%)\n" $testproc_pass [expr $testproc_pass * 100 / $testproc_count]] 6660 append output [format " Testprocs total : %3d %s\n" $testproc_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]] 6661 } 6662 if {$subtest_count > 0} { 6663 if {$testproc_count > 0} { 6664 append output \n 6665 } 6666 append output [format " Subtests failed : %3d (%3d%%)\n" $_subtest_fail_count [expr $_subtest_fail_count * 100 / $subtest_count]] 6667 append output [format " Subtests skipped : %3d (%3d%%)\n" $_subtest_skip_count [expr $_subtest_skip_count * 100 / $subtest_count]] 6668 append output [format " Subtests passed : %3d (%3d%%)\n" $_subtest_pass_count [expr $_subtest_pass_count * 100 / $subtest_count]] 6669 append output [format " Subtests total : %3d %s\n" $subtest_count [expr {$completed ? "COMPLETE" : "INCOMPLETE: $_incomplete_reason"}]] 6670 } 6671 append output [string repeat = 78]\n 6672 6673 # Detailed information 6674 if {$testsuite_subtest_details ne "none"} { 6675 set show_details false 6676 if {$testsuite_subtest_details eq "all" && $subtest_count > 0} { 6677 set show_details true 6678 } elseif {$testsuite_subtest_details eq "fail_skip" && $_subtest_skip_count > 0} { 6679 set show_details true 6680 } elseif {$_subtest_fail_count > 0} { 6681 set show_details true 6682 } 6683 6684 if {$show_details} { 6685 append output [format "SUBTESTS DETAILS : %s\n" $test_name] 6686 dict for {id result_msg} $_subtest_messages { 6687 set result [lindex $result_msg 0] 6688 if {$testsuite_subtest_details eq "all"} { 6689 append output [format " %s\n" [lindex $result_msg 1]] 6690 } elseif {$testsuite_subtest_details eq "fail_skip" && $result eq "skip"} { 6691 append output [format " %s\n" [lindex $result_msg 1]] 6692 } elseif {$result eq "fail"} { 6693 append output [format " %s\n" [lindex $result_msg 1]] 6694 } 6695 } 6696 append output [string repeat = 78]\n 6697 } 6698 } 6699 if {$testsuite_testproc_details ne "none"} { 6700 set show_details false 6701 if {$testsuite_testproc_details eq "all" && $testproc_count > 0} { 6702 set show_details true 6703 } elseif {$testsuite_testproc_details eq "fail_skip" && $testproc_skip > 0} { 6704 set show_details true 6705 } elseif {$testproc_fail > 0} { 6706 set show_details true 6707 } 6708 6709 if {$show_details} { 6710 append output [format "TESTPROCS DETAILS : %s\n" $test_name] 6711 dict for {num result_msg} $_testproc_messages { 6712 set result [lindex $result_msg 0] 6713 if {$testsuite_testproc_details eq "all"} { 6714 append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]] 6715 } elseif {$testsuite_testproc_details eq "fail_skip" && $result eq "skipped"} { 6716 append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]] 6717 } elseif {$result eq "failed"} { 6718 append output [format " Testproc %2d %-7s : %s\n" $num [lindex $result_msg 0] [lindex $result_msg 1]] 6719 } 6720 } 6721 append output [string repeat = 78]\n 6722 } 6723 } 6724 6725 if ($testsuite_colorize) { 6726 append output $COLOR_NONE 6727 } 6728 6729 puts -nonewline $output 6730} 6731 6732 6733################################################################ 6734# 6735# NAME 6736# _get_test_name - gets the name of the invoking source script 6737# 6738# SYNOPSIS 6739# _get_test_name 6740# 6741# RETURN VALUE 6742# The name of the originally called script 6743# 6744################################################################ 6745 6746proc _get_test_name { } { 6747 6748 set test_name unknown 6749 set frame_level 1 6750 while { $frame_level <= [info frame] } { 6751 if { [dict get [info frame $frame_level] type] == "source" } { 6752 set test_name [file tail [dict get [info frame $frame_level] file]] 6753 break 6754 } 6755 incr frame_level 6756 } 6757 6758 return $test_name 6759} 6760 6761 6762################################################################ 6763# 6764# NAME 6765# _test_cleanup - performs the test cleanup 6766# 6767# SYNOPSIS 6768# _test_cleanup 6769# 6770# DESCRIPTION 6771# This procedure removes the temporary test_dir and calls the 6772# test-defined cleanup procedure. 6773# 6774# NOTES 6775# This function should be called only from _test_init and _test_fini. 6776# 6777################################################################ 6778 6779proc _test_cleanup {} { 6780 global log_warn test_dir 6781 6782 set rc 0 6783 6784 # Call global cleanup procedure if it is defined by the test 6785 if {[info procs cleanup] eq "cleanup"} { 6786 if {[catch {cleanup} cleanup_error ]} { 6787 log_error "Cleanup had errors: $cleanup_error" 6788 set rc 1 6789 } 6790 } 6791 6792 # Remove the temporary test directory 6793 exec rm -rf $test_dir 6794 6795 return $rc 6796} 6797 6798 6799################################################################ 6800# 6801# NAME 6802# _test_init - performs test initialization 6803# 6804# SYNOPSIS 6805# _test_init 6806# 6807# DESCRIPTION 6808# This procedure is called automatically at the beginning of each test. 6809# It prints the header, creates the temporary test dir, etc. 6810# 6811################################################################ 6812 6813proc _test_init {} { 6814 global test_dir test_id test_name testsuite_shared_dir 6815 global _testproc_included _testproc_excluded argv 6816 6817 # parse argv to get and remove _testproc_included and _testproc_excluded params 6818 set idx [expr {[info exists argv] ? [lsearch $argv -i] : -1}] 6819 if {$idx >= 0} { 6820 set _testproc_included [split [lindex $argv [expr $idx + 1]] ,] 6821 set argv [lreplace $argv $idx [expr $idx + 1]] 6822 } 6823 set idx [expr {[info exists argv] ? [lsearch $argv -e] : -1}] 6824 if {$idx >= 0} { 6825 set _testproc_excluded [split [lindex $argv [expr $idx + 1]] ,] 6826 set argv [lreplace $argv $idx [expr $idx + 1]] 6827 } 6828 6829 # Set test name to name of originally invoked test script, e.g. test1.1 6830 set test_name [_get_test_name] 6831 6832 # Set test id to suffix of the test script, e.g. 1.1 6833 set test_id [string map {test ""} $test_name] 6834 6835 # Temporary test directory used to stash saved configs, output files... 6836 set test_dir "$testsuite_shared_dir/${test_name}dir" 6837 6838 # Print test header 6839 _print_header 6840 6841 # Cleanup in case test was not cleaned up on last execution 6842 if { [_test_cleanup] } { 6843 fail "Error in the initial cleanup" 6844 } 6845 6846 # Create temporary shared test directory 6847 exec mkdir -p $test_dir 6848} 6849 6850 6851################################################################ 6852# 6853# NAME 6854# _test_fini - performs test finalization 6855# 6856# SYNOPSIS 6857# _test_fini ?status? 6858# 6859# DESCRIPTION 6860# This procedure is called automatically from the ending functions 6861# pass, skip and failure. 6862# It cleans up based on the status and the testsuite_cleanup_on_failure 6863# variable, prints the final test status/summary, and exits the test. 6864# 6865# We will always cleanup for SUCCESS or SKIPPED tests. 6866# Whether or not the cleanup procedure is called for FAILURE tests 6867# depends on the setting of the $testsuite_cleanup_on_failure variable 6868# which can be set in the globals.local file or overridden with the 6869# SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment variable. 6870# 6871################################################################ 6872 6873proc _test_fini { status } { 6874 global testsuite_cleanup_on_failure _test_fini_called 6875 global _subtest_fail_count _subtest_skip_count 6876 6877 # Avoid potential infinite recursive calls. 6878 # _test_fini should be called only once, but custom cleanup procs 6879 # called from _test_cleanup can potentially call it (eg fail) 6880 if {$_test_fini_called} { 6881 log_debug "Recursive _test_fini call detected, most probably a fail on a cleanup function" 6882 return 6883 } 6884 set _test_fini_called true 6885 6886 6887 # Determine if test completed or was aborted 6888 set completed [expr $status == 0 ? true : false] 6889 6890 # Override status with subtest status if available and necessary 6891 if {$status < 1} { 6892 if {$_subtest_fail_count > 0} { 6893 set status 1 6894 } elseif {$_subtest_skip_count > 0} { 6895 set status -1 6896 } 6897 } 6898 6899 # Only cleanup if test not failed or configured to do so 6900 if {$status != 1 || $testsuite_cleanup_on_failure} { 6901 _test_cleanup 6902 } 6903 _print_summary $status $completed 6904 6905 __exit $status 6906} 6907 6908 6909################################################################ 6910# 6911# Overload the exit routine to ensure that no one is explicitly 6912# calling it, and to enforce _test_fini if exit is called when 6913# the test reach its EOF. 6914# 6915# All tests should exit using pass, skip or fail. 6916# 6917################################################################ 6918 6919rename exit __exit 6920 6921proc exit { {status 0} } { 6922 global test_name 6923 6924 # To avoid failures when using "expect -c 'source globals'" 6925 if {$test_name eq "globals"} { 6926 pass 6927 } 6928 6929 # 6930 # Minor sanity check to detect if exit was explicitly called (not 6931 # allowed) or automatically executed when the test ends 6932 # 6933 if {[info level] > 1} { 6934 # exit was called from a function, and it shouldn't 6935 fail "Exit should not be directly called, use pass, skip or fail instead" 6936 } else { 6937 set frame_level 1 6938 while { $frame_level <= [info frame] } { 6939 if { [dict get [info frame $frame_level] type] == "source" } { 6940 if { [file tail [dict get [info frame $frame_level] file]] eq $test_name } { 6941 # exit was called explicitly from the 6942 # test, and it shouldn't 6943 fail "Exit should not be directly called, use pass, skip or fail instead" 6944 } 6945 break 6946 } 6947 incr frame_level 6948 } 6949 } 6950 6951 if {$status != 0} { 6952 fail "Some Expect/TCL exception happen" 6953 } 6954 6955 # The exit was called implicitly when the test ends, allowed but 6956 # _test_fini call enforced 6957 pass 6958} 6959 6960 6961# Call _test_init at the beginning of each test 6962_test_init 6963 6964