1#!/usr/bin/env expect 2############################################################################ 3# Purpose: Test of Slurm functionality 4# Test of sinfo cpu total and allocated 5############################################################################ 6# Copyright (C) 2009 Lawrence Livermore National Security. 7# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 8# Written by Joseph Donaghy <donaghy1@llnl.gov> 9# CODE-OCEC-09-009. All rights reserved. 10# 11# This file is part of Slurm, a resource management program. 12# For details, see <https://slurm.schedmd.com/>. 13# Please also read the included file: DISCLAIMER. 14# 15# Slurm is free software; you can redistribute it and/or modify it under 16# the terms of the GNU General Public License as published by the Free 17# Software Foundation; either version 2 of the License, or (at your option) 18# any later version. 19# 20# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 21# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 22# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 23# details. 24# 25# You should have received a copy of the GNU General Public License along 26# with Slurm; if not, write to the Free Software Foundation, Inc., 27# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 28############################################################################ 29source ./globals 30 31set file_in "test$test_id.in" 32set exit_code 0 33set pnumsc 0 34set pnumsi 0 35set aprocsc 0 36set aprocsi 0 37set inode_found 0 38set inode_name "" 39set smallest 1 40set layout "static" 41set select_type "" 42 43proc scontrol_test { node proc_cnt } { 44 global scontrol number test_prompt 45 upvar spawn_id spawn_id 46 47 set found 0 48 set rc 0 49 50 send "$scontrol show node $node\r" 51 expect { 52 -re "CPUAlloc=($number)" { 53 set num_alloc $expect_out(1,string) 54 set found 1 55 if {$proc_cnt != $num_alloc} { 56 log_error "Requested $proc_cnt but got $num_alloc instead" 57 set rc 1 58 } 59 exp_continue 60 } 61 -re $test_prompt { 62 } 63 timeout { 64 log_error "scontrol not responding" 65 slow_kill $mypid 66 return 1 67 } 68 eof { 69 wait 70 } 71 } 72 73 if {!$found} { 74 log_error "Didn't get expected output from scontrol" 75 set rc 1 76 } 77 78 return $rc 79} 80 81proc sinfo_test_1 { node proc_cnt total_procs idle_cpus } { 82 global float number test_prompt sinfo slash 83 upvar spawn_id spawn_id 84 85 set found 0 86 set rc 0 87 88 send "$sinfo -o \"%C %A %N\" -h -n $node\r" 89 expect { 90 -re "($float)(K?)($slash)($float)(K?)($slash)($float)(K?)($slash)($float)(K?) ($number)($slash)($number) $node" { 91 set found 1 92 set num_alloc $expect_out(1,string) 93 if {[string compare $expect_out(2,string) ""]} { 94 set num_alloc [expr $num_alloc * 1024] 95 } 96 set num_idle $expect_out(4,string) 97 if {[string compare $expect_out(5,string) ""]} { 98 set num_idle [expr $num_idle * 1024] 99 } 100 set num_other $expect_out(7,string) 101 if {[string compare $expect_out(8,string) ""]} { 102 set num_other [expr $num_other * 1024] 103 } 104 set num_total $expect_out(10,string) 105 if {[string compare $expect_out(11,string) ""]} { 106 set num_total [expr $num_total * 1024] 107 } 108 109 if { $num_alloc != $proc_cnt } { 110 log_error "sinfo 1 allocated cpus wrong, got $num_alloc but needed $proc_cnt" 111 set rc 1 112 } elseif { $num_idle != $idle_cpus } { 113 log_error "sinfo 1 idle cpus wrong, got $num_idle but needed $idle_cpus" 114 set rc 1 115 } elseif { $num_total != $total_procs } { 116 log_error "sinfo 1 total cpus wrong, got $num_total but needed $total_procs" 117 set rc 1 118 } 119 exp_continue 120 } 121 -re $test_prompt { 122 } 123 timeout { 124 log_error "sinfo not responding" 125 slow_kill $mypid 126 return 1 127 } 128 eof { 129 wait 130 } 131 } 132 133 if {!$found} { 134 log_error "Didn't get expected output from sinfo" 135 set rc 1 136 } 137 138 return $rc 139} 140 141proc sinfo_test_2 { node proc_cnt total_procs } { 142 global sinfo number test_prompt 143 upvar spawn_id spawn_id 144 145 set rc 0 146 set num_alloc 0 147 set num_idle 0 148 set alloc_nodes 1 149 set total_nodes 1 150 151 set idle_nodes [expr $total_nodes - $alloc_nodes] 152 153 send "$sinfo -o \"%t %D %N\" -h -n $node\r" 154 expect { 155 -re "alloc ($number)(K?) $node" { 156 set num_alloc $expect_out(1,string) 157 if {[string compare $expect_out(2,string) ""]} { 158 set num_alloc [expr $inode_procs * 1024] 159 } 160 exp_continue 161 } 162 -re "idle ($number)(K?) $node" { 163 set num_idle $expect_out(1,string) 164 if {[string compare $expect_out(2,string) ""]} { 165 set num_idle [expr $num_idle * 1024] 166 } 167 exp_continue 168 } 169 -re "mix ($number)(K?) $node" { 170 set num_alloc $expect_out(1,string) 171 if {[string compare $expect_out(2,string) ""]} { 172 set num_alloc [expr $inode_procs * 1024] 173 } 174 exp_continue 175 } 176 -re $test_prompt { 177 } 178 timeout { 179 log_error "sinfo not responding" 180 slow_kill $mypid 181 return 1 182 } 183 eof { 184 wait 185 } 186 } 187 188 if { $num_alloc != $alloc_nodes } { 189 log_error "sinfo 2 allocated nodes wrong, got $num_alloc but needed $alloc_nodes" 190 set rc 1 191 } elseif { $num_idle != $idle_nodes } { 192 log_error "sinfo 2 idle nodes wrong, got $num_idle but needed $idle_nodes" 193 set rc 1 194 } 195 196 return $rc 197} 198 199# allocate a set of nodes (node_cnt) and the quit right after 200proc allocate_and_quit { node proc_cnt total_procs } { 201 global bin_bash salloc scontrol sinfo number re_word_str 202 global test_prompt select_type procs_per_node test_id bin_rm 203 global reset_bash_prompt 204 205 set file_in "test$test_id.input" 206 set job_id 0 207 set num_alloc 0 208 set block "" 209 set rc 0 210 set timeout 60 211 set idle_cpus [expr $total_procs - $proc_cnt] 212 213 make_bash_script $file_in " 214 $reset_bash_prompt 215 $bin_bash --norc 216 " 217 218 set mypid [spawn $salloc -w $node -N 1 -t 5 -n $proc_cnt ./$file_in] 219 expect { 220 -re "Granted job allocation ($number)" { 221 set job_id $expect_out(1,string) 222 exp_continue 223 } 224 225 -re $test_prompt { 226 # test for scontrol to give me the correct cpu count 227 if { [scontrol_test $node $proc_cnt] } { 228 send "exit\r" 229 return 1 230 } 231 232 233 # test for sinfo to give me the correct cpu count 234 if { [sinfo_test_1 $node $proc_cnt $total_procs $idle_cpus] } { 235 send "exit\r" 236 return 1 237 } 238 # test for sinfo to give me the correct node count 239 if { [sinfo_test_2 $node $proc_cnt $total_procs] } { 240 send "exit\r" 241 return 1 242 } 243 send "exit\r" 244 exp_continue 245 } 246 247 -re "Unable to contact" { 248 log_error "Slurm appears to be down" 249 exp_continue 250 } 251 timeout { 252 cancel_job $job_id 253 slow_kill $mypid 254 fail "salloc not responding" 255 } 256 eof { 257 wait 258 } 259 } 260 exec $bin_rm -f $file_in 261 262 return $rc 263} 264 265############################################################################ 266# test starts here 267############################################################################ 268 269set select_type_parameters [get_config_param "SelectTypeParameters"] 270if {[param_contains $select_type_parameters "CR_ONE_TASK_PER_CORE"]} { 271 skip "This test is incompatible SelectTypeParameters=CR_ONE_TASK_PER_CORE" 272} 273 274# find the default partition 275set def_part [default_partition] 276 277# find the nodes in the default partition 278set def_node_name "" 279spawn $sinfo -h -o "=%N=" -p $def_part 280expect { 281 -re "=($re_word_str).($number)-($number).=" { 282 set def_node_name $expect_out(1,string) 283 exp_continue 284 } 285 eof { 286 wait 287 } 288} 289if {[string compare $def_node_name ""] == 0} { 290 skip "Node name format not supported for this test" 291} 292 293log_user 0 294set def_hostlist "" 295set part_exclusive 0 296spawn $scontrol show part $def_part 297expect { 298 -re " Nodes=($re_word_str)" { 299 set def_hostlist $expect_out(1,string) 300 exp_continue 301 } 302 -re " BasePartitions=($re_word_str)" { 303 set def_hostlist $expect_out(1,string) 304 exp_continue 305 } 306 -re " OverSubscribe=EXCLUSIVE" { 307 set part_exclusive 1 308 exp_continue 309 } 310 timeout { 311 fail "scontrol not responding" 312 } 313 eof { 314 wait 315 } 316} 317set host_cnt 0 318spawn $scontrol show hostnames $def_hostlist 319expect { 320 -re "($re_word_str)" { 321 set host_name($host_cnt) $expect_out(1,string) 322 incr host_cnt 323 exp_continue 324 } 325 timeout { 326 fail "scontrol not responding" 327 } 328 eof { 329 wait 330 } 331} 332log_user 1 333if {$host_cnt == 0} { 334 fail "Could not find any nodes in default partition" 335} 336 337# find me an idle node in default partition 338log_user 0 339set inode_name "" 340set inode_cores_per_socket 0 341set inode_procs 0 342set units "" 343set inode_sockets 0 344set inode_threads_per_core 0 345 346set fd [open "|$scontrol --oneliner show node $def_hostlist"] 347exp_internal 1 348while {[gets $fd line] != -1} { 349 if {[regexp {NodeName=(\w+).*CoresPerSocket=(\d+).*CPUTot=(\d+)(K?).*Sockets=(\d+).*State=IDLE ThreadsPerCore=(\d+)} $line frag inode_name inode_cores_per_socket inode_procs units inode_sockets inode_threads_per_core] == 1} { 350 break 351 } 352 if {[regexp {NodeName=(\w+).*CoresPerSocket=(\d+).*CPUTot=(\d+)(K?).*Sockets=(\d+).*State=IDLE.CLOUD ThreadsPerCore=(\d+)} $line frag inode_name inode_cores_per_socket inode_procs units inode_sockets inode_threads_per_core] == 1} { 353 break 354 } 355} 356exp_internal 0 357if {[string compare $units ""]} { 358 set inode_procs [expr $inode_procs * 1024] 359} 360catch {close $fd} 361 362log_user 1 363 364if {!$inode_procs} { 365 fail "Couldn't find an idle node in the default partition" 366} 367 368log_debug "Found idle node $inode_name with $inode_procs processors" 369 370# figure out the select plugin we are using 371if {[check_config_select "linear"]} { 372 set smallest $inode_procs 373} else { 374 if {$part_exclusive == 1} { 375 set smallest $inode_procs 376 } elseif [param_contains $select_type_parameters "CR_CPU*"] { 377 set smallest $inode_threads_per_core 378 } elseif [param_contains $select_type_parameters "NONE"] { 379 set smallest $inode_threads_per_core 380 } elseif [param_contains $select_type_parameters "CR_CORE*"] { 381 set smallest $inode_threads_per_core 382 } elseif [param_contains $select_type_parameters "CR_SOCKET*"] { 383 set smallest [expr $inode_cores_per_socket *$inode_threads_per_core] 384 } else { 385 log_warn "Failed to parse SelectTypeParameters '$select_params'" 386 set smallest $inode_procs 387 } 388} 389 390set exit_code [allocate_and_quit $inode_name $smallest $inode_procs] 391if {!$exit_code && $smallest != $inode_procs} { 392 # just to make sure we get a clean state we will sleep a bit 393 sleep 1 394 set exit_code [allocate_and_quit $inode_name $inode_procs $inode_procs] 395} 396if {$exit_code != 0} { 397 fail "Test failed due to previous errors (\$exit_code = $exit_code)" 398} 399