1#!/usr/bin/perl 2############################################################################### 3# 4# sjstat - List attributes of jobs under SLURM control 5# 6############################################################################### 7# Copyright (C) 2007 The Regents of the University of California. 8# Copyright (C) 2008-2009 Lawrence Livermore National Security. 9# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 10# Written by Phil Eckert <eckert21@llnl.gov>. 11# CODE-OCEC-09-009. All rights reserved. 12# 13# This file is part of SLURM, a resource management program. 14# For details, see <https://slurm.schedmd.com/>. 15# Please also read the included file: DISCLAIMER. 16# 17# SLURM is free software; you can redistribute it and/or modify it under 18# the terms of the GNU General Public License as published by the Free 19# Software Foundation; either version 2 of the License, or (at your option) 20# any later version. 21# 22# In addition, as a special exception, the copyright holders give permission 23# to link the code of portions of this program with the OpenSSL library under 24# certain conditions as described in each individual source file, and 25# distribute linked combinations including the two. You must obey the GNU 26# General Public License in all respects for all of the code used other than 27# OpenSSL. If you modify file(s) with this exception, you may extend this 28# exception to your version of the file(s), but you are not obligated to do 29# so. If you do not wish to do so, delete this exception statement from your 30# version. If you delete this exception statement from all source files in 31# the program, then also delete it here. 32# 33# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY 34# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 35# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 36# details. 37# 38# You should have received a copy of the GNU General Public License along 39# with SLURM; if not, write to the Free Software Foundation, Inc., 40# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 41# 42# Based off code with permission copyright 2006, 2007 Cluster Resources, Inc. 43############################################################################### 44 45# 46# Man page stuff. 47# 48BEGIN { 49 # Just dump the man page in *roff format and exit if --roff specified. 50 foreach my $arg (@ARGV) { 51 if ($arg eq "--") { 52 last; 53 } elsif ($arg eq "--roff") { 54 use Pod::Man; 55 my $parser = Pod::Man->new (section => 1); 56 $parser->parse_from_file($0, \*STDOUT); 57 exit 0; 58 } 59 } 60} 61 62use strict; 63use Getopt::Long 2.24 qw(:config no_ignore_case); 64use autouse 'Pod::Usage' => qw(pod2usage); 65 66# 67# Global Variables. 68# 69 my ($help, $man, $pool, $running, $verbose); 70 my (%MaxNodes, %MaxTime); 71 72# 73# Check SLURM status. 74# 75 isslurmup(); 76 77# 78# Get user options. 79# 80 get_options(); 81 82# 83# Get partition information from scontrol, used 84# currently in conjunction with the sinfo data.. 85# 86 do_scontrol_part(); 87 88# 89# Get and display the sinfo data. 90# 91 do_sinfo(); 92 93# 94# If the -c option was entered, stop here. 95# 96 exit if ($pool); 97 98# 99# Get and display the squeue data. 100# 101 do_squeue(); 102 103 exit; 104 105 106# 107# Get the SLURM partitions information. 108# 109sub do_sinfo 110{ 111 112 my (@s_part, @s_mem, @s_cpu, @s_feat, @s_active, @s_idle, 113 @s_out, @s_total, @s_usable); 114# 115# Get the partition and node info. 116# 117 my $options = "\"%9P %7m %.4c %.22F %f\""; 118 119 my $ct = 0; 120 my @sin = `sinfo -e -o $options`; 121 foreach my $tmp (@sin) { 122 next if ($tmp =~ /^PARTITION/); 123 chomp $tmp; 124 my @line = split(' ',$tmp); 125 $s_part[$ct] = $line[0]; 126 $s_mem[$ct] = $line[1]; 127 $s_cpu[$ct] = $line[2]; 128# 129# Split the status into various components. 130# 131 my @fields = split(/\//, $line[3]); 132 $s_active[$ct] = $fields[0]; 133 $s_idle[$ct] = $fields[1]; 134 $s_out[$ct] = $fields[2]; 135 $s_total[$ct] = $fields[3]; 136 137 $s_usable[$ct] = $s_total[$ct] - $s_out[$ct]; 138 139 $s_feat[$ct] = ($line[4] .= " "); 140 $s_feat[$ct] =~ s/\(null\)//g; 141 $ct++; 142 } 143 144 printf("\nScheduling pool data:\n"); 145 if ($verbose) { 146 printf("----------------------------------------------------------------------------------\n"); 147 printf(" Total Usable Free Node Time Other \n"); 148 printf("Pool Memory Cpus Nodes Nodes Nodes Limit Limit traits \n"); 149 printf("----------------------------------------------------------------------------------\n"); 150 } else { 151 printf("-------------------------------------------------------------\n"); 152 printf("Pool Memory Cpus Total Usable Free Other Traits \n"); 153 printf("-------------------------------------------------------------\n"); 154 } 155 156 for (my $i = 0; $i < $ct; $i++) { 157 if ($verbose) { 158 my $p = $s_part[$i]; 159 $p =~ s/\*//; 160 printf("%-9s %7dMb %5s %6s %7s %6s %6s %10s %-s\n", 161 $s_part[$i], $s_mem[$i], $s_cpu[$i], 162 $s_total[$i], $s_usable[$i], 163 $s_idle[$i], $MaxNodes{$p}, 164 $MaxTime{$p}, $s_feat[$i]); 165 } else { 166 printf("%-9s %7dMb %5s %6s %6s %6s %-s\n", 167 $s_part[$i], $s_mem[$i], $s_cpu[$i], 168 $s_total[$i], $s_usable[$i], 169 $s_idle[$i], $s_feat[$i]); 170 } 171 } 172 printf("\n"); 173 174 return; 175} 176 177 178# 179# Get the SLURM queues. 180# 181sub do_squeue 182{ 183 184 my (@s_job, @s_user, @s_nodes, @s_status, @s_begin, @s_limit, 185 @s_start, @s_pool, @s_used, @s_master); 186# 187# Base options on whether this partition is node or process scheduled. 188# 189 my ($type, $options); 190 my $rval = system("scontrol show config | grep cons_res >> /dev/null"); 191 if ($rval) { 192 $type = "Nodes"; 193 $options = "\"%8i %8u %.6D %2t %S %.12l %.9P %.11M %1000R\""; 194 } else { 195 $type = "Procs"; 196 $options = "\"%8i %8u %.6C %2t %S %.12l %.9P %.11M %1000R\""; 197 } 198 199# 200# Get the job information. 201# 202 203 my $ct = 0; 204 my $pat = "tr -s '[' '\000' |cut -d'-' -f 1 | cut -d',' -f 1"; 205 my @sout = `squeue -o $options`; 206 foreach my $tmp (@sout) { 207 next if ($tmp =~ /^JOBID/); 208 next if ($running && $tmp =~ / PD /); 209 chomp $tmp; 210 my @line = split(' ', $tmp); 211 $s_job[$ct] = $line[0]; 212 $s_user[$ct] = $line[1]; 213 $s_nodes[$ct] = $line[2]; 214 $s_status[$ct] = $line[3]; 215 $line[4] =~ s/^.....//; 216 $line[4] = "N/A" if ($line[3] =~ /PD/); 217 $s_begin[$ct] = $line[4]; 218 $s_limit[$ct] = $line[5]; 219 if ($line[5] eq "UNLIMITED") { 220 $s_limit[$ct] = $line[5]; 221 } else { 222 $s_limit[$ct] = convert_time($line[5]); 223 } 224 225 $s_pool[$ct] = $line[6]; 226 $s_used[$ct] = $line[7]; 227# 228# Only keep the master node from the nodes list. 229# 230 $line[8] =~ s/\[([0-9.]*).*/$1/; 231 $s_master[$ct] = $line[8]; 232 $ct++; 233 } 234 235 236 printf("Running job data:\n"); 237 238 if ($verbose) { 239 printf("---------------------------------------------------------------------------------------------------\n"); 240 printf(" Time Time Time \n"); 241 printf("JobID User $type Pool Status Used Limit Started Master/Other \n"); 242 printf("---------------------------------------------------------------------------------------------------\n"); 243 } else { 244 printf("----------------------------------------------------------------------\n"); 245 printf("JobID User $type Pool Status Used Master/Other \n"); 246 printf("----------------------------------------------------------------------\n"); 247 } 248 249 for (my $i = 0; $i < $ct; $i++) { 250 if ($verbose) { 251 printf("%-8s %-8s %6s %-9s %-7s %10s %11s %14s %.12s\n", 252 $s_job[$i], $s_user[$i], $s_nodes[$i], 253 $s_pool[$i], $s_status[$i], 254 $s_used[$i], $s_limit[$i], $s_begin[$i], 255 $s_master[$i]); 256 } else { 257 printf("%-8s %-8s %6s %-9s %-7s %10s %.12s\n", 258 $s_job[$i], $s_user[$i], $s_nodes[$i], 259 $s_pool[$i], $s_status[$i], 260 $s_used[$i], $s_master[$i]); 261 } 262 } 263 printf("\n"); 264 265 return; 266} 267 268# 269# Get the SLURM partitions. 270# 271sub do_scontrol_part 272{ 273 274# 275# Get All partition data Don't need it all now, but 276# it may be useful later. 277# 278 my @scon = `scontrol show part`; 279 my $part; 280 foreach my $tmp (@scon) { 281 chomp $tmp; 282 my @line = split(' ',$tmp); 283 ($part) = ($tmp =~ m/PartitionName=(\S+)/) if ($tmp =~ /PartitionName=/); 284 285 ($MaxTime{$part}) = ($tmp =~ m/MaxTime=(\S+)\s+/) if ($tmp =~ /MaxTime=/); 286 ($MaxNodes{$part}) = ($tmp =~ m/MaxNodes=(\S+)\s+/) if ($tmp =~ /MaxNodes=/); 287 $MaxTime{$part} =~ s/UNLIMITED/UNLIM/ if ($MaxTime{$part}); 288 $MaxNodes{$part} =~ s/UNLIMITED/UNLIM/ if ($MaxNodes{$part}); 289 } 290 291 return; 292} 293 294 295# 296# Show the man page. 297# 298sub show_man 299{ 300 301 if ($< == 0) { # Cannot invoke perldoc as root 302 my $id = eval { getpwnam("nobody") }; 303 $id = eval { getpwnam("nouser") } unless defined $id; 304 $id = -2 unless defined $id; 305 $< = $id; 306 printf("\n You can not do this as root!\n\n"); 307 exit 1; 308 } 309 $> = $<; # Disengage setuid 310 $ENV{PATH} = "/bin:/usr/bin"; # Untaint PATH 311 delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; 312 if ($0 =~ /^([-\/\w\.]+)$/) { $0 = $1; } # Untaint $0 313 else { die "Illegal characters were found in \$0 ($0)\n"; } 314 pod2usage(-exitstatus => 0, -verbose => 2); 315 316 return; 317} 318 319 320# 321# Convert the time to a better format. 322# 323sub convert_time 324{ 325 my $val = shift(@_); 326 327 my $tmp; 328 my @field = split(/-|:/, $val); 329 if (@field == 4) { 330 $tmp = ($field[0]*24)+$field[1] . ':'.$field[2] . ':' . $field[3]; 331 } else { 332 $tmp = sprintf("%8s",$val); 333 } 334 335 return($tmp); 336} 337 338 339# 340# Get options. 341# 342sub get_options 343{ 344 GetOptions( 345 'help|h|?' => \$help, 346 'man' => \$man, 347 'v' => \$verbose, 348 'r' => \$running, 349 'c' => \$pool, 350 ) or usage(1); 351 352 show_man() if ($man); 353 usage(0) if ($help); 354 355 return; 356} 357 358 359# 360# Usage. 361# 362sub usage 363{ 364 my $eval = shift(@_); 365 366# 367# Print usage instructions and exit. 368# 369 print STDERR "\nUsage: sjstat [-h] [-c] p\[-man] [-r] [-v]\n"; 370 371 printf("\ 372 -h shows usage. 373 -c shows computing resources info only. 374 -man shows man page. 375 -r show only running jobs. 376 -v is for the verbose mode.\n 377 378 Output is very similar to that of squeue. 379 \n\n"); 380 381 exit($eval); 382} 383 384 385# 386# Determine if SLURM is available. 387# 388sub isslurmup 389{ 390 my $out = `scontrol show part 2>&1`; 391 if ($?) { 392 printf("\n SLURM is not communicating.\n\n"); 393 exit(1); 394 } 395 396 return; 397} 398 399 400__END__ 401 402=head1 NAME 403 404B<sjstat> - List attributes of jobs under the SLURM control 405 406=head1 SYNOPSIS 407 408B<sjstat> [B<-h> ] [B<-c>] [B<-r> ] [B<-v>] 409 410=head1 DESCRIPTION 411 412The B<sjstat> command is used to display statistics of jobs under control of SLURM. 413The output is designed to give information on the resource usage and availablilty, 414as well as information about jobs that are currently active on the machine. This output 415is built using the SLURM utilities, sinfo, squeue and scontrol, the man pages for these 416utilites will provide more information and greater depth of understanding. 417 418=head1 OPTIONS 419 420=over 4 421 422=item B<-h> 423 424Display a brief help message 425 426=item B<-c> 427 428Display the computing resource information only. 429 430=item B<-man> 431 432Show the man page. 433 434=item B<-r> 435 436Display only the running jobs. 437 438=item B<-v> 439 440Display more verbose information. 441 442=back 443 444=head1 EXAMPLE 445 446The following is a basic request for status. 447 448 > sjstat 449 450 Scheduling pool data: 451 ------------------------------------------------------------ 452 Pool Memory Cpus Total Usable Free Other Traits 453 ------------------------------------------------------------ 454 pdebug 15000Mb 8 32 32 24 (null) 455 pbatch* 15000Mb 8 1072 1070 174 (null) 456 457 458 Running job data: 459 ------------------------------------------------------------------- 460 JobID User Nodes Pool Status Used Master/Other 461 ------------------------------------------------------------------- 462 395 mary 1000 pbatch PD 0:00 (JobHeld) 463 396 mary 1000 pbatch PD 0:00 (JobHeld) 464 375 sam 1000 pbatch CG 0:00 (JobHeld) 465 388 fred 32 pbatch R 25:27 atlas89 466 361 harry 512 pbatch R 1:01:12 atlas618 467 1077742 sally 8 pdebug R 20:16 atlas18 468 469 470 The Scheduling data contains information pertaining to the: 471 472 Pool a set of nodes 473 Memory the amount of memory on each node 474 Cpus the number of cpus on each node 475 Total the total number of nodes in the pool 476 Usable total usaable nodes in the pool 477 Free total nodes that are currently free 478 479 The Running job data contains information pertaining to the: 480 481 JobID the SLURM job id 482 User owner of the job 483 Nodes nodes required, or in use by the job 484 (Note: On cpu scheduled machines, this field 485 will be labled "Procs" show the number of processors 486 the job is using.) 487 Pool the Pool required or in use by the job 488 Status current status of the job 489 Used Wallclick time used by the job 490 Master/Other Either the Master (head) node used by the job, or may 491 indicate furhter status of a pending, or completing job. 492 493 The common status values are: 494 495 R The job is running 496 PD The job is Pending 497 CG The job is Completing 498 499 These are states reproted by SLURM and more elaborate docuemntation 500 can be found in the squeue/sinfo man pages. 501 502 503 An example of the -v option. 504 505 Scheduling pool data: 506 ----------------------------------------------------------------------------- 507 Total Usable Free Node Time Other 508 Pool Memory Cpus Nodes Nodes Nodes Limit Limit Traits 509 ----------------------------------------------------------------------------- 510 pdebug 15000Mb 8 32 32 24 16 30 (null) 511 pbatch* 15000Mb 8 1072 1070 174 UNLIM UNLIM (null) 512 513 Running job data: 514 --------------------------------------------------------------------------------------------------- 515 Time Time Time 516 JobID User Nodes Pool Status Used Limit Started Master/Other 517 --------------------------------------------------------------------------------------------------- 518 38562 tom 4 pbatch PD 0:00 1:00:00 01-14T18:11:22 (JobHeld) 519 520 The added fields to the "Scheduling pool data" are: 521 522 Node Limit SLURM imposed node limit. 523 Time Limit SLURM imposed time limit, value in minutes. 524 525 The added fields to the "Running job data" are: 526 527 Limit Time limit of job. 528 Start Start time of job. 529 530=head1 REPORTING BUGS 531 532Report bugs to <eckert2@llnl.gov> 533 534=cut 535