1 /*****************************************************************************\
2 * gpu_rsmi.c - Support rsmi interface to an AMD GPU.
3 *****************************************************************************
4 * Copyright (C) 2019 SchedMD LLC
5 * Copyright (c) 2019, Advanced Micro Devices, Inc. All rights reserved.
6 * Written by Advanced Micro Devices,
7 * who borrowed heavily from SLURM gpu and nvml plugin.
8 *
9 * This file is part of Slurm, a resource management program.
10 * For details, see <https://slurm.schedmd.com/>.
11 * Please also read the included file: DISCLAIMER.
12 *
13 * Slurm is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option)
16 * any later version.
17 *
18 * In addition, as a special exception, the copyright holders give permission
19 * to link the code of portions of this program with the OpenSSL library under
20 * certain conditions as described in each individual source file, and
21 * distribute linked combinations including the two. You must obey the GNU
22 * General Public License in all respects for all of the code used other than
23 * OpenSSL. If you modify file(s) with this exception, you may extend this
24 * exception to your version of the file(s), but you are not obligated to do
25 * so. If you do not wish to do so, delete this exception statement from your
26 * version. If you delete this exception statement from all source files in
27 * the program, then also delete it here.
28 *
29 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
32 * details.
33 *
34 * You should have received a copy of the GNU General Public License along
35 * with Slurm; if not, write to the Free Software Foundation, Inc.,
36 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
37 \*****************************************************************************/
38
39 #define _GNU_SOURCE
40
41 #include "src/common/slurm_xlator.h"
42 #include "src/common/gres.h"
43 #include "src/common/log.h"
44 #include <rocm_smi/rocm_smi.h>
45
46 /*
47 * #defines needed to test rsmi.
48 */
49 #define FREQS_CONCISE 5 // This must never be smaller than 5, or error
50
51 #define GPU_LOW ((unsigned int) -1)
52 #define GPU_MEDIUM ((unsigned int) -2)
53 #define GPU_HIGH_M1 ((unsigned int) -3)
54 #define GPU_HIGH ((unsigned int) -4)
55
56 static bitstr_t *saved_gpus;
57
58 /*
59 * Buffer size large enough for RSMI string
60 */
61 #define RSMI_STRING_BUFFER_SIZE 80
62
63 /*
64 * PCI information about a GPU device.
65 */
66 typedef struct rsmiPciInfo_st {
67 union {
68 struct {
69 #ifdef SLURM_BIGENDIAN
70 uint64_t reserved : 35;
71 uint64_t domain : 16;
72 uint64_t bus : 5;
73 uint64_t device : 5;
74 uint64_t function : 3;
75 #else
76 uint64_t function : 3;
77 uint64_t device : 5;
78 uint64_t bus : 5;
79 uint64_t domain : 16;
80 uint64_t reserved : 35;
81 #endif
82 };
83 uint64_t bdfid;
84 };
85 } rsmiPciInfo_t;
86
87 /*
88 * These variables are required by the generic plugin interface. If they
89 * are not found in the plugin, the plugin loader will ignore it.
90 *
91 * plugin_name - A string giving a human-readable description of the
92 * plugin. There is no maximum length, but the symbol must refer to
93 * a valid string.
94 *
95 * plugin_type - A string suggesting the type of the plugin or its
96 * applicability to a particular form of data or method of data handling.
97 * If the low-level plugin API is used, the contents of this string are
98 * unimportant and may be anything. Slurm uses the higher-level plugin
99 * interface which requires this string to be of the form
100 *
101 * <application>/<method>
102 *
103 * where <application> is a description of the intended application of
104 * the plugin (e.g., "auth" for Slurm authentication) and <method> is a
105 * description of how this plugin satisfies that application. Slurm will
106 * only load authentication plugins if the plugin_type string has a prefix
107 * of "auth/".
108 *
109 * plugin_version - an unsigned 32-bit integer containing the Slurm version
110 * (major.minor.micro combined into a single number).
111 */
112 const char *plugin_name = "GPU RSMI plugin";
113 const char *plugin_type = "gpu/rsmi";
114 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
115 static log_level_t log_lvl = LOG_LEVEL_DEBUG5;
116
init(void)117 extern int init(void)
118 {
119 debug("%s: %s loaded", __func__, plugin_name);
120
121 if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
122 log_lvl = LOG_LEVEL_INFO;
123
124 return SLURM_SUCCESS;
125 }
126
fini(void)127 extern int fini(void)
128 {
129 debug("%s: unloading %s", __func__, plugin_name);
130
131 return SLURM_SUCCESS;
132 }
133
134 //TODO: Duplicated from NVML plugin. Move to a common directory
_xlate_freq_value(char * gpu_freq)135 static unsigned int _xlate_freq_value(char *gpu_freq)
136 {
137 unsigned int value;
138
139 if (!gpu_freq && (gpu_freq[0] < '0') && (gpu_freq[0] > '9'))
140 return 0; /* Not a numeric value */
141 value = strtoul(gpu_freq, NULL, 10);
142 return value;
143 }
144
145 //TODO: Duplicated from NVML plugin. Move to a common directory
_xlate_freq_code(char * gpu_freq)146 static unsigned int _xlate_freq_code(char *gpu_freq)
147 {
148 //TODO: To be moved to common directory
149 if (!gpu_freq || !gpu_freq[0])
150 return 0;
151 if ((gpu_freq[0] >= '0') && (gpu_freq[0] <= '9'))
152 return 0; /* Pure numeric value */
153 if (!strcasecmp(gpu_freq, "low"))
154 return GPU_LOW;
155 else if (!strcasecmp(gpu_freq, "medium"))
156 return GPU_MEDIUM;
157 else if (!strcasecmp(gpu_freq, "highm1"))
158 return GPU_HIGH_M1;
159 else if (!strcasecmp(gpu_freq, "high"))
160 return GPU_HIGH;
161
162 debug("%s: %s: Invalid job GPU frequency (%s)",
163 plugin_type, __func__, gpu_freq);
164 return 0; /* Bad user input */
165 }
166
167 //TODO: Duplicated from NVML plugin. Move to a common directory
_parse_gpu_freq2(char * gpu_freq,unsigned int * gpu_freq_code,unsigned int * gpu_freq_value,unsigned int * mem_freq_code,unsigned int * mem_freq_value,bool * verbose_flag)168 static void _parse_gpu_freq2(char *gpu_freq, unsigned int *gpu_freq_code,
169 unsigned int *gpu_freq_value,
170 unsigned int *mem_freq_code,
171 unsigned int *mem_freq_value, bool *verbose_flag)
172 {
173 char *tmp, *tok, *sep, *save_ptr = NULL;
174
175 if (!gpu_freq || !gpu_freq[0])
176 return;
177 tmp = xstrdup(gpu_freq);
178 tok = strtok_r(tmp, ",", &save_ptr);
179 while (tok) {
180 sep = strchr(tok, '=');
181 if (sep) {
182 sep[0] = '\0';
183 sep++;
184 if (!strcasecmp(tok, "memory")) {
185 *mem_freq_code = _xlate_freq_code(sep);
186 *mem_freq_value = _xlate_freq_value(sep);
187 if (!(*mem_freq_code) && !(*mem_freq_value)) {
188 debug("Invalid job GPU memory frequency: %s",
189 tok);
190 }
191 } else {
192 debug("%s: %s: Invalid job device frequency type: %s",
193 plugin_type, __func__, tok);
194 }
195 } else if (!strcasecmp(tok, "verbose")) {
196 *verbose_flag = true;
197 } else {
198 *gpu_freq_code = _xlate_freq_code(tok);
199 *gpu_freq_value = _xlate_freq_value(tok);
200 if (!(*gpu_freq_code) && !(*gpu_freq_value))
201 debug("Invalid job GPU frequency: %s", tok);
202 }
203 tok = strtok_r(NULL, ",", &save_ptr);
204 }
205 xfree(tmp);
206 }
207
208 //TODO: Duplicated from NVML plugin. Move to a common directory
_parse_gpu_freq(char * gpu_freq,unsigned int * gpu_freq_num,unsigned int * mem_freq_num,bool * verbose_flag)209 static void _parse_gpu_freq(char *gpu_freq, unsigned int *gpu_freq_num,
210 unsigned int *mem_freq_num, bool *verbose_flag)
211 {
212 unsigned int def_gpu_freq_code = 0, def_gpu_freq_value = 0;
213 unsigned int def_mem_freq_code = 0, def_mem_freq_value = 0;
214 unsigned int job_gpu_freq_code = 0, job_gpu_freq_value = 0;
215 unsigned int job_mem_freq_code = 0, job_mem_freq_value = 0;
216 char *def_freq;
217
218 _parse_gpu_freq2(gpu_freq, &job_gpu_freq_code, &job_gpu_freq_value,
219 &job_mem_freq_code, &job_mem_freq_value, verbose_flag);
220
221 // Defaults to high for both mem and gfx
222 def_freq = slurm_get_gpu_freq_def();
223 _parse_gpu_freq2(def_freq, &def_gpu_freq_code, &def_gpu_freq_value,
224 &def_mem_freq_code, &def_mem_freq_value, verbose_flag);
225 xfree(def_freq);
226
227 if (job_gpu_freq_code)
228 *gpu_freq_num = job_gpu_freq_code;
229 else if (job_gpu_freq_value)
230 *gpu_freq_num = job_gpu_freq_value;
231 else if (def_gpu_freq_code)
232 *gpu_freq_num = def_gpu_freq_code;
233 else if (def_gpu_freq_value)
234 *gpu_freq_num = def_gpu_freq_value;
235
236 if (job_mem_freq_code)
237 *mem_freq_num = job_mem_freq_code;
238 else if (job_mem_freq_value)
239 *mem_freq_num = job_mem_freq_value;
240 else if (def_mem_freq_code)
241 *mem_freq_num = def_mem_freq_code;
242 else if (def_mem_freq_value)
243 *mem_freq_num = def_mem_freq_value;
244 }
245
246 //TODO: Duplicated from NVML plugin. Move to a common directory
_sort_freq_descending(const void * a,const void * b)247 static int _sort_freq_descending(const void *a, const void *b)
248 {
249 return (*(unsigned long *)b - *(unsigned long *)a);
250 }
251
252 /*
253 * Get all possible memory frequencies for the device
254 *
255 * dv_ind (IN) The device index
256 * mem_freqs_size (IN/OUT) The size of the mem_freqs array; this will be
257 * overwritten with the number of memory freqs found.
258 * mem_freqs (OUT) The possible memory frequencies in MHz.
259 *
260 * Return true if successful, false if not.
261 */
_rsmi_get_mem_freqs(uint32_t dv_ind,unsigned int * mem_freqs_size,unsigned int * mem_freqs)262 static bool _rsmi_get_mem_freqs(uint32_t dv_ind,
263 unsigned int *mem_freqs_size,
264 unsigned int *mem_freqs)
265 {
266 const char *status_string;
267 rsmi_status_t rsmi_rc;
268 rsmi_frequencies_t rsmi_freqs;
269
270 DEF_TIMERS;
271 START_TIMER;
272 rsmi_rc = rsmi_dev_gpu_clk_freq_get(
273 dv_ind, RSMI_CLK_TYPE_MEM, &rsmi_freqs);
274 END_TIMER;
275 debug3("rsmi_dev_gpu_clk_freq_get() took %ld microseconds",
276 DELTA_TIMER);
277
278 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
279 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
280 error("RSMI: Failed to get memory frequencies error: %s",
281 status_string);
282 return false;
283 }
284
285 *mem_freqs_size = rsmi_freqs.num_supported;
286 for (int i = 0; i < *mem_freqs_size; i++)
287 mem_freqs[i] = rsmi_freqs.frequency[i]/1000000;
288
289 return true;
290 }
291
292 /*
293 * Get all possible graphics frequencies for the device
294 *
295 * dv_ind (IN) The device index
296 * gfx_freqs_size (IN/OUT) The size of the gfx_freqs array; this will
297 * be overwritten with the number of graphics freqs found.
298 * gfx_freqs (OUT) The possible graphics frequencies in MHz.
299 *
300 * Return true if successful, false if not.
301 */
_rsmi_get_gfx_freqs(uint32_t dv_ind,unsigned int * gfx_freqs_size,unsigned int * gfx_freqs)302 static bool _rsmi_get_gfx_freqs(uint32_t dv_ind,
303 unsigned int *gfx_freqs_size,
304 unsigned int *gfx_freqs)
305 {
306 const char *status_string;
307 rsmi_status_t rsmi_rc;
308 rsmi_frequencies_t rsmi_freqs;
309
310 DEF_TIMERS;
311 START_TIMER;
312 rsmi_rc = rsmi_dev_gpu_clk_freq_get(
313 dv_ind, RSMI_CLK_TYPE_SYS, &rsmi_freqs);
314 END_TIMER;
315 debug3("rsmi_dev_gpu_clk_freq_get() took %ld microseconds",
316 DELTA_TIMER);
317
318 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
319 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
320 error("RSMI: Failed to get graphics frequencies error: %s",
321 status_string);
322 return false;
323 }
324
325 *gfx_freqs_size = rsmi_freqs.num_supported;
326 for (int i = 0; i < *gfx_freqs_size; i++)
327 gfx_freqs[i] = rsmi_freqs.frequency[i]/1000000;
328
329 return true;
330 }
331
332 /*
333 * Print out all possible memory and graphics frequencies for the given device.
334 * If there are more than FREQS_CONCISE frequencies, prints a summary instead
335 *
336 * dv_ind (IN) The device index
337 * l (IN) The log level at which to print
338 */
_rsmi_print_freqs(uint32_t dv_ind,log_level_t l)339 static void _rsmi_print_freqs(uint32_t dv_ind, log_level_t l)
340 {
341 unsigned int mem_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0};
342 unsigned int gfx_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0};
343 unsigned int size = RSMI_MAX_NUM_FREQUENCIES;
344 bool concise = false;
345 unsigned int i;
346
347 if (!_rsmi_get_mem_freqs(dv_ind, &size, mem_freqs))
348 return;
349
350 qsort(mem_freqs, size,
351 sizeof(unsigned int), _sort_freq_descending);
352 if ((size > 1) && (mem_freqs[0] <= mem_freqs[(size)-1])) {
353 error("%s: memory frequencies are not stored in descending order!",
354 __func__);
355 return;
356 }
357
358 if (size > FREQS_CONCISE)
359 concise = true;
360
361 log_var(l, " Possible GPU Memory Frequencies (%u):", size);
362 log_var(l, " ---------------------------------");
363 if (!concise) {
364 for (i = 0; i < size; ++i)
365 log_var(l, " *%u MHz [%u]", mem_freqs[i], i);
366 } else {
367 // first, next, ..., middle, ..., penultimate, last
368 log_var(l, " *%u MHz [0]", mem_freqs[0]);
369 log_var(l, " *%u MHz [1]", mem_freqs[1]);
370 log_var(l, " ...");
371 log_var(l, " *%u MHz [%u]", mem_freqs[(size - 1) / 2],
372 (size - 1) / 2);
373 log_var(l, " ...");
374 log_var(l, " *%u MHz [%u]",
375 mem_freqs[size - 2], size - 2);
376 log_var(l, " *%u MHz [%u]",
377 mem_freqs[size - 1], size - 1);
378 }
379
380 size = RSMI_MAX_NUM_FREQUENCIES;
381 if (!_rsmi_get_gfx_freqs(dv_ind, &size, gfx_freqs))
382 return;
383
384 qsort(gfx_freqs, size,
385 sizeof(unsigned int), _sort_freq_descending);
386 if ((size > 1) && (gfx_freqs[0] <= gfx_freqs[(size)-1])) {
387 error("%s: Graphics frequencies are not stored in descending order!",
388 __func__);
389 return;
390 }
391
392 if (size > FREQS_CONCISE)
393 concise = true;
394
395 log_var(l, " Possible GPU Graphics Frequencies (%u):", size);
396 log_var(l, " ---------------------------------");
397 if (!concise) {
398 for (i = 0; i < size; ++i)
399 log_var(l, " *%u MHz [%u]", gfx_freqs[i], i);
400 return;
401 }
402 // first, next, ..., middle, ..., penultimate, last
403 log_var(l, " *%u MHz [0]", gfx_freqs[0]);
404 log_var(l, " *%u MHz [1]", gfx_freqs[1]);
405 log_var(l, " ...");
406 log_var(l, " *%u MHz [%u]", gfx_freqs[(size - 1) / 2],
407 (size - 1) / 2);
408 log_var(l, " ...");
409 log_var(l, " *%u MHz [%u]", gfx_freqs[size - 2], size - 2);
410 log_var(l, " *%u MHz [%u]", gfx_freqs[size - 1], size - 1);
411 }
412
413 /*
414 * Convert frequency to nearest valid frequency found in frequency array
415 *
416 * freq (IN/OUT) The frequency to check, in MHz. Also the output, if
417 * it needs to be changed.
418 * freqs_size (IN) The size of the freqs array
419 * freqs (IN) An array of frequency values in MHz, sorted highest to
420 * lowest
421 *
422 * Inspired by src/common/cpu_frequency#_cpu_freq_freqspec_num()
423 */
424 //TODO: Duplicated from NVML plugin. Move to a common directory
_get_nearest_freq(unsigned int * freq,unsigned int freqs_size,unsigned int * freqs)425 static void _get_nearest_freq(unsigned int *freq, unsigned int freqs_size,
426 unsigned int *freqs)
427 {
428 unsigned int i;
429
430 if (!freq || !(*freq)) {
431 log_var(log_lvl, "%s: No frequency supplied", __func__);
432 return;
433 }
434 if (!freqs || !(*freqs)) {
435 log_var(log_lvl, "%s: No frequency list supplied", __func__);
436 return;
437 }
438 if (freqs_size <= 0) {
439 log_var(log_lvl, "%s: Frequency list is empty", __func__);
440 return;
441 }
442
443 // Check for special case values; freqs is sorted in descending order
444 switch ((*freq)) {
445 case GPU_LOW:
446 *freq = freqs[freqs_size - 1];
447 debug2("Frequency GPU_LOW: %u MHz", *freq);
448 return;
449
450 case GPU_MEDIUM:
451 *freq = freqs[(freqs_size - 1) / 2];
452 debug2("Frequency GPU_MEDIUM: %u MHz", *freq);
453 return;
454
455 case GPU_HIGH_M1:
456 if (freqs_size == 1)
457 *freq = freqs[0];
458 else
459 *freq = freqs[1];
460 debug2("Frequency GPU_HIGH_M1: %u MHz", *freq);
461 return;
462
463 case GPU_HIGH:
464 *freq = freqs[0];
465 debug2("Frequency GPU_HIGH: %u MHz", *freq);
466 return;
467
468 default:
469 debug2("Freq is not a special case. Continue...");
470 break;
471 }
472
473 /* check if freq is out of bounds of freqs */
474 if (*freq > freqs[0]) {
475 log_var(log_lvl, "Rounding frequency %u MHz down to %u MHz",
476 *freq, freqs[0]);
477 *freq = freqs[0];
478 return;
479 } else if (*freq < freqs[freqs_size - 1]) {
480 log_var(log_lvl, "Rounding frequency %u MHz up to %u MHz",
481 *freq, freqs[freqs_size - 1]);
482 *freq = freqs[freqs_size - 1];
483 return;
484 }
485
486 /* check for frequency, and round up if no exact match */
487 for (i = 0; i < freqs_size - 1;) {
488 if (*freq == freqs[i])
489 // No change necessary
490 debug2("No change necessary. Freq: %u MHz", *freq);
491 return;
492 i++;
493 /*
494 * Step down to next element to round up.
495 * Safe to advance due to bounds checks above here
496 */
497 if (*freq > freqs[i]) {
498 log_var(log_lvl, "Rounding frequency %u MHz up to %u MHz",
499 *freq, freqs[i - 1]);
500 *freq = freqs[i - 1];
501 return;
502 }
503 }
504 error("%s: Got to the end of the function. Freq: %u MHz",
505 __func__, *freq);
506 }
507
508 /*
509 * Get the nearest valid memory and graphics frequencies
510 * Return bit masks indicating the indices of the
511 * frequencies that are to be enabled (1) and disabled (0).
512 *
513 * dv_ind (IN) the device index
514 * mem_freq (IN/OUT) requested/nearest valid memory frequency
515 * mem_bitmask (OUT) bit mask for the nearest valid memory frequency
516 * gfx_freq (IN/OUT) requested/nearest valid graphics frequency
517 * gfx_bitmask (OUT) bit mask for the nearest valid graphics frequency
518 */
_rsmi_get_nearest_freqs(uint32_t dv_ind,unsigned int * mem_freq,uint64_t * mem_bitmask,unsigned int * gfx_freq,uint64_t * gfx_bitmask)519 static void _rsmi_get_nearest_freqs(uint32_t dv_ind,
520 unsigned int *mem_freq,
521 uint64_t *mem_bitmask,
522 unsigned int *gfx_freq,
523 uint64_t *gfx_bitmask)
524 {
525 unsigned int mem_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0};
526 unsigned int mem_freqs_sort[RSMI_MAX_NUM_FREQUENCIES] = {0};
527 unsigned int mem_freqs_size = RSMI_MAX_NUM_FREQUENCIES;
528 unsigned int gfx_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0};
529 unsigned int gfx_freqs_sort[RSMI_MAX_NUM_FREQUENCIES] = {0};
530 unsigned int gfx_freqs_size = RSMI_MAX_NUM_FREQUENCIES;
531
532 // Get the memory frequencies
533 if (!_rsmi_get_mem_freqs(dv_ind, &mem_freqs_size, mem_freqs))
534 return;
535
536 memcpy(mem_freqs_sort, mem_freqs, mem_freqs_size*sizeof(unsigned int));
537 qsort(mem_freqs_sort, mem_freqs_size,
538 sizeof(unsigned int), _sort_freq_descending);
539 if ((mem_freqs_size > 1) &&
540 (mem_freqs_sort[0] <= mem_freqs_sort[(mem_freqs_size)-1])) {
541 error("%s: memory frequencies are not stored in descending order!",
542 __func__);
543 return;
544 }
545
546 // Set the nearest valid memory frequency for the requested frequency
547 _get_nearest_freq(mem_freq, mem_freqs_size, mem_freqs_sort);
548
549 // convert the frequency to bit mask
550 for (int i = 0; i < mem_freqs_size; i++)
551 if (*mem_freq == mem_freqs[i]) {
552 *mem_bitmask = (1 << i);
553 break;
554 }
555
556 // Get the graphics frequencies
557 if (!_rsmi_get_gfx_freqs(dv_ind, &gfx_freqs_size, gfx_freqs))
558 return;
559
560 memcpy(gfx_freqs_sort, gfx_freqs, gfx_freqs_size*sizeof(unsigned int));
561 qsort(gfx_freqs_sort, gfx_freqs_size,
562 sizeof(unsigned int), _sort_freq_descending);
563 if ((gfx_freqs_size > 1) &&
564 (gfx_freqs_sort[0] <= gfx_freqs_sort[(gfx_freqs_size)-1])) {
565 error("%s: graphics frequencies are not stored in descending order!",
566 __func__);
567 return;
568 }
569
570 // Set the nearest valid graphics frequency for the requested frequency
571 _get_nearest_freq(gfx_freq, gfx_freqs_size, gfx_freqs_sort);
572
573 // convert the frequency to bit mask
574 for (int i = 0; i < gfx_freqs_size; i++)
575 if (*gfx_freq == gfx_freqs[i]) {
576 *gfx_bitmask = (1 << i);
577 break;
578 }
579 }
580
581 /*
582 * Set the memory and graphics clock frequencies for the GPU
583 *
584 * dv_ind (IN) The device index
585 * mem_bitmask (IN) bit mask for the memory frequency.
586 * gfx_bitmask (IN) bit mask for the graphics frequency.
587 *
588 * Returns true if successful, false if not
589 */
_rsmi_set_freqs(uint32_t dv_ind,uint64_t mem_bitmask,uint64_t gfx_bitmask)590 static bool _rsmi_set_freqs(uint32_t dv_ind, uint64_t mem_bitmask,
591 uint64_t gfx_bitmask)
592 {
593 const char *status_string;
594 rsmi_status_t rsmi_rc;
595
596 DEF_TIMERS;
597 START_TIMER;
598 rsmi_rc = rsmi_dev_gpu_clk_freq_set(
599 dv_ind, RSMI_CLK_TYPE_MEM, mem_bitmask);
600 END_TIMER;
601 debug3("rsmi_dev_gpu_clk_freq_set(0x%lx) for memory took %ld microseconds",
602 mem_bitmask, DELTA_TIMER);
603 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
604 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
605 error("RSMI: Failed to set memory frequency GPU %u error: %s",
606 dv_ind, status_string);
607 return false;
608 }
609
610 START_TIMER;
611 rsmi_rc = rsmi_dev_gpu_clk_freq_set(dv_ind,
612 RSMI_CLK_TYPE_SYS, gfx_bitmask);
613 debug3("rsmi_dev_gpu_clk_freq_set(0x%lx) for graphics took %ld microseconds",
614 gfx_bitmask, DELTA_TIMER);
615 END_TIMER;
616 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
617 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
618 error("RSMI: Failed to set graphic frequency GPU %u error: %s",
619 dv_ind, status_string);
620 return false;
621 }
622 return true;
623 }
624
625 /*
626 * Reset the memory and graphics clock frequencies for the GPU to the same
627 * default frequencies that are used after system reboot or driver reload. This
628 * default cannot be changed.
629 *
630 * dv_ind (IN) The device index
631 *
632 * Returns true if successful, false if not
633 */
_rsmi_reset_freqs(uint32_t dv_ind)634 static bool _rsmi_reset_freqs(uint32_t dv_ind)
635 {
636 const char *status_string;
637 rsmi_status_t rsmi_rc;
638
639 DEF_TIMERS;
640
641 START_TIMER;
642 rsmi_rc = rsmi_dev_perf_level_set(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO);
643 END_TIMER;
644 debug3("rsmi_dev_perf_level_set() took %ld microseconds",
645 DELTA_TIMER);
646 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
647 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
648 error("RSMI: Failed to reset frequencies error: %s",
649 status_string);
650 return false;
651 }
652 return true;
653 }
654
655 /*
656 * Get the memory or graphics clock frequency that the GPU is currently running
657 * at
658 *
659 * dv_ind (IN) The device index
660 * type (IN) The clock type to query. Either RSMI_CLK_TYPE_SYS or
661 * RSMI_CLK_TYPE_MEM.
662 *
663 * Returns the clock frequency in MHz if successful, or 0 if not
664 */
_rsmi_get_freq(uint32_t dv_ind,rsmi_clk_type_t type)665 static unsigned int _rsmi_get_freq(uint32_t dv_ind, rsmi_clk_type_t type)
666 {
667 const char *status_string;
668 rsmi_status_t rsmi_rc;
669 rsmi_frequencies_t rsmi_freqs;
670 char *type_str = "unknown";
671
672 DEF_TIMERS;
673
674 switch (type) {
675 case RSMI_CLK_TYPE_SYS:
676 type_str = "graphics";
677 break;
678 case RSMI_CLK_TYPE_MEM:
679 type_str = "memory";
680 break;
681 default:
682 error("%s: Unsupported clock type", __func__);
683 break;
684 }
685
686 START_TIMER;
687 rsmi_rc = rsmi_dev_gpu_clk_freq_get(dv_ind, type, &rsmi_freqs);
688 END_TIMER;
689 debug3("rsmi_dev_gpu_clk_freq_get(%s) took %ld microseconds",
690 type_str, DELTA_TIMER);
691 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
692 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
693 error("RSMI: Failed to get the GPU frequency type %s, error: %s",
694 type_str, status_string);
695 return 0;
696 }
697 return (rsmi_freqs.frequency[rsmi_freqs.current]/1000000);
698 }
699
_rsmi_get_gfx_freq(uint32_t dv_ind)700 static unsigned int _rsmi_get_gfx_freq(uint32_t dv_ind)
701 {
702 return _rsmi_get_freq(dv_ind, RSMI_CLK_TYPE_SYS);
703 }
704
_rsmi_get_mem_freq(uint32_t dv_ind)705 static unsigned int _rsmi_get_mem_freq(uint32_t dv_ind)
706 {
707 return _rsmi_get_freq(dv_ind, RSMI_CLK_TYPE_MEM);
708 }
709
710 /*
711 * Convert a frequency value to a string
712 * Returned string must be xfree()'ed
713 */
714 //TODO: Duplicated from NVML plugin. Move to a common directory
_freq_value_to_string(unsigned int freq)715 static char *_freq_value_to_string(unsigned int freq)
716 {
717 switch (freq) {
718 case GPU_LOW:
719 return xstrdup("low");
720 case GPU_MEDIUM:
721 return xstrdup("medium");
722 case GPU_HIGH:
723 return xstrdup("high");
724 case GPU_HIGH_M1:
725 return xstrdup("highm1");
726 default:
727 return xstrdup_printf("%u", freq);
728 }
729 }
730
731 /*
732 * Reset the frequencies of each GPU in the step to the hardware default
733 * NOTE: RSMI must be initialized beforehand
734 *
735 * gpus (IN) A bitmap specifying the GPUs on which to operate.
736 */
_reset_freq(bitstr_t * gpus)737 static void _reset_freq(bitstr_t *gpus)
738 {
739 int gpu_len = bit_size(gpus);
740 int i = -1, count = 0, count_set = 0;
741 bool freq_reset = false;
742
743 // Reset the frequency of each device allocated to the step
744 for (i = 0; i < gpu_len; i++) {
745 if (!bit_test(gpus, i))
746 continue;
747 count++;
748
749 debug2("Memory frequency before reset: %u",
750 _rsmi_get_mem_freq(i));
751 debug2("Graphics frequency before reset: %u",
752 _rsmi_get_gfx_freq(i));
753 freq_reset = _rsmi_reset_freqs(i);
754 debug2("Memory frequency after reset: %u",
755 _rsmi_get_mem_freq(i));
756 debug2("Graphics frequency after reset: %u",
757 _rsmi_get_gfx_freq(i));
758
759 // TODO: Check to make sure that the frequency reset
760
761 if (freq_reset) {
762 log_var(log_lvl, "Successfully reset GPU[%d]", i);
763 count_set++;
764 } else {
765 log_var(log_lvl, "Failed to reset GPU[%d]", i);
766 }
767 }
768
769 if (count_set != count) {
770 log_var(log_lvl,
771 "%s: Could not reset frequencies for all GPUs %d/%d total GPUs",
772 __func__, count_set, count);
773 fprintf(stderr, "Could not reset frequencies for all GPUs %d/%d total GPUs\n",
774 count_set, count);
775 }
776 }
777
778 /*
779 * Set the frequencies of each GPU specified for the step
780 * NOTE: RSMI must be initialized beforehand
781 *
782 * gpus (IN) A bitmap specifying the GPUs on which to operate.
783 * gpu_freq (IN) The frequencies to set each of the GPUs to. If a NULL or
784 * empty memory or graphics frequency is specified, then GpuFreqDef
785 * will be consulted, which defaults to "high,memory=high" if not
786 * set.
787 */
_set_freq(bitstr_t * gpus,char * gpu_freq)788 static void _set_freq(bitstr_t *gpus, char *gpu_freq)
789 {
790 bool verbose_flag = false;
791 int gpu_len = 0;
792 int i = -1, count = 0, count_set = 0;
793 unsigned int gpu_freq_num = 0, mem_freq_num = 0;
794 uint64_t mem_bitmask = 0, gpu_bitmask = 0;
795 bool freq_set = false, freq_logged = false;
796 char *tmp = NULL;
797 slurm_cgroup_conf_t *cg_conf;
798 bool task_cgroup = false;
799 bool constrained_devices = false;
800 bool cgroups_active = false;
801 char *task_plugin_type = NULL;
802
803 // Parse frequency information
804 debug2("_parse_gpu_freq(%s)", gpu_freq);
805 _parse_gpu_freq(gpu_freq, &gpu_freq_num, &mem_freq_num, &verbose_flag);
806 if (verbose_flag)
807 debug2("verbose_flag ON");
808
809 tmp = _freq_value_to_string(mem_freq_num);
810 debug2("Requested GPU memory frequency: %s", tmp);
811 xfree(tmp);
812 tmp = _freq_value_to_string(gpu_freq_num);
813 debug2("Requested GPU graphics frequency: %s", tmp);
814 xfree(tmp);
815
816 if (!mem_freq_num || !gpu_freq_num) {
817 debug2("%s: No frequencies to set", __func__);
818 return;
819 }
820
821 // Check if GPUs are constrained by cgroups
822 slurm_mutex_lock(&xcgroup_config_read_mutex);
823 cg_conf = xcgroup_get_slurm_cgroup_conf();
824 if (cg_conf && cg_conf->constrain_devices)
825 constrained_devices = true;
826 slurm_mutex_unlock(&xcgroup_config_read_mutex);
827
828 // Check if task/cgroup plugin is loaded
829 task_plugin_type = slurm_get_task_plugin();
830 if (strstr(task_plugin_type, "cgroup"))
831 task_cgroup = true;
832 xfree(task_plugin_type);
833
834 // If both of these are true, then GPUs will be constrained
835 if (constrained_devices && task_cgroup) {
836 cgroups_active = true;
837 gpu_len = bit_set_count(gpus);
838 debug2("%s: cgroups are configured. Using LOCAL GPU IDs",
839 __func__);
840 } else {
841 gpu_len = bit_size(gpus);
842 debug2("%s: cgroups are NOT configured. Assuming GLOBAL GPU IDs",
843 __func__);
844 }
845
846 // Set the frequency of each device allocated to the step
847 for (i = 0; i < gpu_len; i++) {
848 char *sep = "";
849
850 // Only check the global GPU bitstring if not using cgroups
851 if (!cgroups_active && !bit_test(gpus, i)) {
852 debug2("Passing over RSMI device %u", i);
853 continue;
854 }
855 count++;
856
857 debug2("Setting frequency of RSMI device %u", i);
858 _rsmi_get_nearest_freqs(i, &mem_freq_num, &mem_bitmask,
859 &gpu_freq_num, &gpu_bitmask);
860
861 debug2("Memory frequency before set: %u",
862 _rsmi_get_mem_freq(i));
863 debug2("Graphics frequency before set: %u",
864 _rsmi_get_gfx_freq(i));
865 freq_set = _rsmi_set_freqs(i, mem_bitmask, gpu_bitmask);
866 debug2("Memory frequency after set: %u",
867 _rsmi_get_mem_freq(i));
868 debug2("Graphics frequency after set: %u",
869 _rsmi_get_gfx_freq(i));
870
871 if (mem_freq_num) {
872 xstrfmtcat(tmp, "%smemory_freq:%u", sep, mem_freq_num);
873 sep = ",";
874 }
875 if (gpu_freq_num) {
876 xstrfmtcat(tmp, "%sgraphics_freq:%u", sep,
877 gpu_freq_num);
878 }
879
880 if (freq_set) {
881 log_var(log_lvl, "Successfully set GPU[%d] %s", i, tmp);
882 count_set++;
883 } else {
884 log_var(log_lvl, "Failed to set GPU[%d] %s", i, tmp);
885 }
886
887 if (verbose_flag && !freq_logged) {
888 fprintf(stderr, "GpuFreq=%s\n", tmp);
889 freq_logged = true; /* Just log for first GPU */
890 }
891 xfree(tmp);
892 }
893
894 if (count_set != count) {
895 log_var(log_lvl,
896 "%s: Could not set frequencies for all GPUs %d/%d total GPUs",
897 __func__, count_set, count);
898 fprintf(stderr, "Could not set frequencies for all GPUs %d/%d total GPUs\n",
899 count_set, count);
900 }
901 }
902
903 /*
904 * Get the version of the AMD Graphics driver
905 *
906 * driver (OUT) A string to return version of AMD GPU driver
907 * len (OUT) Length for version of AMD GPU driver
908 */
_rsmi_get_driver(char * driver,unsigned int len)909 static void _rsmi_get_driver(char *driver, unsigned int len)
910 {
911 rsmi_version_str_get(RSMI_SW_COMP_DRIVER, driver, len);
912 }
913
914 /*
915 * Get the version of the ROCM-SMI library
916 *
917 * version (OUT) A string to return version of RSMI
918 * len (OUT) Length for version of RSMI
919 */
_rsmi_get_version(char * version,unsigned int len)920 static void _rsmi_get_version(char *version, unsigned int len)
921 {
922 const char *status_string;
923 rsmi_version_t rsmi_version;
924 rsmi_status_t rsmi_rc = rsmi_version_get(&rsmi_version);
925
926 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
927 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
928 error("RSMI: Failed to get the version error: %s",
929 status_string);
930 version[0] = '\0';
931 } else
932 sprintf(version, "%s", rsmi_version.build);
933 }
934
935 /*
936 * Get the total # of GPUs in the system
937 *
938 * device_count (OUT) Number of available GPU devices
939 */
_rsmi_get_device_count(unsigned int * device_count)940 static void _rsmi_get_device_count(unsigned int *device_count)
941 {
942 const char *status_string;
943 rsmi_status_t rsmi_rc = rsmi_num_monitor_devices(device_count);
944
945 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
946 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
947 error("RSMI: Failed to get device count: %s", status_string);
948 *device_count = 0;
949 }
950 }
951
952 /*
953 * Get the name of the GPU
954 *
955 * dv_ind (IN) The device index
956 * device_name (OUT) Name of GPU devices
957 * size (OUT) Size of name
958 */
_rsmi_get_device_name(uint32_t dv_ind,char * device_name,unsigned int size)959 static void _rsmi_get_device_name(uint32_t dv_ind, char *device_name,
960 unsigned int size)
961 {
962 const char *status_string;
963 rsmi_status_t rsmi_rc = rsmi_dev_name_get(dv_ind, device_name, size);
964
965 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
966 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
967 error("RSMI: Failed to get name of the GPU: %s", status_string);
968 }
969 }
970
971 /*
972 * Get the brand of the GPU
973 *
974 * dv_ind (IN) The device index
975 * device_brand (OUT) Brand of GPU devices
976 * size (OUT) Size of name
977 */
_rsmi_get_device_brand(uint32_t dv_ind,char * device_brand,unsigned int size)978 static void _rsmi_get_device_brand(uint32_t dv_ind, char *device_brand,
979 unsigned int size)
980 {
981 const char *status_string;
982 rsmi_status_t rsmi_rc = rsmi_dev_brand_get(dv_ind, device_brand, size);
983
984 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
985 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
986 error("RSMI: Failed to get brand of the GPU: %s",
987 status_string);
988 }
989 }
990
991 /*
992 * Retrieves minor number of the render device. Each AMD GPU will have a device node file
993 * in form /dev/dri/renderD[minor_number].
994 *
995 * dv_ind (IN) The device index
996 * minor (OUT) minor number of device node
997 */
_rsmi_get_device_minor_number(uint32_t dv_ind,unsigned int * minor)998 static void _rsmi_get_device_minor_number(uint32_t dv_ind,
999 unsigned int *minor)
1000 {
1001 const char *status_string;
1002 rsmi_status_t rsmi_rc = rsmi_dev_drm_render_minor_get(dv_ind, minor);
1003
1004 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
1005 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
1006 error("RSMI: Failed to get minor number of GPU: %s",
1007 status_string);
1008 }
1009 }
1010
1011 /*
1012 * Get the PCI Info of the GPU
1013 *
1014 * dv_ind (IN) The device index
1015 * pci (OUT) PCI Info of GPU devices
1016 */
_rsmi_get_device_pci_info(uint32_t dv_ind,rsmiPciInfo_t * pci)1017 static void _rsmi_get_device_pci_info(uint32_t dv_ind, rsmiPciInfo_t *pci)
1018 {
1019 const char *status_string;
1020 rsmi_status_t rsmi_rc = rsmi_dev_pci_id_get(dv_ind, &(pci->bdfid));
1021
1022 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
1023 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
1024 error("RSMI: Failed to get PCI Info of the GPU: %s",
1025 status_string);
1026 }
1027 }
1028
1029 /*
1030 * Get the Unique ID of the GPU
1031 *
1032 * dv_ind (IN) The device index
1033 * id (OUT) Unique ID of GPU devices
1034 */
_rsmi_get_device_unique_id(uint32_t dv_ind,uint64_t * id)1035 static void _rsmi_get_device_unique_id(uint32_t dv_ind, uint64_t *id)
1036 {
1037 const char *status_string;
1038 rsmi_status_t rsmi_rc = rsmi_dev_unique_id_get(dv_ind, id);
1039
1040 if (rsmi_rc != RSMI_STATUS_SUCCESS) {
1041 rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
1042 error("RSMI: Failed to get Unique ID of the GPU: %s",
1043 status_string);
1044 }
1045 }
1046
1047 /*
1048 * Creates and returns a gres conf list of detected AMD gpus on the node.
1049 * If an error occurs, return NULL
1050 * Caller is responsible for freeing the list.
1051 *
1052 * If the AMD ROCM-SMI API exists, then query GPU info,
1053 * so the user doesn't need to specify manually in gres.conf.
1054 *
1055 * node_config (IN/OUT) pointer of node_config_load_t passed down
1056 */
_get_system_gpu_list_rsmi(node_config_load_t * node_config)1057 static List _get_system_gpu_list_rsmi(node_config_load_t *node_config)
1058 {
1059 unsigned int i;
1060 unsigned int device_count = 0;
1061 List gres_list_system = list_create(destroy_gres_slurmd_conf);
1062 char driver[RSMI_STRING_BUFFER_SIZE];
1063 char version[RSMI_STRING_BUFFER_SIZE];
1064
1065 rsmi_init(0);
1066
1067 _rsmi_get_driver(driver, RSMI_STRING_BUFFER_SIZE);
1068 _rsmi_get_version(version, RSMI_STRING_BUFFER_SIZE);
1069 debug("AMD Graphics Driver Version: %s", driver);
1070 debug("RSMI Library Version: %s", version);
1071
1072 _rsmi_get_device_count(&device_count);
1073 debug2("Device count: %d", device_count);
1074
1075 // Loop through all the GPUs on the system and add to gres_list_system
1076 for (i = 0; i < device_count; ++i) {
1077 unsigned int minor_number = 0;
1078 char *device_file = NULL;
1079 char device_name[RSMI_STRING_BUFFER_SIZE] = {0};
1080 char device_brand[RSMI_STRING_BUFFER_SIZE] = {0};
1081 rsmiPciInfo_t pci_info;
1082 uint64_t uuid = 0;
1083
1084 _rsmi_get_device_name(i, device_name, RSMI_STRING_BUFFER_SIZE);
1085 _rsmi_get_device_brand(i, device_brand,
1086 RSMI_STRING_BUFFER_SIZE);
1087 _rsmi_get_device_minor_number(i, &minor_number);
1088 pci_info.bdfid = 0;
1089 _rsmi_get_device_pci_info(i, &pci_info);
1090 _rsmi_get_device_unique_id(i, &uuid);
1091
1092 xstrfmtcat(device_file, "/dev/dri/renderD%u", minor_number);
1093
1094 debug2("GPU index %u:", i);
1095 debug2(" Name: %s", device_name);
1096 debug2(" Brand/Type: %s", device_brand);
1097 debug2(" UUID: %lx", uuid);
1098 debug2(" PCI Domain/Bus/Device/Function: %u:%u:%u.%u",
1099 pci_info.domain,
1100 pci_info.bus, pci_info.device, pci_info.function);
1101 debug2(" Device File (minor number): %s", device_file);
1102 if (minor_number != i+128)
1103 debug("Note: GPU index %u is different from minor # %u",
1104 i, minor_number);
1105
1106 // Print out possible memory frequencies for this device
1107 _rsmi_print_freqs(i, LOG_LEVEL_DEBUG2);
1108
1109 add_gres_to_list(gres_list_system, "gpu", 1,
1110 node_config->cpu_cnt, NULL, NULL,
1111 device_file, device_brand, NULL);
1112
1113 xfree(device_file);
1114 }
1115
1116 rsmi_shut_down();
1117
1118 info("%u GPU system device(s) detected", device_count);
1119 return gres_list_system;
1120 }
1121
gpu_p_reconfig(void)1122 extern int gpu_p_reconfig(void)
1123 {
1124 if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
1125 log_lvl = LOG_LEVEL_INFO;
1126 else
1127 log_lvl = LOG_LEVEL_DEBUG5;
1128
1129 return SLURM_SUCCESS;
1130 }
1131
1132
gpu_p_get_system_gpu_list(node_config_load_t * node_config)1133 extern List gpu_p_get_system_gpu_list(node_config_load_t *node_config)
1134 {
1135 List gres_list_system = _get_system_gpu_list_rsmi(node_config);
1136
1137 if (!gres_list_system)
1138 error("System GPU detection failed");
1139
1140 return gres_list_system;
1141 }
1142
gpu_p_step_hardware_init(bitstr_t * usable_gpus,char * tres_freq)1143 extern void gpu_p_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq)
1144 {
1145 char *freq = NULL;
1146 char *tmp = NULL;
1147
1148 xassert(tres_freq);
1149 xassert(usable_gpus);
1150
1151 if (!usable_gpus)
1152 return; /* Job allocated no GPUs */
1153 if (!tres_freq)
1154 return; /* No TRES frequency spec */
1155
1156 tmp = strstr(tres_freq, "gpu:");
1157 if (!tmp)
1158 return; /* No GPU frequency spec */
1159
1160 freq = xstrdup(tmp + 4);
1161 tmp = strchr(freq, ';');
1162 if (tmp)
1163 tmp[0] = '\0';
1164
1165 // Save a copy of the GPUs affected, so we can reset things afterwards
1166 FREE_NULL_BITMAP(saved_gpus);
1167 saved_gpus = bit_copy(usable_gpus);
1168
1169 rsmi_init(0);
1170
1171 // Set the frequency of each GPU index specified in the bitstr
1172 _set_freq(usable_gpus, freq);
1173 xfree(freq);
1174
1175 }
1176
gpu_p_step_hardware_fini(void)1177 extern void gpu_p_step_hardware_fini(void)
1178 {
1179 if (!saved_gpus)
1180 return;
1181
1182 // Reset the frequencies back to the hardware default
1183 _reset_freq(saved_gpus);
1184 FREE_NULL_BITMAP(saved_gpus);
1185 rsmi_shut_down();
1186 }
1187
gpu_p_test_cpu_conv(char * cpu_range)1188 extern char *gpu_p_test_cpu_conv(char *cpu_range)
1189 {
1190 return NULL;
1191 }
1192