1 /*****************************************************************************\
2 * switch_cray_aries.c - Library for managing a switch on a Cray/Aries system.
3 *****************************************************************************
4 * Copyright (C) 2013 SchedMD LLC
5 * Copyright 2013 Cray Inc. All Rights Reserved.
6 * Written by Danny Auble <da@schedmd.com>
7 *
8 * This file is part of Slurm, a resource management program.
9 * For details, see <https://slurm.schedmd.com/>.
10 * Please also read the included file: DISCLAIMER.
11 *
12 * Slurm is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 * In addition, as a special exception, the copyright holders give permission
18 * to link the code of portions of this program with the OpenSSL library under
19 * certain conditions as described in each individual source file, and
20 * distribute linked combinations including the two. You must obey the GNU
21 * General Public License in all respects for all of the code used other than
22 * OpenSSL. If you modify file(s) with this exception, you may extend this
23 * exception to your version of the file(s), but you are not obligated to do
24 * so. If you do not wish to do so, delete this exception statement from your
25 * version. If you delete this exception statement from all source files in
26 * the program, then also delete it here.
27 *
28 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
29 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
31 * details.
32 *
33 * You should have received a copy of the GNU General Public License along
34 * with Slurm; if not, write to the Free Software Foundation, Inc.,
35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
36 \*****************************************************************************/
37
38 #include "config.h"
39
40 #define _GNU_SOURCE
41
42 #include <errno.h>
43 #include <fcntl.h>
44 #include <inttypes.h>
45 #include <limits.h>
46 #ifdef __linux__
47 #include <linux/limits.h>
48 #endif
49 #include <math.h>
50 #include <sched.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <sys/stat.h>
56 #include <unistd.h>
57
58 #include "switch_cray_aries.h"
59 #include "slurm/slurm.h"
60 #include "slurm/slurm_errno.h"
61 #include "src/common/pack.h"
62 #include "src/common/gres.h"
63
64 #ifdef HAVE_NATIVE_CRAY
65 #include <job.h> /* Cray's job module component */
66 #endif
67
68 #define SWITCH_BUF_SIZE (PORT_CNT + 128)
69 #define SWITCH_CRAY_STATE_VERSION "PROTOCOL_VERSION"
70
71 uint64_t debug_flags = 0;
72
73 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
74 static bool lustre_no_flush = false;
75 #endif
76
77 /*
78 * These variables are required by the generic plugin interface. If they
79 * are not found in the plugin, the plugin loader will ignore it.
80 *
81 * plugin_name - a string giving a human-readable description of the
82 * plugin. There is no maximum length, but the symbol must refer to
83 * a valid string.
84 *
85 * plugin_type - a string suggesting the type of the plugin or its
86 * applicability to a particular form of data or method of data handling.
87 * If the low-level plugin API is used, the contents of this string are
88 * unimportant and may be anything. Slurm uses the higher-level plugin
89 * interface which requires this string to be of the form
90 *
91 * <application>/<method>
92 *
93 * where <application> is a description of the intended application of
94 * the plugin (e.g., "switch" for Slurm switch) and <method> is a description
95 * of how this plugin satisfies that application. Slurm will only load
96 * a switch plugin if the plugin_type string has a prefix of "switch/".
97 *
98 * plugin_version - an unsigned 32-bit integer containing the Slurm version
99 * (major.minor.micro combined into a single number).
100 */
101 const char plugin_name[] = "switch Cray/Aries plugin";
102 const char plugin_type[] = "switch/cray_aries";
103 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
104 const uint32_t plugin_id = SWITCH_PLUGIN_CRAY;
105
106 /*
107 * init() is called when the plugin is loaded, before any other functions
108 * are called. Put global initialization here.
109 */
init(void)110 int init(void)
111 {
112 debug("%s loaded.", plugin_name);
113 debug_flags = slurm_get_debug_flags();
114
115 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
116 start_lease_extender();
117 #endif
118 return SLURM_SUCCESS;
119 }
120
fini(void)121 int fini(void)
122 {
123 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
124 cleanup_lease_extender();
125 #endif
126
127 return SLURM_SUCCESS;
128 }
129
switch_p_reconfig(void)130 extern int switch_p_reconfig(void)
131 {
132 debug_flags = slurm_get_debug_flags();
133 return SLURM_SUCCESS;
134 }
135
136 /*
137 * switch functions for global state save/restore
138 */
switch_p_libstate_save(char * dir_name)139 extern int switch_p_libstate_save(char *dir_name)
140 {
141 return SLURM_SUCCESS;
142 }
143
switch_p_libstate_restore(char * dir_name,bool recover)144 extern int switch_p_libstate_restore(char *dir_name, bool recover)
145 {
146 #ifdef HAVE_NATIVE_CRAY
147 char *file_name;
148 struct stat st;
149
150 file_name = xstrdup(dir_name);
151 xstrcat(file_name, "/switch_cray_state");
152 if (stat(file_name, &st) == 0) {
153 error("%s no longer used, please remove it, kill all running "
154 "jobs, and set MpiParams in slurm.conf", file_name);
155 xfree(file_name);
156 return SLURM_ERROR;
157 }
158 xfree(file_name);
159 #endif
160 return SLURM_SUCCESS;
161 }
162
switch_p_libstate_clear(void)163 extern int switch_p_libstate_clear(void)
164 {
165 return SLURM_SUCCESS;
166 }
167
168 /*
169 * switch functions for job step specific credential
170 */
switch_p_alloc_jobinfo(switch_jobinfo_t ** switch_job,uint32_t job_id,uint32_t step_id)171 extern int switch_p_alloc_jobinfo(
172 switch_jobinfo_t **switch_job, uint32_t job_id, uint32_t step_id)
173 {
174 slurm_cray_jobinfo_t *new;
175
176 xassert(switch_job);
177 new = (slurm_cray_jobinfo_t *) xmalloc(sizeof(slurm_cray_jobinfo_t));
178 new->magic = CRAY_JOBINFO_MAGIC;
179 new->num_cookies = 0;
180 new->cookies = NULL;
181 new->cookie_ids = NULL;
182 new->apid = SLURM_ID_HASH(job_id, step_id);
183 *switch_job = (switch_jobinfo_t *) new;
184 return SLURM_SUCCESS;
185 }
186
switch_p_build_jobinfo(switch_jobinfo_t * switch_job,slurm_step_layout_t * step_layout,char * network)187 extern int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
188 slurm_step_layout_t *step_layout,
189 char *network)
190 {
191 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
192 int rc, cnt = 0;
193 int32_t *nodes = NULL;
194 slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job;
195 DEF_TIMERS;
196
197 START_TIMER;
198
199 if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
200 CRAY_DEBUG("switch_job was NULL");
201 return SLURM_SUCCESS;
202 }
203
204 xassert(job->magic == CRAY_JOBINFO_MAGIC);
205
206 // Get the list of nodes used for the cookie lease
207 rc = list_str_to_array(step_layout->node_list, &cnt, &nodes);
208 if (rc < 0) {
209 CRAY_ERR("list_str_to_array failed");
210 return SLURM_ERROR;
211 }
212 if (step_layout->node_cnt != cnt) {
213 CRAY_ERR("list_str_to_array returned count %"
214 PRIu32 "does not match expected count %d",
215 cnt, step_layout->node_cnt);
216 }
217
218 // Get cookies for network configuration
219 rc = lease_cookies(job, nodes, step_layout->node_cnt);
220
221 END_TIMER;
222 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
223 INFO_LINE("call took: %s", TIME_STR);
224
225 xfree(nodes);
226 if (rc != SLURM_SUCCESS) {
227 return rc;
228 }
229 #endif
230 return SLURM_SUCCESS;
231 }
232
switch_p_duplicate_jobinfo(switch_jobinfo_t * source,switch_jobinfo_t ** dest)233 extern int switch_p_duplicate_jobinfo(switch_jobinfo_t *source,
234 switch_jobinfo_t **dest)
235 {
236 slurm_cray_jobinfo_t *new;
237 slurm_cray_jobinfo_t *old = (slurm_cray_jobinfo_t *) source;
238
239 xassert(old);
240
241 new = xmalloc(sizeof(slurm_cray_jobinfo_t));
242 memcpy(new, old, sizeof(slurm_cray_jobinfo_t));
243
244 if (old->num_cookies) {
245 int i;
246 new->cookie_ids = xcalloc(old->num_cookies, sizeof(uint32_t));
247 memcpy(new->cookie_ids, old->cookie_ids,
248 sizeof(uint32_t) * old->num_cookies);
249 new->cookies = xcalloc(old->num_cookies, sizeof(char *));
250 for (i = 0; i < old->num_cookies; i++)
251 new->cookies[i] = xstrdup(old->cookies[i]);
252 }
253
254 if (old->num_ptags) {
255 new->ptags = xcalloc(old->num_ptags, sizeof(int));
256 memcpy(new->ptags, old->ptags, sizeof(int) * old->num_ptags);
257 }
258
259 *dest = (switch_jobinfo_t *) new;
260 return SLURM_SUCCESS;
261 }
262
263 /*
264 *
265 */
switch_p_free_jobinfo(switch_jobinfo_t * switch_job)266 extern void switch_p_free_jobinfo(switch_jobinfo_t *switch_job)
267 {
268 slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job;
269 int i;
270 DEF_TIMERS;
271
272 START_TIMER;
273
274 if (!job) {
275 CRAY_DEBUG("switch_job was NULL");
276 return;
277 }
278
279 if (job->magic == CRAY_NULL_JOBINFO_MAGIC) {
280 CRAY_DEBUG("switch_job was NULL MAGIC");
281 goto endit;
282 } else if (job->magic != CRAY_JOBINFO_MAGIC) {
283 CRAY_ERR("job is not a switch/cray slurm_cray_jobinfo_t");
284 return;
285 }
286
287 job->magic = 0;
288
289 /*
290 * Free the cookies and the cookie_ids.
291 */
292 if (job->num_cookies != 0) {
293 xfree(job->cookie_ids);
294
295 if (job->cookies) {
296 // Free the individual cookie strings.
297 for (i = 0; i < job->num_cookies; i++) {
298 xfree(job->cookies[i]);
299 }
300 xfree(job->cookies);
301 }
302 }
303 if (job->num_ptags)
304 xfree(job->ptags);
305 endit:
306 xfree(job);
307 END_TIMER;
308 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
309 INFO_LINE("call took: %s", TIME_STR);
310
311 return;
312 }
313
switch_p_pack_jobinfo(switch_jobinfo_t * switch_job,Buf buffer,uint16_t protocol_version)314 extern int switch_p_pack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer,
315 uint16_t protocol_version)
316 {
317 slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job;
318
319 xassert(buffer);
320
321 /*
322 * There is nothing to pack, so pack in magic telling unpack not to
323 * attempt to unpack anything.
324 */
325 if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
326 pack32(CRAY_NULL_JOBINFO_MAGIC, buffer);
327 return 0;
328 }
329
330 xassert(job->magic == CRAY_JOBINFO_MAGIC);
331
332 if (debug_flags & DEBUG_FLAG_SWITCH) {
333 CRAY_INFO("switch_jobinfo_t contents:");
334 print_jobinfo(job);
335 }
336
337 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
338 pack32(job->magic, buffer);
339 pack32(job->num_cookies, buffer);
340 packstr_array(job->cookies, job->num_cookies, buffer);
341 pack32_array(job->cookie_ids, job->num_cookies, buffer);
342 pack64(job->apid, buffer);
343 }
344
345 return 0;
346 }
347
switch_p_unpack_jobinfo(switch_jobinfo_t ** switch_job,Buf buffer,uint16_t protocol_version)348 extern int switch_p_unpack_jobinfo(switch_jobinfo_t **switch_job, Buf buffer,
349 uint16_t protocol_version)
350 {
351 uint32_t num_cookies;
352 slurm_cray_jobinfo_t *job;
353
354 if (!switch_job) {
355 CRAY_DEBUG("switch_job was NULL");
356 return SLURM_SUCCESS;
357 }
358
359 xassert(buffer);
360
361 job = xmalloc(sizeof(slurm_cray_jobinfo_t));
362 *switch_job = (switch_jobinfo_t *)job;
363
364 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
365 safe_unpack32(&job->magic, buffer);
366
367 if (job->magic == CRAY_NULL_JOBINFO_MAGIC) {
368 CRAY_DEBUG("Nothing to unpack");
369 return SLURM_SUCCESS;
370 }
371
372 xassert(job->magic == CRAY_JOBINFO_MAGIC);
373 safe_unpack32(&(job->num_cookies), buffer);
374 safe_unpackstr_array(&(job->cookies), &num_cookies, buffer);
375 if (num_cookies != job->num_cookies) {
376 CRAY_ERR("Wrong number of cookies received."
377 " Expected: %" PRIu32 "Received: %" PRIu32,
378 job->num_cookies, num_cookies);
379 goto unpack_error;
380 }
381 safe_unpack32_array(&(job->cookie_ids), &num_cookies, buffer);
382 if (num_cookies != job->num_cookies) {
383 CRAY_ERR("Wrong number of cookie IDs received."
384 " Expected: %" PRIu32 "Received: %" PRIu32,
385 job->num_cookies, num_cookies);
386 goto unpack_error;
387 }
388 safe_unpack64(&job->apid, buffer);
389 }
390
391 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
392 /*
393 * On recovery, we want to keep extending the life of
394 * cookies still in use. So lets track these cookies
395 * with the lease extender. Duplicate cookies are ignored.
396 */
397 track_cookies(job);
398 #endif
399
400 if (debug_flags & DEBUG_FLAG_SWITCH) {
401 CRAY_INFO("Unpacked jobinfo");
402 print_jobinfo(job);
403 }
404
405 return SLURM_SUCCESS;
406
407 unpack_error:
408
409 CRAY_ERR("Unpacking error");
410 switch_p_free_jobinfo(*switch_job);
411 *switch_job = NULL;
412
413 return SLURM_ERROR;
414 }
415
switch_p_print_jobinfo(FILE * fp,switch_jobinfo_t * jobinfo)416 extern void switch_p_print_jobinfo(FILE *fp, switch_jobinfo_t *jobinfo)
417 {
418 return;
419 }
420
switch_p_sprint_jobinfo(switch_jobinfo_t * switch_jobinfo,char * buf,size_t size)421 extern char *switch_p_sprint_jobinfo(switch_jobinfo_t *switch_jobinfo,
422 char *buf, size_t size)
423 {
424 if (buf && size) {
425 buf[0] = '\0';
426 return buf;
427 }
428
429 return NULL ;
430 }
431
432 /*
433 * switch functions for job initiation
434 */
switch_p_node_init(void)435 extern int switch_p_node_init(void)
436 {
437 return SLURM_SUCCESS;
438 }
439
switch_p_node_fini(void)440 extern int switch_p_node_fini(void)
441 {
442 return SLURM_SUCCESS;
443 }
444
switch_p_job_preinit(switch_jobinfo_t * jobinfo)445 extern int switch_p_job_preinit(switch_jobinfo_t *jobinfo)
446 {
447 return SLURM_SUCCESS;
448 }
449
switch_p_job_init(stepd_step_rec_t * job)450 extern int switch_p_job_init(stepd_step_rec_t *job)
451 {
452
453 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
454 slurm_cray_jobinfo_t *sw_job = job->switch_job ?
455 (slurm_cray_jobinfo_t *)job->switch_job->data : NULL;
456 int rc, num_ptags;
457 char *launch_params;
458 int exclusive = 0, mem_scaling = 100, cpu_scaling = 100;
459 int *ptags = NULL;
460 char *err_msg = NULL;
461 uint64_t cont_id = job->cont_id;
462 alpsc_peInfo_t alpsc_pe_info = {-1, -1, -1, -1, NULL, NULL, NULL};
463 int cmd_index = 0;
464 #ifdef HAVE_NATIVE_CRAY
465 uint64_t gpu_cnt = 0;
466 int control_nid = 0, num_branches = 0;
467 struct sockaddr_in control_soc;
468 alpsc_branchInfo_t alpsc_branch_info;
469 uint32_t jobid;
470 #endif
471
472 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
473 char *npc = "none";
474 int access = ALPSC_NET_PERF_CTR_NONE;
475 #endif
476 DEF_TIMERS;
477
478 START_TIMER;
479
480 #ifdef HAVE_CRAY_NETWORK
481 /* No PAGG job containers; uid used instead to configure network */
482 cont_id = (uint64_t)job->uid;
483 #endif
484
485 if (!sw_job || (sw_job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
486 CRAY_DEBUG("job->switch_job was NULL");
487 return SLURM_SUCCESS;
488 }
489
490 xassert(job->msg);
491 xassert(sw_job->magic == CRAY_JOBINFO_MAGIC);
492
493 #ifdef HAVE_NATIVE_CRAY
494 // Attach to the cncu container
495 if (job->het_job_id && (job->het_job_id != NO_VAL))
496 jobid = job->het_job_id;
497 else
498 jobid = job->jobid;
499 rc = alpsc_attach_cncu_container(&err_msg, jobid, job->cont_id);
500 ALPSC_CN_DEBUG("alpsc_attach_cncu_container");
501 if (rc != 1) {
502 return SLURM_ERROR;
503 }
504
505 // Create the apid directory
506 rc = create_apid_dir(sw_job->apid, job->uid, job->gid);
507 if (rc != SLURM_SUCCESS) {
508 return rc;
509 }
510
511 /*
512 * Not defined yet -- This one may be skipped because we may not need to
513 * find the PAGG JOB container based on the APID. It is part of the
514 * stepd_step_rec_t struct in the cont_id member, so if we have access
515 * to the struct, then we have access to the JOB container.
516 */
517
518 // alpsc_set_PAGG_apid()
519 #endif
520 /*
521 * Fill in the alpsc_pe_info structure
522 */
523 rc = build_alpsc_pe_info(job, &alpsc_pe_info, &cmd_index);
524 if (rc != SLURM_SUCCESS) {
525 return rc;
526 }
527
528 /*
529 * Configure the network
530 *
531 * Cray shmem still uses the network, even when it's using only one
532 * node, so we must always configure the network.
533 */
534 launch_params = slurm_get_launch_params();
535 if (launch_params && strstr(launch_params, "cray_net_exclusive")) {
536 /*
537 * Grant exclusive access and all aries resources to the job.
538 * Not recommended if you may run multiple steps within
539 * the job, and will cause problems if you suspend or allow
540 * nodes to be shared across multiple jobs.
541 */
542 /*
543 * TODO: determine if this can be managed per-job, rather
544 * than globally across the cluster.
545 */
546 exclusive = 1;
547 }
548 if (launch_params && strstr(launch_params, "lustre_no_flush")) {
549 /* Lustre cache flush can cause job bus errors, see bug 4309 */
550 lustre_no_flush = true;
551 }
552 xfree(launch_params);
553
554 if (!exclusive) {
555 /*
556 * Calculate percentages of cpu and mem to assign to
557 * non-exclusive jobs.
558 */
559
560 cpu_scaling = get_cpu_scaling(job);
561 if (cpu_scaling == -1)
562 return SLURM_ERROR;
563
564 mem_scaling = get_mem_scaling(job);
565 if (mem_scaling == -1)
566 return SLURM_ERROR;
567 }
568
569 if (debug_flags & DEBUG_FLAG_SWITCH) {
570 CRAY_INFO("Network Scaling: Exclusive %d CPU %d Memory %d",
571 exclusive, cpu_scaling, mem_scaling);
572 }
573
574 rc = alpsc_configure_nic(&err_msg, exclusive, cpu_scaling, mem_scaling,
575 cont_id, sw_job->num_cookies,
576 (const char **) sw_job->cookies,
577 &num_ptags, &ptags, NULL);
578 ALPSC_CN_DEBUG("alpsc_configure_nic");
579 if (rc != 1) {
580 free(ptags);
581 free_alpsc_pe_info(&alpsc_pe_info);
582 return SLURM_ERROR;
583 }
584 /*
585 * xmalloc the ptags and copy the ptag array to the xmalloced
586 * space, so they can be xfreed later
587 */
588 if (num_ptags) {
589 sw_job->ptags = xcalloc(num_ptags, sizeof(int));
590 memcpy(sw_job->ptags, ptags, sizeof(int) * num_ptags);
591 free(ptags);
592 sw_job->num_ptags = num_ptags;
593 }
594
595 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
596 // Write the IAA file
597 rc = write_iaa_file(job, sw_job, sw_job->ptags, sw_job->num_ptags,
598 &alpsc_pe_info);
599 if (rc != SLURM_SUCCESS) {
600 free_alpsc_pe_info(&alpsc_pe_info);
601 return rc;
602 }
603 #endif
604
605 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
606 /*
607 * If there is reserved access to network performance counters,
608 * configure the appropriate access permission in the kernel.
609 */
610 access = ALPSC_NET_PERF_CTR_NONE;
611 select_g_select_jobinfo_get(job->msg->select_jobinfo,
612 SELECT_JOBDATA_NETWORK, &npc);
613 CRAY_DEBUG("network performance counters SELECT_JOBDATA_NETWORK %s",
614 npc);
615 if (xstrcasecmp(npc, "system") == 0) {
616 access = ALPSC_NET_PERF_CTR_SYSTEM;
617 } else if (xstrcasecmp(npc, "blade") == 0) {
618 access = ALPSC_NET_PERF_CTR_BLADE;
619 }
620 if (access != ALPSC_NET_PERF_CTR_NONE) {
621 rc = alpsc_set_perf_ctr_perms(&err_msg, job->cont_id, access);
622 ALPSC_CN_DEBUG("alpsc_set_perf_ctr_perms");
623 if (rc != 1) {
624 free_alpsc_pe_info(&alpsc_pe_info);
625 return SLURM_ERROR;
626 }
627 }
628
629 /*
630 * Some of the input parameters for alpsc_write_placement_file do not
631 * apply for Slurm. These parameters will be given zero values.
632 * They are
633 * int control_nid
634 * struct sockaddr_in control_soc
635 * int num_branches
636 * alpsc_branchInfo_t alpsc_branch_info
637 */
638 control_soc.sin_port = 0;
639 control_soc.sin_addr.s_addr = 0;
640 /* Just assigning control_soc because it's already zero. */
641 alpsc_branch_info.tAddr = control_soc;
642 alpsc_branch_info.tIndex = 0;
643 alpsc_branch_info.tLen = 0;
644 alpsc_branch_info.targ = 0;
645
646 rc = alpsc_write_placement_file(&err_msg, sw_job->apid, cmd_index,
647 &alpsc_pe_info, control_nid,
648 control_soc, num_branches,
649 &alpsc_branch_info);
650
651 ALPSC_CN_DEBUG("alpsc_write_placement_file");
652 if (rc != 1) {
653 free_alpsc_pe_info(&alpsc_pe_info);
654 return SLURM_ERROR;
655 }
656
657 /*
658 * Also write a placement file with the legacy apid to support old
659 * statically linked Cray PMI applications. We can't simply symlink
660 * the old format to the new because the apid is written to the file.
661 */
662 if (sw_job->apid != SLURM_ID_HASH_LEGACY(sw_job->apid)) {
663 rc = alpsc_write_placement_file(&err_msg,
664 SLURM_ID_HASH_LEGACY(sw_job->apid),
665 cmd_index, &alpsc_pe_info, control_nid, control_soc,
666 num_branches, &alpsc_branch_info);
667 ALPSC_CN_DEBUG("alpsc_write_placement_file");
668 if (rc != 1) {
669 free_alpsc_pe_info(&alpsc_pe_info);
670 return SLURM_ERROR;
671 }
672 }
673 #endif
674 /* Clean up alpsc_pe_info*/
675 free_alpsc_pe_info(&alpsc_pe_info);
676 /*
677 * Write some environment variables used by LLI and PMI
678 */
679 rc = set_job_env(job, sw_job);
680 if (rc != SLURM_SUCCESS)
681 return rc;
682
683 #ifdef HAVE_NATIVE_CRAY
684 /*
685 * Query the generic resources to see if the GPU should be allocated
686 */
687
688 rc = gres_get_step_info(job->step_gres_list, "gpu", 0,
689 GRES_STEP_DATA_COUNT, &gpu_cnt);
690 CRAY_INFO("gres_cnt: %d %"PRIu64, rc, gpu_cnt);
691 if (gpu_cnt > 0)
692 setup_gpu(job);
693
694 /*
695 * Set the Job's APID
696 */
697 job_setapid(getpid(), sw_job->apid);
698 #endif
699
700 END_TIMER;
701 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
702 INFO_LINE("call took: %s", TIME_STR);
703 #endif
704
705 return SLURM_SUCCESS;
706 }
707
switch_p_job_suspend_test(switch_jobinfo_t * jobinfo)708 extern int switch_p_job_suspend_test(switch_jobinfo_t *jobinfo)
709 {
710 return SLURM_SUCCESS;
711 }
712
switch_p_job_suspend_info_get(switch_jobinfo_t * jobinfo,void ** suspend_info)713 extern void switch_p_job_suspend_info_get(switch_jobinfo_t *jobinfo,
714 void **suspend_info)
715 {
716 return;
717 }
switch_p_job_suspend_info_pack(void * suspend_info,Buf buffer,uint16_t protocol_version)718 extern void switch_p_job_suspend_info_pack(void *suspend_info, Buf buffer,
719 uint16_t protocol_version)
720 {
721 return;
722 }
723
switch_p_job_suspend_info_unpack(void ** suspend_info,Buf buffer,uint16_t protocol_version)724 extern int switch_p_job_suspend_info_unpack(void **suspend_info, Buf buffer,
725 uint16_t protocol_version)
726 {
727 return SLURM_SUCCESS;
728 }
729
switch_p_job_suspend_info_free(void * suspend_info)730 extern void switch_p_job_suspend_info_free(void *suspend_info)
731 {
732 return;
733 }
734
switch_p_job_suspend(void * suspend_info,int max_wait)735 extern int switch_p_job_suspend(void *suspend_info, int max_wait)
736 {
737 return SLURM_SUCCESS;
738 }
739
switch_p_job_resume(void * suspend_info,int max_wait)740 extern int switch_p_job_resume(void *suspend_info, int max_wait)
741 {
742 return SLURM_SUCCESS;
743 }
744
switch_p_job_fini(switch_jobinfo_t * jobinfo)745 extern int switch_p_job_fini(switch_jobinfo_t *jobinfo)
746 {
747 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
748 slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) jobinfo;
749 DEF_TIMERS;
750
751 START_TIMER;
752
753 if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
754 CRAY_ERR("jobinfo pointer was NULL");
755 return SLURM_SUCCESS;
756 }
757
758 xassert(job->magic == CRAY_JOBINFO_MAGIC);
759
760 #ifdef HAVE_NATIVE_CRAY
761 int rc;
762 rc = remove_spool_files(job->apid);
763 if (rc != SLURM_SUCCESS) {
764 return rc;
765 }
766 #endif
767
768 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
769 // Remove the IAA file
770 unlink_iaa_file(job);
771 #endif
772
773 END_TIMER;
774 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
775 INFO_LINE("call took: %s", TIME_STR);
776 #endif
777 return SLURM_SUCCESS;
778 }
779
switch_p_job_postfini(stepd_step_rec_t * job)780 extern int switch_p_job_postfini(stepd_step_rec_t *job)
781 {
782 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
783 int rc;
784 char *err_msg = NULL;
785 uid_t pgid = job->jmgr_pid;
786 #ifdef HAVE_NATIVE_CRAY
787 uint64_t gpu_cnt = 0;
788 #endif
789 DEF_TIMERS;
790
791 START_TIMER;
792
793 if (!job->switch_job) {
794 CRAY_DEBUG("job->switch_job was NULL");
795 }
796
797 /*
798 * Kill all processes in the job's session
799 */
800 if (pgid) {
801 CRAY_DEBUG("Sending SIGKILL to pgid %lu", (unsigned long) pgid);
802 kill(-pgid, SIGKILL);
803 } else
804 CRAY_INFO("Job %u.%u: Bad pid value %lu",
805 job->jobid, job->stepid, (unsigned long) pgid);
806 /*
807 * Clean-up
808 *
809 * 0. Reset GPU proxy
810 * 1. Flush Lustre caches
811 * 2. Flush virtual memory
812 * 3. Compact memory
813 */
814
815 #ifdef HAVE_NATIVE_CRAY
816 // Set the proxy back to the default state.
817 rc = gres_get_step_info(job->step_gres_list, "gpu", 0,
818 GRES_STEP_DATA_COUNT, &gpu_cnt);
819 if (gpu_cnt > 0) {
820 reset_gpu(job);
821 }
822 #endif
823 if (!lustre_no_flush) {
824 // Flush Lustre Cache
825 rc = alpsc_flush_lustre(&err_msg);
826 ALPSC_CN_DEBUG("alpsc_flush_lustre");
827 if (rc != 1) {
828 return SLURM_ERROR;
829 }
830
831 // Flush virtual memory
832 rc = system("echo 3 > /proc/sys/vm/drop_caches");
833 if (rc != -1) {
834 rc = WEXITSTATUS(rc);
835 }
836 if (rc) {
837 CRAY_ERR("Flushing virtual memory failed. Return code: %d",
838 rc);
839 }
840 }
841
842 END_TIMER;
843 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
844 INFO_LINE("call took: %s", TIME_STR);
845 #endif
846 return SLURM_SUCCESS;
847 }
848
switch_p_job_attach(switch_jobinfo_t * jobinfo,char *** env,uint32_t nodeid,uint32_t procid,uint32_t nnodes,uint32_t nprocs,uint32_t rank)849 extern int switch_p_job_attach(switch_jobinfo_t *jobinfo, char ***env,
850 uint32_t nodeid, uint32_t procid,
851 uint32_t nnodes, uint32_t nprocs,
852 uint32_t rank)
853 {
854 return SLURM_SUCCESS;
855 }
856
switch_p_get_jobinfo(switch_jobinfo_t * switch_job,int key,void * resulting_data)857 extern int switch_p_get_jobinfo(switch_jobinfo_t *switch_job, int key,
858 void *resulting_data)
859 {
860 slurm_seterrno(EINVAL);
861 return SLURM_ERROR;
862 }
863
864 /*
865 * node switch state monitoring functions
866 * required for IBM Federation switch
867 */
switch_p_clear_node_state(void)868 extern int switch_p_clear_node_state(void)
869 {
870 return SLURM_SUCCESS;
871 }
872
switch_p_alloc_node_info(switch_node_info_t ** switch_node)873 extern int switch_p_alloc_node_info(switch_node_info_t **switch_node)
874 {
875 return SLURM_SUCCESS;
876 }
877
switch_p_build_node_info(switch_node_info_t * switch_node)878 extern int switch_p_build_node_info(switch_node_info_t *switch_node)
879 {
880 return SLURM_SUCCESS;
881 }
882
switch_p_pack_node_info(switch_node_info_t * switch_node,Buf buffer,uint16_t protocol_version)883 extern int switch_p_pack_node_info(switch_node_info_t *switch_node, Buf buffer,
884 uint16_t protocol_version)
885 {
886 return 0;
887 }
888
switch_p_unpack_node_info(switch_node_info_t ** switch_node,Buf buffer,uint16_t protocol_version)889 extern int switch_p_unpack_node_info(switch_node_info_t **switch_node,
890 Buf buffer, uint16_t protocol_version)
891 {
892 return SLURM_SUCCESS;
893 }
894
switch_p_free_node_info(switch_node_info_t ** switch_node)895 extern int switch_p_free_node_info(switch_node_info_t **switch_node)
896 {
897 return SLURM_SUCCESS;
898 }
899
switch_p_sprintf_node_info(switch_node_info_t * switch_node,char * buf,size_t size)900 extern char*switch_p_sprintf_node_info(switch_node_info_t *switch_node,
901 char *buf, size_t size)
902 {
903 if (buf && size) {
904 buf[0] = '\0';
905 return buf;
906 }
907
908 return NULL ;
909 }
910
switch_p_job_step_complete(switch_jobinfo_t * jobinfo,char * nodelist)911 extern int switch_p_job_step_complete(switch_jobinfo_t *jobinfo,
912 char *nodelist)
913 {
914 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
915 slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) jobinfo;
916 int rc = 0;
917 DEF_TIMERS;
918
919 START_TIMER;
920
921 if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
922 CRAY_DEBUG("switch_job was NULL");
923 return SLURM_SUCCESS;
924 }
925
926 if (debug_flags & DEBUG_FLAG_SWITCH) {
927 CRAY_INFO("switch_p_job_step_complete");
928 }
929
930 /* Release the cookies */
931 rc = release_cookies(job);
932 if (rc != SLURM_SUCCESS) {
933 return rc;
934 }
935 END_TIMER;
936 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
937 INFO_LINE("call took: %s", TIME_STR);
938 #endif
939 return SLURM_SUCCESS;
940 }
941
switch_p_job_step_part_comp(switch_jobinfo_t * jobinfo,char * nodelist)942 extern int switch_p_job_step_part_comp(switch_jobinfo_t *jobinfo,
943 char *nodelist)
944 {
945 return SLURM_SUCCESS;
946 }
947
switch_p_part_comp(void)948 extern bool switch_p_part_comp(void)
949 {
950 return false;
951 }
952
switch_p_job_step_allocated(switch_jobinfo_t * jobinfo,char * nodelist)953 extern int switch_p_job_step_allocated(switch_jobinfo_t *jobinfo,
954 char *nodelist)
955 {
956 return SLURM_SUCCESS;
957 }
958
switch_p_slurmctld_init(void)959 extern int switch_p_slurmctld_init(void)
960 {
961 return SLURM_SUCCESS;
962 }
963
switch_p_slurmd_init(void)964 extern int switch_p_slurmd_init(void)
965 {
966 return SLURM_SUCCESS;
967 }
968
switch_p_slurmd_step_init(void)969 extern int switch_p_slurmd_step_init(void)
970 {
971 return SLURM_SUCCESS;
972 }
973
974 /*
975 * Functions for suspend/resume support
976 */
switch_p_job_step_pre_suspend(stepd_step_rec_t * job)977 extern int switch_p_job_step_pre_suspend(stepd_step_rec_t *job)
978 {
979 #if _DEBUG
980 info("switch_p_job_step_pre_suspend(job %u.%u)",
981 job->jobid, job->stepid);
982 #endif
983 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
984 slurm_cray_jobinfo_t *jobinfo = job->switch_job ?
985 (slurm_cray_jobinfo_t *)job->switch_job->data : NULL;
986 char *err_msg = NULL;
987 int rc;
988 DEF_TIMERS;
989
990 START_TIMER;
991
992 rc = alpsc_pre_suspend(&err_msg, job->cont_id, jobinfo->ptags,
993 jobinfo->num_ptags, SUSPEND_TIMEOUT_MSEC);
994 ALPSC_CN_DEBUG("alpsc_pre_suspend");
995 if (rc != 1) {
996 return SLURM_ERROR;
997 }
998 END_TIMER;
999 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1000 INFO_LINE("call took: %s", TIME_STR);
1001 #endif
1002 return SLURM_SUCCESS;
1003 }
1004
switch_p_job_step_post_suspend(stepd_step_rec_t * job)1005 extern int switch_p_job_step_post_suspend(stepd_step_rec_t *job)
1006 {
1007 #if _DEBUG
1008 info("switch_p_job_step_post_suspend(job %u.%u)",
1009 job->jobid, job->stepid);
1010 #endif
1011 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
1012 char *err_msg = NULL;
1013 int rc;
1014 DEF_TIMERS;
1015
1016 START_TIMER;
1017
1018 rc = alpsc_post_suspend(&err_msg, job->cont_id);
1019 ALPSC_CN_DEBUG("alpsc_post_suspend");
1020 if (rc != 1) {
1021 return SLURM_ERROR;
1022 }
1023 END_TIMER;
1024 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1025 INFO_LINE("call took: %s", TIME_STR);
1026 #endif
1027 return SLURM_SUCCESS;
1028 }
1029
switch_p_job_step_pre_resume(stepd_step_rec_t * job)1030 extern int switch_p_job_step_pre_resume(stepd_step_rec_t *job)
1031 {
1032 #if _DEBUG
1033 info("switch_p_job_step_pre_resume(job %u.%u)",
1034 job->jobid, job->stepid);
1035 #endif
1036 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
1037 slurm_cray_jobinfo_t *jobinfo = job->switch_job ?
1038 (slurm_cray_jobinfo_t *)job->switch_job->data : NULL;
1039 char *err_msg = NULL;
1040 int rc;
1041 DEF_TIMERS;
1042
1043 START_TIMER;
1044
1045 rc = alpsc_pre_resume(&err_msg, job->cont_id, jobinfo->ptags,
1046 jobinfo->num_ptags);
1047 ALPSC_CN_DEBUG("alpsc_pre_resume");
1048 if (rc != 1) {
1049 return SLURM_ERROR;
1050 }
1051 END_TIMER;
1052 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1053 INFO_LINE("call took: %s", TIME_STR);
1054 #endif
1055 return SLURM_SUCCESS;
1056 }
1057
switch_p_job_step_post_resume(stepd_step_rec_t * job)1058 extern int switch_p_job_step_post_resume(stepd_step_rec_t *job)
1059 {
1060 #if _DEBUG
1061 info("switch_p_job_step_post_resume(job %u.%u)",
1062 job->jobid, job->stepid);
1063 #endif
1064 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
1065 char *err_msg = NULL;
1066 int rc;
1067 DEF_TIMERS;
1068
1069 START_TIMER;
1070
1071 rc = alpsc_post_resume(&err_msg, job->cont_id);
1072 ALPSC_CN_DEBUG("alpsc_post_resume");
1073 if (rc != 1) {
1074 return SLURM_ERROR;
1075 }
1076 END_TIMER;
1077 if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1078 INFO_LINE("call took: %s", TIME_STR);
1079 #endif
1080 return SLURM_SUCCESS;
1081 }
1082