1 /*****************************************************************************\
2  *  switch_cray_aries.c - Library for managing a switch on a Cray/Aries system.
3  *****************************************************************************
4  *  Copyright (C) 2013 SchedMD LLC
5  *  Copyright 2013 Cray Inc. All Rights Reserved.
6  *  Written by Danny Auble <da@schedmd.com>
7  *
8  *  This file is part of Slurm, a resource management program.
9  *  For details, see <https://slurm.schedmd.com/>.
10  *  Please also read the included file: DISCLAIMER.
11  *
12  *  Slurm is free software; you can redistribute it and/or modify it under
13  *  the terms of the GNU General Public License as published by the Free
14  *  Software Foundation; either version 2 of the License, or (at your option)
15  *  any later version.
16  *
17  *  In addition, as a special exception, the copyright holders give permission
18  *  to link the code of portions of this program with the OpenSSL library under
19  *  certain conditions as described in each individual source file, and
20  *  distribute linked combinations including the two. You must obey the GNU
21  *  General Public License in all respects for all of the code used other than
22  *  OpenSSL. If you modify file(s) with this exception, you may extend this
23  *  exception to your version of the file(s), but you are not obligated to do
24  *  so. If you do not wish to do so, delete this exception statement from your
25  *  version.  If you delete this exception statement from all source files in
26  *  the program, then also delete it here.
27  *
28  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
29  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
31  *  details.
32  *
33  *  You should have received a copy of the GNU General Public License along
34  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
35  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
36 \*****************************************************************************/
37 
38 #include "config.h"
39 
40 #define _GNU_SOURCE
41 
42 #include <errno.h>
43 #include <fcntl.h>
44 #include <inttypes.h>
45 #include <limits.h>
46 #ifdef __linux__
47 #include <linux/limits.h>
48 #endif
49 #include <math.h>
50 #include <sched.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <sys/stat.h>
56 #include <unistd.h>
57 
58 #include "switch_cray_aries.h"
59 #include "slurm/slurm.h"
60 #include "slurm/slurm_errno.h"
61 #include "src/common/pack.h"
62 #include "src/common/gres.h"
63 
64 #ifdef HAVE_NATIVE_CRAY
65 #include <job.h> /* Cray's job module component */
66 #endif
67 
68 #define SWITCH_BUF_SIZE (PORT_CNT + 128)
69 #define SWITCH_CRAY_STATE_VERSION "PROTOCOL_VERSION"
70 
71 uint64_t debug_flags = 0;
72 
73 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
74 static bool lustre_no_flush = false;
75 #endif
76 
77 /*
78  * These variables are required by the generic plugin interface.  If they
79  * are not found in the plugin, the plugin loader will ignore it.
80  *
81  * plugin_name - a string giving a human-readable description of the
82  * plugin.  There is no maximum length, but the symbol must refer to
83  * a valid string.
84  *
85  * plugin_type - a string suggesting the type of the plugin or its
86  * applicability to a particular form of data or method of data handling.
87  * If the low-level plugin API is used, the contents of this string are
88  * unimportant and may be anything.  Slurm uses the higher-level plugin
89  * interface which requires this string to be of the form
90  *
91  *      <application>/<method>
92  *
93  * where <application> is a description of the intended application of
94  * the plugin (e.g., "switch" for Slurm switch) and <method> is a description
95  * of how this plugin satisfies that application.  Slurm will only load
96  * a switch plugin if the plugin_type string has a prefix of "switch/".
97  *
98  * plugin_version - an unsigned 32-bit integer containing the Slurm version
99  * (major.minor.micro combined into a single number).
100  */
101 const char plugin_name[] = "switch Cray/Aries plugin";
102 const char plugin_type[] = "switch/cray_aries";
103 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
104 const uint32_t plugin_id      = SWITCH_PLUGIN_CRAY;
105 
106 /*
107  * init() is called when the plugin is loaded, before any other functions
108  * are called.  Put global initialization here.
109  */
init(void)110 int init(void)
111 {
112 	debug("%s loaded.", plugin_name);
113 	debug_flags = slurm_get_debug_flags();
114 
115 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
116 	start_lease_extender();
117 #endif
118 	return SLURM_SUCCESS;
119 }
120 
fini(void)121 int fini(void)
122 {
123 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
124 	cleanup_lease_extender();
125 #endif
126 
127 	return SLURM_SUCCESS;
128 }
129 
switch_p_reconfig(void)130 extern int switch_p_reconfig(void)
131 {
132 	debug_flags = slurm_get_debug_flags();
133 	return SLURM_SUCCESS;
134 }
135 
136 /*
137  * switch functions for global state save/restore
138  */
switch_p_libstate_save(char * dir_name)139 extern int switch_p_libstate_save(char *dir_name)
140 {
141 	return SLURM_SUCCESS;
142 }
143 
switch_p_libstate_restore(char * dir_name,bool recover)144 extern int switch_p_libstate_restore(char *dir_name, bool recover)
145 {
146 #ifdef HAVE_NATIVE_CRAY
147 	char *file_name;
148 	struct stat st;
149 
150 	file_name = xstrdup(dir_name);
151 	xstrcat(file_name, "/switch_cray_state");
152 	if (stat(file_name, &st) == 0) {
153 		error("%s no longer used, please remove it, kill all running "
154 		      "jobs, and set MpiParams in slurm.conf", file_name);
155 		xfree(file_name);
156 		return SLURM_ERROR;
157 	}
158 	xfree(file_name);
159 #endif
160 	return SLURM_SUCCESS;
161 }
162 
switch_p_libstate_clear(void)163 extern int switch_p_libstate_clear(void)
164 {
165 	return SLURM_SUCCESS;
166 }
167 
168 /*
169  * switch functions for job step specific credential
170  */
switch_p_alloc_jobinfo(switch_jobinfo_t ** switch_job,uint32_t job_id,uint32_t step_id)171 extern int switch_p_alloc_jobinfo(
172 	switch_jobinfo_t **switch_job, uint32_t job_id, uint32_t step_id)
173 {
174 	slurm_cray_jobinfo_t *new;
175 
176 	xassert(switch_job);
177 	new = (slurm_cray_jobinfo_t *) xmalloc(sizeof(slurm_cray_jobinfo_t));
178 	new->magic = CRAY_JOBINFO_MAGIC;
179 	new->num_cookies = 0;
180 	new->cookies = NULL;
181 	new->cookie_ids = NULL;
182 	new->apid = SLURM_ID_HASH(job_id, step_id);
183 	*switch_job = (switch_jobinfo_t *) new;
184 	return SLURM_SUCCESS;
185 }
186 
switch_p_build_jobinfo(switch_jobinfo_t * switch_job,slurm_step_layout_t * step_layout,char * network)187 extern int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
188 				  slurm_step_layout_t *step_layout,
189 				  char *network)
190 {
191 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
192 	int rc, cnt = 0;
193 	int32_t *nodes = NULL;
194 	slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job;
195 	DEF_TIMERS;
196 
197 	START_TIMER;
198 
199 	if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
200 		CRAY_DEBUG("switch_job was NULL");
201 		return SLURM_SUCCESS;
202 	}
203 
204 	xassert(job->magic == CRAY_JOBINFO_MAGIC);
205 
206 	// Get the list of nodes used for the cookie lease
207 	rc = list_str_to_array(step_layout->node_list, &cnt, &nodes);
208 	if (rc < 0) {
209 		CRAY_ERR("list_str_to_array failed");
210 		return SLURM_ERROR;
211 	}
212 	if (step_layout->node_cnt != cnt) {
213 		CRAY_ERR("list_str_to_array returned count %"
214 			 PRIu32 "does not match expected count %d",
215 			 cnt, step_layout->node_cnt);
216 	}
217 
218 	// Get cookies for network configuration
219 	rc = lease_cookies(job, nodes, step_layout->node_cnt);
220 
221 	END_TIMER;
222 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
223 		INFO_LINE("call took: %s", TIME_STR);
224 
225 	xfree(nodes);
226 	if (rc != SLURM_SUCCESS) {
227 		return rc;
228 	}
229 #endif
230 	return SLURM_SUCCESS;
231 }
232 
switch_p_duplicate_jobinfo(switch_jobinfo_t * source,switch_jobinfo_t ** dest)233 extern int switch_p_duplicate_jobinfo(switch_jobinfo_t *source,
234 				      switch_jobinfo_t **dest)
235 {
236 	slurm_cray_jobinfo_t *new;
237 	slurm_cray_jobinfo_t *old = (slurm_cray_jobinfo_t *) source;
238 
239 	xassert(old);
240 
241 	new = xmalloc(sizeof(slurm_cray_jobinfo_t));
242 	memcpy(new, old, sizeof(slurm_cray_jobinfo_t));
243 
244 	if (old->num_cookies) {
245 		int i;
246 		new->cookie_ids = xcalloc(old->num_cookies, sizeof(uint32_t));
247 		memcpy(new->cookie_ids, old->cookie_ids,
248 		       sizeof(uint32_t) * old->num_cookies);
249 		new->cookies = xcalloc(old->num_cookies, sizeof(char *));
250 		for (i = 0; i < old->num_cookies; i++)
251 			new->cookies[i] = xstrdup(old->cookies[i]);
252 	}
253 
254 	if (old->num_ptags) {
255 		new->ptags = xcalloc(old->num_ptags, sizeof(int));
256 		memcpy(new->ptags, old->ptags, sizeof(int) * old->num_ptags);
257 	}
258 
259 	*dest = (switch_jobinfo_t *) new;
260 	return SLURM_SUCCESS;
261 }
262 
263 /*
264  *
265  */
switch_p_free_jobinfo(switch_jobinfo_t * switch_job)266 extern void switch_p_free_jobinfo(switch_jobinfo_t *switch_job)
267 {
268 	slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job;
269 	int i;
270 	DEF_TIMERS;
271 
272 	START_TIMER;
273 
274 	if (!job) {
275 		CRAY_DEBUG("switch_job was NULL");
276 		return;
277 	}
278 
279 	if (job->magic == CRAY_NULL_JOBINFO_MAGIC) {
280 		CRAY_DEBUG("switch_job was NULL MAGIC");
281 		goto endit;
282 	} else if (job->magic != CRAY_JOBINFO_MAGIC) {
283 		CRAY_ERR("job is not a switch/cray slurm_cray_jobinfo_t");
284 		return;
285 	}
286 
287 	job->magic = 0;
288 
289 	/*
290 	 * Free the cookies and the cookie_ids.
291 	 */
292 	if (job->num_cookies != 0) {
293 		xfree(job->cookie_ids);
294 
295 		if (job->cookies) {
296 			// Free the individual cookie strings.
297 			for (i = 0; i < job->num_cookies; i++) {
298 				xfree(job->cookies[i]);
299 			}
300 			xfree(job->cookies);
301 		}
302 	}
303 	if (job->num_ptags)
304 		xfree(job->ptags);
305 endit:
306 	xfree(job);
307 	END_TIMER;
308 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
309 		INFO_LINE("call took: %s", TIME_STR);
310 
311 	return;
312 }
313 
switch_p_pack_jobinfo(switch_jobinfo_t * switch_job,Buf buffer,uint16_t protocol_version)314 extern int switch_p_pack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer,
315 				 uint16_t protocol_version)
316 {
317 	slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job;
318 
319 	xassert(buffer);
320 
321 	/*
322 	 * There is nothing to pack, so pack in magic telling unpack not to
323 	 * attempt to unpack anything.
324 	 */
325 	if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
326 		pack32(CRAY_NULL_JOBINFO_MAGIC, buffer);
327 		return 0;
328 	}
329 
330 	xassert(job->magic == CRAY_JOBINFO_MAGIC);
331 
332 	if (debug_flags & DEBUG_FLAG_SWITCH) {
333 		CRAY_INFO("switch_jobinfo_t contents:");
334 		print_jobinfo(job);
335 	}
336 
337 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
338 		pack32(job->magic, buffer);
339 		pack32(job->num_cookies, buffer);
340 		packstr_array(job->cookies, job->num_cookies, buffer);
341 		pack32_array(job->cookie_ids, job->num_cookies, buffer);
342 		pack64(job->apid, buffer);
343 	}
344 
345 	return 0;
346 }
347 
switch_p_unpack_jobinfo(switch_jobinfo_t ** switch_job,Buf buffer,uint16_t protocol_version)348 extern int switch_p_unpack_jobinfo(switch_jobinfo_t **switch_job, Buf buffer,
349 				   uint16_t protocol_version)
350 {
351 	uint32_t num_cookies;
352 	slurm_cray_jobinfo_t *job;
353 
354 	if (!switch_job) {
355 		CRAY_DEBUG("switch_job was NULL");
356 		return SLURM_SUCCESS;
357 	}
358 
359 	xassert(buffer);
360 
361 	job = xmalloc(sizeof(slurm_cray_jobinfo_t));
362 	*switch_job = (switch_jobinfo_t *)job;
363 
364 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
365 		safe_unpack32(&job->magic, buffer);
366 
367 		if (job->magic == CRAY_NULL_JOBINFO_MAGIC) {
368 			CRAY_DEBUG("Nothing to unpack");
369 			return SLURM_SUCCESS;
370 		}
371 
372 		xassert(job->magic == CRAY_JOBINFO_MAGIC);
373 		safe_unpack32(&(job->num_cookies), buffer);
374 		safe_unpackstr_array(&(job->cookies), &num_cookies, buffer);
375 		if (num_cookies != job->num_cookies) {
376 			CRAY_ERR("Wrong number of cookies received."
377 				 " Expected: %" PRIu32 "Received: %" PRIu32,
378 				 job->num_cookies, num_cookies);
379 			goto unpack_error;
380 		}
381 		safe_unpack32_array(&(job->cookie_ids), &num_cookies, buffer);
382 		if (num_cookies != job->num_cookies) {
383 			CRAY_ERR("Wrong number of cookie IDs received."
384 				 " Expected: %" PRIu32 "Received: %" PRIu32,
385 				 job->num_cookies, num_cookies);
386 			goto unpack_error;
387 		}
388 		safe_unpack64(&job->apid, buffer);
389 	}
390 
391 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
392 	/*
393 	 * On recovery, we want to keep extending the life of
394 	 * cookies still in use. So lets track these cookies
395 	 * with the lease extender. Duplicate cookies are ignored.
396 	 */
397 	track_cookies(job);
398 #endif
399 
400 	if (debug_flags & DEBUG_FLAG_SWITCH) {
401 		CRAY_INFO("Unpacked jobinfo");
402 		print_jobinfo(job);
403 	}
404 
405 	return SLURM_SUCCESS;
406 
407 unpack_error:
408 
409 	CRAY_ERR("Unpacking error");
410 	switch_p_free_jobinfo(*switch_job);
411 	*switch_job = NULL;
412 
413 	return SLURM_ERROR;
414 }
415 
switch_p_print_jobinfo(FILE * fp,switch_jobinfo_t * jobinfo)416 extern void switch_p_print_jobinfo(FILE *fp, switch_jobinfo_t *jobinfo)
417 {
418 	return;
419 }
420 
switch_p_sprint_jobinfo(switch_jobinfo_t * switch_jobinfo,char * buf,size_t size)421 extern char *switch_p_sprint_jobinfo(switch_jobinfo_t *switch_jobinfo,
422 				     char *buf, size_t size)
423 {
424 	if (buf && size) {
425 		buf[0] = '\0';
426 		return buf;
427 	}
428 
429 	return NULL ;
430 }
431 
432 /*
433  * switch functions for job initiation
434  */
switch_p_node_init(void)435 extern int switch_p_node_init(void)
436 {
437 	return SLURM_SUCCESS;
438 }
439 
switch_p_node_fini(void)440 extern int switch_p_node_fini(void)
441 {
442 	return SLURM_SUCCESS;
443 }
444 
switch_p_job_preinit(switch_jobinfo_t * jobinfo)445 extern int switch_p_job_preinit(switch_jobinfo_t *jobinfo)
446 {
447 	return SLURM_SUCCESS;
448 }
449 
switch_p_job_init(stepd_step_rec_t * job)450 extern int switch_p_job_init(stepd_step_rec_t *job)
451 {
452 
453 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
454 	slurm_cray_jobinfo_t *sw_job = job->switch_job ?
455 		(slurm_cray_jobinfo_t *)job->switch_job->data : NULL;
456 	int rc, num_ptags;
457 	char *launch_params;
458 	int exclusive = 0, mem_scaling = 100, cpu_scaling = 100;
459 	int *ptags = NULL;
460 	char *err_msg = NULL;
461 	uint64_t cont_id = job->cont_id;
462 	alpsc_peInfo_t alpsc_pe_info = {-1, -1, -1, -1, NULL, NULL, NULL};
463 	int cmd_index = 0;
464 #ifdef HAVE_NATIVE_CRAY
465 	uint64_t gpu_cnt = 0;
466 	int control_nid = 0, num_branches = 0;
467 	struct sockaddr_in control_soc;
468 	alpsc_branchInfo_t alpsc_branch_info;
469 	uint32_t jobid;
470 #endif
471 
472 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
473 	char *npc = "none";
474 	int access = ALPSC_NET_PERF_CTR_NONE;
475 #endif
476 	DEF_TIMERS;
477 
478 	START_TIMER;
479 
480 #ifdef HAVE_CRAY_NETWORK
481 	/* No PAGG job containers; uid used instead to configure network */
482 	cont_id = (uint64_t)job->uid;
483 #endif
484 
485 	if (!sw_job || (sw_job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
486 		CRAY_DEBUG("job->switch_job was NULL");
487 		return SLURM_SUCCESS;
488 	}
489 
490 	xassert(job->msg);
491 	xassert(sw_job->magic == CRAY_JOBINFO_MAGIC);
492 
493 #ifdef HAVE_NATIVE_CRAY
494 	// Attach to the cncu container
495 	if (job->het_job_id && (job->het_job_id != NO_VAL))
496 		jobid = job->het_job_id;
497 	else
498 		jobid = job->jobid;
499 	rc = alpsc_attach_cncu_container(&err_msg, jobid, job->cont_id);
500 	ALPSC_CN_DEBUG("alpsc_attach_cncu_container");
501 	if (rc != 1) {
502 		return SLURM_ERROR;
503 	}
504 
505 	// Create the apid directory
506 	rc = create_apid_dir(sw_job->apid, job->uid, job->gid);
507 	if (rc != SLURM_SUCCESS) {
508 		return rc;
509 	}
510 
511 	/*
512 	 * Not defined yet -- This one may be skipped because we may not need to
513 	 * find the PAGG JOB container based on the APID.  It is part of the
514 	 * stepd_step_rec_t struct in the cont_id member, so if we have access
515 	 * to the struct, then we have access to the JOB container.
516 	 */
517 
518 	// alpsc_set_PAGG_apid()
519 #endif
520 	/*
521 	 * Fill in the alpsc_pe_info structure
522 	 */
523 	rc = build_alpsc_pe_info(job, &alpsc_pe_info, &cmd_index);
524 	if (rc != SLURM_SUCCESS) {
525 		return rc;
526 	}
527 
528 	/*
529 	 * Configure the network
530 	 *
531 	 * Cray shmem still uses the network, even when it's using only one
532 	 * node, so we must always configure the network.
533 	 */
534 	launch_params = slurm_get_launch_params();
535 	if (launch_params && strstr(launch_params, "cray_net_exclusive")) {
536 		/*
537 		 * Grant exclusive access and all aries resources to the job.
538 		 * Not recommended if you may run multiple steps within
539 		 * the job, and will cause problems if you suspend or allow
540 		 * nodes to be shared across multiple jobs.
541 		 */
542 		/*
543 		 * TODO: determine if this can be managed per-job, rather
544 		 * than globally across the cluster.
545 		 */
546 		exclusive = 1;
547 	}
548 	if (launch_params && strstr(launch_params, "lustre_no_flush")) {
549 		/* Lustre cache flush can cause job bus errors, see bug 4309 */
550 		lustre_no_flush = true;
551 	}
552 	xfree(launch_params);
553 
554 	if (!exclusive) {
555 		/*
556 		 * Calculate percentages of cpu and mem to assign to
557 		 * non-exclusive jobs.
558 		 */
559 
560 		cpu_scaling = get_cpu_scaling(job);
561 		if (cpu_scaling == -1)
562 			return SLURM_ERROR;
563 
564 		mem_scaling = get_mem_scaling(job);
565 		if (mem_scaling == -1)
566 			return SLURM_ERROR;
567 	}
568 
569 	if (debug_flags & DEBUG_FLAG_SWITCH) {
570 		CRAY_INFO("Network Scaling: Exclusive %d CPU %d Memory %d",
571 			  exclusive, cpu_scaling, mem_scaling);
572 	}
573 
574 	rc = alpsc_configure_nic(&err_msg, exclusive, cpu_scaling, mem_scaling,
575 				 cont_id, sw_job->num_cookies,
576 				 (const char **) sw_job->cookies,
577 				 &num_ptags, &ptags, NULL);
578 	ALPSC_CN_DEBUG("alpsc_configure_nic");
579 	if (rc != 1) {
580 		free(ptags);
581 		free_alpsc_pe_info(&alpsc_pe_info);
582 		return SLURM_ERROR;
583 	}
584 	/*
585 	 * xmalloc the ptags and copy the ptag array to the xmalloced
586 	 * space, so they can be xfreed later
587 	 */
588 	if (num_ptags) {
589 		sw_job->ptags = xcalloc(num_ptags, sizeof(int));
590 		memcpy(sw_job->ptags, ptags, sizeof(int) * num_ptags);
591 		free(ptags);
592 		sw_job->num_ptags = num_ptags;
593 	}
594 
595 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
596 	// Write the IAA file
597 	rc = write_iaa_file(job, sw_job, sw_job->ptags, sw_job->num_ptags,
598 			    &alpsc_pe_info);
599 	if (rc != SLURM_SUCCESS) {
600 		free_alpsc_pe_info(&alpsc_pe_info);
601 		return rc;
602 	}
603 #endif
604 
605 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
606 	/*
607 	 * If there is reserved access to network performance counters,
608 	 * configure the appropriate access permission in the kernel.
609 	 */
610 	access = ALPSC_NET_PERF_CTR_NONE;
611 	select_g_select_jobinfo_get(job->msg->select_jobinfo,
612 		SELECT_JOBDATA_NETWORK, &npc);
613 	CRAY_DEBUG("network performance counters SELECT_JOBDATA_NETWORK %s",
614 		npc);
615 	if (xstrcasecmp(npc, "system") == 0) {
616 		access = ALPSC_NET_PERF_CTR_SYSTEM;
617 	} else if (xstrcasecmp(npc, "blade") == 0) {
618 		access = ALPSC_NET_PERF_CTR_BLADE;
619 	}
620 	if (access != ALPSC_NET_PERF_CTR_NONE) {
621 		rc = alpsc_set_perf_ctr_perms(&err_msg, job->cont_id, access);
622 		ALPSC_CN_DEBUG("alpsc_set_perf_ctr_perms");
623 		if (rc != 1) {
624 			free_alpsc_pe_info(&alpsc_pe_info);
625 			return SLURM_ERROR;
626 		}
627 	}
628 
629 	/*
630 	 * Some of the input parameters for alpsc_write_placement_file do not
631 	 * apply for Slurm.  These parameters will be given zero values.
632 	 * They are
633 	 *  int control_nid
634 	 *  struct sockaddr_in control_soc
635 	 *  int num_branches
636 	 *  alpsc_branchInfo_t alpsc_branch_info
637 	 */
638 	control_soc.sin_port = 0;
639 	control_soc.sin_addr.s_addr = 0;
640 	/* Just assigning control_soc because it's already zero. */
641 	alpsc_branch_info.tAddr = control_soc;
642 	alpsc_branch_info.tIndex = 0;
643 	alpsc_branch_info.tLen = 0;
644 	alpsc_branch_info.targ = 0;
645 
646 	rc = alpsc_write_placement_file(&err_msg, sw_job->apid, cmd_index,
647 					&alpsc_pe_info, control_nid,
648 					control_soc, num_branches,
649 					&alpsc_branch_info);
650 
651 	ALPSC_CN_DEBUG("alpsc_write_placement_file");
652 	if (rc != 1) {
653 		free_alpsc_pe_info(&alpsc_pe_info);
654 		return SLURM_ERROR;
655 	}
656 
657 	/*
658 	 * Also write a placement file with the legacy apid to support old
659 	 * statically linked Cray PMI applications. We can't simply symlink
660 	 * the old format to the new because the apid is written to the file.
661 	 */
662 	if (sw_job->apid != SLURM_ID_HASH_LEGACY(sw_job->apid)) {
663 		rc = alpsc_write_placement_file(&err_msg,
664 			SLURM_ID_HASH_LEGACY(sw_job->apid),
665 			cmd_index, &alpsc_pe_info, control_nid,	control_soc,
666 			num_branches, &alpsc_branch_info);
667 		ALPSC_CN_DEBUG("alpsc_write_placement_file");
668 		if (rc != 1) {
669 			free_alpsc_pe_info(&alpsc_pe_info);
670 			return SLURM_ERROR;
671 		}
672 	}
673 #endif
674 	/* Clean up alpsc_pe_info*/
675 	free_alpsc_pe_info(&alpsc_pe_info);
676 	/*
677 	 * Write some environment variables used by LLI and PMI
678 	 */
679 	rc = set_job_env(job, sw_job);
680 	if (rc != SLURM_SUCCESS)
681 		return rc;
682 
683 #ifdef HAVE_NATIVE_CRAY
684 	/*
685 	 * Query the generic resources to see if the GPU should be allocated
686 	 */
687 
688 	rc = gres_get_step_info(job->step_gres_list, "gpu", 0,
689 				GRES_STEP_DATA_COUNT, &gpu_cnt);
690 	CRAY_INFO("gres_cnt: %d %"PRIu64, rc, gpu_cnt);
691 	if (gpu_cnt > 0)
692 		setup_gpu(job);
693 
694 	/*
695 	 * Set the Job's APID
696 	 */
697 	job_setapid(getpid(), sw_job->apid);
698 #endif
699 
700 	END_TIMER;
701 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
702 		INFO_LINE("call took: %s", TIME_STR);
703 #endif
704 
705 	return SLURM_SUCCESS;
706 }
707 
switch_p_job_suspend_test(switch_jobinfo_t * jobinfo)708 extern int switch_p_job_suspend_test(switch_jobinfo_t *jobinfo)
709 {
710 	return SLURM_SUCCESS;
711 }
712 
switch_p_job_suspend_info_get(switch_jobinfo_t * jobinfo,void ** suspend_info)713 extern void switch_p_job_suspend_info_get(switch_jobinfo_t *jobinfo,
714 					  void **suspend_info)
715 {
716 	return;
717 }
switch_p_job_suspend_info_pack(void * suspend_info,Buf buffer,uint16_t protocol_version)718 extern void switch_p_job_suspend_info_pack(void *suspend_info, Buf buffer,
719 					   uint16_t protocol_version)
720 {
721 	return;
722 }
723 
switch_p_job_suspend_info_unpack(void ** suspend_info,Buf buffer,uint16_t protocol_version)724 extern int switch_p_job_suspend_info_unpack(void **suspend_info, Buf buffer,
725 					    uint16_t protocol_version)
726 {
727 	return SLURM_SUCCESS;
728 }
729 
switch_p_job_suspend_info_free(void * suspend_info)730 extern void switch_p_job_suspend_info_free(void *suspend_info)
731 {
732 	return;
733 }
734 
switch_p_job_suspend(void * suspend_info,int max_wait)735 extern int switch_p_job_suspend(void *suspend_info, int max_wait)
736 {
737 	return SLURM_SUCCESS;
738 }
739 
switch_p_job_resume(void * suspend_info,int max_wait)740 extern int switch_p_job_resume(void *suspend_info, int max_wait)
741 {
742 	return SLURM_SUCCESS;
743 }
744 
switch_p_job_fini(switch_jobinfo_t * jobinfo)745 extern int switch_p_job_fini(switch_jobinfo_t *jobinfo)
746 {
747 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
748 	slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) jobinfo;
749 	DEF_TIMERS;
750 
751 	START_TIMER;
752 
753 	if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
754 		CRAY_ERR("jobinfo pointer was NULL");
755 		return SLURM_SUCCESS;
756 	}
757 
758 	xassert(job->magic == CRAY_JOBINFO_MAGIC);
759 
760 #ifdef HAVE_NATIVE_CRAY
761 	int rc;
762 	rc = remove_spool_files(job->apid);
763 	if (rc != SLURM_SUCCESS) {
764 	    return rc;
765 	}
766 #endif
767 
768 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
769 	// Remove the IAA file
770 	unlink_iaa_file(job);
771 #endif
772 
773 	END_TIMER;
774 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
775 		INFO_LINE("call took: %s", TIME_STR);
776 #endif
777 	return SLURM_SUCCESS;
778 }
779 
switch_p_job_postfini(stepd_step_rec_t * job)780 extern int switch_p_job_postfini(stepd_step_rec_t *job)
781 {
782 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
783 	int rc;
784 	char *err_msg = NULL;
785 	uid_t pgid = job->jmgr_pid;
786 #ifdef HAVE_NATIVE_CRAY
787         uint64_t gpu_cnt = 0;
788 #endif
789 	DEF_TIMERS;
790 
791 	START_TIMER;
792 
793 	if (!job->switch_job) {
794 		CRAY_DEBUG("job->switch_job was NULL");
795 	}
796 
797 	/*
798 	 *  Kill all processes in the job's session
799 	 */
800 	if (pgid) {
801 		CRAY_DEBUG("Sending SIGKILL to pgid %lu", (unsigned long) pgid);
802 		kill(-pgid, SIGKILL);
803 	} else
804 		CRAY_INFO("Job %u.%u: Bad pid value %lu",
805 			  job->jobid, job->stepid, (unsigned long) pgid);
806 	/*
807 	 * Clean-up
808 	 *
809 	 * 0. Reset GPU proxy
810 	 * 1. Flush Lustre caches
811 	 * 2. Flush virtual memory
812 	 * 3. Compact memory
813 	 */
814 
815 #ifdef HAVE_NATIVE_CRAY
816 	// Set the proxy back to the default state.
817 	rc = gres_get_step_info(job->step_gres_list, "gpu", 0,
818 				GRES_STEP_DATA_COUNT, &gpu_cnt);
819 	if (gpu_cnt > 0) {
820 		reset_gpu(job);
821 	}
822 #endif
823 	if (!lustre_no_flush) {
824 		// Flush Lustre Cache
825 		rc = alpsc_flush_lustre(&err_msg);
826 		ALPSC_CN_DEBUG("alpsc_flush_lustre");
827 		if (rc != 1) {
828 			return SLURM_ERROR;
829 		}
830 
831 		// Flush virtual memory
832 		rc = system("echo 3 > /proc/sys/vm/drop_caches");
833 		if (rc != -1) {
834 			rc = WEXITSTATUS(rc);
835 		}
836 		if (rc) {
837 			CRAY_ERR("Flushing virtual memory failed. Return code: %d",
838 				 rc);
839 		}
840 	}
841 
842 	END_TIMER;
843 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
844 		INFO_LINE("call took: %s", TIME_STR);
845 #endif
846 	return SLURM_SUCCESS;
847 }
848 
switch_p_job_attach(switch_jobinfo_t * jobinfo,char *** env,uint32_t nodeid,uint32_t procid,uint32_t nnodes,uint32_t nprocs,uint32_t rank)849 extern int switch_p_job_attach(switch_jobinfo_t *jobinfo, char ***env,
850 			       uint32_t nodeid, uint32_t procid,
851 			       uint32_t nnodes, uint32_t nprocs,
852 			       uint32_t rank)
853 {
854 	return SLURM_SUCCESS;
855 }
856 
switch_p_get_jobinfo(switch_jobinfo_t * switch_job,int key,void * resulting_data)857 extern int switch_p_get_jobinfo(switch_jobinfo_t *switch_job, int key,
858 				void *resulting_data)
859 {
860 	slurm_seterrno(EINVAL);
861 	return SLURM_ERROR;
862 }
863 
864 /*
865  * node switch state monitoring functions
866  * required for IBM Federation switch
867  */
switch_p_clear_node_state(void)868 extern int switch_p_clear_node_state(void)
869 {
870 	return SLURM_SUCCESS;
871 }
872 
switch_p_alloc_node_info(switch_node_info_t ** switch_node)873 extern int switch_p_alloc_node_info(switch_node_info_t **switch_node)
874 {
875 	return SLURM_SUCCESS;
876 }
877 
switch_p_build_node_info(switch_node_info_t * switch_node)878 extern int switch_p_build_node_info(switch_node_info_t *switch_node)
879 {
880 	return SLURM_SUCCESS;
881 }
882 
switch_p_pack_node_info(switch_node_info_t * switch_node,Buf buffer,uint16_t protocol_version)883 extern int switch_p_pack_node_info(switch_node_info_t *switch_node, Buf buffer,
884 				   uint16_t protocol_version)
885 {
886 	return 0;
887 }
888 
switch_p_unpack_node_info(switch_node_info_t ** switch_node,Buf buffer,uint16_t protocol_version)889 extern int switch_p_unpack_node_info(switch_node_info_t **switch_node,
890 				     Buf buffer, uint16_t protocol_version)
891 {
892 	return SLURM_SUCCESS;
893 }
894 
switch_p_free_node_info(switch_node_info_t ** switch_node)895 extern int switch_p_free_node_info(switch_node_info_t **switch_node)
896 {
897 	return SLURM_SUCCESS;
898 }
899 
switch_p_sprintf_node_info(switch_node_info_t * switch_node,char * buf,size_t size)900 extern char*switch_p_sprintf_node_info(switch_node_info_t *switch_node,
901 				       char *buf, size_t size)
902 {
903 	if (buf && size) {
904 		buf[0] = '\0';
905 		return buf;
906 	}
907 
908 	return NULL ;
909 }
910 
switch_p_job_step_complete(switch_jobinfo_t * jobinfo,char * nodelist)911 extern int switch_p_job_step_complete(switch_jobinfo_t *jobinfo,
912 				      char *nodelist)
913 {
914 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
915 	slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) jobinfo;
916 	int rc = 0;
917 	DEF_TIMERS;
918 
919 	START_TIMER;
920 
921 	if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
922 		CRAY_DEBUG("switch_job was NULL");
923 		return SLURM_SUCCESS;
924 	}
925 
926 	if (debug_flags & DEBUG_FLAG_SWITCH) {
927 		CRAY_INFO("switch_p_job_step_complete");
928 	}
929 
930 	/* Release the cookies */
931 	rc = release_cookies(job);
932 	if (rc != SLURM_SUCCESS) {
933 		return rc;
934 	}
935 	END_TIMER;
936 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
937 		INFO_LINE("call took: %s", TIME_STR);
938 #endif
939 	return SLURM_SUCCESS;
940 }
941 
switch_p_job_step_part_comp(switch_jobinfo_t * jobinfo,char * nodelist)942 extern int switch_p_job_step_part_comp(switch_jobinfo_t *jobinfo,
943 				       char *nodelist)
944 {
945 	return SLURM_SUCCESS;
946 }
947 
switch_p_part_comp(void)948 extern bool switch_p_part_comp(void)
949 {
950 	return false;
951 }
952 
switch_p_job_step_allocated(switch_jobinfo_t * jobinfo,char * nodelist)953 extern int switch_p_job_step_allocated(switch_jobinfo_t *jobinfo,
954 				       char *nodelist)
955 {
956 	return SLURM_SUCCESS;
957 }
958 
switch_p_slurmctld_init(void)959 extern int switch_p_slurmctld_init(void)
960 {
961 	return SLURM_SUCCESS;
962 }
963 
switch_p_slurmd_init(void)964 extern int switch_p_slurmd_init(void)
965 {
966 	return SLURM_SUCCESS;
967 }
968 
switch_p_slurmd_step_init(void)969 extern int switch_p_slurmd_step_init(void)
970 {
971 	return SLURM_SUCCESS;
972 }
973 
974 /*
975  * Functions for suspend/resume support
976  */
switch_p_job_step_pre_suspend(stepd_step_rec_t * job)977 extern int switch_p_job_step_pre_suspend(stepd_step_rec_t *job)
978 {
979 #if _DEBUG
980 	info("switch_p_job_step_pre_suspend(job %u.%u)",
981 		job->jobid, job->stepid);
982 #endif
983 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
984 	slurm_cray_jobinfo_t *jobinfo = job->switch_job ?
985 		(slurm_cray_jobinfo_t *)job->switch_job->data : NULL;
986 	char *err_msg = NULL;
987 	int rc;
988 	DEF_TIMERS;
989 
990 	START_TIMER;
991 
992 	rc = alpsc_pre_suspend(&err_msg, job->cont_id, jobinfo->ptags,
993 			       jobinfo->num_ptags, SUSPEND_TIMEOUT_MSEC);
994 	ALPSC_CN_DEBUG("alpsc_pre_suspend");
995 	if (rc != 1) {
996 		return SLURM_ERROR;
997 	}
998 	END_TIMER;
999 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1000 		INFO_LINE("call took: %s", TIME_STR);
1001 #endif
1002 	return SLURM_SUCCESS;
1003 }
1004 
switch_p_job_step_post_suspend(stepd_step_rec_t * job)1005 extern int switch_p_job_step_post_suspend(stepd_step_rec_t *job)
1006 {
1007 #if _DEBUG
1008 	info("switch_p_job_step_post_suspend(job %u.%u)",
1009 		job->jobid, job->stepid);
1010 #endif
1011 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
1012 	char *err_msg = NULL;
1013 	int rc;
1014 	DEF_TIMERS;
1015 
1016 	START_TIMER;
1017 
1018 	rc = alpsc_post_suspend(&err_msg, job->cont_id);
1019 	ALPSC_CN_DEBUG("alpsc_post_suspend");
1020 	if (rc != 1) {
1021 		return SLURM_ERROR;
1022 	}
1023 	END_TIMER;
1024 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1025 		INFO_LINE("call took: %s", TIME_STR);
1026 #endif
1027 	return SLURM_SUCCESS;
1028 }
1029 
switch_p_job_step_pre_resume(stepd_step_rec_t * job)1030 extern int switch_p_job_step_pre_resume(stepd_step_rec_t *job)
1031 {
1032 #if _DEBUG
1033 	info("switch_p_job_step_pre_resume(job %u.%u)",
1034 		job->jobid, job->stepid);
1035 #endif
1036 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
1037 	slurm_cray_jobinfo_t *jobinfo = job->switch_job ?
1038 		(slurm_cray_jobinfo_t *)job->switch_job->data : NULL;
1039 	char *err_msg = NULL;
1040 	int rc;
1041 	DEF_TIMERS;
1042 
1043 	START_TIMER;
1044 
1045 	rc = alpsc_pre_resume(&err_msg, job->cont_id, jobinfo->ptags,
1046 			       jobinfo->num_ptags);
1047 	ALPSC_CN_DEBUG("alpsc_pre_resume");
1048 	if (rc != 1) {
1049 		return SLURM_ERROR;
1050 	}
1051 	END_TIMER;
1052 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1053 		INFO_LINE("call took: %s", TIME_STR);
1054 #endif
1055 	return SLURM_SUCCESS;
1056 }
1057 
switch_p_job_step_post_resume(stepd_step_rec_t * job)1058 extern int switch_p_job_step_post_resume(stepd_step_rec_t *job)
1059 {
1060 #if _DEBUG
1061 	info("switch_p_job_step_post_resume(job %u.%u)",
1062 		job->jobid, job->stepid);
1063 #endif
1064 #if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
1065 	char *err_msg = NULL;
1066 	int rc;
1067 	DEF_TIMERS;
1068 
1069 	START_TIMER;
1070 
1071 	rc = alpsc_post_resume(&err_msg, job->cont_id);
1072 	ALPSC_CN_DEBUG("alpsc_post_resume");
1073 	if (rc != 1) {
1074 		return SLURM_ERROR;
1075 	}
1076 	END_TIMER;
1077 	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
1078 		INFO_LINE("call took: %s", TIME_STR);
1079 #endif
1080 	return SLURM_SUCCESS;
1081 }
1082