1 /*****************************************************************************\ 2 * step_launch.h - launch a parallel job step 3 ***************************************************************************** 4 * Copyright (C) 2006-2007 The Regents of the University of California. 5 * Copyright (C) 2008-2010 Lawrence Livermore National Security. 6 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 7 * Written by Christopher J. Morrone <morrone2@llnl.gov> 8 * CODE-OCEC-09-009. All rights reserved. 9 * 10 * This file is part of Slurm, a resource management program. 11 * For details, see <https://slurm.schedmd.com/>. 12 * Please also read the included file: DISCLAIMER. 13 * 14 * Slurm is free software; you can redistribute it and/or modify it under 15 * the terms of the GNU General Public License as published by the Free 16 * Software Foundation; either version 2 of the License, or (at your option) 17 * any later version. 18 * 19 * In addition, as a special exception, the copyright holders give permission 20 * to link the code of portions of this program with the OpenSSL library under 21 * certain conditions as described in each individual source file, and 22 * distribute linked combinations including the two. You must obey the GNU 23 * General Public License in all respects for all of the code used other than 24 * OpenSSL. If you modify file(s) with this exception, you may extend this 25 * exception to your version of the file(s), but you are not obligated to do 26 * so. If you do not wish to do so, delete this exception statement from your 27 * version. If you delete this exception statement from all source files in 28 * the program, then also delete it here. 29 * 30 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 31 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 32 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 33 * details. 34 * 35 * You should have received a copy of the GNU General Public License along 36 * with Slurm; if not, write to the Free Software Foundation, Inc., 37 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 38 \*****************************************************************************/ 39 #ifndef _STEP_LAUNCH_H 40 #define _STEP_LAUNCH_H 41 42 #include <pthread.h> 43 #include <stdint.h> 44 #include <unistd.h> 45 46 #include "slurm/slurm.h" 47 48 #include "src/common/bitstring.h" 49 #include "src/common/eio.h" 50 #include "src/common/slurm_mpi.h" 51 #include "src/common/slurm_step_layout.h" 52 53 #include "src/api/step_io.h" 54 55 typedef struct { 56 int connected; 57 int *sockets; /* array of socket file descriptors */ 58 } user_managed_io_t; 59 60 struct step_launch_state { 61 /* This lock protects tasks_started, tasks_exited, node_io_error, 62 io_deadline, abort, and abort_action_taken. The main thread 63 blocks on cond, waking when a tast starts or exits, or the abort 64 flag is set. */ 65 pthread_mutex_t lock; 66 pthread_cond_t cond; 67 int tasks_requested; 68 bitstr_t *tasks_started; /* or attempted to start, but failed */ 69 bitstr_t *tasks_exited; /* or never started correctly */ 70 bitstr_t *node_io_error; /* set after write or read error */ 71 pthread_t io_timeout_thread; 72 bool io_timeout_thread_created; 73 time_t *io_deadline; /* Holds the time by which a "connection okay" 74 message must be received. Each entry holds 75 NO_VAL unless the node is suspected to be 76 down and is being tested. */ 77 int io_timeout; /* num seconds between I/O tests */ 78 bool halt_io_test; /* set to true when I/O test thread should 79 shut down. */ 80 bool abort; 81 bool abort_action_taken; 82 83 /* message thread variables */ 84 eio_handle_t *msg_handle; 85 pthread_t msg_thread; 86 /* set to -1 if step launch message handler should not attempt 87 to handle */ 88 int slurmctld_socket_fd; 89 uint16_t num_resp_port; 90 uint16_t *resp_port; /* array of message response ports */ 91 92 /* io variables */ 93 bool user_managed_io; 94 union { 95 client_io_t *normal; 96 user_managed_io_t *user; 97 } io; 98 99 slurm_step_layout_t *layout; /* a pointer into the ctx 100 step_resp, do not free */ 101 mpi_plugin_client_info_t mpi_info[1]; 102 mpi_plugin_client_state_t *mpi_state; 103 int mpi_rc; 104 105 /* user registered callbacks */ 106 slurm_step_launch_callbacks_t callback; 107 }; 108 typedef struct step_launch_state step_launch_state_t; 109 110 111 /* 112 * Create a launch state structure for a specified step context, "ctx". 113 */ 114 struct step_launch_state * step_launch_state_create(slurm_step_ctx_t *ctx); 115 116 /* 117 * If a steps size has changed update the launch_state structure for a 118 * specified step context, "ctx". 119 */ 120 void step_launch_state_alter(slurm_step_ctx_t *ctx); 121 122 /* 123 * Free the memory associated with the a launch state structure. 124 */ 125 void step_launch_state_destroy(struct step_launch_state *sls); 126 127 /* 128 * Notify the step_launch_state that an I/O connection went bad. 129 * If the node is suspected to be down, abort the job. 130 */ 131 int step_launch_notify_io_failure(step_launch_state_t *sls, int node_id); 132 133 /* 134 * Just in case the node was marked questionable very early in the 135 * job step setup, clear this flag when the node makes its initial 136 * connection. 137 */ 138 int step_launch_clear_questionable_state(step_launch_state_t *sls, 139 int node_id); 140 141 142 #endif /* _STEP_LAUNCH_H */ 143