1 /*****************************************************************************\
2  * pdebug.c - ptrace functions for slurmstepd
3  *****************************************************************************
4  *  Copyright (C) 2002-2007 The Regents of the University of California.
5  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
6  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7  *  Written by Mark Grondona <mgrondona@llnl.gov>.
8  *  CODE-OCEC-09-009. All rights reserved.
9  *
10  *  This file is part of Slurm, a resource management program.
11  *  For details, see <https://slurm.schedmd.com/>.
12  *  Please also read the included file: DISCLAIMER.
13  *
14  *  Slurm is free software; you can redistribute it and/or modify it under
15  *  the terms of the GNU General Public License as published by the Free
16  *  Software Foundation; either version 2 of the License, or (at your option)
17  *  any later version.
18  *
19  *  In addition, as a special exception, the copyright holders give permission
20  *  to link the code of portions of this program with the OpenSSL library under
21  *  certain conditions as described in each individual source file, and
22  *  distribute linked combinations including the two. You must obey the GNU
23  *  General Public License in all respects for all of the code used other than
24  *  OpenSSL. If you modify file(s) with this exception, you may extend this
25  *  exception to your version of the file(s), but you are not obligated to do
26  *  so. If you do not wish to do so, delete this exception statement from your
27  *  version.  If you delete this exception statement from all source files in
28  *  the program, then also delete it here.
29  *
30  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
33  *  details.
34  *
35  *  You should have received a copy of the GNU General Public License along
36  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
37  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
38 \*****************************************************************************/
39 
40 #include "config.h"
41 
42 #include "pdebug.h"
43 
44 #include <fcntl.h>
45 #include <string.h>
46 #include <sys/stat.h>
47 #include <sys/types.h>
48 #include <signal.h>
49 
50 
51 /*
52  * Prepare task for parallel debugger attach
53  * Returns SLURM_SUCCESS or SLURM_ERROR.
54  */
55 int
pdebug_trace_process(stepd_step_rec_t * job,pid_t pid)56 pdebug_trace_process(stepd_step_rec_t *job, pid_t pid)
57 {
58 	/*  If task to be debugged, wait for it to stop via
59 	 *  child's ptrace(PTRACE_TRACEME), then SIGSTOP, and
60 	 *  ptrace(PTRACE_DETACH).
61 	 */
62 
63 	if (job->flags & LAUNCH_PARALLEL_DEBUG) {
64 		int status;
65 		waitpid(pid, &status, WUNTRACED);
66 		if (!WIFSTOPPED(status)) {
67 			int i;
68 			error("pdebug_trace_process WIFSTOPPED false"
69 			      " for pid %d", pid);
70 			if (WIFEXITED(status)) {
71 				error("Process %d exited \"normally\""
72 				      " with return code %d",
73 				      pid,
74 				      WEXITSTATUS(status));
75 			} else if (WIFSIGNALED(status)) {
76 				error("Process %d killed by signal %d",
77 				      pid, WTERMSIG(status));
78 			}
79 
80 			/*
81 			 * Mark this process as complete since it died
82 			 * prematurely.
83 			 */
84 			for (i = 0; i < job->node_tasks; i++) {
85 				if (job->task[i]->pid == pid) {
86 					job->task[i]->state =
87 						STEPD_STEP_TASK_COMPLETE;
88 				}
89 			}
90 
91 			return SLURM_ERROR;
92 		}
93 		if ((pid > (pid_t) 0) && (kill(pid, SIGSTOP) < 0)) {
94 			error("kill(%lu): %m", (unsigned long) pid);
95 			return SLURM_ERROR;
96 		}
97 
98 #ifdef BSD
99 		if (_PTRACE(PT_DETACH, pid, (caddr_t)1, 0)) {
100 #elif defined(PT_DETACH)
101 		if (_PTRACE(PT_DETACH, pid, NULL, 0)) {
102 #else
103 		if (_PTRACE(PTRACE_DETACH, pid, NULL, 0)) {
104 #endif
105 			error("ptrace(%lu): %m", (unsigned long) pid);
106 			return SLURM_ERROR;
107 		}
108 	}
109 	return SLURM_SUCCESS;
110 }
111 
112 /*
113  * Stop current task on exec() for connection from a parallel debugger
114  */
115 void
116 pdebug_stop_current(stepd_step_rec_t *job)
117 {
118 	/*
119 	 * Stop the task on exec for TotalView to connect
120 	 */
121 	if ((job->flags & LAUNCH_PARALLEL_DEBUG)
122 #ifdef BSD
123 	     && (_PTRACE(PT_TRACE_ME, 0, (caddr_t)0, 0) < 0) )
124 #elif defined(PT_TRACE_ME)
125 	     && (_PTRACE(PT_TRACE_ME, 0, NULL, 0) < 0) )
126 #else
127 	     && (_PTRACE(PTRACE_TRACEME, 0, NULL, 0) < 0) )
128 #endif
129 		error("ptrace: %m");
130 }
131 
132 /* Check if this PID should be woken for TotalView partial attach */
133 static int _being_traced(pid_t pid)
134 {
135 	FILE *fp = NULL;
136 	size_t n = 0, max_len;
137 	int tracer_id = 0;
138 	char *match = NULL;
139 	char buf[2048] = {0};
140 	char sp[PATH_MAX] = {0};
141 
142 	if (snprintf(sp, PATH_MAX, "/proc/%lu/status",(unsigned long)pid) == -1)
143 		return -1;
144 	if ((fp = fopen((const char *)sp, "r")) == NULL)
145 		return -1;
146 
147 	max_len = sizeof(buf) - 1;
148 	n = fread(buf, 1, max_len, fp);
149 	fclose(fp);
150 	if ((n == 0) || (n == max_len))
151 		return -1;
152 	buf[n] = '\0';	/* Ensure string is terminated */
153 	if ((match = strstr(buf, "TracerPid:")) == NULL)
154 		return -1;
155 	if (sscanf(match, "TracerPid:\t%d", &tracer_id) == EOF)
156 		return -1;
157 	return tracer_id;
158 }
159 
160 static bool _pid_to_wake(pid_t pid)
161 {
162 	int rc = 0;
163 
164 	if ((rc = _being_traced(pid)) == -1) {
165 		/* If an error occurred (e.g., /proc FS doesn't exist
166 		 * or TracerPid field doesn't exist, it is better to wake
167 		 * up the target process -- at the expense of potential
168 		 * side effects on the debugger. */
169 		debug("_pid_to_wake(%lu): %m\n", (unsigned long) pid);
170 		errno = 0;
171 		rc = 0;
172 	}
173 	return (rc == 0) ? true : false;
174 }
175 
176 /*
177  * Wake tasks currently stopped for parallel debugger attach
178  */
179 void pdebug_wake_process(stepd_step_rec_t *job, pid_t pid)
180 {
181 	if ((job->flags & LAUNCH_PARALLEL_DEBUG) && (pid > (pid_t) 0)) {
182 		if (_pid_to_wake(pid)) {
183 			if (kill(pid, SIGCONT) < 0)
184 				error("kill(%lu): %m", (unsigned long) pid);
185 			else
186 				debug("woke pid %lu", (unsigned long) pid);
187 		} else {
188 			debug("pid %lu not stopped or being traced",
189 			      (unsigned long) pid);
190 		}
191 	}
192 }
193