1 /*****************************************************************************\
2  *  src/plugins/task/affinity/affinity.c - task affinity plugin
3  *****************************************************************************
4  *  Copyright (C) 2005-2006 Hewlett-Packard Development Company, L.P.
5  *
6  *  This file is part of Slurm, a resource management program.
7  *  For details, see <https://slurm.schedmd.com/>.
8  *  Please also read the included file: DISCLAIMER.
9  *
10  *  Slurm is free software; you can redistribute it and/or modify it under
11  *  the terms of the GNU General Public License as published by the Free
12  *  Software Foundation; either version 2 of the License, or (at your option)
13  *  any later version.
14  *
15  *  In addition, as a special exception, the copyright holders give permission
16  *  to link the code of portions of this program with the OpenSSL library under
17  *  certain conditions as described in each individual source file, and
18  *  distribute linked combinations including the two. You must obey the GNU
19  *  General Public License in all respects for all of the code used other than
20  *  OpenSSL. If you modify file(s) with this exception, you may extend this
21  *  exception to your version of the file(s), but you are not obligated to do
22  *  so. If you do not wish to do so, delete this exception statement from your
23  *  version.  If you delete this exception statement from all source files in
24  *  the program, then also delete it here.
25  *
26  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
27  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
28  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
29  *  details.
30  *
31  *  You should have received a copy of the GNU General Public License along
32  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
33  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
34 \*****************************************************************************/
35 
36 #define _GNU_SOURCE
37 
38 #include "affinity.h"
39 
40 /* Older versions of sched.h (ie. Centos5) don't include CPU_OR. */
41 #ifndef CPU_OR
42 
43 #ifndef CPU_OP_S
44 # define __CPU_OP_S(setsize, destset, srcset1, srcset2, op) \
45   (__extension__      \
46    ({ cpu_set_t *__dest = (destset);      \
47      const __cpu_mask *__arr1 = (srcset1)->__bits;      \
48      const __cpu_mask *__arr2 = (srcset2)->__bits;      \
49      size_t __imax = (setsize) / sizeof (__cpu_mask);      \
50      size_t __i;      \
51      for (__i = 0; __i < __imax; ++__i)      \
52        ((__cpu_mask *) __dest->__bits)[__i] = __arr1[__i] op __arr2[__i];    \
53      __dest; }))
54 #endif
55 
56 # define CPU_OR(destset, srcset1, srcset2) \
57   __CPU_OP_S (sizeof (cpu_set_t), destset, srcset1, srcset2, |)
58 #endif
59 
60 static int is_power = -1;
61 
62 /* If HAVE_NUMA, create mask for given ldom.
63  * Otherwise create mask for given socket
64  */
_bind_ldom(uint32_t ldom,cpu_set_t * mask)65 static int _bind_ldom(uint32_t ldom, cpu_set_t *mask)
66 {
67 #ifdef HAVE_NUMA
68 	int c, maxcpus, nnid = 0;
69 	int nmax = numa_max_node();
70 	if (nmax > 0)
71 		nnid = ldom % (nmax+1);
72 	debug3("task/affinity: binding to NUMA node %d", nnid);
73 	maxcpus = conf->sockets * conf->cores * conf->threads;
74 	for (c = 0; c < maxcpus; c++) {
75 		if (slurm_get_numa_node(c) == nnid)
76 			CPU_SET(c, mask);
77 	}
78 	return true;
79 #else
80 	uint16_t s, sid  = ldom % conf->sockets;
81 	uint16_t i, cpus = conf->cores * conf->threads;
82 	if (!conf->block_map)
83 		return false;
84 	for (s = sid * cpus; s < (sid+1) * cpus; s++) {
85 		i = s % conf->block_map_size;
86 		CPU_SET(conf->block_map[i], mask);
87 	}
88 	return true;
89 #endif
90 }
91 
get_cpuset(cpu_set_t * mask,stepd_step_rec_t * job)92 int get_cpuset(cpu_set_t *mask, stepd_step_rec_t *job)
93 {
94 	int nummasks, maskid, i, threads;
95 	char *curstr, *selstr;
96 	char mstr[1 + CPU_SETSIZE / 4];
97 	uint32_t local_id = job->envtp->localid;
98 	char buftype[1024];
99 
100 	slurm_sprint_cpu_bind_type(buftype, job->cpu_bind_type);
101 	debug3("get_cpuset (%s[%d]) %s", buftype, job->cpu_bind_type,
102 		job->cpu_bind);
103 	CPU_ZERO(mask);
104 
105 	if (job->cpu_bind_type & CPU_BIND_NONE) {
106 		return true;
107 	}
108 
109 	if (job->cpu_bind_type & CPU_BIND_RANK) {
110 		threads = MAX(conf->threads, 1);
111 		CPU_SET(job->envtp->localid % (job->cpus*threads), mask);
112 		return true;
113 	}
114 
115 	if (job->cpu_bind_type & CPU_BIND_LDRANK) {
116 		/* if HAVE_NUMA then bind this task ID to it's corresponding
117 		 * locality domain ID. Otherwise, bind this task ID to it's
118 		 * corresponding socket ID */
119 		return _bind_ldom(local_id, mask);
120 	}
121 
122 	if (!job->cpu_bind)
123 		return false;
124 
125 	nummasks = 1;
126 	selstr = NULL;
127 
128 	/* get number of strings present in cpu_bind */
129 	curstr = job->cpu_bind;
130 	while (*curstr) {
131 		if (nummasks == local_id+1) {
132 			selstr = curstr;
133 			break;
134 		}
135 		if (*curstr == ',')
136 			nummasks++;
137 		curstr++;
138 	}
139 
140 	/* if we didn't already find the mask... */
141 	if (!selstr) {
142 		/* ...select mask string by wrapping task ID into list */
143 		maskid = local_id % nummasks;
144 		i = maskid;
145 		curstr = job->cpu_bind;
146 		while (*curstr && i) {
147 			if (*curstr == ',')
148 			    	i--;
149 			curstr++;
150 		}
151 		if (!*curstr) {
152 			return false;
153 		}
154 		selstr = curstr;
155 	}
156 
157 	/* extract the selected mask from the list */
158 	i = 0;
159 	curstr = mstr;
160 	while (*selstr && *selstr != ',' && i++ < (CPU_SETSIZE/4))
161 		*curstr++ = *selstr++;
162 	*curstr = '\0';
163 
164 	if (job->cpu_bind_type & CPU_BIND_MASK) {
165 		/* convert mask string into cpu_set_t mask */
166 		if (task_str_to_cpuset(mask, mstr) < 0) {
167 			error("task_str_to_cpuset %s", mstr);
168 			return false;
169 		}
170 		return true;
171 	}
172 
173 	if (job->cpu_bind_type & CPU_BIND_MAP) {
174 		unsigned int mycpu = 0;
175 		if (xstrncmp(mstr, "0x", 2) == 0) {
176 			mycpu = strtoul (&(mstr[2]), NULL, 16);
177 		} else {
178 			mycpu = strtoul (mstr, NULL, 10);
179 		}
180 		CPU_SET(mycpu, mask);
181 		return true;
182 	}
183 
184 	if (job->cpu_bind_type & CPU_BIND_LDMASK) {
185 		/* if HAVE_NUMA bind this task to the locality domains
186 		 * identified in mstr. Otherwise bind this task to the
187 		 * sockets identified in mstr */
188 		int len = strlen(mstr);
189 		char *ptr = mstr + len - 1;
190 		uint32_t base = 0;
191 
192 		curstr = mstr;
193 		/* skip 0x, it's all hex anyway */
194 		if (len > 1 && !memcmp(mstr, "0x", 2L))
195 			curstr += 2;
196 		while (ptr >= curstr) {
197 			char val = slurm_char_to_hex(*ptr);
198 			if (val == (char) -1)
199 				return false;
200 			if (val & 1)
201 				_bind_ldom(base, mask);
202 			if (val & 2)
203 				_bind_ldom(base + 1, mask);
204 			if (val & 4)
205 				_bind_ldom(base + 2, mask);
206 			if (val & 8)
207 				_bind_ldom(base + 3, mask);
208 			len--;
209 			ptr--;
210 			base += 4;
211 		}
212 		return true;
213 	}
214 
215 	if (job->cpu_bind_type & CPU_BIND_LDMAP) {
216 		/* if HAVE_NUMA bind this task to the given locality
217 		 * domain. Otherwise bind this task to the given
218 		 * socket */
219 		uint32_t myldom = 0;
220 		if (xstrncmp(mstr, "0x", 2) == 0) {
221 			myldom = strtoul (&(mstr[2]), NULL, 16);
222 		} else {
223 			myldom = strtoul (mstr, NULL, 10);
224 		}
225 		return _bind_ldom(myldom, mask);
226 	}
227 
228 	return false;
229 }
230 
231 #define	BUFFLEN	127
232 
233 /* Return true if Power7 processor */
_is_power_cpu(void)234 static bool _is_power_cpu(void)
235 {
236 	if (is_power == -1) {
237 #ifdef HAVE_SYSCTLBYNAME
238 
239 		char    buffer[BUFFLEN+1];
240 		size_t  len = BUFFLEN;
241 
242 		if ( sysctlbyname("hw.model", buffer, &len, NULL, 0) == 0 )
243 		    is_power = ( strstr(buffer, "POWER7") != NULL );
244 		else {
245 		    error("_get_is_power: sysctl could not retrieve hw.model");
246 		    return false;
247 		}
248 
249 #elif defined(__linux__)
250 
251 		FILE *cpu_info_file;
252 		char buffer[BUFFLEN+1];
253 		char* _cpuinfo_path = "/proc/cpuinfo";
254 		cpu_info_file = fopen(_cpuinfo_path, "r");
255 		if (cpu_info_file == NULL) {
256 			error("_get_is_power: error %d opening %s", errno,
257 			      _cpuinfo_path);
258 			return false;	/* assume not power processor */
259 		}
260 
261 		is_power = 0;
262 		while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) {
263 			if (strstr(buffer, "POWER7")) {
264 				is_power = 1;
265 				break;
266 			}
267 		}
268 		fclose(cpu_info_file);
269 
270 #else
271 
272 /* Assuming other platforms don't support sysctlbyname() or /proc/cpuinfo */
273 #warning	"Power7 check not implemented for this platform."
274 	is_power = 0;
275 
276 #endif
277 	}
278 
279 	if (is_power == 1)
280 		return true;
281 	return false;
282 }
283 
284 /* Translate global CPU index to local CPU index. This is needed for
285  * Power7 processors with multi-threading disabled. On those processors,
286  * the CPU mask has gaps for the unused threads (different from Intel
287  * processors) which need to be skipped over in the mask used in the
288  * set system call. */
reset_cpuset(cpu_set_t * new_mask,cpu_set_t * cur_mask)289 void reset_cpuset(cpu_set_t *new_mask, cpu_set_t *cur_mask)
290 {
291 	cpu_set_t full_mask, newer_mask;
292 	int cur_offset, new_offset = 0, last_set = -1;
293 
294 	if (!_is_power_cpu())
295 		return;
296 
297 	if (slurm_getaffinity(1, sizeof(full_mask), &full_mask)) {
298 		/* Try to get full CPU mask from process init */
299 		CPU_ZERO(&full_mask);
300 #if defined(__FreeBSD__) && !defined(CPU_ALLOC)
301 		CPU_OR(&full_mask, cur_mask);
302 #else
303 		CPU_OR(&full_mask, &full_mask, cur_mask);
304 #endif
305 	}
306 	CPU_ZERO(&newer_mask);
307 	for (cur_offset = 0; cur_offset < CPU_SETSIZE; cur_offset++) {
308 		if (!CPU_ISSET(cur_offset, &full_mask))
309 			continue;
310 		if (CPU_ISSET(new_offset, new_mask)) {
311 			CPU_SET(cur_offset, &newer_mask);
312 			last_set = cur_offset;
313 		}
314 		new_offset++;
315 	}
316 
317 	CPU_ZERO(new_mask);
318 	for (cur_offset = 0; cur_offset <= last_set; cur_offset++) {
319 		if (CPU_ISSET(cur_offset, &newer_mask))
320 			CPU_SET(cur_offset, new_mask);
321 	}
322 }
323 
slurm_setaffinity(pid_t pid,size_t size,const cpu_set_t * mask)324 int slurm_setaffinity(pid_t pid, size_t size, const cpu_set_t *mask)
325 {
326 	int rval;
327 	char mstr[1 + CPU_SETSIZE / 4];
328 
329 #ifdef __FreeBSD__
330         rval = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID,
331 				pid, size, mask);
332 #else
333 	rval = sched_setaffinity(pid, size, mask);
334 #endif
335 	if (rval) {
336 		verbose("sched_setaffinity(%d,%zu,0x%s) failed: %m",
337 			pid, size, task_cpuset_to_str(mask, mstr));
338 	}
339 	return (rval);
340 }
341 
slurm_getaffinity(pid_t pid,size_t size,cpu_set_t * mask)342 int slurm_getaffinity(pid_t pid, size_t size, cpu_set_t *mask)
343 {
344 	int rval;
345 	char mstr[1 + CPU_SETSIZE / 4];
346 
347 	CPU_ZERO(mask);
348 
349 	/*
350 	 * The FreeBSD cpuset API is a superset of the Linux API.
351 	 * In addition to PIDs, it supports threads, interrupts,
352 	 * jails, and potentially other objects.  The first two arguments
353 	 * to cpuset_*etaffinity() below indicate that the third argument
354 	 * is a PID.  -1 indicates the PID of the calling process.
355 	 * Linux sched_*etaffinity() uses 0 for this.
356 	 */
357 #ifdef __FreeBSD__
358         rval = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID,
359 				pid, size, mask);
360 #else
361 	rval = sched_getaffinity(pid, size, mask);
362 #endif
363 	if (rval) {
364 		verbose("sched_getaffinity(%d,%zu,0x%s) failed with status %d",
365 			pid, size, task_cpuset_to_str(mask, mstr), rval);
366 	} else {
367 		debug3("sched_getaffinity(%d) = 0x%s",
368 		       pid, task_cpuset_to_str(mask, mstr));
369 	}
370 	return (rval);
371 }
372