1 
2 #define _GNU_SOURCE
3 #include <endian.h>
4 #include <errno.h>
5 #include <fcntl.h>
6 #include <grp.h>
7 #include <sched.h>
8 #include <setjmp.h>
9 #include <signal.h>
10 #include <stdarg.h>
11 #include <stdbool.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <stdbool.h>
16 #include <string.h>
17 #include <unistd.h>
18 
19 #include <sys/ioctl.h>
20 #include <sys/prctl.h>
21 #include <sys/socket.h>
22 #include <sys/types.h>
23 #include <sys/wait.h>
24 
25 #include <linux/limits.h>
26 #include <linux/netlink.h>
27 #include <linux/types.h>
28 
29 /* Get all of the CLONE_NEW* flags. */
30 #include "namespace.h"
31 
32 /* Synchronisation values. */
33 enum sync_t {
34 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
35 	SYNC_USERMAP_ACK = 0x41,	/* Mapping finished by the parent. */
36 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
37 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
38 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
39 	SYNC_CHILD_READY = 0x45,	/* The child or grandchild is ready to return. */
40 };
41 
42 /*
43  * Synchronisation value for cgroup namespace setup.
44  * The same constant is defined in process_linux.go as "createCgroupns".
45  */
46 #define CREATECGROUPNS 0x80
47 
48 /* longjmp() arguments. */
49 #define JUMP_PARENT 0x00
50 #define JUMP_CHILD  0xA0
51 #define JUMP_INIT   0xA1
52 
53 /* Assume the stack grows down, so arguments should be above it. */
54 struct clone_t {
55 	/*
56 	 * Reserve some space for clone() to locate arguments
57 	 * and retcode in this place
58 	 */
59 	char stack[4096] __attribute__ ((aligned(16)));
60 	char stack_ptr[0];
61 
62 	/* There's two children. This is used to execute the different code. */
63 	jmp_buf *env;
64 	int jmpval;
65 };
66 
67 struct nlconfig_t {
68 	char *data;
69 
70 	/* Process settings. */
71 	uint32_t cloneflags;
72 	char *oom_score_adj;
73 	size_t oom_score_adj_len;
74 
75 	/* User namespace settings. */
76 	char *uidmap;
77 	size_t uidmap_len;
78 	char *gidmap;
79 	size_t gidmap_len;
80 	char *namespaces;
81 	size_t namespaces_len;
82 	uint8_t is_setgroup;
83 
84 	/* Rootless container settings. */
85 	uint8_t is_rootless_euid;	/* boolean */
86 	char *uidmappath;
87 	size_t uidmappath_len;
88 	char *gidmappath;
89 	size_t gidmappath_len;
90 };
91 
92 #define PANIC   "panic"
93 #define FATAL   "fatal"
94 #define ERROR   "error"
95 #define WARNING "warning"
96 #define INFO    "info"
97 #define DEBUG   "debug"
98 
99 static int logfd = -1;
100 
101 /*
102  * List of netlink message types sent to us as part of bootstrapping the init.
103  * These constants are defined in libcontainer/message_linux.go.
104  */
105 #define INIT_MSG			62000
106 #define CLONE_FLAGS_ATTR	27281
107 #define NS_PATHS_ATTR		27282
108 #define UIDMAP_ATTR			27283
109 #define GIDMAP_ATTR			27284
110 #define SETGROUP_ATTR		27285
111 #define OOM_SCORE_ADJ_ATTR	27286
112 #define ROOTLESS_EUID_ATTR	27287
113 #define UIDMAPPATH_ATTR	    27288
114 #define GIDMAPPATH_ATTR	    27289
115 
116 /*
117  * Use the raw syscall for versions of glibc which don't include a function for
118  * it, namely (glibc 2.12).
119  */
120 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
121 #	define _GNU_SOURCE
122 #	include "syscall.h"
123 #	if !defined(SYS_setns) && defined(__NR_setns)
124 #		define SYS_setns __NR_setns
125 #	endif
126 
127 #ifndef SYS_setns
128 #	error "setns(2) syscall not supported by glibc version"
129 #endif
130 
setns(int fd,int nstype)131 int setns(int fd, int nstype)
132 {
133 	return syscall(SYS_setns, fd, nstype);
134 }
135 #endif
136 
write_log_with_info(const char * level,const char * function,int line,const char * format,...)137 static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
138 {
139 	char message[1024] = {};
140 
141 	va_list args;
142 
143 	if (logfd < 0 || level == NULL)
144 		return;
145 
146 	va_start(args, format);
147 	if (vsnprintf(message, sizeof(message), format, args) < 0)
148 		goto done;
149 
150 	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
151 done:
152 	va_end(args);
153 }
154 
155 #define write_log(level, fmt, ...) \
156 	write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
157 
158 /* XXX: This is ugly. */
159 static int syncfd = -1;
160 
161 #define bail(fmt, ...)                                       \
162 	do {                                                       \
163 		write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
164 		exit(1);                                                 \
165 	} while(0)
166 
write_file(char * data,size_t data_len,char * pathfmt,...)167 static int write_file(char *data, size_t data_len, char *pathfmt, ...)
168 {
169 	int fd, len, ret = 0;
170 	char path[PATH_MAX];
171 
172 	va_list ap;
173 	va_start(ap, pathfmt);
174 	len = vsnprintf(path, PATH_MAX, pathfmt, ap);
175 	va_end(ap);
176 	if (len < 0)
177 		return -1;
178 
179 	fd = open(path, O_RDWR);
180 	if (fd < 0) {
181 		return -1;
182 	}
183 
184 	len = write(fd, data, data_len);
185 	if (len != data_len) {
186 		ret = -1;
187 		goto out;
188 	}
189 
190  out:
191 	close(fd);
192 	return ret;
193 }
194 
195 enum policy_t {
196 	SETGROUPS_DEFAULT = 0,
197 	SETGROUPS_ALLOW,
198 	SETGROUPS_DENY,
199 };
200 
201 /* This *must* be called before we touch gid_map. */
update_setgroups(int pid,enum policy_t setgroup)202 static void update_setgroups(int pid, enum policy_t setgroup)
203 {
204 	char *policy;
205 
206 	switch (setgroup) {
207 	case SETGROUPS_ALLOW:
208 		policy = "allow";
209 		break;
210 	case SETGROUPS_DENY:
211 		policy = "deny";
212 		break;
213 	case SETGROUPS_DEFAULT:
214 	default:
215 		/* Nothing to do. */
216 		return;
217 	}
218 
219 	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
220 		/*
221 		 * If the kernel is too old to support /proc/pid/setgroups,
222 		 * open(2) or write(2) will return ENOENT. This is fine.
223 		 */
224 		if (errno != ENOENT)
225 			bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
226 	}
227 }
228 
try_mapping_tool(const char * app,int pid,char * map,size_t map_len)229 static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
230 {
231 	int child;
232 
233 	/*
234 	 * If @app is NULL, execve will segfault. Just check it here and bail (if
235 	 * we're in this path, the caller is already getting desperate and there
236 	 * isn't a backup to this failing). This usually would be a configuration
237 	 * or programming issue.
238 	 */
239 	if (!app)
240 		bail("mapping tool not present");
241 
242 	child = fork();
243 	if (child < 0)
244 		bail("failed to fork");
245 
246 	if (!child) {
247 #define MAX_ARGV 20
248 		char *argv[MAX_ARGV];
249 		char *envp[] = { NULL };
250 		char pid_fmt[16];
251 		int argc = 0;
252 		char *next;
253 
254 		snprintf(pid_fmt, 16, "%d", pid);
255 
256 		argv[argc++] = (char *)app;
257 		argv[argc++] = pid_fmt;
258 		/*
259 		 * Convert the map string into a list of argument that
260 		 * newuidmap/newgidmap can understand.
261 		 */
262 
263 		while (argc < MAX_ARGV) {
264 			if (*map == '\0') {
265 				argv[argc++] = NULL;
266 				break;
267 			}
268 			argv[argc++] = map;
269 			next = strpbrk(map, "\n ");
270 			if (next == NULL)
271 				break;
272 			*next++ = '\0';
273 			map = next + strspn(next, "\n ");
274 		}
275 
276 		execve(app, argv, envp);
277 		bail("failed to execv");
278 	} else {
279 		int status;
280 
281 		while (true) {
282 			if (waitpid(child, &status, 0) < 0) {
283 				if (errno == EINTR)
284 					continue;
285 				bail("failed to waitpid");
286 			}
287 			if (WIFEXITED(status) || WIFSIGNALED(status))
288 				return WEXITSTATUS(status);
289 		}
290 	}
291 
292 	return -1;
293 }
294 
update_uidmap(const char * path,int pid,char * map,size_t map_len)295 static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
296 {
297 	if (map == NULL || map_len <= 0)
298 		return;
299 
300 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
301 		if (errno != EPERM)
302 			bail("failed to update /proc/%d/uid_map", pid);
303 		if (try_mapping_tool(path, pid, map, map_len))
304 			bail("failed to use newuid map on %d", pid);
305 	}
306 }
307 
update_gidmap(const char * path,int pid,char * map,size_t map_len)308 static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
309 {
310 	if (map == NULL || map_len <= 0)
311 		return;
312 
313 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
314 		if (errno != EPERM)
315 			bail("failed to update /proc/%d/gid_map", pid);
316 		if (try_mapping_tool(path, pid, map, map_len))
317 			bail("failed to use newgid map on %d", pid);
318 	}
319 }
320 
update_oom_score_adj(char * data,size_t len)321 static void update_oom_score_adj(char *data, size_t len)
322 {
323 	if (data == NULL || len <= 0)
324 		return;
325 
326 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
327 		bail("failed to update /proc/self/oom_score_adj");
328 }
329 
330 /* A dummy function that just jumps to the given jumpval. */
331 static int child_func(void *arg) __attribute__ ((noinline));
child_func(void * arg)332 static int child_func(void *arg)
333 {
334 	struct clone_t *ca = (struct clone_t *)arg;
335 	longjmp(*ca->env, ca->jmpval);
336 }
337 
338 static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
clone_parent(jmp_buf * env,int jmpval)339 static int clone_parent(jmp_buf *env, int jmpval)
340 {
341 	struct clone_t ca = {
342 		.env = env,
343 		.jmpval = jmpval,
344 	};
345 
346 	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
347 }
348 
349 /*
350  * Gets the init pipe fd from the environment, which is used to read the
351  * bootstrap data and tell the parent what the new pid is after we finish
352  * setting up the environment.
353  */
initpipe(void)354 static int initpipe(void)
355 {
356 	int pipenum;
357 	char *initpipe, *endptr;
358 
359 	initpipe = getenv("_LIBCONTAINER_INITPIPE");
360 	if (initpipe == NULL || *initpipe == '\0')
361 		return -1;
362 
363 	pipenum = strtol(initpipe, &endptr, 10);
364 	if (*endptr != '\0')
365 		bail("unable to parse _LIBCONTAINER_INITPIPE");
366 
367 	return pipenum;
368 }
369 
setup_logpipe(void)370 static void setup_logpipe(void)
371 {
372 	char *logpipe, *endptr;
373 
374 	logpipe = getenv("_LIBCONTAINER_LOGPIPE");
375 	if (logpipe == NULL || *logpipe == '\0') {
376 		return;
377 	}
378 
379 	logfd = strtol(logpipe, &endptr, 10);
380 	if (logpipe == endptr || *endptr != '\0') {
381 		fprintf(stderr, "unable to parse _LIBCONTAINER_LOGPIPE, value: %s\n", logpipe);
382 		/* It is too early to use bail */
383 		exit(1);
384 	}
385 }
386 
387 /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
nsflag(char * name)388 static int nsflag(char *name)
389 {
390 	if (!strcmp(name, "cgroup"))
391 		return CLONE_NEWCGROUP;
392 	else if (!strcmp(name, "ipc"))
393 		return CLONE_NEWIPC;
394 	else if (!strcmp(name, "mnt"))
395 		return CLONE_NEWNS;
396 	else if (!strcmp(name, "net"))
397 		return CLONE_NEWNET;
398 	else if (!strcmp(name, "pid"))
399 		return CLONE_NEWPID;
400 	else if (!strcmp(name, "user"))
401 		return CLONE_NEWUSER;
402 	else if (!strcmp(name, "uts"))
403 		return CLONE_NEWUTS;
404 
405 	/* If we don't recognise a name, fallback to 0. */
406 	return 0;
407 }
408 
readint32(char * buf)409 static uint32_t readint32(char *buf)
410 {
411 	return *(uint32_t *) buf;
412 }
413 
readint8(char * buf)414 static uint8_t readint8(char *buf)
415 {
416 	return *(uint8_t *) buf;
417 }
418 
nl_parse(int fd,struct nlconfig_t * config)419 static void nl_parse(int fd, struct nlconfig_t *config)
420 {
421 	size_t len, size;
422 	struct nlmsghdr hdr;
423 	char *data, *current;
424 
425 	/* Retrieve the netlink header. */
426 	len = read(fd, &hdr, NLMSG_HDRLEN);
427 	if (len != NLMSG_HDRLEN)
428 		bail("invalid netlink header length %zu", len);
429 
430 	if (hdr.nlmsg_type == NLMSG_ERROR)
431 		bail("failed to read netlink message");
432 
433 	if (hdr.nlmsg_type != INIT_MSG)
434 		bail("unexpected msg type %d", hdr.nlmsg_type);
435 
436 	/* Retrieve data. */
437 	size = NLMSG_PAYLOAD(&hdr, 0);
438 	current = data = malloc(size);
439 	if (!data)
440 		bail("failed to allocate %zu bytes of memory for nl_payload", size);
441 
442 	len = read(fd, data, size);
443 	if (len != size)
444 		bail("failed to read netlink payload, %zu != %zu", len, size);
445 
446 	/* Parse the netlink payload. */
447 	config->data = data;
448 	while (current < data + size) {
449 		struct nlattr *nlattr = (struct nlattr *)current;
450 		size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
451 
452 		/* Advance to payload. */
453 		current += NLA_HDRLEN;
454 
455 		/* Handle payload. */
456 		switch (nlattr->nla_type) {
457 		case CLONE_FLAGS_ATTR:
458 			config->cloneflags = readint32(current);
459 			break;
460 		case ROOTLESS_EUID_ATTR:
461 			config->is_rootless_euid = readint8(current);	/* boolean */
462 			break;
463 		case OOM_SCORE_ADJ_ATTR:
464 			config->oom_score_adj = current;
465 			config->oom_score_adj_len = payload_len;
466 			break;
467 		case NS_PATHS_ATTR:
468 			config->namespaces = current;
469 			config->namespaces_len = payload_len;
470 			break;
471 		case UIDMAP_ATTR:
472 			config->uidmap = current;
473 			config->uidmap_len = payload_len;
474 			break;
475 		case GIDMAP_ATTR:
476 			config->gidmap = current;
477 			config->gidmap_len = payload_len;
478 			break;
479 		case UIDMAPPATH_ATTR:
480 			config->uidmappath = current;
481 			config->uidmappath_len = payload_len;
482 			break;
483 		case GIDMAPPATH_ATTR:
484 			config->gidmappath = current;
485 			config->gidmappath_len = payload_len;
486 			break;
487 		case SETGROUP_ATTR:
488 			config->is_setgroup = readint8(current);
489 			break;
490 		default:
491 			bail("unknown netlink message type %d", nlattr->nla_type);
492 		}
493 
494 		current += NLA_ALIGN(payload_len);
495 	}
496 }
497 
nl_free(struct nlconfig_t * config)498 void nl_free(struct nlconfig_t *config)
499 {
500 	free(config->data);
501 }
502 
join_namespaces(char * nslist)503 void join_namespaces(char *nslist)
504 {
505 	int num = 0, i;
506 	char *saveptr = NULL;
507 	char *namespace = strtok_r(nslist, ",", &saveptr);
508 	struct namespace_t {
509 		int fd;
510 		int ns;
511 		char type[PATH_MAX];
512 		char path[PATH_MAX];
513 	} *namespaces = NULL;
514 
515 	if (!namespace || !strlen(namespace) || !strlen(nslist))
516 		bail("ns paths are empty");
517 
518 	/*
519 	 * We have to open the file descriptors first, since after
520 	 * we join the mnt namespace we might no longer be able to
521 	 * access the paths.
522 	 */
523 	do {
524 		int fd;
525 		char *path;
526 		struct namespace_t *ns;
527 
528 		/* Resize the namespace array. */
529 		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
530 		if (!namespaces)
531 			bail("failed to reallocate namespace array");
532 		ns = &namespaces[num - 1];
533 
534 		/* Split 'ns:path'. */
535 		path = strstr(namespace, ":");
536 		if (!path)
537 			bail("failed to parse %s", namespace);
538 		*path++ = '\0';
539 
540 		fd = open(path, O_RDONLY);
541 		if (fd < 0)
542 			bail("failed to open %s", path);
543 
544 		ns->fd = fd;
545 		ns->ns = nsflag(namespace);
546 		strncpy(ns->path, path, PATH_MAX - 1);
547 		ns->path[PATH_MAX - 1] = '\0';
548 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
549 
550 	/*
551 	 * The ordering in which we join namespaces is important. We should
552 	 * always join the user namespace *first*. This is all guaranteed
553 	 * from the container_linux.go side of this, so we're just going to
554 	 * follow the order given to us.
555 	 */
556 
557 	for (i = 0; i < num; i++) {
558 		struct namespace_t ns = namespaces[i];
559 
560 		if (setns(ns.fd, ns.ns) < 0)
561 			bail("failed to setns to %s", ns.path);
562 
563 		close(ns.fd);
564 	}
565 
566 	free(namespaces);
567 }
568 
569 /* Defined in cloned_binary.c. */
570 extern int ensure_cloned_binary(void);
571 
nsexec(void)572 void nsexec(void)
573 {
574 	int pipenum;
575 	jmp_buf env;
576 	int sync_child_pipe[2], sync_grandchild_pipe[2];
577 	struct nlconfig_t config = { 0 };
578 
579 	/*
580 	 * Setup a pipe to send logs to the parent. This should happen
581 	 * first, because bail will use that pipe.
582 	 */
583 	setup_logpipe();
584 
585 	/*
586 	 * If we don't have an init pipe, just return to the go routine.
587 	 * We'll only get an init pipe for start or exec.
588 	 */
589 	pipenum = initpipe();
590 	if (pipenum == -1)
591 		return;
592 
593 	/*
594 	 * We need to re-exec if we are not in a cloned binary. This is necessary
595 	 * to ensure that containers won't be able to access the host binary
596 	 * through /proc/self/exe. See CVE-2019-5736.
597 	 */
598 	if (ensure_cloned_binary() < 0)
599 		bail("could not ensure we are a cloned binary");
600 
601 	write_log(DEBUG, "nsexec started");
602 
603 	/* Parse all of the netlink configuration. */
604 	nl_parse(pipenum, &config);
605 
606 	/* Set oom_score_adj. This has to be done before !dumpable because
607 	 * /proc/self/oom_score_adj is not writeable unless you're an privileged
608 	 * user (if !dumpable is set). All children inherit their parent's
609 	 * oom_score_adj value on fork(2) so this will always be propagated
610 	 * properly.
611 	 */
612 	update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
613 
614 	/*
615 	 * Make the process non-dumpable, to avoid various race conditions that
616 	 * could cause processes in namespaces we're joining to access host
617 	 * resources (or potentially execute code).
618 	 *
619 	 * However, if the number of namespaces we are joining is 0, we are not
620 	 * going to be switching to a different security context. Thus setting
621 	 * ourselves to be non-dumpable only breaks things (like rootless
622 	 * containers), which is the recommendation from the kernel folks.
623 	 */
624 	if (config.namespaces) {
625 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
626 			bail("failed to set process as non-dumpable");
627 	}
628 
629 	/* Pipe so we can tell the child when we've finished setting up. */
630 	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
631 		bail("failed to setup sync pipe between parent and child");
632 
633 	/*
634 	 * We need a new socketpair to sync with grandchild so we don't have
635 	 * race condition with child.
636 	 */
637 	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
638 		bail("failed to setup sync pipe between parent and grandchild");
639 
640 	/* TODO: Currently we aren't dealing with child deaths properly. */
641 
642 	/*
643 	 * Okay, so this is quite annoying.
644 	 *
645 	 * In order for this unsharing code to be more extensible we need to split
646 	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
647 	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
648 	 * separately, but because of SELinux issues we cannot really do that. But
649 	 * we cannot just dump the namespace flags into clone(...) because several
650 	 * usecases (such as rootless containers) require more granularity around
651 	 * the namespace setup. In addition, some older kernels had issues where
652 	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
653 	 * handle this while also dealing with SELinux so we choose SELinux support
654 	 * over broken kernel support).
655 	 *
656 	 * However, if we unshare(2) the user namespace *before* we clone(2), then
657 	 * all hell breaks loose.
658 	 *
659 	 * The parent no longer has permissions to do many things (unshare(2) drops
660 	 * all capabilities in your old namespace), and the container cannot be set
661 	 * up to have more than one {uid,gid} mapping. This is obviously less than
662 	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
663 	 *
664 	 * Unfortunately, it's not as simple as that. We have to fork to enter the
665 	 * PID namespace (the PID namespace only applies to children). Since we'll
666 	 * have to double-fork, this clone_parent() call won't be able to get the
667 	 * PID of the _actual_ init process (without doing more synchronisation than
668 	 * I can deal with at the moment). So we'll just get the parent to send it
669 	 * for us, the only job of this process is to update
670 	 * /proc/pid/{setgroups,uid_map,gid_map}.
671 	 *
672 	 * And as a result of the above, we also need to setns(2) in the first child
673 	 * because if we join a PID namespace in the topmost parent then our child
674 	 * will be in that namespace (and it will not be able to give us a PID value
675 	 * that makes sense without resorting to sending things with cmsg).
676 	 *
677 	 * This also deals with an older issue caused by dumping cloneflags into
678 	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
679 	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
680 	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
681 	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
682 	 * aware, the last mainline kernel which had this bug was Linux 3.12.
683 	 * However, we cannot comment on which kernels the broken patch was
684 	 * backported to.
685 	 *
686 	 * -- Aleksa "what has my life come to?" Sarai
687 	 */
688 
689 	switch (setjmp(env)) {
690 		/*
691 		 * Stage 0: We're in the parent. Our job is just to create a new child
692 		 *          (stage 1: JUMP_CHILD) process and write its uid_map and
693 		 *          gid_map. That process will go on to create a new process, then
694 		 *          it will send us its PID which we will send to the bootstrap
695 		 *          process.
696 		 */
697 	case JUMP_PARENT:{
698 			int len;
699 			pid_t child, first_child = -1;
700 			bool ready = false;
701 
702 			/* For debugging. */
703 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
704 
705 			/* Start the process of getting a container. */
706 			child = clone_parent(&env, JUMP_CHILD);
707 			if (child < 0)
708 				bail("unable to fork: child_func");
709 
710 			/*
711 			 * State machine for synchronisation with the children.
712 			 *
713 			 * Father only return when both child and grandchild are
714 			 * ready, so we can receive all possible error codes
715 			 * generated by children.
716 			 */
717 			syncfd = sync_child_pipe[1];
718 			close(sync_child_pipe[0]);
719 
720 			while (!ready) {
721 				enum sync_t s;
722 
723 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
724 					bail("failed to sync with child: next state");
725 
726 				switch (s) {
727 				case SYNC_USERMAP_PLS:
728 					/*
729 					 * Enable setgroups(2) if we've been asked to. But we also
730 					 * have to explicitly disable setgroups(2) if we're
731 					 * creating a rootless container for single-entry mapping.
732 					 * i.e. config.is_setgroup == false.
733 					 * (this is required since Linux 3.19).
734 					 *
735 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
736 					 * newuidmap/newgidmap shall be used.
737 					 */
738 
739 					if (config.is_rootless_euid && !config.is_setgroup)
740 						update_setgroups(child, SETGROUPS_DENY);
741 
742 					/* Set up mappings. */
743 					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
744 					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
745 
746 					s = SYNC_USERMAP_ACK;
747 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
748 						kill(child, SIGKILL);
749 						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
750 					}
751 					break;
752 				case SYNC_RECVPID_PLS:{
753 						first_child = child;
754 
755 						/* Get the init_func pid. */
756 						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
757 							kill(first_child, SIGKILL);
758 							bail("failed to sync with child: read(childpid)");
759 						}
760 
761 						/* Send ACK. */
762 						s = SYNC_RECVPID_ACK;
763 						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
764 							kill(first_child, SIGKILL);
765 							kill(child, SIGKILL);
766 							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
767 						}
768 
769 						/* Send the init_func pid back to our parent.
770 						 *
771 						 * Send the init_func pid and the pid of the first child back to our parent.
772 						 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
773 						 * It becomes the responsibility of our parent to reap the first child.
774 						 */
775 						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
776 						if (len < 0) {
777 							kill(child, SIGKILL);
778 							bail("unable to generate JSON for child pid");
779 						}
780 					}
781 					break;
782 				case SYNC_CHILD_READY:
783 					ready = true;
784 					break;
785 				default:
786 					bail("unexpected sync value: %u", s);
787 				}
788 			}
789 
790 			/* Now sync with grandchild. */
791 
792 			syncfd = sync_grandchild_pipe[1];
793 			close(sync_grandchild_pipe[0]);
794 
795 			ready = false;
796 			while (!ready) {
797 				enum sync_t s;
798 
799 				s = SYNC_GRANDCHILD;
800 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
801 					kill(child, SIGKILL);
802 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
803 				}
804 
805 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
806 					bail("failed to sync with child: next state");
807 
808 				switch (s) {
809 				case SYNC_CHILD_READY:
810 					ready = true;
811 					break;
812 				default:
813 					bail("unexpected sync value: %u", s);
814 				}
815 			}
816 			exit(0);
817 		}
818 
819 		/*
820 		 * Stage 1: We're in the first child process. Our job is to join any
821 		 *          provided namespaces in the netlink payload and unshare all
822 		 *          of the requested namespaces. If we've been asked to
823 		 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
824 		 *          our user mappings for us. Then, we create a new child
825 		 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
826 		 *          child's PID to our parent (stage 0).
827 		 */
828 	case JUMP_CHILD:{
829 			pid_t child;
830 			enum sync_t s;
831 
832 			/* We're in a child and thus need to tell the parent if we die. */
833 			syncfd = sync_child_pipe[0];
834 			close(sync_child_pipe[1]);
835 
836 			/* For debugging. */
837 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
838 
839 			/*
840 			 * We need to setns first. We cannot do this earlier (in stage 0)
841 			 * because of the fact that we forked to get here (the PID of
842 			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
843 			 * using cmsg(3) but that's just annoying.
844 			 */
845 			if (config.namespaces)
846 				join_namespaces(config.namespaces);
847 
848 			/*
849 			 * Deal with user namespaces first. They are quite special, as they
850 			 * affect our ability to unshare other namespaces and are used as
851 			 * context for privilege checks.
852 			 *
853 			 * We don't unshare all namespaces in one go. The reason for this
854 			 * is that, while the kernel documentation may claim otherwise,
855 			 * there are certain cases where unsharing all namespaces at once
856 			 * will result in namespace objects being owned incorrectly.
857 			 * Ideally we should just fix these kernel bugs, but it's better to
858 			 * be safe than sorry, and fix them separately.
859 			 *
860 			 * A specific case of this is that the SELinux label of the
861 			 * internal kern-mount that mqueue uses will be incorrect if the
862 			 * UTS namespace is cloned before the USER namespace is mapped.
863 			 * I've also heard of similar problems with the network namespace
864 			 * in some scenarios. This also mirrors how LXC deals with this
865 			 * problem.
866 			 */
867 			if (config.cloneflags & CLONE_NEWUSER) {
868 				if (unshare(CLONE_NEWUSER) < 0)
869 					bail("failed to unshare user namespace");
870 				config.cloneflags &= ~CLONE_NEWUSER;
871 
872 				/*
873 				 * We don't have the privileges to do any mapping here (see the
874 				 * clone_parent rant). So signal our parent to hook us up.
875 				 */
876 
877 				/* Switching is only necessary if we joined namespaces. */
878 				if (config.namespaces) {
879 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
880 						bail("failed to set process as dumpable");
881 				}
882 				s = SYNC_USERMAP_PLS;
883 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
884 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
885 
886 				/* ... wait for mapping ... */
887 
888 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
889 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
890 				if (s != SYNC_USERMAP_ACK)
891 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
892 				/* Switching is only necessary if we joined namespaces. */
893 				if (config.namespaces) {
894 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
895 						bail("failed to set process as dumpable");
896 				}
897 
898 				/* Become root in the namespace proper. */
899 				if (setresuid(0, 0, 0) < 0)
900 					bail("failed to become root in user namespace");
901 			}
902 			/*
903 			 * Unshare all of the namespaces. Now, it should be noted that this
904 			 * ordering might break in the future (especially with rootless
905 			 * containers). But for now, it's not possible to split this into
906 			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
907 			 *
908 			 * Note that we don't merge this with clone() because there were
909 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
910 			 * was broken, so we'll just do it the long way anyway.
911 			 */
912 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
913 				bail("failed to unshare namespaces");
914 
915 			/*
916 			 * TODO: What about non-namespace clone flags that we're dropping here?
917 			 *
918 			 * We fork again because of PID namespace, setns(2) or unshare(2) don't
919 			 * change the PID namespace of the calling process, because doing so
920 			 * would change the caller's idea of its own PID (as reported by getpid()),
921 			 * which would break many applications and libraries, so we must fork
922 			 * to actually enter the new PID namespace.
923 			 */
924 			child = clone_parent(&env, JUMP_INIT);
925 			if (child < 0)
926 				bail("unable to fork: init_func");
927 
928 			/* Send the child to our parent, which knows what it's doing. */
929 			s = SYNC_RECVPID_PLS;
930 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
931 				kill(child, SIGKILL);
932 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
933 			}
934 			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
935 				kill(child, SIGKILL);
936 				bail("failed to sync with parent: write(childpid)");
937 			}
938 
939 			/* ... wait for parent to get the pid ... */
940 
941 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
942 				kill(child, SIGKILL);
943 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
944 			}
945 			if (s != SYNC_RECVPID_ACK) {
946 				kill(child, SIGKILL);
947 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
948 			}
949 
950 			s = SYNC_CHILD_READY;
951 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
952 				kill(child, SIGKILL);
953 				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
954 			}
955 
956 			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
957 			exit(0);
958 		}
959 
960 		/*
961 		 * Stage 2: We're the final child process, and the only process that will
962 		 *          actually return to the Go runtime. Our job is to just do the
963 		 *          final cleanup steps and then return to the Go runtime to allow
964 		 *          init_linux.go to run.
965 		 */
966 	case JUMP_INIT:{
967 			/*
968 			 * We're inside the child now, having jumped from the
969 			 * start_child() code after forking in the parent.
970 			 */
971 			enum sync_t s;
972 
973 			/* We're in a child and thus need to tell the parent if we die. */
974 			syncfd = sync_grandchild_pipe[0];
975 			close(sync_grandchild_pipe[1]);
976 			close(sync_child_pipe[0]);
977 			close(sync_child_pipe[1]);
978 
979 			/* For debugging. */
980 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
981 
982 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
983 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
984 			if (s != SYNC_GRANDCHILD)
985 				bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
986 
987 			if (setsid() < 0)
988 				bail("setsid failed");
989 
990 			if (setuid(0) < 0)
991 				bail("setuid failed");
992 
993 			if (setgid(0) < 0)
994 				bail("setgid failed");
995 
996 			if (!config.is_rootless_euid && config.is_setgroup) {
997 				if (setgroups(0, NULL) < 0)
998 					bail("setgroups failed");
999 			}
1000 
1001 			/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
1002 			if (config.cloneflags & CLONE_NEWCGROUP) {
1003 				uint8_t value;
1004 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
1005 					bail("read synchronisation value failed");
1006 				if (value == CREATECGROUPNS) {
1007 					if (unshare(CLONE_NEWCGROUP) < 0)
1008 						bail("failed to unshare cgroup namespace");
1009 				} else
1010 					bail("received unknown synchronisation value");
1011 			}
1012 
1013 			s = SYNC_CHILD_READY;
1014 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
1015 				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
1016 
1017 			/* Close sync pipes. */
1018 			close(sync_grandchild_pipe[0]);
1019 
1020 			/* Free netlink data. */
1021 			nl_free(&config);
1022 
1023 			/* Finish executing, let the Go runtime take over. */
1024 			return;
1025 		}
1026 	default:
1027 		bail("unexpected jump value");
1028 	}
1029 
1030 	/* Should never be reached. */
1031 	bail("should never be reached");
1032 }
1033