1 /*
2  * Copyright (c) 2019-2021 Joris Vink <joris@coders.se>
3  *
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include <sys/param.h>
18 #include <sys/mman.h>
19 #include <sys/epoll.h>
20 #include <sys/ptrace.h>
21 #include <sys/prctl.h>
22 #include <sys/user.h>
23 #include <sys/syscall.h>
24 
25 #include <linux/ptrace.h>
26 #include <linux/seccomp.h>
27 #include <linux/filter.h>
28 #include <linux/audit.h>
29 
30 #include <stddef.h>
31 #include <sched.h>
32 
33 #include "kore.h"
34 #include "seccomp.h"
35 #include "platform.h"
36 
37 #if defined(KORE_USE_PYTHON)
38 #include "python_api.h"
39 #endif
40 
41 #if !defined(SECCOMP_KILL_POLICY)
42 #define SECCOMP_KILL_POLICY		SECCOMP_RET_KILL
43 #endif
44 
45 /*
46  * The bare minimum to be able to run kore. These are added last and can
47  * be overwritten by a filter program that is added before hand.
48  */
49 static struct sock_filter filter_kore[] = {
50 	/* Deny these, but with EACCESS instead of dying. */
51 	KORE_SYSCALL_DENY(ioctl, EACCES),
52 
53 	/* File related. */
54 #if defined(SYS_open)
55 	KORE_SYSCALL_ALLOW(open),
56 #endif
57 	KORE_SYSCALL_ALLOW(read),
58 #if defined(SYS_stat)
59 	KORE_SYSCALL_ALLOW(stat),
60 #endif
61 #if defined(SYS_stat64)
62 	KORE_SYSCALL_ALLOW(stat64),
63 #endif
64 #if defined(SYS_lstat)
65 	KORE_SYSCALL_ALLOW(lstat),
66 #endif
67 	KORE_SYSCALL_ALLOW(fstat),
68 #if defined(SYS_fstat64)
69 	KORE_SYSCALL_ALLOW(fstat64),
70 #endif
71 	KORE_SYSCALL_ALLOW(write),
72 	KORE_SYSCALL_ALLOW(fcntl),
73 #if defined(SYS_fcntl64)
74 	KORE_SYSCALL_ALLOW(fcntl64),
75 #endif
76 	KORE_SYSCALL_ALLOW(lseek),
77 #if defined(SYS__llseek)
78 	KORE_SYSCALL_ALLOW(_llseek),
79 #endif
80 	KORE_SYSCALL_ALLOW(close),
81 	KORE_SYSCALL_ALLOW(openat),
82 #if defined(SYS_access)
83 	KORE_SYSCALL_ALLOW(access),
84 #endif
85 	KORE_SYSCALL_ALLOW(writev),
86 	KORE_SYSCALL_ALLOW(getcwd),
87 #if defined(SYS_unlink)
88 	KORE_SYSCALL_ALLOW(unlink),
89 #endif
90 #if defined(SYS_readlink)
91 	KORE_SYSCALL_ALLOW(readlink),
92 #endif
93 
94 	/* Process related. */
95 	KORE_SYSCALL_ALLOW(exit),
96 	KORE_SYSCALL_ALLOW(kill),
97 	KORE_SYSCALL_ALLOW(getpid),
98 	KORE_SYSCALL_ALLOW(getuid),
99 	KORE_SYSCALL_ALLOW(geteuid),
100 	KORE_SYSCALL_ALLOW(exit_group),
101 	KORE_SYSCALL_ALLOW(nanosleep),
102 	KORE_SYSCALL_ALLOW(clock_nanosleep),
103 #if defined(SYS_sigreturn)
104 	KORE_SYSCALL_ALLOW(sigreturn),
105 #endif
106 
107 	/* Memory related. */
108 	KORE_SYSCALL_ALLOW(brk),
109 	KORE_SYSCALL_ALLOW(munmap),
110 
111 	/* Deny mmap/mprotect calls with PROT_EXEC/PROT_WRITE protection. */
112 #if defined(SYS_mmap)
113 	KORE_SYSCALL_DENY_WITH_FLAG(mmap, 2, PROT_EXEC | PROT_WRITE, EINVAL),
114 #endif
115 #if defined(SYS_mmap2)
116 	KORE_SYSCALL_DENY_WITH_FLAG(mmap2, 2, PROT_EXEC | PROT_WRITE, EINVAL),
117 #endif
118 	KORE_SYSCALL_DENY_WITH_FLAG(mprotect, 2, PROT_EXEC, EINVAL),
119 
120 #if defined(SYS_mmap)
121 	KORE_SYSCALL_ALLOW(mmap),
122 #endif
123 #if defined(SYS_mmap2)
124 	KORE_SYSCALL_ALLOW(mmap2),
125 #endif
126 	KORE_SYSCALL_ALLOW(madvise),
127 	KORE_SYSCALL_ALLOW(mprotect),
128 
129 	/* Net related. */
130 #if defined(SYS_poll)
131 	KORE_SYSCALL_ALLOW(poll),
132 #endif
133 	KORE_SYSCALL_ALLOW(ppoll),
134 #if defined(SYS_send)
135 	KORE_SYSCALL_ALLOW(send),
136 #endif
137 	KORE_SYSCALL_ALLOW(sendto),
138 	KORE_SYSCALL_ALLOW(accept),
139 	KORE_SYSCALL_ALLOW(sendfile),
140 #if defined(SYS_recv)
141 	KORE_SYSCALL_ALLOW(recv),
142 #endif
143 	KORE_SYSCALL_ALLOW(recvfrom),
144 	KORE_SYSCALL_ALLOW(epoll_ctl),
145 	KORE_SYSCALL_ALLOW(setsockopt),
146 #if defined(SYS_epoll_wait)
147 	KORE_SYSCALL_ALLOW(epoll_wait),
148 #endif
149 	KORE_SYSCALL_ALLOW(epoll_pwait),
150 
151 	/* Signal related. */
152 	KORE_SYSCALL_ALLOW(sigaltstack),
153 	KORE_SYSCALL_ALLOW(rt_sigreturn),
154 	KORE_SYSCALL_ALLOW(rt_sigaction),
155 	KORE_SYSCALL_ALLOW(rt_sigprocmask),
156 
157 	/* "Other" without clear category. */
158 	KORE_SYSCALL_ALLOW(futex),
159 	KORE_SYSCALL_ALLOW(clock_gettime),
160 
161 #if defined(__NR_getrandom)
162 	KORE_SYSCALL_ALLOW(getrandom),
163 #endif
164 };
165 
166 /* bpf program prologue. */
167 static struct sock_filter filter_prologue[] = {
168 	/* Load arch member into accumulator (A) (arch is __u32). */
169 	KORE_BPF_LOAD(arch, 0),
170 
171 	/* Compare accumulator against constant, if false jump over kill. */
172 	KORE_BPF_CMP(SECCOMP_AUDIT_ARCH, 1, 0),
173 	KORE_BPF_RET(SECCOMP_RET_KILL),
174 
175 	/* Load the system call number into the accumulator. */
176 	KORE_BPF_LOAD(nr, 0),
177 };
178 
179 /* bpf program epilogue. */
180 static struct sock_filter filter_epilogue[] = {
181 	/* Return hit if no system calls matched our list. */
182 	BPF_STMT(BPF_RET+BPF_K, SECCOMP_KILL_POLICY)
183 };
184 
185 static struct sock_filter	*seccomp_filter_update(struct sock_filter *,
186 				    const char *, size_t);
187 
188 #define filter_prologue_len	KORE_FILTER_LEN(filter_prologue)
189 #define filter_epilogue_len	KORE_FILTER_LEN(filter_epilogue)
190 
191 static void	seccomp_register_violation(pid_t);
192 
193 struct filter {
194 	char			*name;
195 	struct sock_filter	*prog;
196 	size_t			instructions;
197 	TAILQ_ENTRY(filter)	list;
198 };
199 
200 static TAILQ_HEAD(, filter)	filters;
201 static struct filter		*ufilter = NULL;
202 
203 /*
204  * If enabled will instruct the parent process to ptrace its children and
205  * log any seccomp SECCOMP_RET_TRACE rule.
206  */
207 int	kore_seccomp_tracing = 0;
208 
209 void
kore_seccomp_init(void)210 kore_seccomp_init(void)
211 {
212 	TAILQ_INIT(&filters);
213 }
214 
215 void
kore_seccomp_drop(void)216 kore_seccomp_drop(void)
217 {
218 	struct filter		*filter;
219 
220 	while ((filter = TAILQ_FIRST(&filters)) != NULL) {
221 		if (!kore_quiet) {
222 			kore_log(LOG_INFO,
223 			    "seccomp filter '%s' dropped", filter->name);
224 		}
225 		TAILQ_REMOVE(&filters, filter, list);
226 		kore_free(filter->name);
227 		kore_free(filter);
228 	}
229 
230 	TAILQ_INIT(&filters);
231 }
232 
233 void
kore_seccomp_enable(void)234 kore_seccomp_enable(void)
235 {
236 	struct sock_filter		*sf;
237 	struct sock_fprog		prog;
238 	struct kore_runtime_call	*rcall;
239 	struct filter			*filter;
240 	size_t				prog_len, off, i;
241 
242 	/*
243 	 * If kore_seccomp_tracing is turned on, set the default policy to
244 	 * SECCOMP_RET_TRACE so we can log the system calls.
245 	 */
246 	if (kore_seccomp_tracing) {
247 		filter_epilogue[0].k = SECCOMP_RET_TRACE;
248 		kore_log(LOG_NOTICE, "seccomp tracing enabled");
249 	}
250 
251 #if defined(KORE_USE_PYTHON)
252 	ufilter = TAILQ_FIRST(&filters);
253 	kore_python_seccomp_hook("koreapp.seccomp");
254 	ufilter = NULL;
255 #endif
256 
257 	/* Allow application to add its own filters. */
258 	if ((rcall = kore_runtime_getcall("kore_seccomp_hook")) != NULL) {
259 		ufilter = TAILQ_FIRST(&filters);
260 		kore_runtime_execute(rcall);
261 		kore_free(rcall);
262 		ufilter = NULL;
263 	}
264 
265 	if (worker->id != KORE_WORKER_KEYMGR) {
266 		/* Add worker required syscalls. */
267 		kore_seccomp_filter("worker", filter_kore,
268 		    KORE_FILTER_LEN(filter_kore));
269 	}
270 
271 	/* Start with the prologue. */
272 	prog_len = filter_prologue_len;
273 
274 	/* Now account for all enabled filters. */
275 	TAILQ_FOREACH(filter, &filters, list)
276 		prog_len += filter->instructions;
277 
278 	/* Finally add the epilogue. */
279 	prog_len += filter_epilogue_len;
280 
281 	/* Build the entire bpf program now. */
282 	if ((sf = calloc(prog_len, sizeof(*sf))) == NULL)
283 		fatalx("calloc");
284 
285 	off = 0;
286 	for (i = 0; i < filter_prologue_len; i++)
287 		sf[off++] = filter_prologue[i];
288 
289 	TAILQ_FOREACH(filter, &filters, list) {
290 		for (i = 0; i < filter->instructions; i++)
291 			sf[off++] = filter->prog[i];
292 	}
293 
294 	for (i = 0; i < filter_epilogue_len; i++)
295 		sf[off++] = filter_epilogue[i];
296 
297 	/* Lock and load it. */
298 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1)
299 		fatalx("prctl: %s", errno_s);
300 
301 	prog.filter = sf;
302 	prog.len = prog_len;
303 
304 	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1)
305 		fatalx("prctl: %s", errno_s);
306 
307 #if defined(KORE_USE_PYTHON)
308 	kore_python_seccomp_cleanup();
309 #endif
310 }
311 
312 int
kore_seccomp_filter(const char * name,void * prog,size_t len)313 kore_seccomp_filter(const char *name, void *prog, size_t len)
314 {
315 	struct filter		*filter;
316 
317 	TAILQ_FOREACH(filter, &filters, list) {
318 		if (!strcmp(filter->name, name))
319 			return (KORE_RESULT_ERROR);
320 	}
321 
322 	filter = kore_calloc(1, sizeof(*filter));
323 
324 	filter->prog = prog;
325 	filter->instructions = len;
326 	filter->name = kore_strdup(name);
327 
328 	if (ufilter) {
329 		TAILQ_INSERT_BEFORE(ufilter, filter, list);
330 	} else {
331 		TAILQ_INSERT_TAIL(&filters, filter, list);
332 	}
333 
334 	return (KORE_RESULT_OK);
335 }
336 
337 void
kore_seccomp_traceme(void)338 kore_seccomp_traceme(void)
339 {
340 	if (kore_seccomp_tracing == 0)
341 		return;
342 
343 	if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) == -1)
344 		fatalx("ptrace: %s", errno_s);
345 	if (kill(worker->pid, SIGSTOP) == -1)
346 		fatalx("kill: %s", errno_s);
347 }
348 
349 int
kore_seccomp_trace(pid_t pid,int status)350 kore_seccomp_trace(pid_t pid, int status)
351 {
352 	int	evt;
353 
354 	if (kore_seccomp_tracing == 0)
355 		return (KORE_RESULT_ERROR);
356 
357 	if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) {
358 		if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
359 		    PTRACE_O_TRACESECCOMP | PTRACE_O_TRACECLONE |
360 		    PTRACE_O_TRACEFORK) == -1)
361 			fatal("ptrace: %s", errno_s);
362 		if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
363 			fatal("ptrace: %s", errno_s);
364 		return (KORE_RESULT_OK);
365 	}
366 
367 	if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) {
368 		evt = status >> 8;
369 		if (evt == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8)))
370 			seccomp_register_violation(pid);
371 		if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
372 			fatal("ptrace: %s", errno_s);
373 		return (KORE_RESULT_OK);
374 	}
375 
376 	if (WIFSTOPPED(status)) {
377 		if (ptrace(PTRACE_CONT, pid, NULL, WSTOPSIG(status)) == -1)
378 			fatal("ptrace: %s", errno_s);
379 		return (KORE_RESULT_OK);
380 	}
381 
382 	return (KORE_RESULT_ERROR);
383 }
384 
385 int
kore_seccomp_syscall_resolve(const char * name)386 kore_seccomp_syscall_resolve(const char *name)
387 {
388 	int		i;
389 
390 	for (i = 0; kore_syscall_map[i].name != NULL; i++) {
391 		if (!strcmp(name, kore_syscall_map[i].name))
392 			return (kore_syscall_map[i].nr);
393 	}
394 
395 	return (-1);
396 }
397 
398 const char *
kore_seccomp_syscall_name(long sysnr)399 kore_seccomp_syscall_name(long sysnr)
400 {
401 	int		i;
402 
403 	for (i = 0; kore_syscall_map[i].name != NULL; i++) {
404 		if (kore_syscall_map[i].nr == sysnr)
405 			return (kore_syscall_map[i].name);
406 	}
407 
408 	return ("unknown");
409 }
410 
411 struct sock_filter *
kore_seccomp_syscall_filter(const char * name,int action)412 kore_seccomp_syscall_filter(const char *name, int action)
413 {
414 	struct sock_filter	filter[] = {
415 		KORE_SYSCALL_FILTER(exit, action),
416 		KORE_BPF_GUARD
417 	};
418 
419 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
420 }
421 
422 struct sock_filter *
kore_seccomp_syscall_arg(const char * name,int action,int arg,int value)423 kore_seccomp_syscall_arg(const char *name, int action, int arg, int value)
424 {
425 	struct sock_filter	filter[] = {
426 		KORE_SYSCALL_ARG(exit, arg, value, action),
427 		KORE_BPF_GUARD
428 	};
429 
430 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
431 }
432 
433 struct sock_filter *
kore_seccomp_syscall_mask(const char * name,int action,int arg,int value)434 kore_seccomp_syscall_mask(const char *name, int action, int arg, int value)
435 {
436 	struct sock_filter	filter[] = {
437 		KORE_SYSCALL_MASK(exit, arg, value, action),
438 		KORE_BPF_GUARD
439 	};
440 
441 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
442 }
443 
444 struct sock_filter *
kore_seccomp_syscall_flag(const char * name,int action,int arg,int value)445 kore_seccomp_syscall_flag(const char *name, int action, int arg, int value)
446 {
447 	struct sock_filter	filter[] = {
448 		KORE_SYSCALL_WITH_FLAG(exit, arg, value, action),
449 		KORE_BPF_GUARD
450 	};
451 
452 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
453 }
454 
455 static void
seccomp_register_violation(pid_t pid)456 seccomp_register_violation(pid_t pid)
457 {
458 	int				idx;
459 	struct kore_worker		*kw;
460 	struct iovec			iov;
461 #if defined(__arm__)
462 	struct pt_regs			regs;
463 #else
464 	struct user_regs_struct		regs;
465 #endif
466 	long				sysnr;
467 	const char			*name;
468 
469 	iov.iov_base = &regs;
470 	iov.iov_len = sizeof(regs);
471 
472 	if (ptrace(PTRACE_GETREGSET, pid, 1, &iov) == -1)
473 		fatal("ptrace: %s", errno_s);
474 
475 #if SECCOMP_AUDIT_ARCH == AUDIT_ARCH_X86_64
476 	sysnr = regs.orig_rax;
477 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_AARCH64
478 	sysnr = regs.regs[8];
479 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_ARM
480 	sysnr = regs.uregs[7];
481 #else
482 #error "platform not supported"
483 #endif
484 
485 	name = NULL;
486 	for (idx = 0; idx < worker_count; idx++) {
487 		kw = kore_worker_data(idx);
488 		if (kw->pid == pid) {
489 			name = kore_worker_name(kw->id);
490 			break;
491 		}
492 	}
493 
494 	if (name == NULL)
495 		name = "<child>";
496 
497 	kore_log(LOG_INFO, "seccomp violation, %s pid=%d, syscall=%ld:%s",
498 	    name, pid, sysnr, kore_seccomp_syscall_name(sysnr));
499 }
500 
501 static struct sock_filter *
seccomp_filter_update(struct sock_filter * filter,const char * name,size_t elm)502 seccomp_filter_update(struct sock_filter *filter, const char *name, size_t elm)
503 {
504 	int			nr;
505 	struct sock_filter	*result;
506 
507 	if ((nr = kore_seccomp_syscall_resolve(name)) == -1)
508 		return (NULL);
509 
510 	result = kore_calloc(elm, sizeof(struct sock_filter));
511 	memcpy(result, filter, elm * sizeof(struct sock_filter));
512 
513 	/* Update the syscall number to the one specified. */
514 	result[0].k = nr;
515 
516 	return (result);
517 }
518