1 /*
2 * Copyright (c) 2019-2021 Joris Vink <joris@coders.se>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <sys/param.h>
18 #include <sys/mman.h>
19 #include <sys/epoll.h>
20 #include <sys/ptrace.h>
21 #include <sys/prctl.h>
22 #include <sys/user.h>
23 #include <sys/syscall.h>
24
25 #include <linux/ptrace.h>
26 #include <linux/seccomp.h>
27 #include <linux/filter.h>
28 #include <linux/audit.h>
29
30 #include <stddef.h>
31 #include <sched.h>
32
33 #include "kore.h"
34 #include "seccomp.h"
35 #include "platform.h"
36
37 #if defined(KORE_USE_PYTHON)
38 #include "python_api.h"
39 #endif
40
41 #if !defined(SECCOMP_KILL_POLICY)
42 #define SECCOMP_KILL_POLICY SECCOMP_RET_KILL
43 #endif
44
45 /*
46 * The bare minimum to be able to run kore. These are added last and can
47 * be overwritten by a filter program that is added before hand.
48 */
49 static struct sock_filter filter_kore[] = {
50 /* Deny these, but with EACCESS instead of dying. */
51 KORE_SYSCALL_DENY(ioctl, EACCES),
52
53 /* File related. */
54 #if defined(SYS_open)
55 KORE_SYSCALL_ALLOW(open),
56 #endif
57 KORE_SYSCALL_ALLOW(read),
58 #if defined(SYS_stat)
59 KORE_SYSCALL_ALLOW(stat),
60 #endif
61 #if defined(SYS_stat64)
62 KORE_SYSCALL_ALLOW(stat64),
63 #endif
64 #if defined(SYS_lstat)
65 KORE_SYSCALL_ALLOW(lstat),
66 #endif
67 KORE_SYSCALL_ALLOW(fstat),
68 #if defined(SYS_fstat64)
69 KORE_SYSCALL_ALLOW(fstat64),
70 #endif
71 KORE_SYSCALL_ALLOW(write),
72 KORE_SYSCALL_ALLOW(fcntl),
73 #if defined(SYS_fcntl64)
74 KORE_SYSCALL_ALLOW(fcntl64),
75 #endif
76 KORE_SYSCALL_ALLOW(lseek),
77 #if defined(SYS__llseek)
78 KORE_SYSCALL_ALLOW(_llseek),
79 #endif
80 KORE_SYSCALL_ALLOW(close),
81 KORE_SYSCALL_ALLOW(openat),
82 #if defined(SYS_access)
83 KORE_SYSCALL_ALLOW(access),
84 #endif
85 KORE_SYSCALL_ALLOW(writev),
86 KORE_SYSCALL_ALLOW(getcwd),
87 #if defined(SYS_unlink)
88 KORE_SYSCALL_ALLOW(unlink),
89 #endif
90 #if defined(SYS_readlink)
91 KORE_SYSCALL_ALLOW(readlink),
92 #endif
93
94 /* Process related. */
95 KORE_SYSCALL_ALLOW(exit),
96 KORE_SYSCALL_ALLOW(kill),
97 KORE_SYSCALL_ALLOW(getpid),
98 KORE_SYSCALL_ALLOW(getuid),
99 KORE_SYSCALL_ALLOW(geteuid),
100 KORE_SYSCALL_ALLOW(exit_group),
101 KORE_SYSCALL_ALLOW(nanosleep),
102 KORE_SYSCALL_ALLOW(clock_nanosleep),
103 #if defined(SYS_sigreturn)
104 KORE_SYSCALL_ALLOW(sigreturn),
105 #endif
106
107 /* Memory related. */
108 KORE_SYSCALL_ALLOW(brk),
109 KORE_SYSCALL_ALLOW(munmap),
110
111 /* Deny mmap/mprotect calls with PROT_EXEC/PROT_WRITE protection. */
112 #if defined(SYS_mmap)
113 KORE_SYSCALL_DENY_WITH_FLAG(mmap, 2, PROT_EXEC | PROT_WRITE, EINVAL),
114 #endif
115 #if defined(SYS_mmap2)
116 KORE_SYSCALL_DENY_WITH_FLAG(mmap2, 2, PROT_EXEC | PROT_WRITE, EINVAL),
117 #endif
118 KORE_SYSCALL_DENY_WITH_FLAG(mprotect, 2, PROT_EXEC, EINVAL),
119
120 #if defined(SYS_mmap)
121 KORE_SYSCALL_ALLOW(mmap),
122 #endif
123 #if defined(SYS_mmap2)
124 KORE_SYSCALL_ALLOW(mmap2),
125 #endif
126 KORE_SYSCALL_ALLOW(madvise),
127 KORE_SYSCALL_ALLOW(mprotect),
128
129 /* Net related. */
130 #if defined(SYS_poll)
131 KORE_SYSCALL_ALLOW(poll),
132 #endif
133 KORE_SYSCALL_ALLOW(ppoll),
134 #if defined(SYS_send)
135 KORE_SYSCALL_ALLOW(send),
136 #endif
137 KORE_SYSCALL_ALLOW(sendto),
138 KORE_SYSCALL_ALLOW(accept),
139 KORE_SYSCALL_ALLOW(sendfile),
140 #if defined(SYS_recv)
141 KORE_SYSCALL_ALLOW(recv),
142 #endif
143 KORE_SYSCALL_ALLOW(recvfrom),
144 KORE_SYSCALL_ALLOW(epoll_ctl),
145 KORE_SYSCALL_ALLOW(setsockopt),
146 #if defined(SYS_epoll_wait)
147 KORE_SYSCALL_ALLOW(epoll_wait),
148 #endif
149 KORE_SYSCALL_ALLOW(epoll_pwait),
150
151 /* Signal related. */
152 KORE_SYSCALL_ALLOW(sigaltstack),
153 KORE_SYSCALL_ALLOW(rt_sigreturn),
154 KORE_SYSCALL_ALLOW(rt_sigaction),
155 KORE_SYSCALL_ALLOW(rt_sigprocmask),
156
157 /* "Other" without clear category. */
158 KORE_SYSCALL_ALLOW(futex),
159 KORE_SYSCALL_ALLOW(clock_gettime),
160
161 #if defined(__NR_getrandom)
162 KORE_SYSCALL_ALLOW(getrandom),
163 #endif
164 };
165
166 /* bpf program prologue. */
167 static struct sock_filter filter_prologue[] = {
168 /* Load arch member into accumulator (A) (arch is __u32). */
169 KORE_BPF_LOAD(arch, 0),
170
171 /* Compare accumulator against constant, if false jump over kill. */
172 KORE_BPF_CMP(SECCOMP_AUDIT_ARCH, 1, 0),
173 KORE_BPF_RET(SECCOMP_RET_KILL),
174
175 /* Load the system call number into the accumulator. */
176 KORE_BPF_LOAD(nr, 0),
177 };
178
179 /* bpf program epilogue. */
180 static struct sock_filter filter_epilogue[] = {
181 /* Return hit if no system calls matched our list. */
182 BPF_STMT(BPF_RET+BPF_K, SECCOMP_KILL_POLICY)
183 };
184
185 static struct sock_filter *seccomp_filter_update(struct sock_filter *,
186 const char *, size_t);
187
188 #define filter_prologue_len KORE_FILTER_LEN(filter_prologue)
189 #define filter_epilogue_len KORE_FILTER_LEN(filter_epilogue)
190
191 static void seccomp_register_violation(pid_t);
192
193 struct filter {
194 char *name;
195 struct sock_filter *prog;
196 size_t instructions;
197 TAILQ_ENTRY(filter) list;
198 };
199
200 static TAILQ_HEAD(, filter) filters;
201 static struct filter *ufilter = NULL;
202
203 /*
204 * If enabled will instruct the parent process to ptrace its children and
205 * log any seccomp SECCOMP_RET_TRACE rule.
206 */
207 int kore_seccomp_tracing = 0;
208
209 void
kore_seccomp_init(void)210 kore_seccomp_init(void)
211 {
212 TAILQ_INIT(&filters);
213 }
214
215 void
kore_seccomp_drop(void)216 kore_seccomp_drop(void)
217 {
218 struct filter *filter;
219
220 while ((filter = TAILQ_FIRST(&filters)) != NULL) {
221 if (!kore_quiet) {
222 kore_log(LOG_INFO,
223 "seccomp filter '%s' dropped", filter->name);
224 }
225 TAILQ_REMOVE(&filters, filter, list);
226 kore_free(filter->name);
227 kore_free(filter);
228 }
229
230 TAILQ_INIT(&filters);
231 }
232
233 void
kore_seccomp_enable(void)234 kore_seccomp_enable(void)
235 {
236 struct sock_filter *sf;
237 struct sock_fprog prog;
238 struct kore_runtime_call *rcall;
239 struct filter *filter;
240 size_t prog_len, off, i;
241
242 /*
243 * If kore_seccomp_tracing is turned on, set the default policy to
244 * SECCOMP_RET_TRACE so we can log the system calls.
245 */
246 if (kore_seccomp_tracing) {
247 filter_epilogue[0].k = SECCOMP_RET_TRACE;
248 kore_log(LOG_NOTICE, "seccomp tracing enabled");
249 }
250
251 #if defined(KORE_USE_PYTHON)
252 ufilter = TAILQ_FIRST(&filters);
253 kore_python_seccomp_hook("koreapp.seccomp");
254 ufilter = NULL;
255 #endif
256
257 /* Allow application to add its own filters. */
258 if ((rcall = kore_runtime_getcall("kore_seccomp_hook")) != NULL) {
259 ufilter = TAILQ_FIRST(&filters);
260 kore_runtime_execute(rcall);
261 kore_free(rcall);
262 ufilter = NULL;
263 }
264
265 if (worker->id != KORE_WORKER_KEYMGR) {
266 /* Add worker required syscalls. */
267 kore_seccomp_filter("worker", filter_kore,
268 KORE_FILTER_LEN(filter_kore));
269 }
270
271 /* Start with the prologue. */
272 prog_len = filter_prologue_len;
273
274 /* Now account for all enabled filters. */
275 TAILQ_FOREACH(filter, &filters, list)
276 prog_len += filter->instructions;
277
278 /* Finally add the epilogue. */
279 prog_len += filter_epilogue_len;
280
281 /* Build the entire bpf program now. */
282 if ((sf = calloc(prog_len, sizeof(*sf))) == NULL)
283 fatalx("calloc");
284
285 off = 0;
286 for (i = 0; i < filter_prologue_len; i++)
287 sf[off++] = filter_prologue[i];
288
289 TAILQ_FOREACH(filter, &filters, list) {
290 for (i = 0; i < filter->instructions; i++)
291 sf[off++] = filter->prog[i];
292 }
293
294 for (i = 0; i < filter_epilogue_len; i++)
295 sf[off++] = filter_epilogue[i];
296
297 /* Lock and load it. */
298 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1)
299 fatalx("prctl: %s", errno_s);
300
301 prog.filter = sf;
302 prog.len = prog_len;
303
304 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1)
305 fatalx("prctl: %s", errno_s);
306
307 #if defined(KORE_USE_PYTHON)
308 kore_python_seccomp_cleanup();
309 #endif
310 }
311
312 int
kore_seccomp_filter(const char * name,void * prog,size_t len)313 kore_seccomp_filter(const char *name, void *prog, size_t len)
314 {
315 struct filter *filter;
316
317 TAILQ_FOREACH(filter, &filters, list) {
318 if (!strcmp(filter->name, name))
319 return (KORE_RESULT_ERROR);
320 }
321
322 filter = kore_calloc(1, sizeof(*filter));
323
324 filter->prog = prog;
325 filter->instructions = len;
326 filter->name = kore_strdup(name);
327
328 if (ufilter) {
329 TAILQ_INSERT_BEFORE(ufilter, filter, list);
330 } else {
331 TAILQ_INSERT_TAIL(&filters, filter, list);
332 }
333
334 return (KORE_RESULT_OK);
335 }
336
337 void
kore_seccomp_traceme(void)338 kore_seccomp_traceme(void)
339 {
340 if (kore_seccomp_tracing == 0)
341 return;
342
343 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) == -1)
344 fatalx("ptrace: %s", errno_s);
345 if (kill(worker->pid, SIGSTOP) == -1)
346 fatalx("kill: %s", errno_s);
347 }
348
349 int
kore_seccomp_trace(pid_t pid,int status)350 kore_seccomp_trace(pid_t pid, int status)
351 {
352 int evt;
353
354 if (kore_seccomp_tracing == 0)
355 return (KORE_RESULT_ERROR);
356
357 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) {
358 if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
359 PTRACE_O_TRACESECCOMP | PTRACE_O_TRACECLONE |
360 PTRACE_O_TRACEFORK) == -1)
361 fatal("ptrace: %s", errno_s);
362 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
363 fatal("ptrace: %s", errno_s);
364 return (KORE_RESULT_OK);
365 }
366
367 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) {
368 evt = status >> 8;
369 if (evt == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8)))
370 seccomp_register_violation(pid);
371 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
372 fatal("ptrace: %s", errno_s);
373 return (KORE_RESULT_OK);
374 }
375
376 if (WIFSTOPPED(status)) {
377 if (ptrace(PTRACE_CONT, pid, NULL, WSTOPSIG(status)) == -1)
378 fatal("ptrace: %s", errno_s);
379 return (KORE_RESULT_OK);
380 }
381
382 return (KORE_RESULT_ERROR);
383 }
384
385 int
kore_seccomp_syscall_resolve(const char * name)386 kore_seccomp_syscall_resolve(const char *name)
387 {
388 int i;
389
390 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
391 if (!strcmp(name, kore_syscall_map[i].name))
392 return (kore_syscall_map[i].nr);
393 }
394
395 return (-1);
396 }
397
398 const char *
kore_seccomp_syscall_name(long sysnr)399 kore_seccomp_syscall_name(long sysnr)
400 {
401 int i;
402
403 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
404 if (kore_syscall_map[i].nr == sysnr)
405 return (kore_syscall_map[i].name);
406 }
407
408 return ("unknown");
409 }
410
411 struct sock_filter *
kore_seccomp_syscall_filter(const char * name,int action)412 kore_seccomp_syscall_filter(const char *name, int action)
413 {
414 struct sock_filter filter[] = {
415 KORE_SYSCALL_FILTER(exit, action),
416 KORE_BPF_GUARD
417 };
418
419 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
420 }
421
422 struct sock_filter *
kore_seccomp_syscall_arg(const char * name,int action,int arg,int value)423 kore_seccomp_syscall_arg(const char *name, int action, int arg, int value)
424 {
425 struct sock_filter filter[] = {
426 KORE_SYSCALL_ARG(exit, arg, value, action),
427 KORE_BPF_GUARD
428 };
429
430 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
431 }
432
433 struct sock_filter *
kore_seccomp_syscall_mask(const char * name,int action,int arg,int value)434 kore_seccomp_syscall_mask(const char *name, int action, int arg, int value)
435 {
436 struct sock_filter filter[] = {
437 KORE_SYSCALL_MASK(exit, arg, value, action),
438 KORE_BPF_GUARD
439 };
440
441 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
442 }
443
444 struct sock_filter *
kore_seccomp_syscall_flag(const char * name,int action,int arg,int value)445 kore_seccomp_syscall_flag(const char *name, int action, int arg, int value)
446 {
447 struct sock_filter filter[] = {
448 KORE_SYSCALL_WITH_FLAG(exit, arg, value, action),
449 KORE_BPF_GUARD
450 };
451
452 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
453 }
454
455 static void
seccomp_register_violation(pid_t pid)456 seccomp_register_violation(pid_t pid)
457 {
458 int idx;
459 struct kore_worker *kw;
460 struct iovec iov;
461 #if defined(__arm__)
462 struct pt_regs regs;
463 #else
464 struct user_regs_struct regs;
465 #endif
466 long sysnr;
467 const char *name;
468
469 iov.iov_base = ®s;
470 iov.iov_len = sizeof(regs);
471
472 if (ptrace(PTRACE_GETREGSET, pid, 1, &iov) == -1)
473 fatal("ptrace: %s", errno_s);
474
475 #if SECCOMP_AUDIT_ARCH == AUDIT_ARCH_X86_64
476 sysnr = regs.orig_rax;
477 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_AARCH64
478 sysnr = regs.regs[8];
479 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_ARM
480 sysnr = regs.uregs[7];
481 #else
482 #error "platform not supported"
483 #endif
484
485 name = NULL;
486 for (idx = 0; idx < worker_count; idx++) {
487 kw = kore_worker_data(idx);
488 if (kw->pid == pid) {
489 name = kore_worker_name(kw->id);
490 break;
491 }
492 }
493
494 if (name == NULL)
495 name = "<child>";
496
497 kore_log(LOG_INFO, "seccomp violation, %s pid=%d, syscall=%ld:%s",
498 name, pid, sysnr, kore_seccomp_syscall_name(sysnr));
499 }
500
501 static struct sock_filter *
seccomp_filter_update(struct sock_filter * filter,const char * name,size_t elm)502 seccomp_filter_update(struct sock_filter *filter, const char *name, size_t elm)
503 {
504 int nr;
505 struct sock_filter *result;
506
507 if ((nr = kore_seccomp_syscall_resolve(name)) == -1)
508 return (NULL);
509
510 result = kore_calloc(elm, sizeof(struct sock_filter));
511 memcpy(result, filter, elm * sizeof(struct sock_filter));
512
513 /* Update the syscall number to the one specified. */
514 result[0].k = nr;
515
516 return (result);
517 }
518