1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Functions to manage eBPF programs attached to cgroups 4 * 5 * Copyright (c) 2016 Daniel Mack 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/atomic.h> 10 #include <linux/cgroup.h> 11 #include <linux/filter.h> 12 #include <linux/slab.h> 13 #include <linux/sysctl.h> 14 #include <linux/string.h> 15 #include <linux/bpf.h> 16 #include <linux/bpf-cgroup.h> 17 #include <net/sock.h> 18 19 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 20 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 21 22 /** 23 * cgroup_bpf_put() - put references of all bpf programs 24 * @cgrp: the cgroup to modify 25 */ 26 void cgroup_bpf_put(struct cgroup *cgrp) 27 { 28 enum bpf_cgroup_storage_type stype; 29 unsigned int type; 30 31 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 32 struct list_head *progs = &cgrp->bpf.progs[type]; 33 struct bpf_prog_list *pl, *tmp; 34 35 list_for_each_entry_safe(pl, tmp, progs, node) { 36 list_del(&pl->node); 37 bpf_prog_put(pl->prog); 38 for_each_cgroup_storage_type(stype) { 39 bpf_cgroup_storage_unlink(pl->storage[stype]); 40 bpf_cgroup_storage_free(pl->storage[stype]); 41 } 42 kfree(pl); 43 static_branch_dec(&cgroup_bpf_enabled_key); 44 } 45 bpf_prog_array_free(cgrp->bpf.effective[type]); 46 } 47 } 48 49 /* count number of elements in the list. 50 * it's slow but the list cannot be long 51 */ 52 static u32 prog_list_length(struct list_head *head) 53 { 54 struct bpf_prog_list *pl; 55 u32 cnt = 0; 56 57 list_for_each_entry(pl, head, node) { 58 if (!pl->prog) 59 continue; 60 cnt++; 61 } 62 return cnt; 63 } 64 65 /* if parent has non-overridable prog attached, 66 * disallow attaching new programs to the descendent cgroup. 67 * if parent has overridable or multi-prog, allow attaching 68 */ 69 static bool hierarchy_allows_attach(struct cgroup *cgrp, 70 enum bpf_attach_type type, 71 u32 new_flags) 72 { 73 struct cgroup *p; 74 75 p = cgroup_parent(cgrp); 76 if (!p) 77 return true; 78 do { 79 u32 flags = p->bpf.flags[type]; 80 u32 cnt; 81 82 if (flags & BPF_F_ALLOW_MULTI) 83 return true; 84 cnt = prog_list_length(&p->bpf.progs[type]); 85 WARN_ON_ONCE(cnt > 1); 86 if (cnt == 1) 87 return !!(flags & BPF_F_ALLOW_OVERRIDE); 88 p = cgroup_parent(p); 89 } while (p); 90 return true; 91 } 92 93 /* compute a chain of effective programs for a given cgroup: 94 * start from the list of programs in this cgroup and add 95 * all parent programs. 96 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 97 * to programs in this cgroup 98 */ 99 static int compute_effective_progs(struct cgroup *cgrp, 100 enum bpf_attach_type type, 101 struct bpf_prog_array __rcu **array) 102 { 103 enum bpf_cgroup_storage_type stype; 104 struct bpf_prog_array *progs; 105 struct bpf_prog_list *pl; 106 struct cgroup *p = cgrp; 107 int cnt = 0; 108 109 /* count number of effective programs by walking parents */ 110 do { 111 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 112 cnt += prog_list_length(&p->bpf.progs[type]); 113 p = cgroup_parent(p); 114 } while (p); 115 116 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 117 if (!progs) 118 return -ENOMEM; 119 120 /* populate the array with effective progs */ 121 cnt = 0; 122 p = cgrp; 123 do { 124 if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 125 continue; 126 127 list_for_each_entry(pl, &p->bpf.progs[type], node) { 128 if (!pl->prog) 129 continue; 130 131 progs->items[cnt].prog = pl->prog; 132 for_each_cgroup_storage_type(stype) 133 progs->items[cnt].cgroup_storage[stype] = 134 pl->storage[stype]; 135 cnt++; 136 } 137 } while ((p = cgroup_parent(p))); 138 139 rcu_assign_pointer(*array, progs); 140 return 0; 141 } 142 143 static void activate_effective_progs(struct cgroup *cgrp, 144 enum bpf_attach_type type, 145 struct bpf_prog_array __rcu *array) 146 { 147 struct bpf_prog_array __rcu *old_array; 148 149 old_array = xchg(&cgrp->bpf.effective[type], array); 150 /* free prog array after grace period, since __cgroup_bpf_run_*() 151 * might be still walking the array 152 */ 153 bpf_prog_array_free(old_array); 154 } 155 156 /** 157 * cgroup_bpf_inherit() - inherit effective programs from parent 158 * @cgrp: the cgroup to modify 159 */ 160 int cgroup_bpf_inherit(struct cgroup *cgrp) 161 { 162 /* has to use marco instead of const int, since compiler thinks 163 * that array below is variable length 164 */ 165 #define NR ARRAY_SIZE(cgrp->bpf.effective) 166 struct bpf_prog_array __rcu *arrays[NR] = {}; 167 int i; 168 169 for (i = 0; i < NR; i++) 170 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 171 172 for (i = 0; i < NR; i++) 173 if (compute_effective_progs(cgrp, i, &arrays[i])) 174 goto cleanup; 175 176 for (i = 0; i < NR; i++) 177 activate_effective_progs(cgrp, i, arrays[i]); 178 179 return 0; 180 cleanup: 181 for (i = 0; i < NR; i++) 182 bpf_prog_array_free(arrays[i]); 183 return -ENOMEM; 184 } 185 186 static int update_effective_progs(struct cgroup *cgrp, 187 enum bpf_attach_type type) 188 { 189 struct cgroup_subsys_state *css; 190 int err; 191 192 /* allocate and recompute effective prog arrays */ 193 css_for_each_descendant_pre(css, &cgrp->self) { 194 struct cgroup *desc = container_of(css, struct cgroup, self); 195 196 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 197 if (err) 198 goto cleanup; 199 } 200 201 /* all allocations were successful. Activate all prog arrays */ 202 css_for_each_descendant_pre(css, &cgrp->self) { 203 struct cgroup *desc = container_of(css, struct cgroup, self); 204 205 activate_effective_progs(desc, type, desc->bpf.inactive); 206 desc->bpf.inactive = NULL; 207 } 208 209 return 0; 210 211 cleanup: 212 /* oom while computing effective. Free all computed effective arrays 213 * since they were not activated 214 */ 215 css_for_each_descendant_pre(css, &cgrp->self) { 216 struct cgroup *desc = container_of(css, struct cgroup, self); 217 218 bpf_prog_array_free(desc->bpf.inactive); 219 desc->bpf.inactive = NULL; 220 } 221 222 return err; 223 } 224 225 #define BPF_CGROUP_MAX_PROGS 64 226 227 /** 228 * __cgroup_bpf_attach() - Attach the program to a cgroup, and 229 * propagate the change to descendants 230 * @cgrp: The cgroup which descendants to traverse 231 * @prog: A program to attach 232 * @type: Type of attach operation 233 * @flags: Option flags 234 * 235 * Must be called with cgroup_mutex held. 236 */ 237 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 238 enum bpf_attach_type type, u32 flags) 239 { 240 struct list_head *progs = &cgrp->bpf.progs[type]; 241 struct bpf_prog *old_prog = NULL; 242 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], 243 *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; 244 enum bpf_cgroup_storage_type stype; 245 struct bpf_prog_list *pl; 246 bool pl_was_allocated; 247 int err; 248 249 if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) 250 /* invalid combination */ 251 return -EINVAL; 252 253 if (!hierarchy_allows_attach(cgrp, type, flags)) 254 return -EPERM; 255 256 if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) 257 /* Disallow attaching non-overridable on top 258 * of existing overridable in this cgroup. 259 * Disallow attaching multi-prog if overridable or none 260 */ 261 return -EPERM; 262 263 if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) 264 return -E2BIG; 265 266 for_each_cgroup_storage_type(stype) { 267 storage[stype] = bpf_cgroup_storage_alloc(prog, stype); 268 if (IS_ERR(storage[stype])) { 269 storage[stype] = NULL; 270 for_each_cgroup_storage_type(stype) 271 bpf_cgroup_storage_free(storage[stype]); 272 return -ENOMEM; 273 } 274 } 275 276 if (flags & BPF_F_ALLOW_MULTI) { 277 list_for_each_entry(pl, progs, node) { 278 if (pl->prog == prog) { 279 /* disallow attaching the same prog twice */ 280 for_each_cgroup_storage_type(stype) 281 bpf_cgroup_storage_free(storage[stype]); 282 return -EINVAL; 283 } 284 } 285 286 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 287 if (!pl) { 288 for_each_cgroup_storage_type(stype) 289 bpf_cgroup_storage_free(storage[stype]); 290 return -ENOMEM; 291 } 292 293 pl_was_allocated = true; 294 pl->prog = prog; 295 for_each_cgroup_storage_type(stype) 296 pl->storage[stype] = storage[stype]; 297 list_add_tail(&pl->node, progs); 298 } else { 299 if (list_empty(progs)) { 300 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 301 if (!pl) { 302 for_each_cgroup_storage_type(stype) 303 bpf_cgroup_storage_free(storage[stype]); 304 return -ENOMEM; 305 } 306 pl_was_allocated = true; 307 list_add_tail(&pl->node, progs); 308 } else { 309 pl = list_first_entry(progs, typeof(*pl), node); 310 old_prog = pl->prog; 311 for_each_cgroup_storage_type(stype) { 312 old_storage[stype] = pl->storage[stype]; 313 bpf_cgroup_storage_unlink(old_storage[stype]); 314 } 315 pl_was_allocated = false; 316 } 317 pl->prog = prog; 318 for_each_cgroup_storage_type(stype) 319 pl->storage[stype] = storage[stype]; 320 } 321 322 cgrp->bpf.flags[type] = flags; 323 324 err = update_effective_progs(cgrp, type); 325 if (err) 326 goto cleanup; 327 328 static_branch_inc(&cgroup_bpf_enabled_key); 329 for_each_cgroup_storage_type(stype) { 330 if (!old_storage[stype]) 331 continue; 332 bpf_cgroup_storage_free(old_storage[stype]); 333 } 334 if (old_prog) { 335 bpf_prog_put(old_prog); 336 static_branch_dec(&cgroup_bpf_enabled_key); 337 } 338 for_each_cgroup_storage_type(stype) 339 bpf_cgroup_storage_link(storage[stype], cgrp, type); 340 return 0; 341 342 cleanup: 343 /* and cleanup the prog list */ 344 pl->prog = old_prog; 345 for_each_cgroup_storage_type(stype) { 346 bpf_cgroup_storage_free(pl->storage[stype]); 347 pl->storage[stype] = old_storage[stype]; 348 bpf_cgroup_storage_link(old_storage[stype], cgrp, type); 349 } 350 if (pl_was_allocated) { 351 list_del(&pl->node); 352 kfree(pl); 353 } 354 return err; 355 } 356 357 /** 358 * __cgroup_bpf_detach() - Detach the program from a cgroup, and 359 * propagate the change to descendants 360 * @cgrp: The cgroup which descendants to traverse 361 * @prog: A program to detach or NULL 362 * @type: Type of detach operation 363 * 364 * Must be called with cgroup_mutex held. 365 */ 366 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 367 enum bpf_attach_type type) 368 { 369 struct list_head *progs = &cgrp->bpf.progs[type]; 370 enum bpf_cgroup_storage_type stype; 371 u32 flags = cgrp->bpf.flags[type]; 372 struct bpf_prog *old_prog = NULL; 373 struct bpf_prog_list *pl; 374 int err; 375 376 if (flags & BPF_F_ALLOW_MULTI) { 377 if (!prog) 378 /* to detach MULTI prog the user has to specify valid FD 379 * of the program to be detached 380 */ 381 return -EINVAL; 382 } else { 383 if (list_empty(progs)) 384 /* report error when trying to detach and nothing is attached */ 385 return -ENOENT; 386 } 387 388 if (flags & BPF_F_ALLOW_MULTI) { 389 /* find the prog and detach it */ 390 list_for_each_entry(pl, progs, node) { 391 if (pl->prog != prog) 392 continue; 393 old_prog = prog; 394 /* mark it deleted, so it's ignored while 395 * recomputing effective 396 */ 397 pl->prog = NULL; 398 break; 399 } 400 if (!old_prog) 401 return -ENOENT; 402 } else { 403 /* to maintain backward compatibility NONE and OVERRIDE cgroups 404 * allow detaching with invalid FD (prog==NULL) 405 */ 406 pl = list_first_entry(progs, typeof(*pl), node); 407 old_prog = pl->prog; 408 pl->prog = NULL; 409 } 410 411 err = update_effective_progs(cgrp, type); 412 if (err) 413 goto cleanup; 414 415 /* now can actually delete it from this cgroup list */ 416 list_del(&pl->node); 417 for_each_cgroup_storage_type(stype) { 418 bpf_cgroup_storage_unlink(pl->storage[stype]); 419 bpf_cgroup_storage_free(pl->storage[stype]); 420 } 421 kfree(pl); 422 if (list_empty(progs)) 423 /* last program was detached, reset flags to zero */ 424 cgrp->bpf.flags[type] = 0; 425 426 bpf_prog_put(old_prog); 427 static_branch_dec(&cgroup_bpf_enabled_key); 428 return 0; 429 430 cleanup: 431 /* and restore back old_prog */ 432 pl->prog = old_prog; 433 return err; 434 } 435 436 /* Must be called with cgroup_mutex held to avoid races. */ 437 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 438 union bpf_attr __user *uattr) 439 { 440 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 441 enum bpf_attach_type type = attr->query.attach_type; 442 struct list_head *progs = &cgrp->bpf.progs[type]; 443 u32 flags = cgrp->bpf.flags[type]; 444 int cnt, ret = 0, i; 445 446 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 447 cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); 448 else 449 cnt = prog_list_length(progs); 450 451 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 452 return -EFAULT; 453 if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) 454 return -EFAULT; 455 if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) 456 /* return early if user requested only program count + flags */ 457 return 0; 458 if (attr->query.prog_cnt < cnt) { 459 cnt = attr->query.prog_cnt; 460 ret = -ENOSPC; 461 } 462 463 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 464 return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], 465 prog_ids, cnt); 466 } else { 467 struct bpf_prog_list *pl; 468 u32 id; 469 470 i = 0; 471 list_for_each_entry(pl, progs, node) { 472 id = pl->prog->aux->id; 473 if (copy_to_user(prog_ids + i, &id, sizeof(id))) 474 return -EFAULT; 475 if (++i == cnt) 476 break; 477 } 478 } 479 return ret; 480 } 481 482 int cgroup_bpf_prog_attach(const union bpf_attr *attr, 483 enum bpf_prog_type ptype, struct bpf_prog *prog) 484 { 485 struct cgroup *cgrp; 486 int ret; 487 488 cgrp = cgroup_get_from_fd(attr->target_fd); 489 if (IS_ERR(cgrp)) 490 return PTR_ERR(cgrp); 491 492 ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, 493 attr->attach_flags); 494 cgroup_put(cgrp); 495 return ret; 496 } 497 498 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) 499 { 500 struct bpf_prog *prog; 501 struct cgroup *cgrp; 502 int ret; 503 504 cgrp = cgroup_get_from_fd(attr->target_fd); 505 if (IS_ERR(cgrp)) 506 return PTR_ERR(cgrp); 507 508 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 509 if (IS_ERR(prog)) 510 prog = NULL; 511 512 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); 513 if (prog) 514 bpf_prog_put(prog); 515 516 cgroup_put(cgrp); 517 return ret; 518 } 519 520 int cgroup_bpf_prog_query(const union bpf_attr *attr, 521 union bpf_attr __user *uattr) 522 { 523 struct cgroup *cgrp; 524 int ret; 525 526 cgrp = cgroup_get_from_fd(attr->query.target_fd); 527 if (IS_ERR(cgrp)) 528 return PTR_ERR(cgrp); 529 530 ret = cgroup_bpf_query(cgrp, attr, uattr); 531 532 cgroup_put(cgrp); 533 return ret; 534 } 535 536 /** 537 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 538 * @sk: The socket sending or receiving traffic 539 * @skb: The skb that is being sent or received 540 * @type: The type of program to be exectuted 541 * 542 * If no socket is passed, or the socket is not of type INET or INET6, 543 * this function does nothing and returns 0. 544 * 545 * The program type passed in via @type must be suitable for network 546 * filtering. No further check is performed to assert that. 547 * 548 * This function will return %-EPERM if any if an attached program was found 549 * and if it returned != 1 during execution. In all other cases, 0 is returned. 550 */ 551 int __cgroup_bpf_run_filter_skb(struct sock *sk, 552 struct sk_buff *skb, 553 enum bpf_attach_type type) 554 { 555 unsigned int offset = skb->data - skb_network_header(skb); 556 struct sock *save_sk; 557 void *saved_data_end; 558 struct cgroup *cgrp; 559 int ret; 560 561 if (!sk || !sk_fullsock(sk)) 562 return 0; 563 564 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 565 return 0; 566 567 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 568 save_sk = skb->sk; 569 skb->sk = sk; 570 __skb_push(skb, offset); 571 572 /* compute pointers for the bpf prog */ 573 bpf_compute_and_save_data_end(skb, &saved_data_end); 574 575 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 576 __bpf_prog_run_save_cb); 577 bpf_restore_data_end(skb, saved_data_end); 578 __skb_pull(skb, offset); 579 skb->sk = save_sk; 580 return ret == 1 ? 0 : -EPERM; 581 } 582 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 583 584 /** 585 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 586 * @sk: sock structure to manipulate 587 * @type: The type of program to be exectuted 588 * 589 * socket is passed is expected to be of type INET or INET6. 590 * 591 * The program type passed in via @type must be suitable for sock 592 * filtering. No further check is performed to assert that. 593 * 594 * This function will return %-EPERM if any if an attached program was found 595 * and if it returned != 1 during execution. In all other cases, 0 is returned. 596 */ 597 int __cgroup_bpf_run_filter_sk(struct sock *sk, 598 enum bpf_attach_type type) 599 { 600 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 601 int ret; 602 603 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); 604 return ret == 1 ? 0 : -EPERM; 605 } 606 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 607 608 /** 609 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and 610 * provided by user sockaddr 611 * @sk: sock struct that will use sockaddr 612 * @uaddr: sockaddr struct provided by user 613 * @type: The type of program to be exectuted 614 * @t_ctx: Pointer to attach type specific context 615 * 616 * socket is expected to be of type INET or INET6. 617 * 618 * This function will return %-EPERM if an attached program is found and 619 * returned value != 1 during execution. In all other cases, 0 is returned. 620 */ 621 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, 622 struct sockaddr *uaddr, 623 enum bpf_attach_type type, 624 void *t_ctx) 625 { 626 struct bpf_sock_addr_kern ctx = { 627 .sk = sk, 628 .uaddr = uaddr, 629 .t_ctx = t_ctx, 630 }; 631 struct sockaddr_storage unspec; 632 struct cgroup *cgrp; 633 int ret; 634 635 /* Check socket family since not all sockets represent network 636 * endpoint (e.g. AF_UNIX). 637 */ 638 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 639 return 0; 640 641 if (!ctx.uaddr) { 642 memset(&unspec, 0, sizeof(unspec)); 643 ctx.uaddr = (struct sockaddr *)&unspec; 644 } 645 646 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 647 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 648 649 return ret == 1 ? 0 : -EPERM; 650 } 651 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); 652 653 /** 654 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 655 * @sk: socket to get cgroup from 656 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 657 * sk with connection information (IP addresses, etc.) May not contain 658 * cgroup info if it is a req sock. 659 * @type: The type of program to be exectuted 660 * 661 * socket passed is expected to be of type INET or INET6. 662 * 663 * The program type passed in via @type must be suitable for sock_ops 664 * filtering. No further check is performed to assert that. 665 * 666 * This function will return %-EPERM if any if an attached program was found 667 * and if it returned != 1 during execution. In all other cases, 0 is returned. 668 */ 669 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 670 struct bpf_sock_ops_kern *sock_ops, 671 enum bpf_attach_type type) 672 { 673 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 674 int ret; 675 676 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, 677 BPF_PROG_RUN); 678 return ret == 1 ? 0 : -EPERM; 679 } 680 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 681 682 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, 683 short access, enum bpf_attach_type type) 684 { 685 struct cgroup *cgrp; 686 struct bpf_cgroup_dev_ctx ctx = { 687 .access_type = (access << 16) | dev_type, 688 .major = major, 689 .minor = minor, 690 }; 691 int allow = 1; 692 693 rcu_read_lock(); 694 cgrp = task_dfl_cgroup(current); 695 allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, 696 BPF_PROG_RUN); 697 rcu_read_unlock(); 698 699 return !allow; 700 } 701 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); 702 703 static const struct bpf_func_proto * 704 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 705 { 706 switch (func_id) { 707 case BPF_FUNC_map_lookup_elem: 708 return &bpf_map_lookup_elem_proto; 709 case BPF_FUNC_map_update_elem: 710 return &bpf_map_update_elem_proto; 711 case BPF_FUNC_map_delete_elem: 712 return &bpf_map_delete_elem_proto; 713 case BPF_FUNC_map_push_elem: 714 return &bpf_map_push_elem_proto; 715 case BPF_FUNC_map_pop_elem: 716 return &bpf_map_pop_elem_proto; 717 case BPF_FUNC_map_peek_elem: 718 return &bpf_map_peek_elem_proto; 719 case BPF_FUNC_get_current_uid_gid: 720 return &bpf_get_current_uid_gid_proto; 721 case BPF_FUNC_get_local_storage: 722 return &bpf_get_local_storage_proto; 723 case BPF_FUNC_get_current_cgroup_id: 724 return &bpf_get_current_cgroup_id_proto; 725 case BPF_FUNC_trace_printk: 726 if (capable(CAP_SYS_ADMIN)) 727 return bpf_get_trace_printk_proto(); 728 /* fall through */ 729 default: 730 return NULL; 731 } 732 } 733 734 static const struct bpf_func_proto * 735 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 736 { 737 return cgroup_base_func_proto(func_id, prog); 738 } 739 740 static bool cgroup_dev_is_valid_access(int off, int size, 741 enum bpf_access_type type, 742 const struct bpf_prog *prog, 743 struct bpf_insn_access_aux *info) 744 { 745 const int size_default = sizeof(__u32); 746 747 if (type == BPF_WRITE) 748 return false; 749 750 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) 751 return false; 752 /* The verifier guarantees that size > 0. */ 753 if (off % size != 0) 754 return false; 755 756 switch (off) { 757 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): 758 bpf_ctx_record_field_size(info, size_default); 759 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 760 return false; 761 break; 762 default: 763 if (size != size_default) 764 return false; 765 } 766 767 return true; 768 } 769 770 const struct bpf_prog_ops cg_dev_prog_ops = { 771 }; 772 773 const struct bpf_verifier_ops cg_dev_verifier_ops = { 774 .get_func_proto = cgroup_dev_func_proto, 775 .is_valid_access = cgroup_dev_is_valid_access, 776 }; 777 778 /** 779 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl 780 * 781 * @head: sysctl table header 782 * @table: sysctl table 783 * @write: sysctl is being read (= 0) or written (= 1) 784 * @buf: pointer to buffer passed by user space 785 * @pcount: value-result argument: value is size of buffer pointed to by @buf, 786 * result is size of @new_buf if program set new value, initial value 787 * otherwise 788 * @ppos: value-result argument: value is position at which read from or write 789 * to sysctl is happening, result is new position if program overrode it, 790 * initial value otherwise 791 * @new_buf: pointer to pointer to new buffer that will be allocated if program 792 * overrides new value provided by user space on sysctl write 793 * NOTE: it's caller responsibility to free *new_buf if it was set 794 * @type: type of program to be executed 795 * 796 * Program is run when sysctl is being accessed, either read or written, and 797 * can allow or deny such access. 798 * 799 * This function will return %-EPERM if an attached program is found and 800 * returned value != 1 during execution. In all other cases 0 is returned. 801 */ 802 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 803 struct ctl_table *table, int write, 804 void __user *buf, size_t *pcount, 805 loff_t *ppos, void **new_buf, 806 enum bpf_attach_type type) 807 { 808 struct bpf_sysctl_kern ctx = { 809 .head = head, 810 .table = table, 811 .write = write, 812 .ppos = ppos, 813 .cur_val = NULL, 814 .cur_len = PAGE_SIZE, 815 .new_val = NULL, 816 .new_len = 0, 817 .new_updated = 0, 818 }; 819 struct cgroup *cgrp; 820 int ret; 821 822 ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); 823 if (ctx.cur_val) { 824 mm_segment_t old_fs; 825 loff_t pos = 0; 826 827 old_fs = get_fs(); 828 set_fs(KERNEL_DS); 829 if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, 830 &ctx.cur_len, &pos)) { 831 /* Let BPF program decide how to proceed. */ 832 ctx.cur_len = 0; 833 } 834 set_fs(old_fs); 835 } else { 836 /* Let BPF program decide how to proceed. */ 837 ctx.cur_len = 0; 838 } 839 840 if (write && buf && *pcount) { 841 /* BPF program should be able to override new value with a 842 * buffer bigger than provided by user. 843 */ 844 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); 845 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); 846 if (!ctx.new_val || 847 copy_from_user(ctx.new_val, buf, ctx.new_len)) 848 /* Let BPF program decide how to proceed. */ 849 ctx.new_len = 0; 850 } 851 852 rcu_read_lock(); 853 cgrp = task_dfl_cgroup(current); 854 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 855 rcu_read_unlock(); 856 857 kfree(ctx.cur_val); 858 859 if (ret == 1 && ctx.new_updated) { 860 *new_buf = ctx.new_val; 861 *pcount = ctx.new_len; 862 } else { 863 kfree(ctx.new_val); 864 } 865 866 return ret == 1 ? 0 : -EPERM; 867 } 868 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); 869 870 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 871 size_t *lenp) 872 { 873 ssize_t tmp_ret = 0, ret; 874 875 if (dir->header.parent) { 876 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); 877 if (tmp_ret < 0) 878 return tmp_ret; 879 } 880 881 ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); 882 if (ret < 0) 883 return ret; 884 *bufp += ret; 885 *lenp -= ret; 886 ret += tmp_ret; 887 888 /* Avoid leading slash. */ 889 if (!ret) 890 return ret; 891 892 tmp_ret = strscpy(*bufp, "/", *lenp); 893 if (tmp_ret < 0) 894 return tmp_ret; 895 *bufp += tmp_ret; 896 *lenp -= tmp_ret; 897 898 return ret + tmp_ret; 899 } 900 901 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, 902 size_t, buf_len, u64, flags) 903 { 904 ssize_t tmp_ret = 0, ret; 905 906 if (!buf) 907 return -EINVAL; 908 909 if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { 910 if (!ctx->head) 911 return -EINVAL; 912 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); 913 if (tmp_ret < 0) 914 return tmp_ret; 915 } 916 917 ret = strscpy(buf, ctx->table->procname, buf_len); 918 919 return ret < 0 ? ret : tmp_ret + ret; 920 } 921 922 static const struct bpf_func_proto bpf_sysctl_get_name_proto = { 923 .func = bpf_sysctl_get_name, 924 .gpl_only = false, 925 .ret_type = RET_INTEGER, 926 .arg1_type = ARG_PTR_TO_CTX, 927 .arg2_type = ARG_PTR_TO_MEM, 928 .arg3_type = ARG_CONST_SIZE, 929 .arg4_type = ARG_ANYTHING, 930 }; 931 932 static int copy_sysctl_value(char *dst, size_t dst_len, char *src, 933 size_t src_len) 934 { 935 if (!dst) 936 return -EINVAL; 937 938 if (!dst_len) 939 return -E2BIG; 940 941 if (!src || !src_len) { 942 memset(dst, 0, dst_len); 943 return -EINVAL; 944 } 945 946 memcpy(dst, src, min(dst_len, src_len)); 947 948 if (dst_len > src_len) { 949 memset(dst + src_len, '\0', dst_len - src_len); 950 return src_len; 951 } 952 953 dst[dst_len - 1] = '\0'; 954 955 return -E2BIG; 956 } 957 958 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, 959 char *, buf, size_t, buf_len) 960 { 961 return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); 962 } 963 964 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { 965 .func = bpf_sysctl_get_current_value, 966 .gpl_only = false, 967 .ret_type = RET_INTEGER, 968 .arg1_type = ARG_PTR_TO_CTX, 969 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 970 .arg3_type = ARG_CONST_SIZE, 971 }; 972 973 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, 974 size_t, buf_len) 975 { 976 if (!ctx->write) { 977 if (buf && buf_len) 978 memset(buf, '\0', buf_len); 979 return -EINVAL; 980 } 981 return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); 982 } 983 984 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { 985 .func = bpf_sysctl_get_new_value, 986 .gpl_only = false, 987 .ret_type = RET_INTEGER, 988 .arg1_type = ARG_PTR_TO_CTX, 989 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 990 .arg3_type = ARG_CONST_SIZE, 991 }; 992 993 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, 994 const char *, buf, size_t, buf_len) 995 { 996 if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) 997 return -EINVAL; 998 999 if (buf_len > PAGE_SIZE - 1) 1000 return -E2BIG; 1001 1002 memcpy(ctx->new_val, buf, buf_len); 1003 ctx->new_len = buf_len; 1004 ctx->new_updated = 1; 1005 1006 return 0; 1007 } 1008 1009 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { 1010 .func = bpf_sysctl_set_new_value, 1011 .gpl_only = false, 1012 .ret_type = RET_INTEGER, 1013 .arg1_type = ARG_PTR_TO_CTX, 1014 .arg2_type = ARG_PTR_TO_MEM, 1015 .arg3_type = ARG_CONST_SIZE, 1016 }; 1017 1018 static const struct bpf_func_proto * 1019 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1020 { 1021 switch (func_id) { 1022 case BPF_FUNC_strtol: 1023 return &bpf_strtol_proto; 1024 case BPF_FUNC_strtoul: 1025 return &bpf_strtoul_proto; 1026 case BPF_FUNC_sysctl_get_name: 1027 return &bpf_sysctl_get_name_proto; 1028 case BPF_FUNC_sysctl_get_current_value: 1029 return &bpf_sysctl_get_current_value_proto; 1030 case BPF_FUNC_sysctl_get_new_value: 1031 return &bpf_sysctl_get_new_value_proto; 1032 case BPF_FUNC_sysctl_set_new_value: 1033 return &bpf_sysctl_set_new_value_proto; 1034 default: 1035 return cgroup_base_func_proto(func_id, prog); 1036 } 1037 } 1038 1039 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, 1040 const struct bpf_prog *prog, 1041 struct bpf_insn_access_aux *info) 1042 { 1043 const int size_default = sizeof(__u32); 1044 1045 if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) 1046 return false; 1047 1048 switch (off) { 1049 case offsetof(struct bpf_sysctl, write): 1050 if (type != BPF_READ) 1051 return false; 1052 bpf_ctx_record_field_size(info, size_default); 1053 return bpf_ctx_narrow_access_ok(off, size, size_default); 1054 case offsetof(struct bpf_sysctl, file_pos): 1055 if (type == BPF_READ) { 1056 bpf_ctx_record_field_size(info, size_default); 1057 return bpf_ctx_narrow_access_ok(off, size, size_default); 1058 } else { 1059 return size == size_default; 1060 } 1061 default: 1062 return false; 1063 } 1064 } 1065 1066 static u32 sysctl_convert_ctx_access(enum bpf_access_type type, 1067 const struct bpf_insn *si, 1068 struct bpf_insn *insn_buf, 1069 struct bpf_prog *prog, u32 *target_size) 1070 { 1071 struct bpf_insn *insn = insn_buf; 1072 1073 switch (si->off) { 1074 case offsetof(struct bpf_sysctl, write): 1075 *insn++ = BPF_LDX_MEM( 1076 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 1077 bpf_target_off(struct bpf_sysctl_kern, write, 1078 FIELD_SIZEOF(struct bpf_sysctl_kern, 1079 write), 1080 target_size)); 1081 break; 1082 case offsetof(struct bpf_sysctl, file_pos): 1083 /* ppos is a pointer so it should be accessed via indirect 1084 * loads and stores. Also for stores additional temporary 1085 * register is used since neither src_reg nor dst_reg can be 1086 * overridden. 1087 */ 1088 if (type == BPF_WRITE) { 1089 int treg = BPF_REG_9; 1090 1091 if (si->src_reg == treg || si->dst_reg == treg) 1092 --treg; 1093 if (si->src_reg == treg || si->dst_reg == treg) 1094 --treg; 1095 *insn++ = BPF_STX_MEM( 1096 BPF_DW, si->dst_reg, treg, 1097 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1098 *insn++ = BPF_LDX_MEM( 1099 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1100 treg, si->dst_reg, 1101 offsetof(struct bpf_sysctl_kern, ppos)); 1102 *insn++ = BPF_STX_MEM( 1103 BPF_SIZEOF(u32), treg, si->src_reg, 0); 1104 *insn++ = BPF_LDX_MEM( 1105 BPF_DW, treg, si->dst_reg, 1106 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1107 } else { 1108 *insn++ = BPF_LDX_MEM( 1109 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1110 si->dst_reg, si->src_reg, 1111 offsetof(struct bpf_sysctl_kern, ppos)); 1112 *insn++ = BPF_LDX_MEM( 1113 BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); 1114 } 1115 *target_size = sizeof(u32); 1116 break; 1117 } 1118 1119 return insn - insn_buf; 1120 } 1121 1122 const struct bpf_verifier_ops cg_sysctl_verifier_ops = { 1123 .get_func_proto = sysctl_func_proto, 1124 .is_valid_access = sysctl_is_valid_access, 1125 .convert_ctx_access = sysctl_convert_ctx_access, 1126 }; 1127 1128 const struct bpf_prog_ops cg_sysctl_prog_ops = { 1129 }; 1130