1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Unstable Conntrack Helpers for XDP and TC-BPF hook 3 * 4 * These are called from the XDP and SCHED_CLS BPF programs. Note that it is 5 * allowed to break compatibility for these functions since the interface they 6 * are exposed through to BPF programs is explicitly unstable. 7 */ 8 9 #include <linux/bpf_verifier.h> 10 #include <linux/bpf.h> 11 #include <linux/btf.h> 12 #include <linux/filter.h> 13 #include <linux/mutex.h> 14 #include <linux/types.h> 15 #include <linux/btf_ids.h> 16 #include <linux/net_namespace.h> 17 #include <net/xdp.h> 18 #include <net/netfilter/nf_conntrack_bpf.h> 19 #include <net/netfilter/nf_conntrack_core.h> 20 21 /* bpf_ct_opts - Options for CT lookup helpers 22 * 23 * Members: 24 * @netns_id - Specify the network namespace for lookup 25 * Values: 26 * BPF_F_CURRENT_NETNS (-1) 27 * Use namespace associated with ctx (xdp_md, __sk_buff) 28 * [0, S32_MAX] 29 * Network Namespace ID 30 * @error - Out parameter, set for any errors encountered 31 * Values: 32 * -EINVAL - Passed NULL for bpf_tuple pointer 33 * -EINVAL - opts->reserved is not 0 34 * -EINVAL - netns_id is less than -1 35 * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12) 36 * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP 37 * -ENONET - No network namespace found for netns_id 38 * -ENOENT - Conntrack lookup could not find entry for tuple 39 * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4) 40 * or sizeof(tuple->ipv6) 41 * @l4proto - Layer 4 protocol 42 * Values: 43 * IPPROTO_TCP, IPPROTO_UDP 44 * @dir: - connection tracking tuple direction. 45 * @reserved - Reserved member, will be reused for more options in future 46 * Values: 47 * 0 48 */ 49 struct bpf_ct_opts { 50 s32 netns_id; 51 s32 error; 52 u8 l4proto; 53 u8 dir; 54 u8 reserved[2]; 55 }; 56 57 enum { 58 NF_BPF_CT_OPTS_SZ = 12, 59 }; 60 61 static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, 62 u32 tuple_len, u8 protonum, u8 dir, 63 struct nf_conntrack_tuple *tuple) 64 { 65 union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3; 66 union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3; 67 union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u 68 : &tuple->src.u; 69 union nf_conntrack_man_proto *dport = dir ? &tuple->src.u 70 : (void *)&tuple->dst.u; 71 72 if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP)) 73 return -EPROTO; 74 75 memset(tuple, 0, sizeof(*tuple)); 76 77 switch (tuple_len) { 78 case sizeof(bpf_tuple->ipv4): 79 tuple->src.l3num = AF_INET; 80 src->ip = bpf_tuple->ipv4.saddr; 81 sport->tcp.port = bpf_tuple->ipv4.sport; 82 dst->ip = bpf_tuple->ipv4.daddr; 83 dport->tcp.port = bpf_tuple->ipv4.dport; 84 break; 85 case sizeof(bpf_tuple->ipv6): 86 tuple->src.l3num = AF_INET6; 87 memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr)); 88 sport->tcp.port = bpf_tuple->ipv6.sport; 89 memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr)); 90 dport->tcp.port = bpf_tuple->ipv6.dport; 91 break; 92 default: 93 return -EAFNOSUPPORT; 94 } 95 tuple->dst.protonum = protonum; 96 tuple->dst.dir = dir; 97 98 return 0; 99 } 100 101 static struct nf_conn * 102 __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, 103 u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len, 104 u32 timeout) 105 { 106 struct nf_conntrack_tuple otuple, rtuple; 107 struct nf_conn *ct; 108 int err; 109 110 if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || 111 opts_len != NF_BPF_CT_OPTS_SZ) 112 return ERR_PTR(-EINVAL); 113 114 if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) 115 return ERR_PTR(-EINVAL); 116 117 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 118 IP_CT_DIR_ORIGINAL, &otuple); 119 if (err < 0) 120 return ERR_PTR(err); 121 122 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 123 IP_CT_DIR_REPLY, &rtuple); 124 if (err < 0) 125 return ERR_PTR(err); 126 127 if (opts->netns_id >= 0) { 128 net = get_net_ns_by_id(net, opts->netns_id); 129 if (unlikely(!net)) 130 return ERR_PTR(-ENONET); 131 } 132 133 ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple, 134 GFP_ATOMIC); 135 if (IS_ERR(ct)) 136 goto out; 137 138 memset(&ct->proto, 0, sizeof(ct->proto)); 139 __nf_ct_set_timeout(ct, timeout * HZ); 140 141 out: 142 if (opts->netns_id >= 0) 143 put_net(net); 144 145 return ct; 146 } 147 148 static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, 149 struct bpf_sock_tuple *bpf_tuple, 150 u32 tuple_len, struct bpf_ct_opts *opts, 151 u32 opts_len) 152 { 153 struct nf_conntrack_tuple_hash *hash; 154 struct nf_conntrack_tuple tuple; 155 struct nf_conn *ct; 156 int err; 157 158 if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || 159 opts_len != NF_BPF_CT_OPTS_SZ) 160 return ERR_PTR(-EINVAL); 161 if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) 162 return ERR_PTR(-EPROTO); 163 if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) 164 return ERR_PTR(-EINVAL); 165 166 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 167 IP_CT_DIR_ORIGINAL, &tuple); 168 if (err < 0) 169 return ERR_PTR(err); 170 171 if (opts->netns_id >= 0) { 172 net = get_net_ns_by_id(net, opts->netns_id); 173 if (unlikely(!net)) 174 return ERR_PTR(-ENONET); 175 } 176 177 hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple); 178 if (opts->netns_id >= 0) 179 put_net(net); 180 if (!hash) 181 return ERR_PTR(-ENOENT); 182 183 ct = nf_ct_tuplehash_to_ctrack(hash); 184 opts->dir = NF_CT_DIRECTION(hash); 185 186 return ct; 187 } 188 189 BTF_ID_LIST(btf_nf_conn_ids) 190 BTF_ID(struct, nf_conn) 191 BTF_ID(struct, nf_conn___init) 192 193 /* Check writes into `struct nf_conn` */ 194 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, 195 const struct bpf_reg_state *reg, 196 int off, int size) 197 { 198 const struct btf_type *ncit, *nct, *t; 199 size_t end; 200 201 ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]); 202 nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]); 203 t = btf_type_by_id(reg->btf, reg->btf_id); 204 if (t != nct && t != ncit) { 205 bpf_log(log, "only read is supported\n"); 206 return -EACCES; 207 } 208 209 /* `struct nf_conn` and `struct nf_conn___init` have the same layout 210 * so we are safe to simply merge offset checks here 211 */ 212 switch (off) { 213 #if defined(CONFIG_NF_CONNTRACK_MARK) 214 case offsetof(struct nf_conn, mark): 215 end = offsetofend(struct nf_conn, mark); 216 break; 217 #endif 218 default: 219 bpf_log(log, "no write support to nf_conn at off %d\n", off); 220 return -EACCES; 221 } 222 223 if (off + size > end) { 224 bpf_log(log, 225 "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", 226 off, size, end); 227 return -EACCES; 228 } 229 230 return 0; 231 } 232 233 __bpf_kfunc_start_defs(); 234 235 /* bpf_xdp_ct_alloc - Allocate a new CT entry 236 * 237 * Parameters: 238 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program 239 * Cannot be NULL 240 * @bpf_tuple - Pointer to memory representing the tuple to look up 241 * Cannot be NULL 242 * @tuple__sz - Length of the tuple structure 243 * Must be one of sizeof(bpf_tuple->ipv4) or 244 * sizeof(bpf_tuple->ipv6) 245 * @opts - Additional options for allocation (documented above) 246 * Cannot be NULL 247 * @opts__sz - Length of the bpf_ct_opts structure 248 * Must be NF_BPF_CT_OPTS_SZ (12) 249 */ 250 __bpf_kfunc struct nf_conn___init * 251 bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, 252 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 253 { 254 struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; 255 struct nf_conn *nfct; 256 257 nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz, 258 opts, opts__sz, 10); 259 if (IS_ERR(nfct)) { 260 if (opts) 261 opts->error = PTR_ERR(nfct); 262 return NULL; 263 } 264 265 return (struct nf_conn___init *)nfct; 266 } 267 268 /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a 269 * reference to it 270 * 271 * Parameters: 272 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program 273 * Cannot be NULL 274 * @bpf_tuple - Pointer to memory representing the tuple to look up 275 * Cannot be NULL 276 * @tuple__sz - Length of the tuple structure 277 * Must be one of sizeof(bpf_tuple->ipv4) or 278 * sizeof(bpf_tuple->ipv6) 279 * @opts - Additional options for lookup (documented above) 280 * Cannot be NULL 281 * @opts__sz - Length of the bpf_ct_opts structure 282 * Must be NF_BPF_CT_OPTS_SZ (12) 283 */ 284 __bpf_kfunc struct nf_conn * 285 bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, 286 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 287 { 288 struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; 289 struct net *caller_net; 290 struct nf_conn *nfct; 291 292 caller_net = dev_net(ctx->rxq->dev); 293 nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); 294 if (IS_ERR(nfct)) { 295 if (opts) 296 opts->error = PTR_ERR(nfct); 297 return NULL; 298 } 299 return nfct; 300 } 301 302 /* bpf_skb_ct_alloc - Allocate a new CT entry 303 * 304 * Parameters: 305 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program 306 * Cannot be NULL 307 * @bpf_tuple - Pointer to memory representing the tuple to look up 308 * Cannot be NULL 309 * @tuple__sz - Length of the tuple structure 310 * Must be one of sizeof(bpf_tuple->ipv4) or 311 * sizeof(bpf_tuple->ipv6) 312 * @opts - Additional options for allocation (documented above) 313 * Cannot be NULL 314 * @opts__sz - Length of the bpf_ct_opts structure 315 * Must be NF_BPF_CT_OPTS_SZ (12) 316 */ 317 __bpf_kfunc struct nf_conn___init * 318 bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, 319 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 320 { 321 struct sk_buff *skb = (struct sk_buff *)skb_ctx; 322 struct nf_conn *nfct; 323 struct net *net; 324 325 net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); 326 nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10); 327 if (IS_ERR(nfct)) { 328 if (opts) 329 opts->error = PTR_ERR(nfct); 330 return NULL; 331 } 332 333 return (struct nf_conn___init *)nfct; 334 } 335 336 /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a 337 * reference to it 338 * 339 * Parameters: 340 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program 341 * Cannot be NULL 342 * @bpf_tuple - Pointer to memory representing the tuple to look up 343 * Cannot be NULL 344 * @tuple__sz - Length of the tuple structure 345 * Must be one of sizeof(bpf_tuple->ipv4) or 346 * sizeof(bpf_tuple->ipv6) 347 * @opts - Additional options for lookup (documented above) 348 * Cannot be NULL 349 * @opts__sz - Length of the bpf_ct_opts structure 350 * Must be NF_BPF_CT_OPTS_SZ (12) 351 */ 352 __bpf_kfunc struct nf_conn * 353 bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, 354 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 355 { 356 struct sk_buff *skb = (struct sk_buff *)skb_ctx; 357 struct net *caller_net; 358 struct nf_conn *nfct; 359 360 caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); 361 nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); 362 if (IS_ERR(nfct)) { 363 if (opts) 364 opts->error = PTR_ERR(nfct); 365 return NULL; 366 } 367 return nfct; 368 } 369 370 /* bpf_ct_insert_entry - Add the provided entry into a CT map 371 * 372 * This must be invoked for referenced PTR_TO_BTF_ID. 373 * 374 * @nfct - Pointer to referenced nf_conn___init object, obtained 375 * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 376 */ 377 __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) 378 { 379 struct nf_conn *nfct = (struct nf_conn *)nfct_i; 380 int err; 381 382 if (!nf_ct_is_confirmed(nfct)) 383 nfct->timeout += nfct_time_stamp; 384 nfct->status |= IPS_CONFIRMED; 385 err = nf_conntrack_hash_check_insert(nfct); 386 if (err < 0) { 387 nf_conntrack_free(nfct); 388 return NULL; 389 } 390 return nfct; 391 } 392 393 /* bpf_ct_release - Release acquired nf_conn object 394 * 395 * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects 396 * the program if any references remain in the program in all of the explored 397 * states. 398 * 399 * Parameters: 400 * @nf_conn - Pointer to referenced nf_conn object, obtained using 401 * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. 402 */ 403 __bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) 404 { 405 nf_ct_put(nfct); 406 } 407 408 /* bpf_ct_set_timeout - Set timeout of allocated nf_conn 409 * 410 * Sets the default timeout of newly allocated nf_conn before insertion. 411 * This helper must be invoked for refcounted pointer to nf_conn___init. 412 * 413 * Parameters: 414 * @nfct - Pointer to referenced nf_conn object, obtained using 415 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 416 * @timeout - Timeout in msecs. 417 */ 418 __bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) 419 { 420 __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout)); 421 } 422 423 /* bpf_ct_change_timeout - Change timeout of inserted nf_conn 424 * 425 * Change timeout associated of the inserted or looked up nf_conn. 426 * This helper must be invoked for refcounted pointer to nf_conn. 427 * 428 * Parameters: 429 * @nfct - Pointer to referenced nf_conn object, obtained using 430 * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup. 431 * @timeout - New timeout in msecs. 432 */ 433 __bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) 434 { 435 return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout)); 436 } 437 438 /* bpf_ct_set_status - Set status field of allocated nf_conn 439 * 440 * Set the status field of the newly allocated nf_conn before insertion. 441 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init. 442 * 443 * Parameters: 444 * @nfct - Pointer to referenced nf_conn object, obtained using 445 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 446 * @status - New status value. 447 */ 448 __bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) 449 { 450 return nf_ct_change_status_common((struct nf_conn *)nfct, status); 451 } 452 453 /* bpf_ct_change_status - Change status of inserted nf_conn 454 * 455 * Change the status field of the provided connection tracking entry. 456 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn. 457 * 458 * Parameters: 459 * @nfct - Pointer to referenced nf_conn object, obtained using 460 * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup. 461 * @status - New status value. 462 */ 463 __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) 464 { 465 return nf_ct_change_status_common(nfct, status); 466 } 467 468 __bpf_kfunc_end_defs(); 469 470 BTF_SET8_START(nf_ct_kfunc_set) 471 BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) 472 BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) 473 BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) 474 BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) 475 BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) 476 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) 477 BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) 478 BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) 479 BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) 480 BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) 481 BTF_SET8_END(nf_ct_kfunc_set) 482 483 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { 484 .owner = THIS_MODULE, 485 .set = &nf_ct_kfunc_set, 486 }; 487 488 int register_nf_conntrack_bpf(void) 489 { 490 int ret; 491 492 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set); 493 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); 494 if (!ret) { 495 mutex_lock(&nf_conn_btf_access_lock); 496 nfct_btf_struct_access = _nf_conntrack_btf_struct_access; 497 mutex_unlock(&nf_conn_btf_access_lock); 498 } 499 500 return ret; 501 } 502 503 void cleanup_nf_conntrack_bpf(void) 504 { 505 mutex_lock(&nf_conn_btf_access_lock); 506 nfct_btf_struct_access = NULL; 507 mutex_unlock(&nf_conn_btf_access_lock); 508 } 509