1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook */
3 
4 #include <linux/bpf.h>
5 #include <netinet/in.h>
6 #include <stdbool.h>
7 
8 #include <bpf/bpf_helpers.h>
9 #include <bpf/bpf_endian.h>
10 #include "bpf_tcp_helpers.h"
11 
12 enum bpf_linum_array_idx {
13 	EGRESS_LINUM_IDX,
14 	INGRESS_LINUM_IDX,
15 	READ_SK_DST_PORT_LINUM_IDX,
16 	__NR_BPF_LINUM_ARRAY_IDX,
17 };
18 
19 struct {
20 	__uint(type, BPF_MAP_TYPE_ARRAY);
21 	__uint(max_entries, __NR_BPF_LINUM_ARRAY_IDX);
22 	__type(key, __u32);
23 	__type(value, __u32);
24 } linum_map SEC(".maps");
25 
26 struct bpf_spinlock_cnt {
27 	struct bpf_spin_lock lock;
28 	__u32 cnt;
29 };
30 
31 struct {
32 	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
33 	__uint(map_flags, BPF_F_NO_PREALLOC);
34 	__type(key, int);
35 	__type(value, struct bpf_spinlock_cnt);
36 } sk_pkt_out_cnt SEC(".maps");
37 
38 struct {
39 	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
40 	__uint(map_flags, BPF_F_NO_PREALLOC);
41 	__type(key, int);
42 	__type(value, struct bpf_spinlock_cnt);
43 } sk_pkt_out_cnt10 SEC(".maps");
44 
45 struct bpf_tcp_sock listen_tp = {};
46 struct sockaddr_in6 srv_sa6 = {};
47 struct bpf_tcp_sock cli_tp = {};
48 struct bpf_tcp_sock srv_tp = {};
49 struct bpf_sock listen_sk = {};
50 struct bpf_sock srv_sk = {};
51 struct bpf_sock cli_sk = {};
52 __u64 parent_cg_id = 0;
53 __u64 child_cg_id = 0;
54 __u64 lsndtime = 0;
55 
56 static bool is_loopback6(__u32 *a6)
57 {
58 	return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1);
59 }
60 
61 static void skcpy(struct bpf_sock *dst,
62 		  const struct bpf_sock *src)
63 {
64 	dst->bound_dev_if = src->bound_dev_if;
65 	dst->family = src->family;
66 	dst->type = src->type;
67 	dst->protocol = src->protocol;
68 	dst->mark = src->mark;
69 	dst->priority = src->priority;
70 	dst->src_ip4 = src->src_ip4;
71 	dst->src_ip6[0] = src->src_ip6[0];
72 	dst->src_ip6[1] = src->src_ip6[1];
73 	dst->src_ip6[2] = src->src_ip6[2];
74 	dst->src_ip6[3] = src->src_ip6[3];
75 	dst->src_port = src->src_port;
76 	dst->dst_ip4 = src->dst_ip4;
77 	dst->dst_ip6[0] = src->dst_ip6[0];
78 	dst->dst_ip6[1] = src->dst_ip6[1];
79 	dst->dst_ip6[2] = src->dst_ip6[2];
80 	dst->dst_ip6[3] = src->dst_ip6[3];
81 	dst->dst_port = src->dst_port;
82 	dst->state = src->state;
83 }
84 
85 static void tpcpy(struct bpf_tcp_sock *dst,
86 		  const struct bpf_tcp_sock *src)
87 {
88 	dst->snd_cwnd = src->snd_cwnd;
89 	dst->srtt_us = src->srtt_us;
90 	dst->rtt_min = src->rtt_min;
91 	dst->snd_ssthresh = src->snd_ssthresh;
92 	dst->rcv_nxt = src->rcv_nxt;
93 	dst->snd_nxt = src->snd_nxt;
94 	dst->snd_una = src->snd_una;
95 	dst->mss_cache = src->mss_cache;
96 	dst->ecn_flags = src->ecn_flags;
97 	dst->rate_delivered = src->rate_delivered;
98 	dst->rate_interval_us = src->rate_interval_us;
99 	dst->packets_out = src->packets_out;
100 	dst->retrans_out = src->retrans_out;
101 	dst->total_retrans = src->total_retrans;
102 	dst->segs_in = src->segs_in;
103 	dst->data_segs_in = src->data_segs_in;
104 	dst->segs_out = src->segs_out;
105 	dst->data_segs_out = src->data_segs_out;
106 	dst->lost_out = src->lost_out;
107 	dst->sacked_out = src->sacked_out;
108 	dst->bytes_received = src->bytes_received;
109 	dst->bytes_acked = src->bytes_acked;
110 }
111 
112 /* Always return CG_OK so that no pkt will be filtered out */
113 #define CG_OK 1
114 
115 #define RET_LOG() ({						\
116 	linum = __LINE__;					\
117 	bpf_map_update_elem(&linum_map, &linum_idx, &linum, BPF_ANY);	\
118 	return CG_OK;						\
119 })
120 
121 SEC("cgroup_skb/egress")
122 int egress_read_sock_fields(struct __sk_buff *skb)
123 {
124 	struct bpf_spinlock_cnt cli_cnt_init = { .lock = {}, .cnt = 0xeB9F };
125 	struct bpf_spinlock_cnt *pkt_out_cnt, *pkt_out_cnt10;
126 	struct bpf_tcp_sock *tp, *tp_ret;
127 	struct bpf_sock *sk, *sk_ret;
128 	__u32 linum, linum_idx;
129 	struct tcp_sock *ktp;
130 
131 	linum_idx = EGRESS_LINUM_IDX;
132 
133 	sk = skb->sk;
134 	if (!sk)
135 		RET_LOG();
136 
137 	/* Not testing the egress traffic or the listening socket,
138 	 * which are covered by the cgroup_skb/ingress test program.
139 	 */
140 	if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
141 	    sk->state == BPF_TCP_LISTEN)
142 		return CG_OK;
143 
144 	if (sk->src_port == bpf_ntohs(srv_sa6.sin6_port)) {
145 		/* Server socket */
146 		sk_ret = &srv_sk;
147 		tp_ret = &srv_tp;
148 	} else if (sk->dst_port == srv_sa6.sin6_port) {
149 		/* Client socket */
150 		sk_ret = &cli_sk;
151 		tp_ret = &cli_tp;
152 	} else {
153 		/* Not the testing egress traffic */
154 		return CG_OK;
155 	}
156 
157 	/* It must be a fullsock for cgroup_skb/egress prog */
158 	sk = bpf_sk_fullsock(sk);
159 	if (!sk)
160 		RET_LOG();
161 
162 	/* Not the testing egress traffic */
163 	if (sk->protocol != IPPROTO_TCP)
164 		return CG_OK;
165 
166 	tp = bpf_tcp_sock(sk);
167 	if (!tp)
168 		RET_LOG();
169 
170 	skcpy(sk_ret, sk);
171 	tpcpy(tp_ret, tp);
172 
173 	if (sk_ret == &srv_sk) {
174 		ktp = bpf_skc_to_tcp_sock(sk);
175 
176 		if (!ktp)
177 			RET_LOG();
178 
179 		lsndtime = ktp->lsndtime;
180 
181 		child_cg_id = bpf_sk_cgroup_id(ktp);
182 		if (!child_cg_id)
183 			RET_LOG();
184 
185 		parent_cg_id = bpf_sk_ancestor_cgroup_id(ktp, 2);
186 		if (!parent_cg_id)
187 			RET_LOG();
188 
189 		/* The userspace has created it for srv sk */
190 		pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, ktp, 0, 0);
191 		pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10, ktp,
192 						   0, 0);
193 	} else {
194 		pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, sk,
195 						 &cli_cnt_init,
196 						 BPF_SK_STORAGE_GET_F_CREATE);
197 		pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10,
198 						   sk, &cli_cnt_init,
199 						   BPF_SK_STORAGE_GET_F_CREATE);
200 	}
201 
202 	if (!pkt_out_cnt || !pkt_out_cnt10)
203 		RET_LOG();
204 
205 	/* Even both cnt and cnt10 have lock defined in their BTF,
206 	 * intentionally one cnt takes lock while one does not
207 	 * as a test for the spinlock support in BPF_MAP_TYPE_SK_STORAGE.
208 	 */
209 	pkt_out_cnt->cnt += 1;
210 	bpf_spin_lock(&pkt_out_cnt10->lock);
211 	pkt_out_cnt10->cnt += 10;
212 	bpf_spin_unlock(&pkt_out_cnt10->lock);
213 
214 	return CG_OK;
215 }
216 
217 SEC("cgroup_skb/ingress")
218 int ingress_read_sock_fields(struct __sk_buff *skb)
219 {
220 	struct bpf_tcp_sock *tp;
221 	__u32 linum, linum_idx;
222 	struct bpf_sock *sk;
223 
224 	linum_idx = INGRESS_LINUM_IDX;
225 
226 	sk = skb->sk;
227 	if (!sk)
228 		RET_LOG();
229 
230 	/* Not the testing ingress traffic to the server */
231 	if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
232 	    sk->src_port != bpf_ntohs(srv_sa6.sin6_port))
233 		return CG_OK;
234 
235 	/* Only interested in the listening socket */
236 	if (sk->state != BPF_TCP_LISTEN)
237 		return CG_OK;
238 
239 	/* It must be a fullsock for cgroup_skb/ingress prog */
240 	sk = bpf_sk_fullsock(sk);
241 	if (!sk)
242 		RET_LOG();
243 
244 	tp = bpf_tcp_sock(sk);
245 	if (!tp)
246 		RET_LOG();
247 
248 	skcpy(&listen_sk, sk);
249 	tpcpy(&listen_tp, tp);
250 
251 	return CG_OK;
252 }
253 
254 /*
255  * NOTE: 4-byte load from bpf_sock at dst_port offset is quirky. It
256  * gets rewritten by the access converter to a 2-byte load for
257  * backward compatibility. Treating the load result as a be16 value
258  * makes the code portable across little- and big-endian platforms.
259  */
260 static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk)
261 {
262 	__u32 *word = (__u32 *)&sk->dst_port;
263 	return word[0] == bpf_htons(0xcafe);
264 }
265 
266 static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk)
267 {
268 	__u16 *half = (__u16 *)&sk->dst_port;
269 	return half[0] == bpf_htons(0xcafe);
270 }
271 
272 static __noinline bool sk_dst_port__load_byte(struct bpf_sock *sk)
273 {
274 	__u8 *byte = (__u8 *)&sk->dst_port;
275 	return byte[0] == 0xca && byte[1] == 0xfe;
276 }
277 
278 SEC("cgroup_skb/egress")
279 int read_sk_dst_port(struct __sk_buff *skb)
280 {
281 	__u32 linum, linum_idx;
282 	struct bpf_sock *sk;
283 
284 	linum_idx = READ_SK_DST_PORT_LINUM_IDX;
285 
286 	sk = skb->sk;
287 	if (!sk)
288 		RET_LOG();
289 
290 	/* Ignore everything but the SYN from the client socket */
291 	if (sk->state != BPF_TCP_SYN_SENT)
292 		return CG_OK;
293 
294 	if (!sk_dst_port__load_word(sk))
295 		RET_LOG();
296 	if (!sk_dst_port__load_half(sk))
297 		RET_LOG();
298 	if (!sk_dst_port__load_byte(sk))
299 		RET_LOG();
300 
301 	return CG_OK;
302 }
303 
304 char _license[] SEC("license") = "GPL";
305