196c3490dSMiao Xu // SPDX-License-Identifier: GPL-2.0-only
296c3490dSMiao Xu 
396c3490dSMiao Xu /* Highlights:
496c3490dSMiao Xu  * 1. The major difference between this bpf program and tcp_cubic.c
596c3490dSMiao Xu  *    is that this bpf program relies on `cong_control` rather than
696c3490dSMiao Xu  *    `cong_avoid` in the struct tcp_congestion_ops.
796c3490dSMiao Xu  * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and
896c3490dSMiao Xu  *    tcp_update_pacing_rate is bypassed when `cong_control` is
996c3490dSMiao Xu  *    defined, so moving these logic to `cong_control`.
1096c3490dSMiao Xu  * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c.
1196c3490dSMiao Xu  *    The main purpose is to show use cases of the arguments in
1296c3490dSMiao Xu  *    `cong_control`. For simplicity's sake, it reuses tcp cubic's
1396c3490dSMiao Xu  *    kernel functions.
1496c3490dSMiao Xu  */
1596c3490dSMiao Xu 
16cbaec46dSMartin KaFai Lau #include "bpf_tracing_net.h"
1796c3490dSMiao Xu #include <bpf/bpf_helpers.h>
1896c3490dSMiao Xu #include <bpf/bpf_tracing.h>
1996c3490dSMiao Xu 
2096c3490dSMiao Xu #define USEC_PER_SEC 1000000UL
2196c3490dSMiao Xu #define TCP_PACING_SS_RATIO (200)
2296c3490dSMiao Xu #define TCP_PACING_CA_RATIO (120)
2396c3490dSMiao Xu #define TCP_REORDERING (12)
2496c3490dSMiao Xu 
2596c3490dSMiao Xu #define min(a, b) ((a) < (b) ? (a) : (b))
2696c3490dSMiao Xu #define max(a, b) ((a) > (b) ? (a) : (b))
2796c3490dSMiao Xu #define after(seq2, seq1) before(seq1, seq2)
2896c3490dSMiao Xu 
2996c3490dSMiao Xu extern void cubictcp_init(struct sock *sk) __ksym;
3096c3490dSMiao Xu extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
3196c3490dSMiao Xu extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
3296c3490dSMiao Xu extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym;
3396c3490dSMiao Xu extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
3496c3490dSMiao Xu extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
3596c3490dSMiao Xu extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
3696c3490dSMiao Xu 
before(__u32 seq1,__u32 seq2)3796c3490dSMiao Xu static bool before(__u32 seq1, __u32 seq2)
3896c3490dSMiao Xu {
3996c3490dSMiao Xu 	return (__s32)(seq1-seq2) < 0;
4096c3490dSMiao Xu }
4196c3490dSMiao Xu 
div64_u64(__u64 dividend,__u64 divisor)4296c3490dSMiao Xu static __u64 div64_u64(__u64 dividend, __u64 divisor)
4396c3490dSMiao Xu {
4496c3490dSMiao Xu 	return dividend / divisor;
4596c3490dSMiao Xu }
4696c3490dSMiao Xu 
tcp_update_pacing_rate(struct sock * sk)4796c3490dSMiao Xu static void tcp_update_pacing_rate(struct sock *sk)
4896c3490dSMiao Xu {
4996c3490dSMiao Xu 	const struct tcp_sock *tp = tcp_sk(sk);
5096c3490dSMiao Xu 	__u64 rate;
5196c3490dSMiao Xu 
5296c3490dSMiao Xu 	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
5396c3490dSMiao Xu 	rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
5496c3490dSMiao Xu 
5596c3490dSMiao Xu 	/* current rate is (cwnd * mss) / srtt
5696c3490dSMiao Xu 	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
5796c3490dSMiao Xu 	 * In Congestion Avoidance phase, set it to 120 % the current rate.
5896c3490dSMiao Xu 	 *
5996c3490dSMiao Xu 	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
6096c3490dSMiao Xu 	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
6196c3490dSMiao Xu 	 *	 end of slow start and should slow down.
6296c3490dSMiao Xu 	 */
6396c3490dSMiao Xu 	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
6496c3490dSMiao Xu 		rate *= TCP_PACING_SS_RATIO;
6596c3490dSMiao Xu 	else
6696c3490dSMiao Xu 		rate *= TCP_PACING_CA_RATIO;
6796c3490dSMiao Xu 
6896c3490dSMiao Xu 	rate *= max(tp->snd_cwnd, tp->packets_out);
6996c3490dSMiao Xu 
7096c3490dSMiao Xu 	if (tp->srtt_us)
7196c3490dSMiao Xu 		rate = div64_u64(rate, (__u64)tp->srtt_us);
7296c3490dSMiao Xu 
7396c3490dSMiao Xu 	sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate);
7496c3490dSMiao Xu }
7596c3490dSMiao Xu 
tcp_cwnd_reduction(struct sock * sk,int newly_acked_sacked,int newly_lost,int flag)7696c3490dSMiao Xu static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
7796c3490dSMiao Xu 			       int newly_lost, int flag)
7896c3490dSMiao Xu {
7996c3490dSMiao Xu 	struct tcp_sock *tp = tcp_sk(sk);
8096c3490dSMiao Xu 	int sndcnt = 0;
8196c3490dSMiao Xu 	__u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out;
8296c3490dSMiao Xu 	int delta = tp->snd_ssthresh - pkts_in_flight;
8396c3490dSMiao Xu 
8496c3490dSMiao Xu 	if (newly_acked_sacked <= 0 || !tp->prior_cwnd)
8596c3490dSMiao Xu 		return;
8696c3490dSMiao Xu 
8796c3490dSMiao Xu 	__u32 prr_delivered = tp->prr_delivered + newly_acked_sacked;
8896c3490dSMiao Xu 
8996c3490dSMiao Xu 	if (delta < 0) {
9096c3490dSMiao Xu 		__u64 dividend =
9196c3490dSMiao Xu 			(__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1;
9296c3490dSMiao Xu 		sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out;
9396c3490dSMiao Xu 	} else {
9496c3490dSMiao Xu 		sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked);
9596c3490dSMiao Xu 		if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
9696c3490dSMiao Xu 			sndcnt++;
9796c3490dSMiao Xu 		sndcnt = min(delta, sndcnt);
9896c3490dSMiao Xu 	}
9996c3490dSMiao Xu 	/* Force a fast retransmit upon entering fast recovery */
10096c3490dSMiao Xu 	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
10196c3490dSMiao Xu 	tp->snd_cwnd = pkts_in_flight + sndcnt;
10296c3490dSMiao Xu }
10396c3490dSMiao Xu 
10496c3490dSMiao Xu /* Decide wheather to run the increase function of congestion control. */
tcp_may_raise_cwnd(const struct sock * sk,const int flag)10596c3490dSMiao Xu static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
10696c3490dSMiao Xu {
10796c3490dSMiao Xu 	if (tcp_sk(sk)->reordering > TCP_REORDERING)
10896c3490dSMiao Xu 		return flag & FLAG_FORWARD_PROGRESS;
10996c3490dSMiao Xu 
11096c3490dSMiao Xu 	return flag & FLAG_DATA_ACKED;
11196c3490dSMiao Xu }
11296c3490dSMiao Xu 
113*7d3851a3SMartin KaFai Lau SEC("struct_ops")
BPF_PROG(bpf_cubic_init,struct sock * sk)114*7d3851a3SMartin KaFai Lau void BPF_PROG(bpf_cubic_init, struct sock *sk)
11596c3490dSMiao Xu {
11696c3490dSMiao Xu 	cubictcp_init(sk);
11796c3490dSMiao Xu }
11896c3490dSMiao Xu 
119*7d3851a3SMartin KaFai Lau SEC("struct_ops")
BPF_PROG(bpf_cubic_cwnd_event,struct sock * sk,enum tcp_ca_event event)120*7d3851a3SMartin KaFai Lau void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
12196c3490dSMiao Xu {
12296c3490dSMiao Xu 	cubictcp_cwnd_event(sk, event);
12396c3490dSMiao Xu }
12496c3490dSMiao Xu 
125*7d3851a3SMartin KaFai Lau SEC("struct_ops")
BPF_PROG(bpf_cubic_cong_control,struct sock * sk,__u32 ack,int flag,const struct rate_sample * rs)126*7d3851a3SMartin KaFai Lau void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag,
12796c3490dSMiao Xu 	      const struct rate_sample *rs)
12896c3490dSMiao Xu {
12996c3490dSMiao Xu 	struct tcp_sock *tp = tcp_sk(sk);
13096c3490dSMiao Xu 
13196c3490dSMiao Xu 	if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) &
13296c3490dSMiao Xu 			(1 << inet_csk(sk)->icsk_ca_state)) {
13396c3490dSMiao Xu 		/* Reduce cwnd if state mandates */
13496c3490dSMiao Xu 		tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag);
13596c3490dSMiao Xu 
13696c3490dSMiao Xu 		if (!before(tp->snd_una, tp->high_seq)) {
13796c3490dSMiao Xu 			/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
13896c3490dSMiao Xu 			if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
13996c3490dSMiao Xu 					inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
14096c3490dSMiao Xu 				tp->snd_cwnd = tp->snd_ssthresh;
14196c3490dSMiao Xu 				tp->snd_cwnd_stamp = tcp_jiffies32;
14296c3490dSMiao Xu 			}
14396c3490dSMiao Xu 		}
14496c3490dSMiao Xu 	} else if (tcp_may_raise_cwnd(sk, flag)) {
14596c3490dSMiao Xu 		/* Advance cwnd if state allows */
14696c3490dSMiao Xu 		cubictcp_cong_avoid(sk, ack, rs->acked_sacked);
14796c3490dSMiao Xu 		tp->snd_cwnd_stamp = tcp_jiffies32;
14896c3490dSMiao Xu 	}
14996c3490dSMiao Xu 
15096c3490dSMiao Xu 	tcp_update_pacing_rate(sk);
15196c3490dSMiao Xu }
15296c3490dSMiao Xu 
153*7d3851a3SMartin KaFai Lau SEC("struct_ops")
BPF_PROG(bpf_cubic_recalc_ssthresh,struct sock * sk)154*7d3851a3SMartin KaFai Lau __u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk)
15596c3490dSMiao Xu {
15696c3490dSMiao Xu 	return cubictcp_recalc_ssthresh(sk);
15796c3490dSMiao Xu }
15896c3490dSMiao Xu 
159*7d3851a3SMartin KaFai Lau SEC("struct_ops")
BPF_PROG(bpf_cubic_state,struct sock * sk,__u8 new_state)160*7d3851a3SMartin KaFai Lau void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state)
16196c3490dSMiao Xu {
16296c3490dSMiao Xu 	cubictcp_state(sk, new_state);
16396c3490dSMiao Xu }
16496c3490dSMiao Xu 
165*7d3851a3SMartin KaFai Lau SEC("struct_ops")
BPF_PROG(bpf_cubic_acked,struct sock * sk,const struct ack_sample * sample)166*7d3851a3SMartin KaFai Lau void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample)
16796c3490dSMiao Xu {
16896c3490dSMiao Xu 	cubictcp_acked(sk, sample);
16996c3490dSMiao Xu }
17096c3490dSMiao Xu 
171*7d3851a3SMartin KaFai Lau SEC("struct_ops")
BPF_PROG(bpf_cubic_undo_cwnd,struct sock * sk)172*7d3851a3SMartin KaFai Lau __u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk)
17396c3490dSMiao Xu {
17496c3490dSMiao Xu 	return tcp_reno_undo_cwnd(sk);
17596c3490dSMiao Xu }
17696c3490dSMiao Xu 
17796c3490dSMiao Xu SEC(".struct_ops")
17896c3490dSMiao Xu struct tcp_congestion_ops cc_cubic = {
17996c3490dSMiao Xu 	.init		= (void *)bpf_cubic_init,
18096c3490dSMiao Xu 	.ssthresh	= (void *)bpf_cubic_recalc_ssthresh,
18196c3490dSMiao Xu 	.cong_control	= (void *)bpf_cubic_cong_control,
18296c3490dSMiao Xu 	.set_state	= (void *)bpf_cubic_state,
18396c3490dSMiao Xu 	.undo_cwnd	= (void *)bpf_cubic_undo_cwnd,
18496c3490dSMiao Xu 	.cwnd_event	= (void *)bpf_cubic_cwnd_event,
18596c3490dSMiao Xu 	.pkts_acked     = (void *)bpf_cubic_acked,
18696c3490dSMiao Xu 	.name		= "bpf_cc_cubic",
18796c3490dSMiao Xu };
18896c3490dSMiao Xu 
18996c3490dSMiao Xu char _license[] SEC("license") = "GPL";
190