xref: /linux/net/ipv4/tcp_ipv4.c (revision f4dca95f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 
83 #include <crypto/hash.h>
84 #include <linux/scatterlist.h>
85 
86 #include <trace/events/tcp.h>
87 
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92 
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95 
96 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
97 
tcp_v4_init_seq(const struct sk_buff * skb)98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
114 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
115 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
116 	struct tcp_sock *tp = tcp_sk(sk);
117 
118 	if (reuse == 2) {
119 		/* Still does not detect *everything* that goes through
120 		 * lo, since we require a loopback src or dst address
121 		 * or direct binding to 'lo' interface.
122 		 */
123 		bool loopback = false;
124 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 			loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127 		if (tw->tw_family == AF_INET6) {
128 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
130 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
131 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
132 				loopback = true;
133 		} else
134 #endif
135 		{
136 			if (ipv4_is_loopback(tw->tw_daddr) ||
137 			    ipv4_is_loopback(tw->tw_rcv_saddr))
138 				loopback = true;
139 		}
140 		if (!loopback)
141 			reuse = 0;
142 	}
143 
144 	/* With PAWS, it is safe from the viewpoint
145 	   of data integrity. Even without PAWS it is safe provided sequence
146 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
147 
148 	   Actually, the idea is close to VJ's one, only timestamp cache is
149 	   held not per host, but per port pair and TW bucket is used as state
150 	   holder.
151 
152 	   If TW bucket has been already destroyed we fall back to VJ's scheme
153 	   and use initial timestamp retrieved from peer table.
154 	 */
155 	if (tcptw->tw_ts_recent_stamp &&
156 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
157 					    tcptw->tw_ts_recent_stamp)))) {
158 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
159 		 * and releasing the bucket lock.
160 		 */
161 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
162 			return 0;
163 
164 		/* In case of repair and re-using TIME-WAIT sockets we still
165 		 * want to be sure that it is safe as above but honor the
166 		 * sequence numbers and time stamps set as part of the repair
167 		 * process.
168 		 *
169 		 * Without this check re-using a TIME-WAIT socket with TCP
170 		 * repair would accumulate a -1 on the repair assigned
171 		 * sequence number. The first time it is reused the sequence
172 		 * is -1, the second time -2, etc. This fixes that issue
173 		 * without appearing to create any others.
174 		 */
175 		if (likely(!tp->repair)) {
176 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
177 
178 			if (!seq)
179 				seq = 1;
180 			WRITE_ONCE(tp->write_seq, seq);
181 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
182 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
183 		}
184 
185 		return 1;
186 	}
187 
188 	return 0;
189 }
190 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
191 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)192 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
193 			      int addr_len)
194 {
195 	/* This check is replicated from tcp_v4_connect() and intended to
196 	 * prevent BPF program called below from accessing bytes that are out
197 	 * of the bound specified by user in addr_len.
198 	 */
199 	if (addr_len < sizeof(struct sockaddr_in))
200 		return -EINVAL;
201 
202 	sock_owned_by_me(sk);
203 
204 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
205 }
206 
207 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)208 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
209 {
210 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
211 	struct inet_timewait_death_row *tcp_death_row;
212 	struct inet_sock *inet = inet_sk(sk);
213 	struct tcp_sock *tp = tcp_sk(sk);
214 	struct ip_options_rcu *inet_opt;
215 	struct net *net = sock_net(sk);
216 	__be16 orig_sport, orig_dport;
217 	__be32 daddr, nexthop;
218 	struct flowi4 *fl4;
219 	struct rtable *rt;
220 	int err;
221 
222 	if (addr_len < sizeof(struct sockaddr_in))
223 		return -EINVAL;
224 
225 	if (usin->sin_family != AF_INET)
226 		return -EAFNOSUPPORT;
227 
228 	nexthop = daddr = usin->sin_addr.s_addr;
229 	inet_opt = rcu_dereference_protected(inet->inet_opt,
230 					     lockdep_sock_is_held(sk));
231 	if (inet_opt && inet_opt->opt.srr) {
232 		if (!daddr)
233 			return -EINVAL;
234 		nexthop = inet_opt->opt.faddr;
235 	}
236 
237 	orig_sport = inet->inet_sport;
238 	orig_dport = usin->sin_port;
239 	fl4 = &inet->cork.fl.u.ip4;
240 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
241 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
242 			      orig_dport, sk);
243 	if (IS_ERR(rt)) {
244 		err = PTR_ERR(rt);
245 		if (err == -ENETUNREACH)
246 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
247 		return err;
248 	}
249 
250 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
251 		ip_rt_put(rt);
252 		return -ENETUNREACH;
253 	}
254 
255 	if (!inet_opt || !inet_opt->opt.srr)
256 		daddr = fl4->daddr;
257 
258 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
259 
260 	if (!inet->inet_saddr) {
261 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
262 		if (err) {
263 			ip_rt_put(rt);
264 			return err;
265 		}
266 	} else {
267 		sk_rcv_saddr_set(sk, inet->inet_saddr);
268 	}
269 
270 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
271 		/* Reset inherited state */
272 		tp->rx_opt.ts_recent	   = 0;
273 		tp->rx_opt.ts_recent_stamp = 0;
274 		if (likely(!tp->repair))
275 			WRITE_ONCE(tp->write_seq, 0);
276 	}
277 
278 	inet->inet_dport = usin->sin_port;
279 	sk_daddr_set(sk, daddr);
280 
281 	inet_csk(sk)->icsk_ext_hdr_len = 0;
282 	if (inet_opt)
283 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
284 
285 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
286 
287 	/* Socket identity is still unknown (sport may be zero).
288 	 * However we set state to SYN-SENT and not releasing socket
289 	 * lock select source port, enter ourselves into the hash tables and
290 	 * complete initialization after this.
291 	 */
292 	tcp_set_state(sk, TCP_SYN_SENT);
293 	err = inet_hash_connect(tcp_death_row, sk);
294 	if (err)
295 		goto failure;
296 
297 	sk_set_txhash(sk);
298 
299 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
300 			       inet->inet_sport, inet->inet_dport, sk);
301 	if (IS_ERR(rt)) {
302 		err = PTR_ERR(rt);
303 		rt = NULL;
304 		goto failure;
305 	}
306 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
307 	/* OK, now commit destination to socket.  */
308 	sk->sk_gso_type = SKB_GSO_TCPV4;
309 	sk_setup_caps(sk, &rt->dst);
310 	rt = NULL;
311 
312 	if (likely(!tp->repair)) {
313 		if (!tp->write_seq)
314 			WRITE_ONCE(tp->write_seq,
315 				   secure_tcp_seq(inet->inet_saddr,
316 						  inet->inet_daddr,
317 						  inet->inet_sport,
318 						  usin->sin_port));
319 		WRITE_ONCE(tp->tsoffset,
320 			   secure_tcp_ts_off(net, inet->inet_saddr,
321 					     inet->inet_daddr));
322 	}
323 
324 	atomic_set(&inet->inet_id, get_random_u16());
325 
326 	if (tcp_fastopen_defer_connect(sk, &err))
327 		return err;
328 	if (err)
329 		goto failure;
330 
331 	err = tcp_connect(sk);
332 
333 	if (err)
334 		goto failure;
335 
336 	return 0;
337 
338 failure:
339 	/*
340 	 * This unhashes the socket and releases the local port,
341 	 * if necessary.
342 	 */
343 	tcp_set_state(sk, TCP_CLOSE);
344 	inet_bhash2_reset_saddr(sk);
345 	ip_rt_put(rt);
346 	sk->sk_route_caps = 0;
347 	inet->inet_dport = 0;
348 	return err;
349 }
350 EXPORT_SYMBOL(tcp_v4_connect);
351 
352 /*
353  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
354  * It can be called through tcp_release_cb() if socket was owned by user
355  * at the time tcp_v4_err() was called to handle ICMP message.
356  */
tcp_v4_mtu_reduced(struct sock * sk)357 void tcp_v4_mtu_reduced(struct sock *sk)
358 {
359 	struct inet_sock *inet = inet_sk(sk);
360 	struct dst_entry *dst;
361 	u32 mtu;
362 
363 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
364 		return;
365 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
366 	dst = inet_csk_update_pmtu(sk, mtu);
367 	if (!dst)
368 		return;
369 
370 	/* Something is about to be wrong... Remember soft error
371 	 * for the case, if this connection will not able to recover.
372 	 */
373 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
374 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
375 
376 	mtu = dst_mtu(dst);
377 
378 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
379 	    ip_sk_accept_pmtu(sk) &&
380 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
381 		tcp_sync_mss(sk, mtu);
382 
383 		/* Resend the TCP packet because it's
384 		 * clear that the old packet has been
385 		 * dropped. This is the new "fast" path mtu
386 		 * discovery.
387 		 */
388 		tcp_simple_retransmit(sk);
389 	} /* else let the usual retransmit timer handle it */
390 }
391 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
392 
do_redirect(struct sk_buff * skb,struct sock * sk)393 static void do_redirect(struct sk_buff *skb, struct sock *sk)
394 {
395 	struct dst_entry *dst = __sk_dst_check(sk, 0);
396 
397 	if (dst)
398 		dst->ops->redirect(dst, sk, skb);
399 }
400 
401 
402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)403 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
404 {
405 	struct request_sock *req = inet_reqsk(sk);
406 	struct net *net = sock_net(sk);
407 
408 	/* ICMPs are not backlogged, hence we cannot get
409 	 * an established socket here.
410 	 */
411 	if (seq != tcp_rsk(req)->snt_isn) {
412 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
413 	} else if (abort) {
414 		/*
415 		 * Still in SYN_RECV, just remove it silently.
416 		 * There is no good way to pass the error to the newly
417 		 * created socket, and POSIX does not want network
418 		 * errors returned from accept().
419 		 */
420 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
421 		tcp_listendrop(req->rsk_listener);
422 	}
423 	reqsk_put(req);
424 }
425 EXPORT_SYMBOL(tcp_req_err);
426 
427 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
429 {
430 	struct inet_connection_sock *icsk = inet_csk(sk);
431 	struct tcp_sock *tp = tcp_sk(sk);
432 	struct sk_buff *skb;
433 	s32 remaining;
434 	u32 delta_us;
435 
436 	if (sock_owned_by_user(sk))
437 		return;
438 
439 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
440 	    !icsk->icsk_backoff)
441 		return;
442 
443 	skb = tcp_rtx_queue_head(sk);
444 	if (WARN_ON_ONCE(!skb))
445 		return;
446 
447 	icsk->icsk_backoff--;
448 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
449 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
450 
451 	tcp_mstamp_refresh(tp);
452 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
453 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
454 
455 	if (remaining > 0) {
456 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
457 					  remaining, TCP_RTO_MAX);
458 	} else {
459 		/* RTO revert clocked out retransmission.
460 		 * Will retransmit now.
461 		 */
462 		tcp_retransmit_timer(sk);
463 	}
464 }
465 EXPORT_SYMBOL(tcp_ld_RTO_revert);
466 
467 /*
468  * This routine is called by the ICMP module when it gets some
469  * sort of error condition.  If err < 0 then the socket should
470  * be closed and the error returned to the user.  If err > 0
471  * it's just the icmp type << 8 | icmp code.  After adjustment
472  * header points to the first 8 bytes of the tcp header.  We need
473  * to find the appropriate port.
474  *
475  * The locking strategy used here is very "optimistic". When
476  * someone else accesses the socket the ICMP is just dropped
477  * and for some paths there is no check at all.
478  * A more general error queue to queue errors for later handling
479  * is probably better.
480  *
481  */
482 
tcp_v4_err(struct sk_buff * skb,u32 info)483 int tcp_v4_err(struct sk_buff *skb, u32 info)
484 {
485 	const struct iphdr *iph = (const struct iphdr *)skb->data;
486 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
487 	struct tcp_sock *tp;
488 	const int type = icmp_hdr(skb)->type;
489 	const int code = icmp_hdr(skb)->code;
490 	struct sock *sk;
491 	struct request_sock *fastopen;
492 	u32 seq, snd_una;
493 	int err;
494 	struct net *net = dev_net(skb->dev);
495 
496 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
497 				       iph->daddr, th->dest, iph->saddr,
498 				       ntohs(th->source), inet_iif(skb), 0);
499 	if (!sk) {
500 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
501 		return -ENOENT;
502 	}
503 	if (sk->sk_state == TCP_TIME_WAIT) {
504 		/* To increase the counter of ignored icmps for TCP-AO */
505 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
506 		inet_twsk_put(inet_twsk(sk));
507 		return 0;
508 	}
509 	seq = ntohl(th->seq);
510 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
511 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
512 				     type == ICMP_TIME_EXCEEDED ||
513 				     (type == ICMP_DEST_UNREACH &&
514 				      (code == ICMP_NET_UNREACH ||
515 				       code == ICMP_HOST_UNREACH)));
516 		return 0;
517 	}
518 
519 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
520 		sock_put(sk);
521 		return 0;
522 	}
523 
524 	bh_lock_sock(sk);
525 	/* If too many ICMPs get dropped on busy
526 	 * servers this needs to be solved differently.
527 	 * We do take care of PMTU discovery (RFC1191) special case :
528 	 * we can receive locally generated ICMP messages while socket is held.
529 	 */
530 	if (sock_owned_by_user(sk)) {
531 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
532 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
533 	}
534 	if (sk->sk_state == TCP_CLOSE)
535 		goto out;
536 
537 	if (static_branch_unlikely(&ip4_min_ttl)) {
538 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
539 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
540 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
541 			goto out;
542 		}
543 	}
544 
545 	tp = tcp_sk(sk);
546 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
547 	fastopen = rcu_dereference(tp->fastopen_rsk);
548 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
549 	if (sk->sk_state != TCP_LISTEN &&
550 	    !between(seq, snd_una, tp->snd_nxt)) {
551 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
552 		goto out;
553 	}
554 
555 	switch (type) {
556 	case ICMP_REDIRECT:
557 		if (!sock_owned_by_user(sk))
558 			do_redirect(skb, sk);
559 		goto out;
560 	case ICMP_SOURCE_QUENCH:
561 		/* Just silently ignore these. */
562 		goto out;
563 	case ICMP_PARAMETERPROB:
564 		err = EPROTO;
565 		break;
566 	case ICMP_DEST_UNREACH:
567 		if (code > NR_ICMP_UNREACH)
568 			goto out;
569 
570 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
571 			/* We are not interested in TCP_LISTEN and open_requests
572 			 * (SYN-ACKs send out by Linux are always <576bytes so
573 			 * they should go through unfragmented).
574 			 */
575 			if (sk->sk_state == TCP_LISTEN)
576 				goto out;
577 
578 			WRITE_ONCE(tp->mtu_info, info);
579 			if (!sock_owned_by_user(sk)) {
580 				tcp_v4_mtu_reduced(sk);
581 			} else {
582 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
583 					sock_hold(sk);
584 			}
585 			goto out;
586 		}
587 
588 		err = icmp_err_convert[code].errno;
589 		/* check if this ICMP message allows revert of backoff.
590 		 * (see RFC 6069)
591 		 */
592 		if (!fastopen &&
593 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
594 			tcp_ld_RTO_revert(sk, seq);
595 		break;
596 	case ICMP_TIME_EXCEEDED:
597 		err = EHOSTUNREACH;
598 		break;
599 	default:
600 		goto out;
601 	}
602 
603 	switch (sk->sk_state) {
604 	case TCP_SYN_SENT:
605 	case TCP_SYN_RECV:
606 		/* Only in fast or simultaneous open. If a fast open socket is
607 		 * already accepted it is treated as a connected one below.
608 		 */
609 		if (fastopen && !fastopen->sk)
610 			break;
611 
612 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
613 
614 		if (!sock_owned_by_user(sk)) {
615 			WRITE_ONCE(sk->sk_err, err);
616 
617 			sk_error_report(sk);
618 
619 			tcp_done(sk);
620 		} else {
621 			WRITE_ONCE(sk->sk_err_soft, err);
622 		}
623 		goto out;
624 	}
625 
626 	/* If we've already connected we will keep trying
627 	 * until we time out, or the user gives up.
628 	 *
629 	 * rfc1122 4.2.3.9 allows to consider as hard errors
630 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
631 	 * but it is obsoleted by pmtu discovery).
632 	 *
633 	 * Note, that in modern internet, where routing is unreliable
634 	 * and in each dark corner broken firewalls sit, sending random
635 	 * errors ordered by their masters even this two messages finally lose
636 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
637 	 *
638 	 * Now we are in compliance with RFCs.
639 	 *							--ANK (980905)
640 	 */
641 
642 	if (!sock_owned_by_user(sk) &&
643 	    inet_test_bit(RECVERR, sk)) {
644 		WRITE_ONCE(sk->sk_err, err);
645 		sk_error_report(sk);
646 	} else	{ /* Only an error on timeout */
647 		WRITE_ONCE(sk->sk_err_soft, err);
648 	}
649 
650 out:
651 	bh_unlock_sock(sk);
652 	sock_put(sk);
653 	return 0;
654 }
655 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)656 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
657 {
658 	struct tcphdr *th = tcp_hdr(skb);
659 
660 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
661 	skb->csum_start = skb_transport_header(skb) - skb->head;
662 	skb->csum_offset = offsetof(struct tcphdr, check);
663 }
664 
665 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)666 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
667 {
668 	const struct inet_sock *inet = inet_sk(sk);
669 
670 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
671 }
672 EXPORT_SYMBOL(tcp_v4_send_check);
673 
674 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
675 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])676 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
677 				 const struct tcp_ao_hdr *aoh,
678 				 struct ip_reply_arg *arg, struct tcphdr *reply,
679 				 __be32 reply_options[REPLY_OPTIONS_LEN])
680 {
681 #ifdef CONFIG_TCP_AO
682 	int sdif = tcp_v4_sdif(skb);
683 	int dif = inet_iif(skb);
684 	int l3index = sdif ? dif : 0;
685 	bool allocated_traffic_key;
686 	struct tcp_ao_key *key;
687 	char *traffic_key;
688 	bool drop = true;
689 	u32 ao_sne = 0;
690 	u8 keyid;
691 
692 	rcu_read_lock();
693 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
694 				 &key, &traffic_key, &allocated_traffic_key,
695 				 &keyid, &ao_sne))
696 		goto out;
697 
698 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
699 				 (aoh->rnext_keyid << 8) | keyid);
700 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
701 	reply->doff = arg->iov[0].iov_len / 4;
702 
703 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
704 			    key, traffic_key,
705 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
706 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
707 			    reply, ao_sne))
708 		goto out;
709 	drop = false;
710 out:
711 	rcu_read_unlock();
712 	if (allocated_traffic_key)
713 		kfree(traffic_key);
714 	return drop;
715 #else
716 	return true;
717 #endif
718 }
719 
720 /*
721  *	This routine will send an RST to the other tcp.
722  *
723  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
724  *		      for reset.
725  *	Answer: if a packet caused RST, it is not for a socket
726  *		existing in our system, if it is matched to a socket,
727  *		it is just duplicate segment or bug in other side's TCP.
728  *		So that we build reply only basing on parameters
729  *		arrived with segment.
730  *	Exception: precedence violation. We do not implement it in any case.
731  */
732 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)733 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
734 			      enum sk_rst_reason reason)
735 {
736 	const struct tcphdr *th = tcp_hdr(skb);
737 	struct {
738 		struct tcphdr th;
739 		__be32 opt[REPLY_OPTIONS_LEN];
740 	} rep;
741 	const __u8 *md5_hash_location = NULL;
742 	const struct tcp_ao_hdr *aoh;
743 	struct ip_reply_arg arg;
744 #ifdef CONFIG_TCP_MD5SIG
745 	struct tcp_md5sig_key *key = NULL;
746 	unsigned char newhash[16];
747 	struct sock *sk1 = NULL;
748 	int genhash;
749 #endif
750 	u64 transmit_time = 0;
751 	struct sock *ctl_sk;
752 	struct net *net;
753 	u32 txhash = 0;
754 
755 	/* Never send a reset in response to a reset. */
756 	if (th->rst)
757 		return;
758 
759 	/* If sk not NULL, it means we did a successful lookup and incoming
760 	 * route had to be correct. prequeue might have dropped our dst.
761 	 */
762 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
763 		return;
764 
765 	/* Swap the send and the receive. */
766 	memset(&rep, 0, sizeof(rep));
767 	rep.th.dest   = th->source;
768 	rep.th.source = th->dest;
769 	rep.th.doff   = sizeof(struct tcphdr) / 4;
770 	rep.th.rst    = 1;
771 
772 	if (th->ack) {
773 		rep.th.seq = th->ack_seq;
774 	} else {
775 		rep.th.ack = 1;
776 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
777 				       skb->len - (th->doff << 2));
778 	}
779 
780 	memset(&arg, 0, sizeof(arg));
781 	arg.iov[0].iov_base = (unsigned char *)&rep;
782 	arg.iov[0].iov_len  = sizeof(rep.th);
783 
784 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
785 
786 	/* Invalid TCP option size or twice included auth */
787 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
788 		return;
789 
790 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
791 		return;
792 
793 #ifdef CONFIG_TCP_MD5SIG
794 	rcu_read_lock();
795 	if (sk && sk_fullsock(sk)) {
796 		const union tcp_md5_addr *addr;
797 		int l3index;
798 
799 		/* sdif set, means packet ingressed via a device
800 		 * in an L3 domain and inet_iif is set to it.
801 		 */
802 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
803 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
804 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
805 	} else if (md5_hash_location) {
806 		const union tcp_md5_addr *addr;
807 		int sdif = tcp_v4_sdif(skb);
808 		int dif = inet_iif(skb);
809 		int l3index;
810 
811 		/*
812 		 * active side is lost. Try to find listening socket through
813 		 * source port, and then find md5 key through listening socket.
814 		 * we are not loose security here:
815 		 * Incoming packet is checked with md5 hash with finding key,
816 		 * no RST generated if md5 hash doesn't match.
817 		 */
818 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
819 					     NULL, 0, ip_hdr(skb)->saddr,
820 					     th->source, ip_hdr(skb)->daddr,
821 					     ntohs(th->source), dif, sdif);
822 		/* don't send rst if it can't find key */
823 		if (!sk1)
824 			goto out;
825 
826 		/* sdif set, means packet ingressed via a device
827 		 * in an L3 domain and dif is set to it.
828 		 */
829 		l3index = sdif ? dif : 0;
830 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
831 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
832 		if (!key)
833 			goto out;
834 
835 
836 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
837 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
838 			goto out;
839 
840 	}
841 
842 	if (key) {
843 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
844 				   (TCPOPT_NOP << 16) |
845 				   (TCPOPT_MD5SIG << 8) |
846 				   TCPOLEN_MD5SIG);
847 		/* Update length and the length the header thinks exists */
848 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
849 		rep.th.doff = arg.iov[0].iov_len / 4;
850 
851 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
852 				     key, ip_hdr(skb)->saddr,
853 				     ip_hdr(skb)->daddr, &rep.th);
854 	}
855 #endif
856 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
857 	if (rep.opt[0] == 0) {
858 		__be32 mrst = mptcp_reset_option(skb);
859 
860 		if (mrst) {
861 			rep.opt[0] = mrst;
862 			arg.iov[0].iov_len += sizeof(mrst);
863 			rep.th.doff = arg.iov[0].iov_len / 4;
864 		}
865 	}
866 
867 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
868 				      ip_hdr(skb)->saddr, /* XXX */
869 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
870 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
871 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
872 
873 	/* When socket is gone, all binding information is lost.
874 	 * routing might fail in this case. No choice here, if we choose to force
875 	 * input interface, we will misroute in case of asymmetric route.
876 	 */
877 	if (sk)
878 		arg.bound_dev_if = sk->sk_bound_dev_if;
879 
880 	trace_tcp_send_reset(sk, skb, reason);
881 
882 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
883 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
884 
885 	arg.tos = ip_hdr(skb)->tos;
886 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
887 	local_bh_disable();
888 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
889 	sock_net_set(ctl_sk, net);
890 	if (sk) {
891 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
892 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
893 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
894 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
895 		transmit_time = tcp_transmit_time(sk);
896 		xfrm_sk_clone_policy(ctl_sk, sk);
897 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
898 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
899 	} else {
900 		ctl_sk->sk_mark = 0;
901 		ctl_sk->sk_priority = 0;
902 	}
903 	ip_send_unicast_reply(ctl_sk,
904 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
905 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
906 			      &arg, arg.iov[0].iov_len,
907 			      transmit_time, txhash);
908 
909 	xfrm_sk_free_policy(ctl_sk);
910 	sock_net_set(ctl_sk, &init_net);
911 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
912 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
913 	local_bh_enable();
914 
915 #ifdef CONFIG_TCP_MD5SIG
916 out:
917 	rcu_read_unlock();
918 #endif
919 }
920 
921 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
922    outside socket context is ugly, certainly. What can I do?
923  */
924 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)925 static void tcp_v4_send_ack(const struct sock *sk,
926 			    struct sk_buff *skb, u32 seq, u32 ack,
927 			    u32 win, u32 tsval, u32 tsecr, int oif,
928 			    struct tcp_key *key,
929 			    int reply_flags, u8 tos, u32 txhash)
930 {
931 	const struct tcphdr *th = tcp_hdr(skb);
932 	struct {
933 		struct tcphdr th;
934 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
935 	} rep;
936 	struct net *net = sock_net(sk);
937 	struct ip_reply_arg arg;
938 	struct sock *ctl_sk;
939 	u64 transmit_time;
940 
941 	memset(&rep.th, 0, sizeof(struct tcphdr));
942 	memset(&arg, 0, sizeof(arg));
943 
944 	arg.iov[0].iov_base = (unsigned char *)&rep;
945 	arg.iov[0].iov_len  = sizeof(rep.th);
946 	if (tsecr) {
947 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
948 				   (TCPOPT_TIMESTAMP << 8) |
949 				   TCPOLEN_TIMESTAMP);
950 		rep.opt[1] = htonl(tsval);
951 		rep.opt[2] = htonl(tsecr);
952 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
953 	}
954 
955 	/* Swap the send and the receive. */
956 	rep.th.dest    = th->source;
957 	rep.th.source  = th->dest;
958 	rep.th.doff    = arg.iov[0].iov_len / 4;
959 	rep.th.seq     = htonl(seq);
960 	rep.th.ack_seq = htonl(ack);
961 	rep.th.ack     = 1;
962 	rep.th.window  = htons(win);
963 
964 #ifdef CONFIG_TCP_MD5SIG
965 	if (tcp_key_is_md5(key)) {
966 		int offset = (tsecr) ? 3 : 0;
967 
968 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
969 					  (TCPOPT_NOP << 16) |
970 					  (TCPOPT_MD5SIG << 8) |
971 					  TCPOLEN_MD5SIG);
972 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
973 		rep.th.doff = arg.iov[0].iov_len/4;
974 
975 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
976 				    key->md5_key, ip_hdr(skb)->saddr,
977 				    ip_hdr(skb)->daddr, &rep.th);
978 	}
979 #endif
980 #ifdef CONFIG_TCP_AO
981 	if (tcp_key_is_ao(key)) {
982 		int offset = (tsecr) ? 3 : 0;
983 
984 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
985 					  (tcp_ao_len(key->ao_key) << 16) |
986 					  (key->ao_key->sndid << 8) |
987 					  key->rcv_next);
988 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
989 		rep.th.doff = arg.iov[0].iov_len / 4;
990 
991 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
992 				key->ao_key, key->traffic_key,
993 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
994 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
995 				&rep.th, key->sne);
996 	}
997 #endif
998 	arg.flags = reply_flags;
999 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1000 				      ip_hdr(skb)->saddr, /* XXX */
1001 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1002 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1003 	if (oif)
1004 		arg.bound_dev_if = oif;
1005 	arg.tos = tos;
1006 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1007 	local_bh_disable();
1008 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
1009 	sock_net_set(ctl_sk, net);
1010 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1011 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1012 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1013 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1014 	transmit_time = tcp_transmit_time(sk);
1015 	ip_send_unicast_reply(ctl_sk,
1016 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1017 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1018 			      &arg, arg.iov[0].iov_len,
1019 			      transmit_time, txhash);
1020 
1021 	sock_net_set(ctl_sk, &init_net);
1022 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1023 	local_bh_enable();
1024 }
1025 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1026 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1027 {
1028 	struct inet_timewait_sock *tw = inet_twsk(sk);
1029 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1030 	struct tcp_key key = {};
1031 #ifdef CONFIG_TCP_AO
1032 	struct tcp_ao_info *ao_info;
1033 
1034 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1035 		/* FIXME: the segment to-be-acked is not verified yet */
1036 		ao_info = rcu_dereference(tcptw->ao_info);
1037 		if (ao_info) {
1038 			const struct tcp_ao_hdr *aoh;
1039 
1040 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1041 				inet_twsk_put(tw);
1042 				return;
1043 			}
1044 
1045 			if (aoh)
1046 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1047 		}
1048 	}
1049 	if (key.ao_key) {
1050 		struct tcp_ao_key *rnext_key;
1051 
1052 		key.traffic_key = snd_other_key(key.ao_key);
1053 		key.sne = READ_ONCE(ao_info->snd_sne);
1054 		rnext_key = READ_ONCE(ao_info->rnext_key);
1055 		key.rcv_next = rnext_key->rcvid;
1056 		key.type = TCP_KEY_AO;
1057 #else
1058 	if (0) {
1059 #endif
1060 #ifdef CONFIG_TCP_MD5SIG
1061 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1062 		key.md5_key = tcp_twsk_md5_key(tcptw);
1063 		if (key.md5_key)
1064 			key.type = TCP_KEY_MD5;
1065 #endif
1066 	}
1067 
1068 	tcp_v4_send_ack(sk, skb,
1069 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1070 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1071 			tcp_tw_tsval(tcptw),
1072 			tcptw->tw_ts_recent,
1073 			tw->tw_bound_dev_if, &key,
1074 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1075 			tw->tw_tos,
1076 			tw->tw_txhash);
1077 
1078 	inet_twsk_put(tw);
1079 }
1080 
1081 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1082 				  struct request_sock *req)
1083 {
1084 	struct tcp_key key = {};
1085 
1086 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1087 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1088 	 */
1089 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1090 					     tcp_sk(sk)->snd_nxt;
1091 
1092 #ifdef CONFIG_TCP_AO
1093 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1094 	    tcp_rsk_used_ao(req)) {
1095 		const union tcp_md5_addr *addr;
1096 		const struct tcp_ao_hdr *aoh;
1097 		int l3index;
1098 
1099 		/* Invalid TCP option size or twice included auth */
1100 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1101 			return;
1102 		if (!aoh)
1103 			return;
1104 
1105 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1106 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1107 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1108 					      aoh->rnext_keyid, -1);
1109 		if (unlikely(!key.ao_key)) {
1110 			/* Send ACK with any matching MKT for the peer */
1111 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1112 			/* Matching key disappeared (user removed the key?)
1113 			 * let the handshake timeout.
1114 			 */
1115 			if (!key.ao_key) {
1116 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1117 						     addr,
1118 						     ntohs(tcp_hdr(skb)->source),
1119 						     &ip_hdr(skb)->daddr,
1120 						     ntohs(tcp_hdr(skb)->dest));
1121 				return;
1122 			}
1123 		}
1124 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1125 		if (!key.traffic_key)
1126 			return;
1127 
1128 		key.type = TCP_KEY_AO;
1129 		key.rcv_next = aoh->keyid;
1130 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1131 #else
1132 	if (0) {
1133 #endif
1134 #ifdef CONFIG_TCP_MD5SIG
1135 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1136 		const union tcp_md5_addr *addr;
1137 		int l3index;
1138 
1139 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1140 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1141 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142 		if (key.md5_key)
1143 			key.type = TCP_KEY_MD5;
1144 #endif
1145 	}
1146 
1147 	tcp_v4_send_ack(sk, skb, seq,
1148 			tcp_rsk(req)->rcv_nxt,
1149 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1150 			tcp_rsk_tsval(tcp_rsk(req)),
1151 			READ_ONCE(req->ts_recent),
1152 			0, &key,
1153 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1154 			ip_hdr(skb)->tos,
1155 			READ_ONCE(tcp_rsk(req)->txhash));
1156 	if (tcp_key_is_ao(&key))
1157 		kfree(key.traffic_key);
1158 }
1159 
1160 /*
1161  *	Send a SYN-ACK after having received a SYN.
1162  *	This still operates on a request_sock only, not on a big
1163  *	socket.
1164  */
1165 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1166 			      struct flowi *fl,
1167 			      struct request_sock *req,
1168 			      struct tcp_fastopen_cookie *foc,
1169 			      enum tcp_synack_type synack_type,
1170 			      struct sk_buff *syn_skb)
1171 {
1172 	const struct inet_request_sock *ireq = inet_rsk(req);
1173 	struct flowi4 fl4;
1174 	int err = -1;
1175 	struct sk_buff *skb;
1176 	u8 tos;
1177 
1178 	/* First, grab a route. */
1179 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1180 		return -1;
1181 
1182 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1183 
1184 	if (skb) {
1185 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1186 
1187 		tos = READ_ONCE(inet_sk(sk)->tos);
1188 
1189 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1190 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1191 			      (tos & INET_ECN_MASK);
1192 
1193 		if (!INET_ECN_is_capable(tos) &&
1194 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1195 			tos |= INET_ECN_ECT_0;
1196 
1197 		rcu_read_lock();
1198 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1199 					    ireq->ir_rmt_addr,
1200 					    rcu_dereference(ireq->ireq_opt),
1201 					    tos);
1202 		rcu_read_unlock();
1203 		err = net_xmit_eval(err);
1204 	}
1205 
1206 	return err;
1207 }
1208 
1209 /*
1210  *	IPv4 request_sock destructor.
1211  */
1212 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1213 {
1214 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1215 }
1216 
1217 #ifdef CONFIG_TCP_MD5SIG
1218 /*
1219  * RFC2385 MD5 checksumming requires a mapping of
1220  * IP address->MD5 Key.
1221  * We need to maintain these in the sk structure.
1222  */
1223 
1224 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1225 EXPORT_SYMBOL(tcp_md5_needed);
1226 
1227 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1228 {
1229 	if (!old)
1230 		return true;
1231 
1232 	/* l3index always overrides non-l3index */
1233 	if (old->l3index && new->l3index == 0)
1234 		return false;
1235 	if (old->l3index == 0 && new->l3index)
1236 		return true;
1237 
1238 	return old->prefixlen < new->prefixlen;
1239 }
1240 
1241 /* Find the Key structure for an address.  */
1242 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1243 					   const union tcp_md5_addr *addr,
1244 					   int family, bool any_l3index)
1245 {
1246 	const struct tcp_sock *tp = tcp_sk(sk);
1247 	struct tcp_md5sig_key *key;
1248 	const struct tcp_md5sig_info *md5sig;
1249 	__be32 mask;
1250 	struct tcp_md5sig_key *best_match = NULL;
1251 	bool match;
1252 
1253 	/* caller either holds rcu_read_lock() or socket lock */
1254 	md5sig = rcu_dereference_check(tp->md5sig_info,
1255 				       lockdep_sock_is_held(sk));
1256 	if (!md5sig)
1257 		return NULL;
1258 
1259 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1260 				 lockdep_sock_is_held(sk)) {
1261 		if (key->family != family)
1262 			continue;
1263 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1264 		    key->l3index != l3index)
1265 			continue;
1266 		if (family == AF_INET) {
1267 			mask = inet_make_mask(key->prefixlen);
1268 			match = (key->addr.a4.s_addr & mask) ==
1269 				(addr->a4.s_addr & mask);
1270 #if IS_ENABLED(CONFIG_IPV6)
1271 		} else if (family == AF_INET6) {
1272 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1273 						  key->prefixlen);
1274 #endif
1275 		} else {
1276 			match = false;
1277 		}
1278 
1279 		if (match && better_md5_match(best_match, key))
1280 			best_match = key;
1281 	}
1282 	return best_match;
1283 }
1284 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1285 
1286 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1287 						      const union tcp_md5_addr *addr,
1288 						      int family, u8 prefixlen,
1289 						      int l3index, u8 flags)
1290 {
1291 	const struct tcp_sock *tp = tcp_sk(sk);
1292 	struct tcp_md5sig_key *key;
1293 	unsigned int size = sizeof(struct in_addr);
1294 	const struct tcp_md5sig_info *md5sig;
1295 
1296 	/* caller either holds rcu_read_lock() or socket lock */
1297 	md5sig = rcu_dereference_check(tp->md5sig_info,
1298 				       lockdep_sock_is_held(sk));
1299 	if (!md5sig)
1300 		return NULL;
1301 #if IS_ENABLED(CONFIG_IPV6)
1302 	if (family == AF_INET6)
1303 		size = sizeof(struct in6_addr);
1304 #endif
1305 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1306 				 lockdep_sock_is_held(sk)) {
1307 		if (key->family != family)
1308 			continue;
1309 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1310 			continue;
1311 		if (key->l3index != l3index)
1312 			continue;
1313 		if (!memcmp(&key->addr, addr, size) &&
1314 		    key->prefixlen == prefixlen)
1315 			return key;
1316 	}
1317 	return NULL;
1318 }
1319 
1320 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1321 					 const struct sock *addr_sk)
1322 {
1323 	const union tcp_md5_addr *addr;
1324 	int l3index;
1325 
1326 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1327 						 addr_sk->sk_bound_dev_if);
1328 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1329 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1330 }
1331 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1332 
1333 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1334 {
1335 	struct tcp_sock *tp = tcp_sk(sk);
1336 	struct tcp_md5sig_info *md5sig;
1337 
1338 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1339 	if (!md5sig)
1340 		return -ENOMEM;
1341 
1342 	sk_gso_disable(sk);
1343 	INIT_HLIST_HEAD(&md5sig->head);
1344 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1345 	return 0;
1346 }
1347 
1348 /* This can be called on a newly created socket, from other files */
1349 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1350 			    int family, u8 prefixlen, int l3index, u8 flags,
1351 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1352 {
1353 	/* Add Key to the list */
1354 	struct tcp_md5sig_key *key;
1355 	struct tcp_sock *tp = tcp_sk(sk);
1356 	struct tcp_md5sig_info *md5sig;
1357 
1358 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1359 	if (key) {
1360 		/* Pre-existing entry - just update that one.
1361 		 * Note that the key might be used concurrently.
1362 		 * data_race() is telling kcsan that we do not care of
1363 		 * key mismatches, since changing MD5 key on live flows
1364 		 * can lead to packet drops.
1365 		 */
1366 		data_race(memcpy(key->key, newkey, newkeylen));
1367 
1368 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1369 		 * Also note that a reader could catch new key->keylen value
1370 		 * but old key->key[], this is the reason we use __GFP_ZERO
1371 		 * at sock_kmalloc() time below these lines.
1372 		 */
1373 		WRITE_ONCE(key->keylen, newkeylen);
1374 
1375 		return 0;
1376 	}
1377 
1378 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1379 					   lockdep_sock_is_held(sk));
1380 
1381 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1382 	if (!key)
1383 		return -ENOMEM;
1384 
1385 	memcpy(key->key, newkey, newkeylen);
1386 	key->keylen = newkeylen;
1387 	key->family = family;
1388 	key->prefixlen = prefixlen;
1389 	key->l3index = l3index;
1390 	key->flags = flags;
1391 	memcpy(&key->addr, addr,
1392 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1393 								 sizeof(struct in_addr));
1394 	hlist_add_head_rcu(&key->node, &md5sig->head);
1395 	return 0;
1396 }
1397 
1398 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1399 		   int family, u8 prefixlen, int l3index, u8 flags,
1400 		   const u8 *newkey, u8 newkeylen)
1401 {
1402 	struct tcp_sock *tp = tcp_sk(sk);
1403 
1404 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1405 		if (tcp_md5_alloc_sigpool())
1406 			return -ENOMEM;
1407 
1408 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1409 			tcp_md5_release_sigpool();
1410 			return -ENOMEM;
1411 		}
1412 
1413 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1414 			struct tcp_md5sig_info *md5sig;
1415 
1416 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1417 			rcu_assign_pointer(tp->md5sig_info, NULL);
1418 			kfree_rcu(md5sig, rcu);
1419 			tcp_md5_release_sigpool();
1420 			return -EUSERS;
1421 		}
1422 	}
1423 
1424 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1425 				newkey, newkeylen, GFP_KERNEL);
1426 }
1427 EXPORT_SYMBOL(tcp_md5_do_add);
1428 
1429 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1430 		     int family, u8 prefixlen, int l3index,
1431 		     struct tcp_md5sig_key *key)
1432 {
1433 	struct tcp_sock *tp = tcp_sk(sk);
1434 
1435 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1436 		tcp_md5_add_sigpool();
1437 
1438 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1439 			tcp_md5_release_sigpool();
1440 			return -ENOMEM;
1441 		}
1442 
1443 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1444 			struct tcp_md5sig_info *md5sig;
1445 
1446 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1447 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1448 			rcu_assign_pointer(tp->md5sig_info, NULL);
1449 			kfree_rcu(md5sig, rcu);
1450 			tcp_md5_release_sigpool();
1451 			return -EUSERS;
1452 		}
1453 	}
1454 
1455 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1456 				key->flags, key->key, key->keylen,
1457 				sk_gfp_mask(sk, GFP_ATOMIC));
1458 }
1459 EXPORT_SYMBOL(tcp_md5_key_copy);
1460 
1461 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1462 		   u8 prefixlen, int l3index, u8 flags)
1463 {
1464 	struct tcp_md5sig_key *key;
1465 
1466 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1467 	if (!key)
1468 		return -ENOENT;
1469 	hlist_del_rcu(&key->node);
1470 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1471 	kfree_rcu(key, rcu);
1472 	return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_md5_do_del);
1475 
1476 void tcp_clear_md5_list(struct sock *sk)
1477 {
1478 	struct tcp_sock *tp = tcp_sk(sk);
1479 	struct tcp_md5sig_key *key;
1480 	struct hlist_node *n;
1481 	struct tcp_md5sig_info *md5sig;
1482 
1483 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1484 
1485 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1486 		hlist_del_rcu(&key->node);
1487 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1488 		kfree_rcu(key, rcu);
1489 	}
1490 }
1491 
1492 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1493 				 sockptr_t optval, int optlen)
1494 {
1495 	struct tcp_md5sig cmd;
1496 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1497 	const union tcp_md5_addr *addr;
1498 	u8 prefixlen = 32;
1499 	int l3index = 0;
1500 	bool l3flag;
1501 	u8 flags;
1502 
1503 	if (optlen < sizeof(cmd))
1504 		return -EINVAL;
1505 
1506 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1507 		return -EFAULT;
1508 
1509 	if (sin->sin_family != AF_INET)
1510 		return -EINVAL;
1511 
1512 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1513 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1514 
1515 	if (optname == TCP_MD5SIG_EXT &&
1516 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1517 		prefixlen = cmd.tcpm_prefixlen;
1518 		if (prefixlen > 32)
1519 			return -EINVAL;
1520 	}
1521 
1522 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1523 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1524 		struct net_device *dev;
1525 
1526 		rcu_read_lock();
1527 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1528 		if (dev && netif_is_l3_master(dev))
1529 			l3index = dev->ifindex;
1530 
1531 		rcu_read_unlock();
1532 
1533 		/* ok to reference set/not set outside of rcu;
1534 		 * right now device MUST be an L3 master
1535 		 */
1536 		if (!dev || !l3index)
1537 			return -EINVAL;
1538 	}
1539 
1540 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1541 
1542 	if (!cmd.tcpm_keylen)
1543 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1544 
1545 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1546 		return -EINVAL;
1547 
1548 	/* Don't allow keys for peers that have a matching TCP-AO key.
1549 	 * See the comment in tcp_ao_add_cmd()
1550 	 */
1551 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1552 		return -EKEYREJECTED;
1553 
1554 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1555 			      cmd.tcpm_key, cmd.tcpm_keylen);
1556 }
1557 
1558 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1559 				   __be32 daddr, __be32 saddr,
1560 				   const struct tcphdr *th, int nbytes)
1561 {
1562 	struct tcp4_pseudohdr *bp;
1563 	struct scatterlist sg;
1564 	struct tcphdr *_th;
1565 
1566 	bp = hp->scratch;
1567 	bp->saddr = saddr;
1568 	bp->daddr = daddr;
1569 	bp->pad = 0;
1570 	bp->protocol = IPPROTO_TCP;
1571 	bp->len = cpu_to_be16(nbytes);
1572 
1573 	_th = (struct tcphdr *)(bp + 1);
1574 	memcpy(_th, th, sizeof(*th));
1575 	_th->check = 0;
1576 
1577 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1578 	ahash_request_set_crypt(hp->req, &sg, NULL,
1579 				sizeof(*bp) + sizeof(*th));
1580 	return crypto_ahash_update(hp->req);
1581 }
1582 
1583 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1584 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1585 {
1586 	struct tcp_sigpool hp;
1587 
1588 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1589 		goto clear_hash_nostart;
1590 
1591 	if (crypto_ahash_init(hp.req))
1592 		goto clear_hash;
1593 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1594 		goto clear_hash;
1595 	if (tcp_md5_hash_key(&hp, key))
1596 		goto clear_hash;
1597 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1598 	if (crypto_ahash_final(hp.req))
1599 		goto clear_hash;
1600 
1601 	tcp_sigpool_end(&hp);
1602 	return 0;
1603 
1604 clear_hash:
1605 	tcp_sigpool_end(&hp);
1606 clear_hash_nostart:
1607 	memset(md5_hash, 0, 16);
1608 	return 1;
1609 }
1610 
1611 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1612 			const struct sock *sk,
1613 			const struct sk_buff *skb)
1614 {
1615 	const struct tcphdr *th = tcp_hdr(skb);
1616 	struct tcp_sigpool hp;
1617 	__be32 saddr, daddr;
1618 
1619 	if (sk) { /* valid for establish/request sockets */
1620 		saddr = sk->sk_rcv_saddr;
1621 		daddr = sk->sk_daddr;
1622 	} else {
1623 		const struct iphdr *iph = ip_hdr(skb);
1624 		saddr = iph->saddr;
1625 		daddr = iph->daddr;
1626 	}
1627 
1628 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1629 		goto clear_hash_nostart;
1630 
1631 	if (crypto_ahash_init(hp.req))
1632 		goto clear_hash;
1633 
1634 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1635 		goto clear_hash;
1636 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1637 		goto clear_hash;
1638 	if (tcp_md5_hash_key(&hp, key))
1639 		goto clear_hash;
1640 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1641 	if (crypto_ahash_final(hp.req))
1642 		goto clear_hash;
1643 
1644 	tcp_sigpool_end(&hp);
1645 	return 0;
1646 
1647 clear_hash:
1648 	tcp_sigpool_end(&hp);
1649 clear_hash_nostart:
1650 	memset(md5_hash, 0, 16);
1651 	return 1;
1652 }
1653 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1654 
1655 #endif
1656 
1657 static void tcp_v4_init_req(struct request_sock *req,
1658 			    const struct sock *sk_listener,
1659 			    struct sk_buff *skb)
1660 {
1661 	struct inet_request_sock *ireq = inet_rsk(req);
1662 	struct net *net = sock_net(sk_listener);
1663 
1664 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1665 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1666 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1667 }
1668 
1669 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1670 					  struct sk_buff *skb,
1671 					  struct flowi *fl,
1672 					  struct request_sock *req,
1673 					  u32 tw_isn)
1674 {
1675 	tcp_v4_init_req(req, sk, skb);
1676 
1677 	if (security_inet_conn_request(sk, skb, req))
1678 		return NULL;
1679 
1680 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1681 }
1682 
1683 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1684 	.family		=	PF_INET,
1685 	.obj_size	=	sizeof(struct tcp_request_sock),
1686 	.rtx_syn_ack	=	tcp_rtx_synack,
1687 	.send_ack	=	tcp_v4_reqsk_send_ack,
1688 	.destructor	=	tcp_v4_reqsk_destructor,
1689 	.send_reset	=	tcp_v4_send_reset,
1690 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1691 };
1692 
1693 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1694 	.mss_clamp	=	TCP_MSS_DEFAULT,
1695 #ifdef CONFIG_TCP_MD5SIG
1696 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1697 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1698 #endif
1699 #ifdef CONFIG_TCP_AO
1700 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1701 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1702 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1703 #endif
1704 #ifdef CONFIG_SYN_COOKIES
1705 	.cookie_init_seq =	cookie_v4_init_sequence,
1706 #endif
1707 	.route_req	=	tcp_v4_route_req,
1708 	.init_seq	=	tcp_v4_init_seq,
1709 	.init_ts_off	=	tcp_v4_init_ts_off,
1710 	.send_synack	=	tcp_v4_send_synack,
1711 };
1712 
1713 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1714 {
1715 	/* Never answer to SYNs send to broadcast or multicast */
1716 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1717 		goto drop;
1718 
1719 	return tcp_conn_request(&tcp_request_sock_ops,
1720 				&tcp_request_sock_ipv4_ops, sk, skb);
1721 
1722 drop:
1723 	tcp_listendrop(sk);
1724 	return 0;
1725 }
1726 EXPORT_SYMBOL(tcp_v4_conn_request);
1727 
1728 
1729 /*
1730  * The three way handshake has completed - we got a valid synack -
1731  * now create the new socket.
1732  */
1733 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1734 				  struct request_sock *req,
1735 				  struct dst_entry *dst,
1736 				  struct request_sock *req_unhash,
1737 				  bool *own_req)
1738 {
1739 	struct inet_request_sock *ireq;
1740 	bool found_dup_sk = false;
1741 	struct inet_sock *newinet;
1742 	struct tcp_sock *newtp;
1743 	struct sock *newsk;
1744 #ifdef CONFIG_TCP_MD5SIG
1745 	const union tcp_md5_addr *addr;
1746 	struct tcp_md5sig_key *key;
1747 	int l3index;
1748 #endif
1749 	struct ip_options_rcu *inet_opt;
1750 
1751 	if (sk_acceptq_is_full(sk))
1752 		goto exit_overflow;
1753 
1754 	newsk = tcp_create_openreq_child(sk, req, skb);
1755 	if (!newsk)
1756 		goto exit_nonewsk;
1757 
1758 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1759 	inet_sk_rx_dst_set(newsk, skb);
1760 
1761 	newtp		      = tcp_sk(newsk);
1762 	newinet		      = inet_sk(newsk);
1763 	ireq		      = inet_rsk(req);
1764 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1765 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1766 	newsk->sk_bound_dev_if = ireq->ir_iif;
1767 	newinet->inet_saddr   = ireq->ir_loc_addr;
1768 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1769 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1770 	newinet->mc_index     = inet_iif(skb);
1771 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1772 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1773 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1774 	if (inet_opt)
1775 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1776 	atomic_set(&newinet->inet_id, get_random_u16());
1777 
1778 	/* Set ToS of the new socket based upon the value of incoming SYN.
1779 	 * ECT bits are set later in tcp_init_transfer().
1780 	 */
1781 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1782 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1783 
1784 	if (!dst) {
1785 		dst = inet_csk_route_child_sock(sk, newsk, req);
1786 		if (!dst)
1787 			goto put_and_exit;
1788 	} else {
1789 		/* syncookie case : see end of cookie_v4_check() */
1790 	}
1791 	sk_setup_caps(newsk, dst);
1792 
1793 	tcp_ca_openreq_child(newsk, dst);
1794 
1795 	tcp_sync_mss(newsk, dst_mtu(dst));
1796 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1797 
1798 	tcp_initialize_rcv_mss(newsk);
1799 
1800 #ifdef CONFIG_TCP_MD5SIG
1801 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1802 	/* Copy over the MD5 key from the original socket */
1803 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1804 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1805 	if (key && !tcp_rsk_used_ao(req)) {
1806 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1807 			goto put_and_exit;
1808 		sk_gso_disable(newsk);
1809 	}
1810 #endif
1811 #ifdef CONFIG_TCP_AO
1812 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1813 		goto put_and_exit; /* OOM, release back memory */
1814 #endif
1815 
1816 	if (__inet_inherit_port(sk, newsk) < 0)
1817 		goto put_and_exit;
1818 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1819 				       &found_dup_sk);
1820 	if (likely(*own_req)) {
1821 		tcp_move_syn(newtp, req);
1822 		ireq->ireq_opt = NULL;
1823 	} else {
1824 		newinet->inet_opt = NULL;
1825 
1826 		if (!req_unhash && found_dup_sk) {
1827 			/* This code path should only be executed in the
1828 			 * syncookie case only
1829 			 */
1830 			bh_unlock_sock(newsk);
1831 			sock_put(newsk);
1832 			newsk = NULL;
1833 		}
1834 	}
1835 	return newsk;
1836 
1837 exit_overflow:
1838 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1839 exit_nonewsk:
1840 	dst_release(dst);
1841 exit:
1842 	tcp_listendrop(sk);
1843 	return NULL;
1844 put_and_exit:
1845 	newinet->inet_opt = NULL;
1846 	inet_csk_prepare_forced_close(newsk);
1847 	tcp_done(newsk);
1848 	goto exit;
1849 }
1850 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1851 
1852 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1853 {
1854 #ifdef CONFIG_SYN_COOKIES
1855 	const struct tcphdr *th = tcp_hdr(skb);
1856 
1857 	if (!th->syn)
1858 		sk = cookie_v4_check(sk, skb);
1859 #endif
1860 	return sk;
1861 }
1862 
1863 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1864 			 struct tcphdr *th, u32 *cookie)
1865 {
1866 	u16 mss = 0;
1867 #ifdef CONFIG_SYN_COOKIES
1868 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1869 				    &tcp_request_sock_ipv4_ops, sk, th);
1870 	if (mss) {
1871 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1872 		tcp_synq_overflow(sk);
1873 	}
1874 #endif
1875 	return mss;
1876 }
1877 
1878 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1879 							   u32));
1880 /* The socket must have it's spinlock held when we get
1881  * here, unless it is a TCP_LISTEN socket.
1882  *
1883  * We have a potential double-lock case here, so even when
1884  * doing backlog processing we use the BH locking scheme.
1885  * This is because we cannot sleep with the original spinlock
1886  * held.
1887  */
1888 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1889 {
1890 	enum skb_drop_reason reason;
1891 	struct sock *rsk;
1892 
1893 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1894 		struct dst_entry *dst;
1895 
1896 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1897 						lockdep_sock_is_held(sk));
1898 
1899 		sock_rps_save_rxhash(sk, skb);
1900 		sk_mark_napi_id(sk, skb);
1901 		if (dst) {
1902 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1903 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1904 					     dst, 0)) {
1905 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1906 				dst_release(dst);
1907 			}
1908 		}
1909 		tcp_rcv_established(sk, skb);
1910 		return 0;
1911 	}
1912 
1913 	if (tcp_checksum_complete(skb))
1914 		goto csum_err;
1915 
1916 	if (sk->sk_state == TCP_LISTEN) {
1917 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1918 
1919 		if (!nsk)
1920 			return 0;
1921 		if (nsk != sk) {
1922 			reason = tcp_child_process(sk, nsk, skb);
1923 			if (reason) {
1924 				rsk = nsk;
1925 				goto reset;
1926 			}
1927 			return 0;
1928 		}
1929 	} else
1930 		sock_rps_save_rxhash(sk, skb);
1931 
1932 	reason = tcp_rcv_state_process(sk, skb);
1933 	if (reason) {
1934 		rsk = sk;
1935 		goto reset;
1936 	}
1937 	return 0;
1938 
1939 reset:
1940 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1941 discard:
1942 	kfree_skb_reason(skb, reason);
1943 	/* Be careful here. If this function gets more complicated and
1944 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1945 	 * might be destroyed here. This current version compiles correctly,
1946 	 * but you have been warned.
1947 	 */
1948 	return 0;
1949 
1950 csum_err:
1951 	reason = SKB_DROP_REASON_TCP_CSUM;
1952 	trace_tcp_bad_csum(skb);
1953 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1954 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1955 	goto discard;
1956 }
1957 EXPORT_SYMBOL(tcp_v4_do_rcv);
1958 
1959 int tcp_v4_early_demux(struct sk_buff *skb)
1960 {
1961 	struct net *net = dev_net(skb->dev);
1962 	const struct iphdr *iph;
1963 	const struct tcphdr *th;
1964 	struct sock *sk;
1965 
1966 	if (skb->pkt_type != PACKET_HOST)
1967 		return 0;
1968 
1969 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1970 		return 0;
1971 
1972 	iph = ip_hdr(skb);
1973 	th = tcp_hdr(skb);
1974 
1975 	if (th->doff < sizeof(struct tcphdr) / 4)
1976 		return 0;
1977 
1978 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1979 				       iph->saddr, th->source,
1980 				       iph->daddr, ntohs(th->dest),
1981 				       skb->skb_iif, inet_sdif(skb));
1982 	if (sk) {
1983 		skb->sk = sk;
1984 		skb->destructor = sock_edemux;
1985 		if (sk_fullsock(sk)) {
1986 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1987 
1988 			if (dst)
1989 				dst = dst_check(dst, 0);
1990 			if (dst &&
1991 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1992 				skb_dst_set_noref(skb, dst);
1993 		}
1994 	}
1995 	return 0;
1996 }
1997 
1998 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1999 		     enum skb_drop_reason *reason)
2000 {
2001 	u32 tail_gso_size, tail_gso_segs;
2002 	struct skb_shared_info *shinfo;
2003 	const struct tcphdr *th;
2004 	struct tcphdr *thtail;
2005 	struct sk_buff *tail;
2006 	unsigned int hdrlen;
2007 	bool fragstolen;
2008 	u32 gso_segs;
2009 	u32 gso_size;
2010 	u64 limit;
2011 	int delta;
2012 
2013 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2014 	 * we can fix skb->truesize to its real value to avoid future drops.
2015 	 * This is valid because skb is not yet charged to the socket.
2016 	 * It has been noticed pure SACK packets were sometimes dropped
2017 	 * (if cooked by drivers without copybreak feature).
2018 	 */
2019 	skb_condense(skb);
2020 
2021 	skb_dst_drop(skb);
2022 
2023 	if (unlikely(tcp_checksum_complete(skb))) {
2024 		bh_unlock_sock(sk);
2025 		trace_tcp_bad_csum(skb);
2026 		*reason = SKB_DROP_REASON_TCP_CSUM;
2027 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2028 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2029 		return true;
2030 	}
2031 
2032 	/* Attempt coalescing to last skb in backlog, even if we are
2033 	 * above the limits.
2034 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2035 	 */
2036 	th = (const struct tcphdr *)skb->data;
2037 	hdrlen = th->doff * 4;
2038 
2039 	tail = sk->sk_backlog.tail;
2040 	if (!tail)
2041 		goto no_coalesce;
2042 	thtail = (struct tcphdr *)tail->data;
2043 
2044 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2045 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2046 	    ((TCP_SKB_CB(tail)->tcp_flags |
2047 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2048 	    !((TCP_SKB_CB(tail)->tcp_flags &
2049 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2050 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2051 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2052 	    !mptcp_skb_can_collapse(tail, skb) ||
2053 	    skb_cmp_decrypted(tail, skb) ||
2054 	    thtail->doff != th->doff ||
2055 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2056 		goto no_coalesce;
2057 
2058 	__skb_pull(skb, hdrlen);
2059 
2060 	shinfo = skb_shinfo(skb);
2061 	gso_size = shinfo->gso_size ?: skb->len;
2062 	gso_segs = shinfo->gso_segs ?: 1;
2063 
2064 	shinfo = skb_shinfo(tail);
2065 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2066 	tail_gso_segs = shinfo->gso_segs ?: 1;
2067 
2068 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2069 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2070 
2071 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2072 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2073 			thtail->window = th->window;
2074 		}
2075 
2076 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2077 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2078 		 * is not entered if we append a packet with a FIN.
2079 		 * SYN, RST, URG are not present.
2080 		 * ACK is set on both packets.
2081 		 * PSH : we do not really care in TCP stack,
2082 		 *       at least for 'GRO' packets.
2083 		 */
2084 		thtail->fin |= th->fin;
2085 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2086 
2087 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2088 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2089 			tail->tstamp = skb->tstamp;
2090 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2091 		}
2092 
2093 		/* Not as strict as GRO. We only need to carry mss max value */
2094 		shinfo->gso_size = max(gso_size, tail_gso_size);
2095 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2096 
2097 		sk->sk_backlog.len += delta;
2098 		__NET_INC_STATS(sock_net(sk),
2099 				LINUX_MIB_TCPBACKLOGCOALESCE);
2100 		kfree_skb_partial(skb, fragstolen);
2101 		return false;
2102 	}
2103 	__skb_push(skb, hdrlen);
2104 
2105 no_coalesce:
2106 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2107 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2108 	 * sk_rcvbuf in normal conditions.
2109 	 */
2110 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2111 
2112 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2113 
2114 	/* Only socket owner can try to collapse/prune rx queues
2115 	 * to reduce memory overhead, so add a little headroom here.
2116 	 * Few sockets backlog are possibly concurrently non empty.
2117 	 */
2118 	limit += 64 * 1024;
2119 
2120 	limit = min_t(u64, limit, UINT_MAX);
2121 
2122 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2123 		bh_unlock_sock(sk);
2124 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2125 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2126 		return true;
2127 	}
2128 	return false;
2129 }
2130 EXPORT_SYMBOL(tcp_add_backlog);
2131 
2132 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2133 {
2134 	struct tcphdr *th = (struct tcphdr *)skb->data;
2135 
2136 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2137 }
2138 EXPORT_SYMBOL(tcp_filter);
2139 
2140 static void tcp_v4_restore_cb(struct sk_buff *skb)
2141 {
2142 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2143 		sizeof(struct inet_skb_parm));
2144 }
2145 
2146 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2147 			   const struct tcphdr *th)
2148 {
2149 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2150 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2151 	 */
2152 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2153 		sizeof(struct inet_skb_parm));
2154 	barrier();
2155 
2156 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2157 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2158 				    skb->len - th->doff * 4);
2159 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2160 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2161 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2162 	TCP_SKB_CB(skb)->sacked	 = 0;
2163 	TCP_SKB_CB(skb)->has_rxtstamp =
2164 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2165 }
2166 
2167 /*
2168  *	From tcp_input.c
2169  */
2170 
2171 int tcp_v4_rcv(struct sk_buff *skb)
2172 {
2173 	struct net *net = dev_net(skb->dev);
2174 	enum skb_drop_reason drop_reason;
2175 	int sdif = inet_sdif(skb);
2176 	int dif = inet_iif(skb);
2177 	const struct iphdr *iph;
2178 	const struct tcphdr *th;
2179 	bool refcounted;
2180 	struct sock *sk;
2181 	int ret;
2182 	u32 isn;
2183 
2184 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2185 	if (skb->pkt_type != PACKET_HOST)
2186 		goto discard_it;
2187 
2188 	/* Count it even if it's bad */
2189 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2190 
2191 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2192 		goto discard_it;
2193 
2194 	th = (const struct tcphdr *)skb->data;
2195 
2196 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2197 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2198 		goto bad_packet;
2199 	}
2200 	if (!pskb_may_pull(skb, th->doff * 4))
2201 		goto discard_it;
2202 
2203 	/* An explanation is required here, I think.
2204 	 * Packet length and doff are validated by header prediction,
2205 	 * provided case of th->doff==0 is eliminated.
2206 	 * So, we defer the checks. */
2207 
2208 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2209 		goto csum_error;
2210 
2211 	th = (const struct tcphdr *)skb->data;
2212 	iph = ip_hdr(skb);
2213 lookup:
2214 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2215 			       skb, __tcp_hdrlen(th), th->source,
2216 			       th->dest, sdif, &refcounted);
2217 	if (!sk)
2218 		goto no_tcp_socket;
2219 
2220 	if (sk->sk_state == TCP_TIME_WAIT)
2221 		goto do_time_wait;
2222 
2223 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2224 		struct request_sock *req = inet_reqsk(sk);
2225 		bool req_stolen = false;
2226 		struct sock *nsk;
2227 
2228 		sk = req->rsk_listener;
2229 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2230 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2231 		else
2232 			drop_reason = tcp_inbound_hash(sk, req, skb,
2233 						       &iph->saddr, &iph->daddr,
2234 						       AF_INET, dif, sdif);
2235 		if (unlikely(drop_reason)) {
2236 			sk_drops_add(sk, skb);
2237 			reqsk_put(req);
2238 			goto discard_it;
2239 		}
2240 		if (tcp_checksum_complete(skb)) {
2241 			reqsk_put(req);
2242 			goto csum_error;
2243 		}
2244 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2245 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2246 			if (!nsk) {
2247 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2248 				goto lookup;
2249 			}
2250 			sk = nsk;
2251 			/* reuseport_migrate_sock() has already held one sk_refcnt
2252 			 * before returning.
2253 			 */
2254 		} else {
2255 			/* We own a reference on the listener, increase it again
2256 			 * as we might lose it too soon.
2257 			 */
2258 			sock_hold(sk);
2259 		}
2260 		refcounted = true;
2261 		nsk = NULL;
2262 		if (!tcp_filter(sk, skb)) {
2263 			th = (const struct tcphdr *)skb->data;
2264 			iph = ip_hdr(skb);
2265 			tcp_v4_fill_cb(skb, iph, th);
2266 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2267 		} else {
2268 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2269 		}
2270 		if (!nsk) {
2271 			reqsk_put(req);
2272 			if (req_stolen) {
2273 				/* Another cpu got exclusive access to req
2274 				 * and created a full blown socket.
2275 				 * Try to feed this packet to this socket
2276 				 * instead of discarding it.
2277 				 */
2278 				tcp_v4_restore_cb(skb);
2279 				sock_put(sk);
2280 				goto lookup;
2281 			}
2282 			goto discard_and_relse;
2283 		}
2284 		nf_reset_ct(skb);
2285 		if (nsk == sk) {
2286 			reqsk_put(req);
2287 			tcp_v4_restore_cb(skb);
2288 		} else {
2289 			drop_reason = tcp_child_process(sk, nsk, skb);
2290 			if (drop_reason) {
2291 				enum sk_rst_reason rst_reason;
2292 
2293 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2294 				tcp_v4_send_reset(nsk, skb, rst_reason);
2295 				goto discard_and_relse;
2296 			}
2297 			sock_put(sk);
2298 			return 0;
2299 		}
2300 	}
2301 
2302 process:
2303 	if (static_branch_unlikely(&ip4_min_ttl)) {
2304 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2305 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2306 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2307 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2308 			goto discard_and_relse;
2309 		}
2310 	}
2311 
2312 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2313 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2314 		goto discard_and_relse;
2315 	}
2316 
2317 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2318 				       AF_INET, dif, sdif);
2319 	if (drop_reason)
2320 		goto discard_and_relse;
2321 
2322 	nf_reset_ct(skb);
2323 
2324 	if (tcp_filter(sk, skb)) {
2325 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2326 		goto discard_and_relse;
2327 	}
2328 	th = (const struct tcphdr *)skb->data;
2329 	iph = ip_hdr(skb);
2330 	tcp_v4_fill_cb(skb, iph, th);
2331 
2332 	skb->dev = NULL;
2333 
2334 	if (sk->sk_state == TCP_LISTEN) {
2335 		ret = tcp_v4_do_rcv(sk, skb);
2336 		goto put_and_return;
2337 	}
2338 
2339 	sk_incoming_cpu_update(sk);
2340 
2341 	bh_lock_sock_nested(sk);
2342 	tcp_segs_in(tcp_sk(sk), skb);
2343 	ret = 0;
2344 	if (!sock_owned_by_user(sk)) {
2345 		ret = tcp_v4_do_rcv(sk, skb);
2346 	} else {
2347 		if (tcp_add_backlog(sk, skb, &drop_reason))
2348 			goto discard_and_relse;
2349 	}
2350 	bh_unlock_sock(sk);
2351 
2352 put_and_return:
2353 	if (refcounted)
2354 		sock_put(sk);
2355 
2356 	return ret;
2357 
2358 no_tcp_socket:
2359 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2360 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2361 		goto discard_it;
2362 
2363 	tcp_v4_fill_cb(skb, iph, th);
2364 
2365 	if (tcp_checksum_complete(skb)) {
2366 csum_error:
2367 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2368 		trace_tcp_bad_csum(skb);
2369 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2370 bad_packet:
2371 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2372 	} else {
2373 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2374 	}
2375 
2376 discard_it:
2377 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2378 	/* Discard frame. */
2379 	kfree_skb_reason(skb, drop_reason);
2380 	return 0;
2381 
2382 discard_and_relse:
2383 	sk_drops_add(sk, skb);
2384 	if (refcounted)
2385 		sock_put(sk);
2386 	goto discard_it;
2387 
2388 do_time_wait:
2389 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2390 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2391 		inet_twsk_put(inet_twsk(sk));
2392 		goto discard_it;
2393 	}
2394 
2395 	tcp_v4_fill_cb(skb, iph, th);
2396 
2397 	if (tcp_checksum_complete(skb)) {
2398 		inet_twsk_put(inet_twsk(sk));
2399 		goto csum_error;
2400 	}
2401 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2402 	case TCP_TW_SYN: {
2403 		struct sock *sk2 = inet_lookup_listener(net,
2404 							net->ipv4.tcp_death_row.hashinfo,
2405 							skb, __tcp_hdrlen(th),
2406 							iph->saddr, th->source,
2407 							iph->daddr, th->dest,
2408 							inet_iif(skb),
2409 							sdif);
2410 		if (sk2) {
2411 			inet_twsk_deschedule_put(inet_twsk(sk));
2412 			sk = sk2;
2413 			tcp_v4_restore_cb(skb);
2414 			refcounted = false;
2415 			__this_cpu_write(tcp_tw_isn, isn);
2416 			goto process;
2417 		}
2418 	}
2419 		/* to ACK */
2420 		fallthrough;
2421 	case TCP_TW_ACK:
2422 		tcp_v4_timewait_ack(sk, skb);
2423 		break;
2424 	case TCP_TW_RST:
2425 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2426 		inet_twsk_deschedule_put(inet_twsk(sk));
2427 		goto discard_it;
2428 	case TCP_TW_SUCCESS:;
2429 	}
2430 	goto discard_it;
2431 }
2432 
2433 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2434 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2435 	.twsk_destructor= tcp_twsk_destructor,
2436 };
2437 
2438 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2439 {
2440 	struct dst_entry *dst = skb_dst(skb);
2441 
2442 	if (dst && dst_hold_safe(dst)) {
2443 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2444 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2445 	}
2446 }
2447 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2448 
2449 const struct inet_connection_sock_af_ops ipv4_specific = {
2450 	.queue_xmit	   = ip_queue_xmit,
2451 	.send_check	   = tcp_v4_send_check,
2452 	.rebuild_header	   = inet_sk_rebuild_header,
2453 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2454 	.conn_request	   = tcp_v4_conn_request,
2455 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2456 	.net_header_len	   = sizeof(struct iphdr),
2457 	.setsockopt	   = ip_setsockopt,
2458 	.getsockopt	   = ip_getsockopt,
2459 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2460 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2461 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2462 };
2463 EXPORT_SYMBOL(ipv4_specific);
2464 
2465 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2466 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2467 #ifdef CONFIG_TCP_MD5SIG
2468 	.md5_lookup		= tcp_v4_md5_lookup,
2469 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2470 	.md5_parse		= tcp_v4_parse_md5_keys,
2471 #endif
2472 #ifdef CONFIG_TCP_AO
2473 	.ao_lookup		= tcp_v4_ao_lookup,
2474 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2475 	.ao_parse		= tcp_v4_parse_ao,
2476 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2477 #endif
2478 };
2479 #endif
2480 
2481 /* NOTE: A lot of things set to zero explicitly by call to
2482  *       sk_alloc() so need not be done here.
2483  */
2484 static int tcp_v4_init_sock(struct sock *sk)
2485 {
2486 	struct inet_connection_sock *icsk = inet_csk(sk);
2487 
2488 	tcp_init_sock(sk);
2489 
2490 	icsk->icsk_af_ops = &ipv4_specific;
2491 
2492 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2493 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2494 #endif
2495 
2496 	return 0;
2497 }
2498 
2499 #ifdef CONFIG_TCP_MD5SIG
2500 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2501 {
2502 	struct tcp_md5sig_info *md5sig;
2503 
2504 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2505 	kfree(md5sig);
2506 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2507 	tcp_md5_release_sigpool();
2508 }
2509 #endif
2510 
2511 void tcp_v4_destroy_sock(struct sock *sk)
2512 {
2513 	struct tcp_sock *tp = tcp_sk(sk);
2514 
2515 	trace_tcp_destroy_sock(sk);
2516 
2517 	tcp_clear_xmit_timers(sk);
2518 
2519 	tcp_cleanup_congestion_control(sk);
2520 
2521 	tcp_cleanup_ulp(sk);
2522 
2523 	/* Cleanup up the write buffer. */
2524 	tcp_write_queue_purge(sk);
2525 
2526 	/* Check if we want to disable active TFO */
2527 	tcp_fastopen_active_disable_ofo_check(sk);
2528 
2529 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2530 	skb_rbtree_purge(&tp->out_of_order_queue);
2531 
2532 #ifdef CONFIG_TCP_MD5SIG
2533 	/* Clean up the MD5 key list, if any */
2534 	if (tp->md5sig_info) {
2535 		struct tcp_md5sig_info *md5sig;
2536 
2537 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2538 		tcp_clear_md5_list(sk);
2539 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2540 		rcu_assign_pointer(tp->md5sig_info, NULL);
2541 	}
2542 #endif
2543 	tcp_ao_destroy_sock(sk, false);
2544 
2545 	/* Clean up a referenced TCP bind bucket. */
2546 	if (inet_csk(sk)->icsk_bind_hash)
2547 		inet_put_port(sk);
2548 
2549 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2550 
2551 	/* If socket is aborted during connect operation */
2552 	tcp_free_fastopen_req(tp);
2553 	tcp_fastopen_destroy_cipher(sk);
2554 	tcp_saved_syn_free(tp);
2555 
2556 	sk_sockets_allocated_dec(sk);
2557 }
2558 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2559 
2560 #ifdef CONFIG_PROC_FS
2561 /* Proc filesystem TCP sock list dumping. */
2562 
2563 static unsigned short seq_file_family(const struct seq_file *seq);
2564 
2565 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2566 {
2567 	unsigned short family = seq_file_family(seq);
2568 
2569 	/* AF_UNSPEC is used as a match all */
2570 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2571 		net_eq(sock_net(sk), seq_file_net(seq)));
2572 }
2573 
2574 /* Find a non empty bucket (starting from st->bucket)
2575  * and return the first sk from it.
2576  */
2577 static void *listening_get_first(struct seq_file *seq)
2578 {
2579 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2580 	struct tcp_iter_state *st = seq->private;
2581 
2582 	st->offset = 0;
2583 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2584 		struct inet_listen_hashbucket *ilb2;
2585 		struct hlist_nulls_node *node;
2586 		struct sock *sk;
2587 
2588 		ilb2 = &hinfo->lhash2[st->bucket];
2589 		if (hlist_nulls_empty(&ilb2->nulls_head))
2590 			continue;
2591 
2592 		spin_lock(&ilb2->lock);
2593 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2594 			if (seq_sk_match(seq, sk))
2595 				return sk;
2596 		}
2597 		spin_unlock(&ilb2->lock);
2598 	}
2599 
2600 	return NULL;
2601 }
2602 
2603 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2604  * If "cur" is the last one in the st->bucket,
2605  * call listening_get_first() to return the first sk of the next
2606  * non empty bucket.
2607  */
2608 static void *listening_get_next(struct seq_file *seq, void *cur)
2609 {
2610 	struct tcp_iter_state *st = seq->private;
2611 	struct inet_listen_hashbucket *ilb2;
2612 	struct hlist_nulls_node *node;
2613 	struct inet_hashinfo *hinfo;
2614 	struct sock *sk = cur;
2615 
2616 	++st->num;
2617 	++st->offset;
2618 
2619 	sk = sk_nulls_next(sk);
2620 	sk_nulls_for_each_from(sk, node) {
2621 		if (seq_sk_match(seq, sk))
2622 			return sk;
2623 	}
2624 
2625 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2626 	ilb2 = &hinfo->lhash2[st->bucket];
2627 	spin_unlock(&ilb2->lock);
2628 	++st->bucket;
2629 	return listening_get_first(seq);
2630 }
2631 
2632 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2633 {
2634 	struct tcp_iter_state *st = seq->private;
2635 	void *rc;
2636 
2637 	st->bucket = 0;
2638 	st->offset = 0;
2639 	rc = listening_get_first(seq);
2640 
2641 	while (rc && *pos) {
2642 		rc = listening_get_next(seq, rc);
2643 		--*pos;
2644 	}
2645 	return rc;
2646 }
2647 
2648 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2649 				const struct tcp_iter_state *st)
2650 {
2651 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2652 }
2653 
2654 /*
2655  * Get first established socket starting from bucket given in st->bucket.
2656  * If st->bucket is zero, the very first socket in the hash is returned.
2657  */
2658 static void *established_get_first(struct seq_file *seq)
2659 {
2660 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2661 	struct tcp_iter_state *st = seq->private;
2662 
2663 	st->offset = 0;
2664 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2665 		struct sock *sk;
2666 		struct hlist_nulls_node *node;
2667 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2668 
2669 		cond_resched();
2670 
2671 		/* Lockless fast path for the common case of empty buckets */
2672 		if (empty_bucket(hinfo, st))
2673 			continue;
2674 
2675 		spin_lock_bh(lock);
2676 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2677 			if (seq_sk_match(seq, sk))
2678 				return sk;
2679 		}
2680 		spin_unlock_bh(lock);
2681 	}
2682 
2683 	return NULL;
2684 }
2685 
2686 static void *established_get_next(struct seq_file *seq, void *cur)
2687 {
2688 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2689 	struct tcp_iter_state *st = seq->private;
2690 	struct hlist_nulls_node *node;
2691 	struct sock *sk = cur;
2692 
2693 	++st->num;
2694 	++st->offset;
2695 
2696 	sk = sk_nulls_next(sk);
2697 
2698 	sk_nulls_for_each_from(sk, node) {
2699 		if (seq_sk_match(seq, sk))
2700 			return sk;
2701 	}
2702 
2703 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2704 	++st->bucket;
2705 	return established_get_first(seq);
2706 }
2707 
2708 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2709 {
2710 	struct tcp_iter_state *st = seq->private;
2711 	void *rc;
2712 
2713 	st->bucket = 0;
2714 	rc = established_get_first(seq);
2715 
2716 	while (rc && pos) {
2717 		rc = established_get_next(seq, rc);
2718 		--pos;
2719 	}
2720 	return rc;
2721 }
2722 
2723 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2724 {
2725 	void *rc;
2726 	struct tcp_iter_state *st = seq->private;
2727 
2728 	st->state = TCP_SEQ_STATE_LISTENING;
2729 	rc	  = listening_get_idx(seq, &pos);
2730 
2731 	if (!rc) {
2732 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2733 		rc	  = established_get_idx(seq, pos);
2734 	}
2735 
2736 	return rc;
2737 }
2738 
2739 static void *tcp_seek_last_pos(struct seq_file *seq)
2740 {
2741 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2742 	struct tcp_iter_state *st = seq->private;
2743 	int bucket = st->bucket;
2744 	int offset = st->offset;
2745 	int orig_num = st->num;
2746 	void *rc = NULL;
2747 
2748 	switch (st->state) {
2749 	case TCP_SEQ_STATE_LISTENING:
2750 		if (st->bucket > hinfo->lhash2_mask)
2751 			break;
2752 		rc = listening_get_first(seq);
2753 		while (offset-- && rc && bucket == st->bucket)
2754 			rc = listening_get_next(seq, rc);
2755 		if (rc)
2756 			break;
2757 		st->bucket = 0;
2758 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2759 		fallthrough;
2760 	case TCP_SEQ_STATE_ESTABLISHED:
2761 		if (st->bucket > hinfo->ehash_mask)
2762 			break;
2763 		rc = established_get_first(seq);
2764 		while (offset-- && rc && bucket == st->bucket)
2765 			rc = established_get_next(seq, rc);
2766 	}
2767 
2768 	st->num = orig_num;
2769 
2770 	return rc;
2771 }
2772 
2773 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2774 {
2775 	struct tcp_iter_state *st = seq->private;
2776 	void *rc;
2777 
2778 	if (*pos && *pos == st->last_pos) {
2779 		rc = tcp_seek_last_pos(seq);
2780 		if (rc)
2781 			goto out;
2782 	}
2783 
2784 	st->state = TCP_SEQ_STATE_LISTENING;
2785 	st->num = 0;
2786 	st->bucket = 0;
2787 	st->offset = 0;
2788 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2789 
2790 out:
2791 	st->last_pos = *pos;
2792 	return rc;
2793 }
2794 EXPORT_SYMBOL(tcp_seq_start);
2795 
2796 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2797 {
2798 	struct tcp_iter_state *st = seq->private;
2799 	void *rc = NULL;
2800 
2801 	if (v == SEQ_START_TOKEN) {
2802 		rc = tcp_get_idx(seq, 0);
2803 		goto out;
2804 	}
2805 
2806 	switch (st->state) {
2807 	case TCP_SEQ_STATE_LISTENING:
2808 		rc = listening_get_next(seq, v);
2809 		if (!rc) {
2810 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2811 			st->bucket = 0;
2812 			st->offset = 0;
2813 			rc	  = established_get_first(seq);
2814 		}
2815 		break;
2816 	case TCP_SEQ_STATE_ESTABLISHED:
2817 		rc = established_get_next(seq, v);
2818 		break;
2819 	}
2820 out:
2821 	++*pos;
2822 	st->last_pos = *pos;
2823 	return rc;
2824 }
2825 EXPORT_SYMBOL(tcp_seq_next);
2826 
2827 void tcp_seq_stop(struct seq_file *seq, void *v)
2828 {
2829 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2830 	struct tcp_iter_state *st = seq->private;
2831 
2832 	switch (st->state) {
2833 	case TCP_SEQ_STATE_LISTENING:
2834 		if (v != SEQ_START_TOKEN)
2835 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2836 		break;
2837 	case TCP_SEQ_STATE_ESTABLISHED:
2838 		if (v)
2839 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2840 		break;
2841 	}
2842 }
2843 EXPORT_SYMBOL(tcp_seq_stop);
2844 
2845 static void get_openreq4(const struct request_sock *req,
2846 			 struct seq_file *f, int i)
2847 {
2848 	const struct inet_request_sock *ireq = inet_rsk(req);
2849 	long delta = req->rsk_timer.expires - jiffies;
2850 
2851 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2852 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2853 		i,
2854 		ireq->ir_loc_addr,
2855 		ireq->ir_num,
2856 		ireq->ir_rmt_addr,
2857 		ntohs(ireq->ir_rmt_port),
2858 		TCP_SYN_RECV,
2859 		0, 0, /* could print option size, but that is af dependent. */
2860 		1,    /* timers active (only the expire timer) */
2861 		jiffies_delta_to_clock_t(delta),
2862 		req->num_timeout,
2863 		from_kuid_munged(seq_user_ns(f),
2864 				 sock_i_uid(req->rsk_listener)),
2865 		0,  /* non standard timer */
2866 		0, /* open_requests have no inode */
2867 		0,
2868 		req);
2869 }
2870 
2871 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2872 {
2873 	int timer_active;
2874 	unsigned long timer_expires;
2875 	const struct tcp_sock *tp = tcp_sk(sk);
2876 	const struct inet_connection_sock *icsk = inet_csk(sk);
2877 	const struct inet_sock *inet = inet_sk(sk);
2878 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2879 	__be32 dest = inet->inet_daddr;
2880 	__be32 src = inet->inet_rcv_saddr;
2881 	__u16 destp = ntohs(inet->inet_dport);
2882 	__u16 srcp = ntohs(inet->inet_sport);
2883 	int rx_queue;
2884 	int state;
2885 
2886 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2887 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2888 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2889 		timer_active	= 1;
2890 		timer_expires	= icsk->icsk_timeout;
2891 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2892 		timer_active	= 4;
2893 		timer_expires	= icsk->icsk_timeout;
2894 	} else if (timer_pending(&sk->sk_timer)) {
2895 		timer_active	= 2;
2896 		timer_expires	= sk->sk_timer.expires;
2897 	} else {
2898 		timer_active	= 0;
2899 		timer_expires = jiffies;
2900 	}
2901 
2902 	state = inet_sk_state_load(sk);
2903 	if (state == TCP_LISTEN)
2904 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2905 	else
2906 		/* Because we don't lock the socket,
2907 		 * we might find a transient negative value.
2908 		 */
2909 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2910 				      READ_ONCE(tp->copied_seq), 0);
2911 
2912 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2913 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2914 		i, src, srcp, dest, destp, state,
2915 		READ_ONCE(tp->write_seq) - tp->snd_una,
2916 		rx_queue,
2917 		timer_active,
2918 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2919 		icsk->icsk_retransmits,
2920 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2921 		icsk->icsk_probes_out,
2922 		sock_i_ino(sk),
2923 		refcount_read(&sk->sk_refcnt), sk,
2924 		jiffies_to_clock_t(icsk->icsk_rto),
2925 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2926 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2927 		tcp_snd_cwnd(tp),
2928 		state == TCP_LISTEN ?
2929 		    fastopenq->max_qlen :
2930 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2931 }
2932 
2933 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2934 			       struct seq_file *f, int i)
2935 {
2936 	long delta = tw->tw_timer.expires - jiffies;
2937 	__be32 dest, src;
2938 	__u16 destp, srcp;
2939 
2940 	dest  = tw->tw_daddr;
2941 	src   = tw->tw_rcv_saddr;
2942 	destp = ntohs(tw->tw_dport);
2943 	srcp  = ntohs(tw->tw_sport);
2944 
2945 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2946 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2947 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2948 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2949 		refcount_read(&tw->tw_refcnt), tw);
2950 }
2951 
2952 #define TMPSZ 150
2953 
2954 static int tcp4_seq_show(struct seq_file *seq, void *v)
2955 {
2956 	struct tcp_iter_state *st;
2957 	struct sock *sk = v;
2958 
2959 	seq_setwidth(seq, TMPSZ - 1);
2960 	if (v == SEQ_START_TOKEN) {
2961 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2962 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2963 			   "inode");
2964 		goto out;
2965 	}
2966 	st = seq->private;
2967 
2968 	if (sk->sk_state == TCP_TIME_WAIT)
2969 		get_timewait4_sock(v, seq, st->num);
2970 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2971 		get_openreq4(v, seq, st->num);
2972 	else
2973 		get_tcp4_sock(v, seq, st->num);
2974 out:
2975 	seq_pad(seq, '\n');
2976 	return 0;
2977 }
2978 
2979 #ifdef CONFIG_BPF_SYSCALL
2980 struct bpf_tcp_iter_state {
2981 	struct tcp_iter_state state;
2982 	unsigned int cur_sk;
2983 	unsigned int end_sk;
2984 	unsigned int max_sk;
2985 	struct sock **batch;
2986 	bool st_bucket_done;
2987 };
2988 
2989 struct bpf_iter__tcp {
2990 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2991 	__bpf_md_ptr(struct sock_common *, sk_common);
2992 	uid_t uid __aligned(8);
2993 };
2994 
2995 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2996 			     struct sock_common *sk_common, uid_t uid)
2997 {
2998 	struct bpf_iter__tcp ctx;
2999 
3000 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3001 	ctx.meta = meta;
3002 	ctx.sk_common = sk_common;
3003 	ctx.uid = uid;
3004 	return bpf_iter_run_prog(prog, &ctx);
3005 }
3006 
3007 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3008 {
3009 	while (iter->cur_sk < iter->end_sk)
3010 		sock_gen_put(iter->batch[iter->cur_sk++]);
3011 }
3012 
3013 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3014 				      unsigned int new_batch_sz)
3015 {
3016 	struct sock **new_batch;
3017 
3018 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3019 			     GFP_USER | __GFP_NOWARN);
3020 	if (!new_batch)
3021 		return -ENOMEM;
3022 
3023 	bpf_iter_tcp_put_batch(iter);
3024 	kvfree(iter->batch);
3025 	iter->batch = new_batch;
3026 	iter->max_sk = new_batch_sz;
3027 
3028 	return 0;
3029 }
3030 
3031 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3032 						 struct sock *start_sk)
3033 {
3034 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3035 	struct bpf_tcp_iter_state *iter = seq->private;
3036 	struct tcp_iter_state *st = &iter->state;
3037 	struct hlist_nulls_node *node;
3038 	unsigned int expected = 1;
3039 	struct sock *sk;
3040 
3041 	sock_hold(start_sk);
3042 	iter->batch[iter->end_sk++] = start_sk;
3043 
3044 	sk = sk_nulls_next(start_sk);
3045 	sk_nulls_for_each_from(sk, node) {
3046 		if (seq_sk_match(seq, sk)) {
3047 			if (iter->end_sk < iter->max_sk) {
3048 				sock_hold(sk);
3049 				iter->batch[iter->end_sk++] = sk;
3050 			}
3051 			expected++;
3052 		}
3053 	}
3054 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3055 
3056 	return expected;
3057 }
3058 
3059 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3060 						   struct sock *start_sk)
3061 {
3062 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3063 	struct bpf_tcp_iter_state *iter = seq->private;
3064 	struct tcp_iter_state *st = &iter->state;
3065 	struct hlist_nulls_node *node;
3066 	unsigned int expected = 1;
3067 	struct sock *sk;
3068 
3069 	sock_hold(start_sk);
3070 	iter->batch[iter->end_sk++] = start_sk;
3071 
3072 	sk = sk_nulls_next(start_sk);
3073 	sk_nulls_for_each_from(sk, node) {
3074 		if (seq_sk_match(seq, sk)) {
3075 			if (iter->end_sk < iter->max_sk) {
3076 				sock_hold(sk);
3077 				iter->batch[iter->end_sk++] = sk;
3078 			}
3079 			expected++;
3080 		}
3081 	}
3082 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3083 
3084 	return expected;
3085 }
3086 
3087 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3088 {
3089 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3090 	struct bpf_tcp_iter_state *iter = seq->private;
3091 	struct tcp_iter_state *st = &iter->state;
3092 	unsigned int expected;
3093 	bool resized = false;
3094 	struct sock *sk;
3095 
3096 	/* The st->bucket is done.  Directly advance to the next
3097 	 * bucket instead of having the tcp_seek_last_pos() to skip
3098 	 * one by one in the current bucket and eventually find out
3099 	 * it has to advance to the next bucket.
3100 	 */
3101 	if (iter->st_bucket_done) {
3102 		st->offset = 0;
3103 		st->bucket++;
3104 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3105 		    st->bucket > hinfo->lhash2_mask) {
3106 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3107 			st->bucket = 0;
3108 		}
3109 	}
3110 
3111 again:
3112 	/* Get a new batch */
3113 	iter->cur_sk = 0;
3114 	iter->end_sk = 0;
3115 	iter->st_bucket_done = false;
3116 
3117 	sk = tcp_seek_last_pos(seq);
3118 	if (!sk)
3119 		return NULL; /* Done */
3120 
3121 	if (st->state == TCP_SEQ_STATE_LISTENING)
3122 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3123 	else
3124 		expected = bpf_iter_tcp_established_batch(seq, sk);
3125 
3126 	if (iter->end_sk == expected) {
3127 		iter->st_bucket_done = true;
3128 		return sk;
3129 	}
3130 
3131 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3132 		resized = true;
3133 		goto again;
3134 	}
3135 
3136 	return sk;
3137 }
3138 
3139 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3140 {
3141 	/* bpf iter does not support lseek, so it always
3142 	 * continue from where it was stop()-ped.
3143 	 */
3144 	if (*pos)
3145 		return bpf_iter_tcp_batch(seq);
3146 
3147 	return SEQ_START_TOKEN;
3148 }
3149 
3150 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3151 {
3152 	struct bpf_tcp_iter_state *iter = seq->private;
3153 	struct tcp_iter_state *st = &iter->state;
3154 	struct sock *sk;
3155 
3156 	/* Whenever seq_next() is called, the iter->cur_sk is
3157 	 * done with seq_show(), so advance to the next sk in
3158 	 * the batch.
3159 	 */
3160 	if (iter->cur_sk < iter->end_sk) {
3161 		/* Keeping st->num consistent in tcp_iter_state.
3162 		 * bpf_iter_tcp does not use st->num.
3163 		 * meta.seq_num is used instead.
3164 		 */
3165 		st->num++;
3166 		/* Move st->offset to the next sk in the bucket such that
3167 		 * the future start() will resume at st->offset in
3168 		 * st->bucket.  See tcp_seek_last_pos().
3169 		 */
3170 		st->offset++;
3171 		sock_gen_put(iter->batch[iter->cur_sk++]);
3172 	}
3173 
3174 	if (iter->cur_sk < iter->end_sk)
3175 		sk = iter->batch[iter->cur_sk];
3176 	else
3177 		sk = bpf_iter_tcp_batch(seq);
3178 
3179 	++*pos;
3180 	/* Keeping st->last_pos consistent in tcp_iter_state.
3181 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3182 	 */
3183 	st->last_pos = *pos;
3184 	return sk;
3185 }
3186 
3187 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3188 {
3189 	struct bpf_iter_meta meta;
3190 	struct bpf_prog *prog;
3191 	struct sock *sk = v;
3192 	uid_t uid;
3193 	int ret;
3194 
3195 	if (v == SEQ_START_TOKEN)
3196 		return 0;
3197 
3198 	if (sk_fullsock(sk))
3199 		lock_sock(sk);
3200 
3201 	if (unlikely(sk_unhashed(sk))) {
3202 		ret = SEQ_SKIP;
3203 		goto unlock;
3204 	}
3205 
3206 	if (sk->sk_state == TCP_TIME_WAIT) {
3207 		uid = 0;
3208 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3209 		const struct request_sock *req = v;
3210 
3211 		uid = from_kuid_munged(seq_user_ns(seq),
3212 				       sock_i_uid(req->rsk_listener));
3213 	} else {
3214 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3215 	}
3216 
3217 	meta.seq = seq;
3218 	prog = bpf_iter_get_info(&meta, false);
3219 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3220 
3221 unlock:
3222 	if (sk_fullsock(sk))
3223 		release_sock(sk);
3224 	return ret;
3225 
3226 }
3227 
3228 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3229 {
3230 	struct bpf_tcp_iter_state *iter = seq->private;
3231 	struct bpf_iter_meta meta;
3232 	struct bpf_prog *prog;
3233 
3234 	if (!v) {
3235 		meta.seq = seq;
3236 		prog = bpf_iter_get_info(&meta, true);
3237 		if (prog)
3238 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3239 	}
3240 
3241 	if (iter->cur_sk < iter->end_sk) {
3242 		bpf_iter_tcp_put_batch(iter);
3243 		iter->st_bucket_done = false;
3244 	}
3245 }
3246 
3247 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3248 	.show		= bpf_iter_tcp_seq_show,
3249 	.start		= bpf_iter_tcp_seq_start,
3250 	.next		= bpf_iter_tcp_seq_next,
3251 	.stop		= bpf_iter_tcp_seq_stop,
3252 };
3253 #endif
3254 static unsigned short seq_file_family(const struct seq_file *seq)
3255 {
3256 	const struct tcp_seq_afinfo *afinfo;
3257 
3258 #ifdef CONFIG_BPF_SYSCALL
3259 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3260 	if (seq->op == &bpf_iter_tcp_seq_ops)
3261 		return AF_UNSPEC;
3262 #endif
3263 
3264 	/* Iterated from proc fs */
3265 	afinfo = pde_data(file_inode(seq->file));
3266 	return afinfo->family;
3267 }
3268 
3269 static const struct seq_operations tcp4_seq_ops = {
3270 	.show		= tcp4_seq_show,
3271 	.start		= tcp_seq_start,
3272 	.next		= tcp_seq_next,
3273 	.stop		= tcp_seq_stop,
3274 };
3275 
3276 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3277 	.family		= AF_INET,
3278 };
3279 
3280 static int __net_init tcp4_proc_init_net(struct net *net)
3281 {
3282 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3283 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3284 		return -ENOMEM;
3285 	return 0;
3286 }
3287 
3288 static void __net_exit tcp4_proc_exit_net(struct net *net)
3289 {
3290 	remove_proc_entry("tcp", net->proc_net);
3291 }
3292 
3293 static struct pernet_operations tcp4_net_ops = {
3294 	.init = tcp4_proc_init_net,
3295 	.exit = tcp4_proc_exit_net,
3296 };
3297 
3298 int __init tcp4_proc_init(void)
3299 {
3300 	return register_pernet_subsys(&tcp4_net_ops);
3301 }
3302 
3303 void tcp4_proc_exit(void)
3304 {
3305 	unregister_pernet_subsys(&tcp4_net_ops);
3306 }
3307 #endif /* CONFIG_PROC_FS */
3308 
3309 /* @wake is one when sk_stream_write_space() calls us.
3310  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3311  * This mimics the strategy used in sock_def_write_space().
3312  */
3313 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3314 {
3315 	const struct tcp_sock *tp = tcp_sk(sk);
3316 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3317 			    READ_ONCE(tp->snd_nxt);
3318 
3319 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3320 }
3321 EXPORT_SYMBOL(tcp_stream_memory_free);
3322 
3323 struct proto tcp_prot = {
3324 	.name			= "TCP",
3325 	.owner			= THIS_MODULE,
3326 	.close			= tcp_close,
3327 	.pre_connect		= tcp_v4_pre_connect,
3328 	.connect		= tcp_v4_connect,
3329 	.disconnect		= tcp_disconnect,
3330 	.accept			= inet_csk_accept,
3331 	.ioctl			= tcp_ioctl,
3332 	.init			= tcp_v4_init_sock,
3333 	.destroy		= tcp_v4_destroy_sock,
3334 	.shutdown		= tcp_shutdown,
3335 	.setsockopt		= tcp_setsockopt,
3336 	.getsockopt		= tcp_getsockopt,
3337 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3338 	.keepalive		= tcp_set_keepalive,
3339 	.recvmsg		= tcp_recvmsg,
3340 	.sendmsg		= tcp_sendmsg,
3341 	.splice_eof		= tcp_splice_eof,
3342 	.backlog_rcv		= tcp_v4_do_rcv,
3343 	.release_cb		= tcp_release_cb,
3344 	.hash			= inet_hash,
3345 	.unhash			= inet_unhash,
3346 	.get_port		= inet_csk_get_port,
3347 	.put_port		= inet_put_port,
3348 #ifdef CONFIG_BPF_SYSCALL
3349 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3350 #endif
3351 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3352 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3353 	.stream_memory_free	= tcp_stream_memory_free,
3354 	.sockets_allocated	= &tcp_sockets_allocated,
3355 	.orphan_count		= &tcp_orphan_count,
3356 
3357 	.memory_allocated	= &tcp_memory_allocated,
3358 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3359 
3360 	.memory_pressure	= &tcp_memory_pressure,
3361 	.sysctl_mem		= sysctl_tcp_mem,
3362 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3363 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3364 	.max_header		= MAX_TCP_HEADER,
3365 	.obj_size		= sizeof(struct tcp_sock),
3366 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3367 	.twsk_prot		= &tcp_timewait_sock_ops,
3368 	.rsk_prot		= &tcp_request_sock_ops,
3369 	.h.hashinfo		= NULL,
3370 	.no_autobind		= true,
3371 	.diag_destroy		= tcp_abort,
3372 };
3373 EXPORT_SYMBOL(tcp_prot);
3374 
3375 static void __net_exit tcp_sk_exit(struct net *net)
3376 {
3377 	if (net->ipv4.tcp_congestion_control)
3378 		bpf_module_put(net->ipv4.tcp_congestion_control,
3379 			       net->ipv4.tcp_congestion_control->owner);
3380 }
3381 
3382 static void __net_init tcp_set_hashinfo(struct net *net)
3383 {
3384 	struct inet_hashinfo *hinfo;
3385 	unsigned int ehash_entries;
3386 	struct net *old_net;
3387 
3388 	if (net_eq(net, &init_net))
3389 		goto fallback;
3390 
3391 	old_net = current->nsproxy->net_ns;
3392 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3393 	if (!ehash_entries)
3394 		goto fallback;
3395 
3396 	ehash_entries = roundup_pow_of_two(ehash_entries);
3397 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3398 	if (!hinfo) {
3399 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3400 			"for a netns, fallback to the global one\n",
3401 			ehash_entries);
3402 fallback:
3403 		hinfo = &tcp_hashinfo;
3404 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3405 	}
3406 
3407 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3408 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3409 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3410 }
3411 
3412 static int __net_init tcp_sk_init(struct net *net)
3413 {
3414 	net->ipv4.sysctl_tcp_ecn = 2;
3415 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3416 
3417 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3418 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3419 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3420 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3421 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3422 
3423 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3424 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3425 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3426 
3427 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3428 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3429 	net->ipv4.sysctl_tcp_syncookies = 1;
3430 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3431 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3432 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3433 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3434 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3435 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3436 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3437 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3438 
3439 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3440 	tcp_set_hashinfo(net);
3441 
3442 	net->ipv4.sysctl_tcp_sack = 1;
3443 	net->ipv4.sysctl_tcp_window_scaling = 1;
3444 	net->ipv4.sysctl_tcp_timestamps = 1;
3445 	net->ipv4.sysctl_tcp_early_retrans = 3;
3446 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3447 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3448 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3449 	net->ipv4.sysctl_tcp_max_reordering = 300;
3450 	net->ipv4.sysctl_tcp_dsack = 1;
3451 	net->ipv4.sysctl_tcp_app_win = 31;
3452 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3453 	net->ipv4.sysctl_tcp_frto = 2;
3454 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3455 	/* This limits the percentage of the congestion window which we
3456 	 * will allow a single TSO frame to consume.  Building TSO frames
3457 	 * which are too large can cause TCP streams to be bursty.
3458 	 */
3459 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3460 	/* Default TSQ limit of 16 TSO segments */
3461 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3462 
3463 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3464 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3465 
3466 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3467 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3468 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3469 	net->ipv4.sysctl_tcp_autocorking = 1;
3470 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3471 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3472 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3473 	if (net != &init_net) {
3474 		memcpy(net->ipv4.sysctl_tcp_rmem,
3475 		       init_net.ipv4.sysctl_tcp_rmem,
3476 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3477 		memcpy(net->ipv4.sysctl_tcp_wmem,
3478 		       init_net.ipv4.sysctl_tcp_wmem,
3479 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3480 	}
3481 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3482 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3483 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3484 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3485 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3486 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3487 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3488 
3489 	/* Set default values for PLB */
3490 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3491 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3492 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3493 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3494 	/* Default congestion threshold for PLB to mark a round is 50% */
3495 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3496 
3497 	/* Reno is always built in */
3498 	if (!net_eq(net, &init_net) &&
3499 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3500 			       init_net.ipv4.tcp_congestion_control->owner))
3501 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3502 	else
3503 		net->ipv4.tcp_congestion_control = &tcp_reno;
3504 
3505 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3506 	net->ipv4.sysctl_tcp_shrink_window = 0;
3507 
3508 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3509 
3510 	return 0;
3511 }
3512 
3513 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3514 {
3515 	struct net *net;
3516 
3517 	tcp_twsk_purge(net_exit_list);
3518 
3519 	list_for_each_entry(net, net_exit_list, exit_list) {
3520 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3521 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3522 		tcp_fastopen_ctx_destroy(net);
3523 	}
3524 }
3525 
3526 static struct pernet_operations __net_initdata tcp_sk_ops = {
3527        .init	   = tcp_sk_init,
3528        .exit	   = tcp_sk_exit,
3529        .exit_batch = tcp_sk_exit_batch,
3530 };
3531 
3532 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3533 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3534 		     struct sock_common *sk_common, uid_t uid)
3535 
3536 #define INIT_BATCH_SZ 16
3537 
3538 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3539 {
3540 	struct bpf_tcp_iter_state *iter = priv_data;
3541 	int err;
3542 
3543 	err = bpf_iter_init_seq_net(priv_data, aux);
3544 	if (err)
3545 		return err;
3546 
3547 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3548 	if (err) {
3549 		bpf_iter_fini_seq_net(priv_data);
3550 		return err;
3551 	}
3552 
3553 	return 0;
3554 }
3555 
3556 static void bpf_iter_fini_tcp(void *priv_data)
3557 {
3558 	struct bpf_tcp_iter_state *iter = priv_data;
3559 
3560 	bpf_iter_fini_seq_net(priv_data);
3561 	kvfree(iter->batch);
3562 }
3563 
3564 static const struct bpf_iter_seq_info tcp_seq_info = {
3565 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3566 	.init_seq_private	= bpf_iter_init_tcp,
3567 	.fini_seq_private	= bpf_iter_fini_tcp,
3568 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3569 };
3570 
3571 static const struct bpf_func_proto *
3572 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3573 			    const struct bpf_prog *prog)
3574 {
3575 	switch (func_id) {
3576 	case BPF_FUNC_setsockopt:
3577 		return &bpf_sk_setsockopt_proto;
3578 	case BPF_FUNC_getsockopt:
3579 		return &bpf_sk_getsockopt_proto;
3580 	default:
3581 		return NULL;
3582 	}
3583 }
3584 
3585 static struct bpf_iter_reg tcp_reg_info = {
3586 	.target			= "tcp",
3587 	.ctx_arg_info_size	= 1,
3588 	.ctx_arg_info		= {
3589 		{ offsetof(struct bpf_iter__tcp, sk_common),
3590 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3591 	},
3592 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3593 	.seq_info		= &tcp_seq_info,
3594 };
3595 
3596 static void __init bpf_iter_register(void)
3597 {
3598 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3599 	if (bpf_iter_reg_target(&tcp_reg_info))
3600 		pr_warn("Warning: could not register bpf iterator tcp\n");
3601 }
3602 
3603 #endif
3604 
3605 void __init tcp_v4_init(void)
3606 {
3607 	int cpu, res;
3608 
3609 	for_each_possible_cpu(cpu) {
3610 		struct sock *sk;
3611 
3612 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3613 					   IPPROTO_TCP, &init_net);
3614 		if (res)
3615 			panic("Failed to create the TCP control socket.\n");
3616 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3617 
3618 		/* Please enforce IP_DF and IPID==0 for RST and
3619 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3620 		 */
3621 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3622 
3623 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3624 	}
3625 	if (register_pernet_subsys(&tcp_sk_ops))
3626 		panic("Failed to create the TCP control socket.\n");
3627 
3628 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3629 	bpf_iter_register();
3630 #endif
3631 }
3632