xref: /linux/net/ipv6/ip6_output.c (revision 252442f2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
121 
122 	rcu_read_lock();
123 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			rcu_read_unlock();
131 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
132 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
133 			return -EINVAL;
134 		}
135 	}
136 	sock_confirm_neigh(skb, neigh);
137 	ret = neigh_output(neigh, skb, false);
138 	rcu_read_unlock();
139 	return ret;
140 }
141 
142 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
144 				    struct sk_buff *skb, unsigned int mtu)
145 {
146 	struct sk_buff *segs, *nskb;
147 	netdev_features_t features;
148 	int ret = 0;
149 
150 	/* Please see corresponding comment in ip_finish_output_gso
151 	 * describing the cases where GSO segment length exceeds the
152 	 * egress MTU.
153 	 */
154 	features = netif_skb_features(skb);
155 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
156 	if (IS_ERR_OR_NULL(segs)) {
157 		kfree_skb(skb);
158 		return -ENOMEM;
159 	}
160 
161 	consume_skb(skb);
162 
163 	skb_list_walk_safe(segs, segs, nskb) {
164 		int err;
165 
166 		skb_mark_not_on_list(segs);
167 		/* Last GSO segment can be smaller than gso_size (and MTU).
168 		 * Adding a fragment header would produce an "atomic fragment",
169 		 * which is considered harmful (RFC-8021). Avoid that.
170 		 */
171 		err = segs->len > mtu ?
172 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
173 			ip6_finish_output2(net, sk, segs);
174 		if (err && ret == 0)
175 			ret = err;
176 	}
177 
178 	return ret;
179 }
180 
ip6_finish_output_gso(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)181 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
182 				 struct sk_buff *skb, unsigned int mtu)
183 {
184 	if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
185 	    !skb_gso_validate_network_len(skb, mtu))
186 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
187 
188 	return ip6_finish_output2(net, sk, skb);
189 }
190 
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)191 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
192 {
193 	unsigned int mtu;
194 
195 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
196 	/* Policy lookup after SNAT yielded a new policy */
197 	if (skb_dst(skb)->xfrm) {
198 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
199 		return dst_output(net, sk, skb);
200 	}
201 #endif
202 
203 	mtu = ip6_skb_dst_mtu(skb);
204 	if (skb_is_gso(skb))
205 		return ip6_finish_output_gso(net, sk, skb, mtu);
206 
207 	if (skb->len > mtu ||
208 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
209 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
210 
211 	return ip6_finish_output2(net, sk, skb);
212 }
213 
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)214 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	int ret;
217 
218 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
219 	switch (ret) {
220 	case NET_XMIT_SUCCESS:
221 	case NET_XMIT_CN:
222 		return __ip6_finish_output(net, sk, skb) ? : ret;
223 	default:
224 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
225 		return ret;
226 	}
227 }
228 
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	skb->dev = dev;
236 
237 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
238 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
240 		return 0;
241 	}
242 
243 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 			    net, sk, skb, indev, dev,
245 			    ip6_finish_output,
246 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 EXPORT_SYMBOL(ip6_output);
249 
ip6_autoflowlabel(struct net * net,const struct sock * sk)250 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
251 {
252 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
253 		return ip6_default_np_autolabel(net);
254 	return inet6_test_bit(AUTOFLOWLABEL, sk);
255 }
256 
257 /*
258  * xmit an sk_buff (used by TCP, SCTP and DCCP)
259  * Note : socket lock is not held for SYNACK packets, but might be modified
260  * by calls to skb_set_owner_w() and ipv6_local_error(),
261  * which are using proper atomic operations or spinlocks.
262  */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
264 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
265 {
266 	struct net *net = sock_net(sk);
267 	const struct ipv6_pinfo *np = inet6_sk(sk);
268 	struct in6_addr *first_hop = &fl6->daddr;
269 	struct dst_entry *dst = skb_dst(skb);
270 	struct net_device *dev = dst->dev;
271 	struct inet6_dev *idev = ip6_dst_idev(dst);
272 	struct hop_jumbo_hdr *hop_jumbo;
273 	int hoplen = sizeof(*hop_jumbo);
274 	unsigned int head_room;
275 	struct ipv6hdr *hdr;
276 	u8  proto = fl6->flowi6_proto;
277 	int seg_len = skb->len;
278 	int hlimit = -1;
279 	u32 mtu;
280 
281 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
282 	if (opt)
283 		head_room += opt->opt_nflen + opt->opt_flen;
284 
285 	if (unlikely(head_room > skb_headroom(skb))) {
286 		skb = skb_expand_head(skb, head_room);
287 		if (!skb) {
288 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
289 			return -ENOBUFS;
290 		}
291 	}
292 
293 	if (opt) {
294 		seg_len += opt->opt_nflen + opt->opt_flen;
295 
296 		if (opt->opt_flen)
297 			ipv6_push_frag_opts(skb, opt, &proto);
298 
299 		if (opt->opt_nflen)
300 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
301 					     &fl6->saddr);
302 	}
303 
304 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
305 		hop_jumbo = skb_push(skb, hoplen);
306 
307 		hop_jumbo->nexthdr = proto;
308 		hop_jumbo->hdrlen = 0;
309 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
310 		hop_jumbo->tlv_len = 4;
311 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
312 
313 		proto = IPPROTO_HOPOPTS;
314 		seg_len = 0;
315 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
316 	}
317 
318 	skb_push(skb, sizeof(struct ipv6hdr));
319 	skb_reset_network_header(skb);
320 	hdr = ipv6_hdr(skb);
321 
322 	/*
323 	 *	Fill in the IPv6 header
324 	 */
325 	if (np)
326 		hlimit = READ_ONCE(np->hop_limit);
327 	if (hlimit < 0)
328 		hlimit = ip6_dst_hoplimit(dst);
329 
330 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
331 				ip6_autoflowlabel(net, sk), fl6));
332 
333 	hdr->payload_len = htons(seg_len);
334 	hdr->nexthdr = proto;
335 	hdr->hop_limit = hlimit;
336 
337 	hdr->saddr = fl6->saddr;
338 	hdr->daddr = *first_hop;
339 
340 	skb->protocol = htons(ETH_P_IPV6);
341 	skb->priority = priority;
342 	skb->mark = mark;
343 
344 	mtu = dst_mtu(dst);
345 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
346 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
347 
348 		/* if egress device is enslaved to an L3 master device pass the
349 		 * skb to its handler for processing
350 		 */
351 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
352 		if (unlikely(!skb))
353 			return 0;
354 
355 		/* hooks should never assume socket lock is held.
356 		 * we promote our socket to non const
357 		 */
358 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
359 			       net, (struct sock *)sk, skb, NULL, dev,
360 			       dst_output);
361 	}
362 
363 	skb->dev = dev;
364 	/* ipv6_local_error() does not require socket lock,
365 	 * we promote our socket to non const
366 	 */
367 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
368 
369 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
370 	kfree_skb(skb);
371 	return -EMSGSIZE;
372 }
373 EXPORT_SYMBOL(ip6_xmit);
374 
ip6_call_ra_chain(struct sk_buff * skb,int sel)375 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
376 {
377 	struct ip6_ra_chain *ra;
378 	struct sock *last = NULL;
379 
380 	read_lock(&ip6_ra_lock);
381 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
382 		struct sock *sk = ra->sk;
383 		if (sk && ra->sel == sel &&
384 		    (!sk->sk_bound_dev_if ||
385 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
386 
387 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
388 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
389 				continue;
390 			}
391 			if (last) {
392 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
393 				if (skb2)
394 					rawv6_rcv(last, skb2);
395 			}
396 			last = sk;
397 		}
398 	}
399 
400 	if (last) {
401 		rawv6_rcv(last, skb);
402 		read_unlock(&ip6_ra_lock);
403 		return 1;
404 	}
405 	read_unlock(&ip6_ra_lock);
406 	return 0;
407 }
408 
ip6_forward_proxy_check(struct sk_buff * skb)409 static int ip6_forward_proxy_check(struct sk_buff *skb)
410 {
411 	struct ipv6hdr *hdr = ipv6_hdr(skb);
412 	u8 nexthdr = hdr->nexthdr;
413 	__be16 frag_off;
414 	int offset;
415 
416 	if (ipv6_ext_hdr(nexthdr)) {
417 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
418 		if (offset < 0)
419 			return 0;
420 	} else
421 		offset = sizeof(struct ipv6hdr);
422 
423 	if (nexthdr == IPPROTO_ICMPV6) {
424 		struct icmp6hdr *icmp6;
425 
426 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
427 					 offset + 1 - skb->data)))
428 			return 0;
429 
430 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
431 
432 		switch (icmp6->icmp6_type) {
433 		case NDISC_ROUTER_SOLICITATION:
434 		case NDISC_ROUTER_ADVERTISEMENT:
435 		case NDISC_NEIGHBOUR_SOLICITATION:
436 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
437 		case NDISC_REDIRECT:
438 			/* For reaction involving unicast neighbor discovery
439 			 * message destined to the proxied address, pass it to
440 			 * input function.
441 			 */
442 			return 1;
443 		default:
444 			break;
445 		}
446 	}
447 
448 	/*
449 	 * The proxying router can't forward traffic sent to a link-local
450 	 * address, so signal the sender and discard the packet. This
451 	 * behavior is clarified by the MIPv6 specification.
452 	 */
453 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
454 		dst_link_failure(skb);
455 		return -1;
456 	}
457 
458 	return 0;
459 }
460 
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)461 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
462 				     struct sk_buff *skb)
463 {
464 #ifdef CONFIG_NET_SWITCHDEV
465 	if (skb->offload_l3_fwd_mark) {
466 		consume_skb(skb);
467 		return 0;
468 	}
469 #endif
470 
471 	skb_clear_tstamp(skb);
472 	return dst_output(net, sk, skb);
473 }
474 
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)475 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
476 {
477 	if (skb->len <= mtu)
478 		return false;
479 
480 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
481 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
482 		return true;
483 
484 	if (skb->ignore_df)
485 		return false;
486 
487 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
488 		return false;
489 
490 	return true;
491 }
492 
ip6_forward(struct sk_buff * skb)493 int ip6_forward(struct sk_buff *skb)
494 {
495 	struct dst_entry *dst = skb_dst(skb);
496 	struct ipv6hdr *hdr = ipv6_hdr(skb);
497 	struct inet6_skb_parm *opt = IP6CB(skb);
498 	struct net *net = dev_net(dst->dev);
499 	struct inet6_dev *idev;
500 	SKB_DR(reason);
501 	u32 mtu;
502 
503 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
504 	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
505 		goto error;
506 
507 	if (skb->pkt_type != PACKET_HOST)
508 		goto drop;
509 
510 	if (unlikely(skb->sk))
511 		goto drop;
512 
513 	if (skb_warn_if_lro(skb))
514 		goto drop;
515 
516 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
517 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
518 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
519 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
520 		goto drop;
521 	}
522 
523 	skb_forward_csum(skb);
524 
525 	/*
526 	 *	We DO NOT make any processing on
527 	 *	RA packets, pushing them to user level AS IS
528 	 *	without ane WARRANTY that application will be able
529 	 *	to interpret them. The reason is that we
530 	 *	cannot make anything clever here.
531 	 *
532 	 *	We are not end-node, so that if packet contains
533 	 *	AH/ESP, we cannot make anything.
534 	 *	Defragmentation also would be mistake, RA packets
535 	 *	cannot be fragmented, because there is no warranty
536 	 *	that different fragments will go along one path. --ANK
537 	 */
538 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
539 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
540 			return 0;
541 	}
542 
543 	/*
544 	 *	check and decrement ttl
545 	 */
546 	if (hdr->hop_limit <= 1) {
547 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
548 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
549 
550 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
551 		return -ETIMEDOUT;
552 	}
553 
554 	/* XXX: idev->cnf.proxy_ndp? */
555 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
556 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
557 		int proxied = ip6_forward_proxy_check(skb);
558 		if (proxied > 0) {
559 			/* It's tempting to decrease the hop limit
560 			 * here by 1, as we do at the end of the
561 			 * function too.
562 			 *
563 			 * But that would be incorrect, as proxying is
564 			 * not forwarding.  The ip6_input function
565 			 * will handle this packet locally, and it
566 			 * depends on the hop limit being unchanged.
567 			 *
568 			 * One example is the NDP hop limit, that
569 			 * always has to stay 255, but other would be
570 			 * similar checks around RA packets, where the
571 			 * user can even change the desired limit.
572 			 */
573 			return ip6_input(skb);
574 		} else if (proxied < 0) {
575 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
576 			goto drop;
577 		}
578 	}
579 
580 	if (!xfrm6_route_forward(skb)) {
581 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
582 		SKB_DR_SET(reason, XFRM_POLICY);
583 		goto drop;
584 	}
585 	dst = skb_dst(skb);
586 
587 	/* IPv6 specs say nothing about it, but it is clear that we cannot
588 	   send redirects to source routed frames.
589 	   We don't send redirects to frames decapsulated from IPsec.
590 	 */
591 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
592 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
593 		struct in6_addr *target = NULL;
594 		struct inet_peer *peer;
595 		struct rt6_info *rt;
596 
597 		/*
598 		 *	incoming and outgoing devices are the same
599 		 *	send a redirect.
600 		 */
601 
602 		rt = dst_rt6_info(dst);
603 		if (rt->rt6i_flags & RTF_GATEWAY)
604 			target = &rt->rt6i_gateway;
605 		else
606 			target = &hdr->daddr;
607 
608 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
609 
610 		/* Limit redirects both by destination (here)
611 		   and by source (inside ndisc_send_redirect)
612 		 */
613 		if (inet_peer_xrlim_allow(peer, 1*HZ))
614 			ndisc_send_redirect(skb, target);
615 		if (peer)
616 			inet_putpeer(peer);
617 	} else {
618 		int addrtype = ipv6_addr_type(&hdr->saddr);
619 
620 		/* This check is security critical. */
621 		if (addrtype == IPV6_ADDR_ANY ||
622 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
623 			goto error;
624 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
625 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
626 				    ICMPV6_NOT_NEIGHBOUR, 0);
627 			goto error;
628 		}
629 	}
630 
631 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
632 
633 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
634 	if (mtu < IPV6_MIN_MTU)
635 		mtu = IPV6_MIN_MTU;
636 
637 	if (ip6_pkt_too_big(skb, mtu)) {
638 		/* Again, force OUTPUT device used as source address */
639 		skb->dev = dst->dev;
640 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
642 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
643 				IPSTATS_MIB_FRAGFAILS);
644 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
645 		return -EMSGSIZE;
646 	}
647 
648 	if (skb_cow(skb, dst->dev->hard_header_len)) {
649 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
650 				IPSTATS_MIB_OUTDISCARDS);
651 		goto drop;
652 	}
653 
654 	hdr = ipv6_hdr(skb);
655 
656 	/* Mangling hops number delayed to point after skb COW */
657 
658 	hdr->hop_limit--;
659 
660 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
661 		       net, NULL, skb, skb->dev, dst->dev,
662 		       ip6_forward_finish);
663 
664 error:
665 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
666 	SKB_DR_SET(reason, IP_INADDRERRORS);
667 drop:
668 	kfree_skb_reason(skb, reason);
669 	return -EINVAL;
670 }
671 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)672 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
673 {
674 	to->pkt_type = from->pkt_type;
675 	to->priority = from->priority;
676 	to->protocol = from->protocol;
677 	skb_dst_drop(to);
678 	skb_dst_set(to, dst_clone(skb_dst(from)));
679 	to->dev = from->dev;
680 	to->mark = from->mark;
681 
682 	skb_copy_hash(to, from);
683 
684 #ifdef CONFIG_NET_SCHED
685 	to->tc_index = from->tc_index;
686 #endif
687 	nf_copy(to, from);
688 	skb_ext_copy(to, from);
689 	skb_copy_secmark(to, from);
690 }
691 
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)692 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
693 		      u8 nexthdr, __be32 frag_id,
694 		      struct ip6_fraglist_iter *iter)
695 {
696 	unsigned int first_len;
697 	struct frag_hdr *fh;
698 
699 	/* BUILD HEADER */
700 	*prevhdr = NEXTHDR_FRAGMENT;
701 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
702 	if (!iter->tmp_hdr)
703 		return -ENOMEM;
704 
705 	iter->frag = skb_shinfo(skb)->frag_list;
706 	skb_frag_list_init(skb);
707 
708 	iter->offset = 0;
709 	iter->hlen = hlen;
710 	iter->frag_id = frag_id;
711 	iter->nexthdr = nexthdr;
712 
713 	__skb_pull(skb, hlen);
714 	fh = __skb_push(skb, sizeof(struct frag_hdr));
715 	__skb_push(skb, hlen);
716 	skb_reset_network_header(skb);
717 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
718 
719 	fh->nexthdr = nexthdr;
720 	fh->reserved = 0;
721 	fh->frag_off = htons(IP6_MF);
722 	fh->identification = frag_id;
723 
724 	first_len = skb_pagelen(skb);
725 	skb->data_len = first_len - skb_headlen(skb);
726 	skb->len = first_len;
727 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
728 
729 	return 0;
730 }
731 EXPORT_SYMBOL(ip6_fraglist_init);
732 
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)733 void ip6_fraglist_prepare(struct sk_buff *skb,
734 			  struct ip6_fraglist_iter *iter)
735 {
736 	struct sk_buff *frag = iter->frag;
737 	unsigned int hlen = iter->hlen;
738 	struct frag_hdr *fh;
739 
740 	frag->ip_summed = CHECKSUM_NONE;
741 	skb_reset_transport_header(frag);
742 	fh = __skb_push(frag, sizeof(struct frag_hdr));
743 	__skb_push(frag, hlen);
744 	skb_reset_network_header(frag);
745 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
746 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
747 	fh->nexthdr = iter->nexthdr;
748 	fh->reserved = 0;
749 	fh->frag_off = htons(iter->offset);
750 	if (frag->next)
751 		fh->frag_off |= htons(IP6_MF);
752 	fh->identification = iter->frag_id;
753 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
754 	ip6_copy_metadata(frag, skb);
755 }
756 EXPORT_SYMBOL(ip6_fraglist_prepare);
757 
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)758 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
759 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
760 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
761 {
762 	state->prevhdr = prevhdr;
763 	state->nexthdr = nexthdr;
764 	state->frag_id = frag_id;
765 
766 	state->hlen = hlen;
767 	state->mtu = mtu;
768 
769 	state->left = skb->len - hlen;	/* Space per frame */
770 	state->ptr = hlen;		/* Where to start from */
771 
772 	state->hroom = hdr_room;
773 	state->troom = needed_tailroom;
774 
775 	state->offset = 0;
776 }
777 EXPORT_SYMBOL(ip6_frag_init);
778 
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)779 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
780 {
781 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
782 	struct sk_buff *frag;
783 	struct frag_hdr *fh;
784 	unsigned int len;
785 
786 	len = state->left;
787 	/* IF: it doesn't fit, use 'mtu' - the data space left */
788 	if (len > state->mtu)
789 		len = state->mtu;
790 	/* IF: we are not sending up to and including the packet end
791 	   then align the next start on an eight byte boundary */
792 	if (len < state->left)
793 		len &= ~7;
794 
795 	/* Allocate buffer */
796 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
797 			 state->hroom + state->troom, GFP_ATOMIC);
798 	if (!frag)
799 		return ERR_PTR(-ENOMEM);
800 
801 	/*
802 	 *	Set up data on packet
803 	 */
804 
805 	ip6_copy_metadata(frag, skb);
806 	skb_reserve(frag, state->hroom);
807 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
808 	skb_reset_network_header(frag);
809 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
810 	frag->transport_header = (frag->network_header + state->hlen +
811 				  sizeof(struct frag_hdr));
812 
813 	/*
814 	 *	Charge the memory for the fragment to any owner
815 	 *	it might possess
816 	 */
817 	if (skb->sk)
818 		skb_set_owner_w(frag, skb->sk);
819 
820 	/*
821 	 *	Copy the packet header into the new buffer.
822 	 */
823 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
824 
825 	fragnexthdr_offset = skb_network_header(frag);
826 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
827 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
828 
829 	/*
830 	 *	Build fragment header.
831 	 */
832 	fh->nexthdr = state->nexthdr;
833 	fh->reserved = 0;
834 	fh->identification = state->frag_id;
835 
836 	/*
837 	 *	Copy a block of the IP datagram.
838 	 */
839 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
840 			     len));
841 	state->left -= len;
842 
843 	fh->frag_off = htons(state->offset);
844 	if (state->left > 0)
845 		fh->frag_off |= htons(IP6_MF);
846 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
847 
848 	state->ptr += len;
849 	state->offset += len;
850 
851 	return frag;
852 }
853 EXPORT_SYMBOL(ip6_frag_next);
854 
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))855 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
856 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
857 {
858 	struct sk_buff *frag;
859 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
860 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
861 				inet6_sk(skb->sk) : NULL;
862 	u8 tstamp_type = skb->tstamp_type;
863 	struct ip6_frag_state state;
864 	unsigned int mtu, hlen, nexthdr_offset;
865 	ktime_t tstamp = skb->tstamp;
866 	int hroom, err = 0;
867 	__be32 frag_id;
868 	u8 *prevhdr, nexthdr = 0;
869 
870 	err = ip6_find_1stfragopt(skb, &prevhdr);
871 	if (err < 0)
872 		goto fail;
873 	hlen = err;
874 	nexthdr = *prevhdr;
875 	nexthdr_offset = prevhdr - skb_network_header(skb);
876 
877 	mtu = ip6_skb_dst_mtu(skb);
878 
879 	/* We must not fragment if the socket is set to force MTU discovery
880 	 * or if the skb it not generated by a local socket.
881 	 */
882 	if (unlikely(!skb->ignore_df && skb->len > mtu))
883 		goto fail_toobig;
884 
885 	if (IP6CB(skb)->frag_max_size) {
886 		if (IP6CB(skb)->frag_max_size > mtu)
887 			goto fail_toobig;
888 
889 		/* don't send fragments larger than what we received */
890 		mtu = IP6CB(skb)->frag_max_size;
891 		if (mtu < IPV6_MIN_MTU)
892 			mtu = IPV6_MIN_MTU;
893 	}
894 
895 	if (np) {
896 		u32 frag_size = READ_ONCE(np->frag_size);
897 
898 		if (frag_size && frag_size < mtu)
899 			mtu = frag_size;
900 	}
901 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
902 		goto fail_toobig;
903 	mtu -= hlen + sizeof(struct frag_hdr);
904 
905 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
906 				    &ipv6_hdr(skb)->saddr);
907 
908 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
909 	    (err = skb_checksum_help(skb)))
910 		goto fail;
911 
912 	prevhdr = skb_network_header(skb) + nexthdr_offset;
913 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
914 	if (skb_has_frag_list(skb)) {
915 		unsigned int first_len = skb_pagelen(skb);
916 		struct ip6_fraglist_iter iter;
917 		struct sk_buff *frag2;
918 
919 		if (first_len - hlen > mtu ||
920 		    ((first_len - hlen) & 7) ||
921 		    skb_cloned(skb) ||
922 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
923 			goto slow_path;
924 
925 		skb_walk_frags(skb, frag) {
926 			/* Correct geometry. */
927 			if (frag->len > mtu ||
928 			    ((frag->len & 7) && frag->next) ||
929 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
930 				goto slow_path_clean;
931 
932 			/* Partially cloned skb? */
933 			if (skb_shared(frag))
934 				goto slow_path_clean;
935 
936 			BUG_ON(frag->sk);
937 			if (skb->sk) {
938 				frag->sk = skb->sk;
939 				frag->destructor = sock_wfree;
940 			}
941 			skb->truesize -= frag->truesize;
942 		}
943 
944 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
945 					&iter);
946 		if (err < 0)
947 			goto fail;
948 
949 		/* We prevent @rt from being freed. */
950 		rcu_read_lock();
951 
952 		for (;;) {
953 			/* Prepare header of the next frame,
954 			 * before previous one went down. */
955 			if (iter.frag)
956 				ip6_fraglist_prepare(skb, &iter);
957 
958 			skb_set_delivery_time(skb, tstamp, tstamp_type);
959 			err = output(net, sk, skb);
960 			if (!err)
961 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
962 					      IPSTATS_MIB_FRAGCREATES);
963 
964 			if (err || !iter.frag)
965 				break;
966 
967 			skb = ip6_fraglist_next(&iter);
968 		}
969 
970 		kfree(iter.tmp_hdr);
971 
972 		if (err == 0) {
973 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
974 				      IPSTATS_MIB_FRAGOKS);
975 			rcu_read_unlock();
976 			return 0;
977 		}
978 
979 		kfree_skb_list(iter.frag);
980 
981 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
982 			      IPSTATS_MIB_FRAGFAILS);
983 		rcu_read_unlock();
984 		return err;
985 
986 slow_path_clean:
987 		skb_walk_frags(skb, frag2) {
988 			if (frag2 == frag)
989 				break;
990 			frag2->sk = NULL;
991 			frag2->destructor = NULL;
992 			skb->truesize += frag2->truesize;
993 		}
994 	}
995 
996 slow_path:
997 	/*
998 	 *	Fragment the datagram.
999 	 */
1000 
1001 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1002 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1003 		      &state);
1004 
1005 	/*
1006 	 *	Keep copying data until we run out.
1007 	 */
1008 
1009 	while (state.left > 0) {
1010 		frag = ip6_frag_next(skb, &state);
1011 		if (IS_ERR(frag)) {
1012 			err = PTR_ERR(frag);
1013 			goto fail;
1014 		}
1015 
1016 		/*
1017 		 *	Put this fragment into the sending queue.
1018 		 */
1019 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1020 		err = output(net, sk, frag);
1021 		if (err)
1022 			goto fail;
1023 
1024 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1025 			      IPSTATS_MIB_FRAGCREATES);
1026 	}
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGOKS);
1029 	consume_skb(skb);
1030 	return err;
1031 
1032 fail_toobig:
1033 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1034 	err = -EMSGSIZE;
1035 
1036 fail:
1037 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1038 		      IPSTATS_MIB_FRAGFAILS);
1039 	kfree_skb(skb);
1040 	return err;
1041 }
1042 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1043 static inline int ip6_rt_check(const struct rt6key *rt_key,
1044 			       const struct in6_addr *fl_addr,
1045 			       const struct in6_addr *addr_cache)
1046 {
1047 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1048 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1049 }
1050 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1051 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1052 					  struct dst_entry *dst,
1053 					  const struct flowi6 *fl6)
1054 {
1055 	struct ipv6_pinfo *np = inet6_sk(sk);
1056 	struct rt6_info *rt;
1057 
1058 	if (!dst)
1059 		goto out;
1060 
1061 	if (dst->ops->family != AF_INET6) {
1062 		dst_release(dst);
1063 		return NULL;
1064 	}
1065 
1066 	rt = dst_rt6_info(dst);
1067 	/* Yes, checking route validity in not connected
1068 	 * case is not very simple. Take into account,
1069 	 * that we do not support routing by source, TOS,
1070 	 * and MSG_DONTROUTE		--ANK (980726)
1071 	 *
1072 	 * 1. ip6_rt_check(): If route was host route,
1073 	 *    check that cached destination is current.
1074 	 *    If it is network route, we still may
1075 	 *    check its validity using saved pointer
1076 	 *    to the last used address: daddr_cache.
1077 	 *    We do not want to save whole address now,
1078 	 *    (because main consumer of this service
1079 	 *    is tcp, which has not this problem),
1080 	 *    so that the last trick works only on connected
1081 	 *    sockets.
1082 	 * 2. oif also should be the same.
1083 	 */
1084 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1085 #ifdef CONFIG_IPV6_SUBTREES
1086 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1087 #endif
1088 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1089 		dst_release(dst);
1090 		dst = NULL;
1091 	}
1092 
1093 out:
1094 	return dst;
1095 }
1096 
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1097 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1098 			       struct dst_entry **dst, struct flowi6 *fl6)
1099 {
1100 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1101 	struct neighbour *n;
1102 	struct rt6_info *rt;
1103 #endif
1104 	int err;
1105 	int flags = 0;
1106 
1107 	/* The correct way to handle this would be to do
1108 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1109 	 * the route-specific preferred source forces the
1110 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1111 	 *
1112 	 * In source specific routing (no src=any default route),
1113 	 * ip6_route_output will fail given src=any saddr, though, so
1114 	 * that's why we try it again later.
1115 	 */
1116 	if (ipv6_addr_any(&fl6->saddr)) {
1117 		struct fib6_info *from;
1118 		struct rt6_info *rt;
1119 
1120 		*dst = ip6_route_output(net, sk, fl6);
1121 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1122 
1123 		rcu_read_lock();
1124 		from = rt ? rcu_dereference(rt->from) : NULL;
1125 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1126 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1127 					  fl6->flowi6_l3mdev,
1128 					  &fl6->saddr);
1129 		rcu_read_unlock();
1130 
1131 		if (err)
1132 			goto out_err_release;
1133 
1134 		/* If we had an erroneous initial result, pretend it
1135 		 * never existed and let the SA-enabled version take
1136 		 * over.
1137 		 */
1138 		if ((*dst)->error) {
1139 			dst_release(*dst);
1140 			*dst = NULL;
1141 		}
1142 
1143 		if (fl6->flowi6_oif)
1144 			flags |= RT6_LOOKUP_F_IFACE;
1145 	}
1146 
1147 	if (!*dst)
1148 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1149 
1150 	err = (*dst)->error;
1151 	if (err)
1152 		goto out_err_release;
1153 
1154 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1155 	/*
1156 	 * Here if the dst entry we've looked up
1157 	 * has a neighbour entry that is in the INCOMPLETE
1158 	 * state and the src address from the flow is
1159 	 * marked as OPTIMISTIC, we release the found
1160 	 * dst entry and replace it instead with the
1161 	 * dst entry of the nexthop router
1162 	 */
1163 	rt = dst_rt6_info(*dst);
1164 	rcu_read_lock();
1165 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1166 				      rt6_nexthop(rt, &fl6->daddr));
1167 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1168 	rcu_read_unlock();
1169 
1170 	if (err) {
1171 		struct inet6_ifaddr *ifp;
1172 		struct flowi6 fl_gw6;
1173 		int redirect;
1174 
1175 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1176 				      (*dst)->dev, 1);
1177 
1178 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1179 		if (ifp)
1180 			in6_ifa_put(ifp);
1181 
1182 		if (redirect) {
1183 			/*
1184 			 * We need to get the dst entry for the
1185 			 * default router instead
1186 			 */
1187 			dst_release(*dst);
1188 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1189 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1190 			*dst = ip6_route_output(net, sk, &fl_gw6);
1191 			err = (*dst)->error;
1192 			if (err)
1193 				goto out_err_release;
1194 		}
1195 	}
1196 #endif
1197 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1198 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1199 		err = -EAFNOSUPPORT;
1200 		goto out_err_release;
1201 	}
1202 
1203 	return 0;
1204 
1205 out_err_release:
1206 	dst_release(*dst);
1207 	*dst = NULL;
1208 
1209 	if (err == -ENETUNREACH)
1210 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1211 	return err;
1212 }
1213 
1214 /**
1215  *	ip6_dst_lookup - perform route lookup on flow
1216  *	@net: Network namespace to perform lookup in
1217  *	@sk: socket which provides route info
1218  *	@dst: pointer to dst_entry * for result
1219  *	@fl6: flow to lookup
1220  *
1221  *	This function performs a route lookup on the given flow.
1222  *
1223  *	It returns zero on success, or a standard errno code on error.
1224  */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1225 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1226 		   struct flowi6 *fl6)
1227 {
1228 	*dst = NULL;
1229 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1230 }
1231 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1232 
1233 /**
1234  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1235  *	@net: Network namespace to perform lookup in
1236  *	@sk: socket which provides route info
1237  *	@fl6: flow to lookup
1238  *	@final_dst: final destination address for ipsec lookup
1239  *
1240  *	This function performs a route lookup on the given flow.
1241  *
1242  *	It returns a valid dst pointer on success, or a pointer encoded
1243  *	error code.
1244  */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1245 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1246 				      const struct in6_addr *final_dst)
1247 {
1248 	struct dst_entry *dst = NULL;
1249 	int err;
1250 
1251 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1252 	if (err)
1253 		return ERR_PTR(err);
1254 	if (final_dst)
1255 		fl6->daddr = *final_dst;
1256 
1257 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1258 }
1259 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1260 
1261 /**
1262  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1263  *	@sk: socket which provides the dst cache and route info
1264  *	@fl6: flow to lookup
1265  *	@final_dst: final destination address for ipsec lookup
1266  *	@connected: whether @sk is connected or not
1267  *
1268  *	This function performs a route lookup on the given flow with the
1269  *	possibility of using the cached route in the socket if it is valid.
1270  *	It will take the socket dst lock when operating on the dst cache.
1271  *	As a result, this function can only be used in process context.
1272  *
1273  *	In addition, for a connected socket, cache the dst in the socket
1274  *	if the current cache is not valid.
1275  *
1276  *	It returns a valid dst pointer on success, or a pointer encoded
1277  *	error code.
1278  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1279 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1280 					 const struct in6_addr *final_dst,
1281 					 bool connected)
1282 {
1283 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1284 
1285 	dst = ip6_sk_dst_check(sk, dst, fl6);
1286 	if (dst)
1287 		return dst;
1288 
1289 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1290 	if (connected && !IS_ERR(dst))
1291 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1292 
1293 	return dst;
1294 }
1295 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1296 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1297 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1298 					       gfp_t gfp)
1299 {
1300 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1301 }
1302 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1303 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1304 						gfp_t gfp)
1305 {
1306 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1307 }
1308 
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1309 static void ip6_append_data_mtu(unsigned int *mtu,
1310 				int *maxfraglen,
1311 				unsigned int fragheaderlen,
1312 				struct sk_buff *skb,
1313 				struct rt6_info *rt,
1314 				unsigned int orig_mtu)
1315 {
1316 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1317 		if (!skb) {
1318 			/* first fragment, reserve header_len */
1319 			*mtu = orig_mtu - rt->dst.header_len;
1320 
1321 		} else {
1322 			/*
1323 			 * this fragment is not first, the headers
1324 			 * space is regarded as data space.
1325 			 */
1326 			*mtu = orig_mtu;
1327 		}
1328 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1329 			      + fragheaderlen - sizeof(struct frag_hdr);
1330 	}
1331 }
1332 
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1333 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1334 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1335 			  struct rt6_info *rt)
1336 {
1337 	struct ipv6_pinfo *np = inet6_sk(sk);
1338 	unsigned int mtu, frag_size;
1339 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1340 
1341 	/* callers pass dst together with a reference, set it first so
1342 	 * ip6_cork_release() can put it down even in case of an error.
1343 	 */
1344 	cork->base.dst = &rt->dst;
1345 
1346 	/*
1347 	 * setup for corking
1348 	 */
1349 	if (opt) {
1350 		if (WARN_ON(v6_cork->opt))
1351 			return -EINVAL;
1352 
1353 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1354 		if (unlikely(!nopt))
1355 			return -ENOBUFS;
1356 
1357 		nopt->tot_len = sizeof(*opt);
1358 		nopt->opt_flen = opt->opt_flen;
1359 		nopt->opt_nflen = opt->opt_nflen;
1360 
1361 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1362 		if (opt->dst0opt && !nopt->dst0opt)
1363 			return -ENOBUFS;
1364 
1365 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1366 		if (opt->dst1opt && !nopt->dst1opt)
1367 			return -ENOBUFS;
1368 
1369 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1370 		if (opt->hopopt && !nopt->hopopt)
1371 			return -ENOBUFS;
1372 
1373 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1374 		if (opt->srcrt && !nopt->srcrt)
1375 			return -ENOBUFS;
1376 
1377 		/* need source address above miyazawa*/
1378 	}
1379 	v6_cork->hop_limit = ipc6->hlimit;
1380 	v6_cork->tclass = ipc6->tclass;
1381 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1382 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1383 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1384 	else
1385 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1386 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1387 
1388 	frag_size = READ_ONCE(np->frag_size);
1389 	if (frag_size && frag_size < mtu)
1390 		mtu = frag_size;
1391 
1392 	cork->base.fragsize = mtu;
1393 	cork->base.gso_size = ipc6->gso_size;
1394 	cork->base.tx_flags = 0;
1395 	cork->base.mark = ipc6->sockc.mark;
1396 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1397 
1398 	cork->base.length = 0;
1399 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1400 
1401 	return 0;
1402 }
1403 
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1404 static int __ip6_append_data(struct sock *sk,
1405 			     struct sk_buff_head *queue,
1406 			     struct inet_cork_full *cork_full,
1407 			     struct inet6_cork *v6_cork,
1408 			     struct page_frag *pfrag,
1409 			     int getfrag(void *from, char *to, int offset,
1410 					 int len, int odd, struct sk_buff *skb),
1411 			     void *from, size_t length, int transhdrlen,
1412 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1413 {
1414 	struct sk_buff *skb, *skb_prev = NULL;
1415 	struct inet_cork *cork = &cork_full->base;
1416 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1417 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1418 	struct ubuf_info *uarg = NULL;
1419 	int exthdrlen = 0;
1420 	int dst_exthdrlen = 0;
1421 	int hh_len;
1422 	int copy;
1423 	int err;
1424 	int offset = 0;
1425 	bool zc = false;
1426 	u32 tskey = 0;
1427 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1428 	bool paged, hold_tskey, extra_uref = false;
1429 	struct ipv6_txoptions *opt = v6_cork->opt;
1430 	int csummode = CHECKSUM_NONE;
1431 	unsigned int maxnonfragsize, headersize;
1432 	unsigned int wmem_alloc_delta = 0;
1433 
1434 	skb = skb_peek_tail(queue);
1435 	if (!skb) {
1436 		exthdrlen = opt ? opt->opt_flen : 0;
1437 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1438 	}
1439 
1440 	paged = !!cork->gso_size;
1441 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1442 	orig_mtu = mtu;
1443 
1444 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1445 
1446 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1447 			(opt ? opt->opt_nflen : 0);
1448 
1449 	headersize = sizeof(struct ipv6hdr) +
1450 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1451 		     rt->rt6i_nfheader_len;
1452 
1453 	if (mtu <= fragheaderlen ||
1454 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1455 		goto emsgsize;
1456 
1457 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1458 		     sizeof(struct frag_hdr);
1459 
1460 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1461 	 * the first fragment
1462 	 */
1463 	if (headersize + transhdrlen > mtu)
1464 		goto emsgsize;
1465 
1466 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1467 	    (sk->sk_protocol == IPPROTO_UDP ||
1468 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1469 	     sk->sk_protocol == IPPROTO_RAW)) {
1470 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1471 				sizeof(struct ipv6hdr));
1472 		goto emsgsize;
1473 	}
1474 
1475 	if (ip6_sk_ignore_df(sk))
1476 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1477 	else
1478 		maxnonfragsize = mtu;
1479 
1480 	if (cork->length + length > maxnonfragsize - headersize) {
1481 emsgsize:
1482 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1483 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1484 		return -EMSGSIZE;
1485 	}
1486 
1487 	/* CHECKSUM_PARTIAL only with no extension headers and when
1488 	 * we are not going to fragment
1489 	 */
1490 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1491 	    headersize == sizeof(struct ipv6hdr) &&
1492 	    length <= mtu - headersize &&
1493 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1494 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1495 		csummode = CHECKSUM_PARTIAL;
1496 
1497 	if ((flags & MSG_ZEROCOPY) && length) {
1498 		struct msghdr *msg = from;
1499 
1500 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1501 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1502 				return -EINVAL;
1503 
1504 			/* Leave uarg NULL if can't zerocopy, callers should
1505 			 * be able to handle it.
1506 			 */
1507 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1508 			    csummode == CHECKSUM_PARTIAL) {
1509 				paged = true;
1510 				zc = true;
1511 				uarg = msg->msg_ubuf;
1512 			}
1513 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1514 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1515 			if (!uarg)
1516 				return -ENOBUFS;
1517 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1518 			if (rt->dst.dev->features & NETIF_F_SG &&
1519 			    csummode == CHECKSUM_PARTIAL) {
1520 				paged = true;
1521 				zc = true;
1522 			} else {
1523 				uarg_to_msgzc(uarg)->zerocopy = 0;
1524 				skb_zcopy_set(skb, uarg, &extra_uref);
1525 			}
1526 		}
1527 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1528 		if (inet_test_bit(HDRINCL, sk))
1529 			return -EPERM;
1530 		if (rt->dst.dev->features & NETIF_F_SG &&
1531 		    getfrag == ip_generic_getfrag)
1532 			/* We need an empty buffer to attach stuff to */
1533 			paged = true;
1534 		else
1535 			flags &= ~MSG_SPLICE_PAGES;
1536 	}
1537 
1538 	hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
1539 		     READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
1540 	if (hold_tskey)
1541 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1542 
1543 	/*
1544 	 * Let's try using as much space as possible.
1545 	 * Use MTU if total length of the message fits into the MTU.
1546 	 * Otherwise, we need to reserve fragment header and
1547 	 * fragment alignment (= 8-15 octects, in total).
1548 	 *
1549 	 * Note that we may need to "move" the data from the tail
1550 	 * of the buffer to the new fragment when we split
1551 	 * the message.
1552 	 *
1553 	 * FIXME: It may be fragmented into multiple chunks
1554 	 *        at once if non-fragmentable extension headers
1555 	 *        are too large.
1556 	 * --yoshfuji
1557 	 */
1558 
1559 	cork->length += length;
1560 	if (!skb)
1561 		goto alloc_new_skb;
1562 
1563 	while (length > 0) {
1564 		/* Check if the remaining data fits into current packet. */
1565 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1566 		if (copy < length)
1567 			copy = maxfraglen - skb->len;
1568 
1569 		if (copy <= 0) {
1570 			char *data;
1571 			unsigned int datalen;
1572 			unsigned int fraglen;
1573 			unsigned int fraggap;
1574 			unsigned int alloclen, alloc_extra;
1575 			unsigned int pagedlen;
1576 alloc_new_skb:
1577 			/* There's no room in the current skb */
1578 			if (skb)
1579 				fraggap = skb->len - maxfraglen;
1580 			else
1581 				fraggap = 0;
1582 			/* update mtu and maxfraglen if necessary */
1583 			if (!skb || !skb_prev)
1584 				ip6_append_data_mtu(&mtu, &maxfraglen,
1585 						    fragheaderlen, skb, rt,
1586 						    orig_mtu);
1587 
1588 			skb_prev = skb;
1589 
1590 			/*
1591 			 * If remaining data exceeds the mtu,
1592 			 * we know we need more fragment(s).
1593 			 */
1594 			datalen = length + fraggap;
1595 
1596 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1597 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1598 			fraglen = datalen + fragheaderlen;
1599 			pagedlen = 0;
1600 
1601 			alloc_extra = hh_len;
1602 			alloc_extra += dst_exthdrlen;
1603 			alloc_extra += rt->dst.trailer_len;
1604 
1605 			/* We just reserve space for fragment header.
1606 			 * Note: this may be overallocation if the message
1607 			 * (without MSG_MORE) fits into the MTU.
1608 			 */
1609 			alloc_extra += sizeof(struct frag_hdr);
1610 
1611 			if ((flags & MSG_MORE) &&
1612 			    !(rt->dst.dev->features&NETIF_F_SG))
1613 				alloclen = mtu;
1614 			else if (!paged &&
1615 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1616 				  !(rt->dst.dev->features & NETIF_F_SG)))
1617 				alloclen = fraglen;
1618 			else {
1619 				alloclen = fragheaderlen + transhdrlen;
1620 				pagedlen = datalen - transhdrlen;
1621 			}
1622 			alloclen += alloc_extra;
1623 
1624 			if (datalen != length + fraggap) {
1625 				/*
1626 				 * this is not the last fragment, the trailer
1627 				 * space is regarded as data space.
1628 				 */
1629 				datalen += rt->dst.trailer_len;
1630 			}
1631 
1632 			fraglen = datalen + fragheaderlen;
1633 
1634 			copy = datalen - transhdrlen - fraggap - pagedlen;
1635 			/* [!] NOTE: copy may be negative if pagedlen>0
1636 			 * because then the equation may reduces to -fraggap.
1637 			 */
1638 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1639 				err = -EINVAL;
1640 				goto error;
1641 			}
1642 			if (transhdrlen) {
1643 				skb = sock_alloc_send_skb(sk, alloclen,
1644 						(flags & MSG_DONTWAIT), &err);
1645 			} else {
1646 				skb = NULL;
1647 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1648 				    2 * sk->sk_sndbuf)
1649 					skb = alloc_skb(alloclen,
1650 							sk->sk_allocation);
1651 				if (unlikely(!skb))
1652 					err = -ENOBUFS;
1653 			}
1654 			if (!skb)
1655 				goto error;
1656 			/*
1657 			 *	Fill in the control structures
1658 			 */
1659 			skb->protocol = htons(ETH_P_IPV6);
1660 			skb->ip_summed = csummode;
1661 			skb->csum = 0;
1662 			/* reserve for fragmentation and ipsec header */
1663 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1664 				    dst_exthdrlen);
1665 
1666 			/*
1667 			 *	Find where to start putting bytes
1668 			 */
1669 			data = skb_put(skb, fraglen - pagedlen);
1670 			skb_set_network_header(skb, exthdrlen);
1671 			data += fragheaderlen;
1672 			skb->transport_header = (skb->network_header +
1673 						 fragheaderlen);
1674 			if (fraggap) {
1675 				skb->csum = skb_copy_and_csum_bits(
1676 					skb_prev, maxfraglen,
1677 					data + transhdrlen, fraggap);
1678 				skb_prev->csum = csum_sub(skb_prev->csum,
1679 							  skb->csum);
1680 				data += fraggap;
1681 				pskb_trim_unique(skb_prev, maxfraglen);
1682 			}
1683 			if (copy > 0 &&
1684 			    getfrag(from, data + transhdrlen, offset,
1685 				    copy, fraggap, skb) < 0) {
1686 				err = -EFAULT;
1687 				kfree_skb(skb);
1688 				goto error;
1689 			} else if (flags & MSG_SPLICE_PAGES) {
1690 				copy = 0;
1691 			}
1692 
1693 			offset += copy;
1694 			length -= copy + transhdrlen;
1695 			transhdrlen = 0;
1696 			exthdrlen = 0;
1697 			dst_exthdrlen = 0;
1698 
1699 			/* Only the initial fragment is time stamped */
1700 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1701 			cork->tx_flags = 0;
1702 			skb_shinfo(skb)->tskey = tskey;
1703 			tskey = 0;
1704 			skb_zcopy_set(skb, uarg, &extra_uref);
1705 
1706 			if ((flags & MSG_CONFIRM) && !skb_prev)
1707 				skb_set_dst_pending_confirm(skb, 1);
1708 
1709 			/*
1710 			 * Put the packet on the pending queue
1711 			 */
1712 			if (!skb->destructor) {
1713 				skb->destructor = sock_wfree;
1714 				skb->sk = sk;
1715 				wmem_alloc_delta += skb->truesize;
1716 			}
1717 			__skb_queue_tail(queue, skb);
1718 			continue;
1719 		}
1720 
1721 		if (copy > length)
1722 			copy = length;
1723 
1724 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1725 		    skb_tailroom(skb) >= copy) {
1726 			unsigned int off;
1727 
1728 			off = skb->len;
1729 			if (getfrag(from, skb_put(skb, copy),
1730 						offset, copy, off, skb) < 0) {
1731 				__skb_trim(skb, off);
1732 				err = -EFAULT;
1733 				goto error;
1734 			}
1735 		} else if (flags & MSG_SPLICE_PAGES) {
1736 			struct msghdr *msg = from;
1737 
1738 			err = -EIO;
1739 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1740 				goto error;
1741 
1742 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1743 						   sk->sk_allocation);
1744 			if (err < 0)
1745 				goto error;
1746 			copy = err;
1747 			wmem_alloc_delta += copy;
1748 		} else if (!zc) {
1749 			int i = skb_shinfo(skb)->nr_frags;
1750 
1751 			err = -ENOMEM;
1752 			if (!sk_page_frag_refill(sk, pfrag))
1753 				goto error;
1754 
1755 			skb_zcopy_downgrade_managed(skb);
1756 			if (!skb_can_coalesce(skb, i, pfrag->page,
1757 					      pfrag->offset)) {
1758 				err = -EMSGSIZE;
1759 				if (i == MAX_SKB_FRAGS)
1760 					goto error;
1761 
1762 				__skb_fill_page_desc(skb, i, pfrag->page,
1763 						     pfrag->offset, 0);
1764 				skb_shinfo(skb)->nr_frags = ++i;
1765 				get_page(pfrag->page);
1766 			}
1767 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1768 			if (getfrag(from,
1769 				    page_address(pfrag->page) + pfrag->offset,
1770 				    offset, copy, skb->len, skb) < 0)
1771 				goto error_efault;
1772 
1773 			pfrag->offset += copy;
1774 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1775 			skb->len += copy;
1776 			skb->data_len += copy;
1777 			skb->truesize += copy;
1778 			wmem_alloc_delta += copy;
1779 		} else {
1780 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1781 			if (err < 0)
1782 				goto error;
1783 		}
1784 		offset += copy;
1785 		length -= copy;
1786 	}
1787 
1788 	if (wmem_alloc_delta)
1789 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1790 	return 0;
1791 
1792 error_efault:
1793 	err = -EFAULT;
1794 error:
1795 	net_zcopy_put_abort(uarg, extra_uref);
1796 	cork->length -= length;
1797 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1798 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1799 	if (hold_tskey)
1800 		atomic_dec(&sk->sk_tskey);
1801 	return err;
1802 }
1803 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1804 int ip6_append_data(struct sock *sk,
1805 		    int getfrag(void *from, char *to, int offset, int len,
1806 				int odd, struct sk_buff *skb),
1807 		    void *from, size_t length, int transhdrlen,
1808 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1809 		    struct rt6_info *rt, unsigned int flags)
1810 {
1811 	struct inet_sock *inet = inet_sk(sk);
1812 	struct ipv6_pinfo *np = inet6_sk(sk);
1813 	int exthdrlen;
1814 	int err;
1815 
1816 	if (flags&MSG_PROBE)
1817 		return 0;
1818 	if (skb_queue_empty(&sk->sk_write_queue)) {
1819 		/*
1820 		 * setup for corking
1821 		 */
1822 		dst_hold(&rt->dst);
1823 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1824 				     ipc6, rt);
1825 		if (err)
1826 			return err;
1827 
1828 		inet->cork.fl.u.ip6 = *fl6;
1829 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1830 		length += exthdrlen;
1831 		transhdrlen += exthdrlen;
1832 	} else {
1833 		transhdrlen = 0;
1834 	}
1835 
1836 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1837 				 &np->cork, sk_page_frag(sk), getfrag,
1838 				 from, length, transhdrlen, flags, ipc6);
1839 }
1840 EXPORT_SYMBOL_GPL(ip6_append_data);
1841 
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1842 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1843 {
1844 	struct dst_entry *dst = cork->base.dst;
1845 
1846 	cork->base.dst = NULL;
1847 	skb_dst_set(skb, dst);
1848 }
1849 
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1850 static void ip6_cork_release(struct inet_cork_full *cork,
1851 			     struct inet6_cork *v6_cork)
1852 {
1853 	if (v6_cork->opt) {
1854 		struct ipv6_txoptions *opt = v6_cork->opt;
1855 
1856 		kfree(opt->dst0opt);
1857 		kfree(opt->dst1opt);
1858 		kfree(opt->hopopt);
1859 		kfree(opt->srcrt);
1860 		kfree(opt);
1861 		v6_cork->opt = NULL;
1862 	}
1863 
1864 	if (cork->base.dst) {
1865 		dst_release(cork->base.dst);
1866 		cork->base.dst = NULL;
1867 	}
1868 }
1869 
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1870 struct sk_buff *__ip6_make_skb(struct sock *sk,
1871 			       struct sk_buff_head *queue,
1872 			       struct inet_cork_full *cork,
1873 			       struct inet6_cork *v6_cork)
1874 {
1875 	struct sk_buff *skb, *tmp_skb;
1876 	struct sk_buff **tail_skb;
1877 	struct in6_addr *final_dst;
1878 	struct net *net = sock_net(sk);
1879 	struct ipv6hdr *hdr;
1880 	struct ipv6_txoptions *opt = v6_cork->opt;
1881 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1882 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1883 	unsigned char proto = fl6->flowi6_proto;
1884 
1885 	skb = __skb_dequeue(queue);
1886 	if (!skb)
1887 		goto out;
1888 	tail_skb = &(skb_shinfo(skb)->frag_list);
1889 
1890 	/* move skb->data to ip header from ext header */
1891 	if (skb->data < skb_network_header(skb))
1892 		__skb_pull(skb, skb_network_offset(skb));
1893 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1894 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1895 		*tail_skb = tmp_skb;
1896 		tail_skb = &(tmp_skb->next);
1897 		skb->len += tmp_skb->len;
1898 		skb->data_len += tmp_skb->len;
1899 		skb->truesize += tmp_skb->truesize;
1900 		tmp_skb->destructor = NULL;
1901 		tmp_skb->sk = NULL;
1902 	}
1903 
1904 	/* Allow local fragmentation. */
1905 	skb->ignore_df = ip6_sk_ignore_df(sk);
1906 	__skb_pull(skb, skb_network_header_len(skb));
1907 
1908 	final_dst = &fl6->daddr;
1909 	if (opt && opt->opt_flen)
1910 		ipv6_push_frag_opts(skb, opt, &proto);
1911 	if (opt && opt->opt_nflen)
1912 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1913 
1914 	skb_push(skb, sizeof(struct ipv6hdr));
1915 	skb_reset_network_header(skb);
1916 	hdr = ipv6_hdr(skb);
1917 
1918 	ip6_flow_hdr(hdr, v6_cork->tclass,
1919 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1920 					ip6_autoflowlabel(net, sk), fl6));
1921 	hdr->hop_limit = v6_cork->hop_limit;
1922 	hdr->nexthdr = proto;
1923 	hdr->saddr = fl6->saddr;
1924 	hdr->daddr = *final_dst;
1925 
1926 	skb->priority = READ_ONCE(sk->sk_priority);
1927 	skb->mark = cork->base.mark;
1928 	if (sk_is_tcp(sk))
1929 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1930 	else
1931 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1932 
1933 	ip6_cork_steal_dst(skb, cork);
1934 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1935 	if (proto == IPPROTO_ICMPV6) {
1936 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1937 		u8 icmp6_type;
1938 
1939 		if (sk->sk_socket->type == SOCK_RAW &&
1940 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1941 			icmp6_type = fl6->fl6_icmp_type;
1942 		else
1943 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1944 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1945 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1946 	}
1947 
1948 	ip6_cork_release(cork, v6_cork);
1949 out:
1950 	return skb;
1951 }
1952 
ip6_send_skb(struct sk_buff * skb)1953 int ip6_send_skb(struct sk_buff *skb)
1954 {
1955 	struct net *net = sock_net(skb->sk);
1956 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1957 	int err;
1958 
1959 	err = ip6_local_out(net, skb->sk, skb);
1960 	if (err) {
1961 		if (err > 0)
1962 			err = net_xmit_errno(err);
1963 		if (err)
1964 			IP6_INC_STATS(net, rt->rt6i_idev,
1965 				      IPSTATS_MIB_OUTDISCARDS);
1966 	}
1967 
1968 	return err;
1969 }
1970 
ip6_push_pending_frames(struct sock * sk)1971 int ip6_push_pending_frames(struct sock *sk)
1972 {
1973 	struct sk_buff *skb;
1974 
1975 	skb = ip6_finish_skb(sk);
1976 	if (!skb)
1977 		return 0;
1978 
1979 	return ip6_send_skb(skb);
1980 }
1981 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1982 
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1983 static void __ip6_flush_pending_frames(struct sock *sk,
1984 				       struct sk_buff_head *queue,
1985 				       struct inet_cork_full *cork,
1986 				       struct inet6_cork *v6_cork)
1987 {
1988 	struct sk_buff *skb;
1989 
1990 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1991 		if (skb_dst(skb))
1992 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1993 				      IPSTATS_MIB_OUTDISCARDS);
1994 		kfree_skb(skb);
1995 	}
1996 
1997 	ip6_cork_release(cork, v6_cork);
1998 }
1999 
ip6_flush_pending_frames(struct sock * sk)2000 void ip6_flush_pending_frames(struct sock *sk)
2001 {
2002 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2003 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2004 }
2005 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2006 
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2007 struct sk_buff *ip6_make_skb(struct sock *sk,
2008 			     int getfrag(void *from, char *to, int offset,
2009 					 int len, int odd, struct sk_buff *skb),
2010 			     void *from, size_t length, int transhdrlen,
2011 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2012 			     unsigned int flags, struct inet_cork_full *cork)
2013 {
2014 	struct inet6_cork v6_cork;
2015 	struct sk_buff_head queue;
2016 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2017 	int err;
2018 
2019 	if (flags & MSG_PROBE) {
2020 		dst_release(&rt->dst);
2021 		return NULL;
2022 	}
2023 
2024 	__skb_queue_head_init(&queue);
2025 
2026 	cork->base.flags = 0;
2027 	cork->base.addr = 0;
2028 	cork->base.opt = NULL;
2029 	v6_cork.opt = NULL;
2030 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2031 	if (err) {
2032 		ip6_cork_release(cork, &v6_cork);
2033 		return ERR_PTR(err);
2034 	}
2035 	if (ipc6->dontfrag < 0)
2036 		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2037 
2038 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2039 				&current->task_frag, getfrag, from,
2040 				length + exthdrlen, transhdrlen + exthdrlen,
2041 				flags, ipc6);
2042 	if (err) {
2043 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2044 		return ERR_PTR(err);
2045 	}
2046 
2047 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2048 }
2049