1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 struct dst_entry *dst = skb_dst(skb);
63 struct net_device *dev = dst->dev;
64 struct inet6_dev *idev = ip6_dst_idev(dst);
65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
68 struct neighbour *neigh;
69 int ret;
70
71 /* Be paranoid, rather than too clever. */
72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 skb = skb_expand_head(skb, hh_len);
74 if (!skb) {
75 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 return -ENOMEM;
77 }
78 }
79
80 hdr = ipv6_hdr(skb);
81 daddr = &hdr->daddr;
82 if (ipv6_addr_is_multicast(daddr)) {
83 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 ((mroute6_is_socket(net, skb) &&
85 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88
89 /* Do not check for IFF_ALLMULTI; multicast routing
90 is not supported in any case.
91 */
92 if (newskb)
93 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 net, sk, newskb, NULL, newskb->dev,
95 dev_loopback_xmit);
96
97 if (hdr->hop_limit == 0) {
98 IP6_INC_STATS(net, idev,
99 IPSTATS_MIB_OUTDISCARDS);
100 kfree_skb(skb);
101 return 0;
102 }
103 }
104
105 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 !(dev->flags & IFF_LOOPBACK)) {
108 kfree_skb(skb);
109 return 0;
110 }
111 }
112
113 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 int res = lwtunnel_xmit(skb);
115
116 if (res != LWTUNNEL_XMIT_CONTINUE)
117 return res;
118 }
119
120 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
121
122 rcu_read_lock();
123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125
126 if (unlikely(IS_ERR_OR_NULL(neigh))) {
127 if (unlikely(!neigh))
128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 if (IS_ERR(neigh)) {
130 rcu_read_unlock();
131 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
132 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
133 return -EINVAL;
134 }
135 }
136 sock_confirm_neigh(skb, neigh);
137 ret = neigh_output(neigh, skb, false);
138 rcu_read_unlock();
139 return ret;
140 }
141
142 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
144 struct sk_buff *skb, unsigned int mtu)
145 {
146 struct sk_buff *segs, *nskb;
147 netdev_features_t features;
148 int ret = 0;
149
150 /* Please see corresponding comment in ip_finish_output_gso
151 * describing the cases where GSO segment length exceeds the
152 * egress MTU.
153 */
154 features = netif_skb_features(skb);
155 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
156 if (IS_ERR_OR_NULL(segs)) {
157 kfree_skb(skb);
158 return -ENOMEM;
159 }
160
161 consume_skb(skb);
162
163 skb_list_walk_safe(segs, segs, nskb) {
164 int err;
165
166 skb_mark_not_on_list(segs);
167 /* Last GSO segment can be smaller than gso_size (and MTU).
168 * Adding a fragment header would produce an "atomic fragment",
169 * which is considered harmful (RFC-8021). Avoid that.
170 */
171 err = segs->len > mtu ?
172 ip6_fragment(net, sk, segs, ip6_finish_output2) :
173 ip6_finish_output2(net, sk, segs);
174 if (err && ret == 0)
175 ret = err;
176 }
177
178 return ret;
179 }
180
ip6_finish_output_gso(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)181 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
182 struct sk_buff *skb, unsigned int mtu)
183 {
184 if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
185 !skb_gso_validate_network_len(skb, mtu))
186 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
187
188 return ip6_finish_output2(net, sk, skb);
189 }
190
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)191 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
192 {
193 unsigned int mtu;
194
195 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
196 /* Policy lookup after SNAT yielded a new policy */
197 if (skb_dst(skb)->xfrm) {
198 IP6CB(skb)->flags |= IP6SKB_REROUTED;
199 return dst_output(net, sk, skb);
200 }
201 #endif
202
203 mtu = ip6_skb_dst_mtu(skb);
204 if (skb_is_gso(skb))
205 return ip6_finish_output_gso(net, sk, skb, mtu);
206
207 if (skb->len > mtu ||
208 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
209 return ip6_fragment(net, sk, skb, ip6_finish_output2);
210
211 return ip6_finish_output2(net, sk, skb);
212 }
213
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)214 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 int ret;
217
218 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
219 switch (ret) {
220 case NET_XMIT_SUCCESS:
221 case NET_XMIT_CN:
222 return __ip6_finish_output(net, sk, skb) ? : ret;
223 default:
224 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
225 return ret;
226 }
227 }
228
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233
234 skb->protocol = htons(ETH_P_IPV6);
235 skb->dev = dev;
236
237 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
238 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
240 return 0;
241 }
242
243 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 net, sk, skb, indev, dev,
245 ip6_finish_output,
246 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 EXPORT_SYMBOL(ip6_output);
249
ip6_autoflowlabel(struct net * net,const struct sock * sk)250 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
251 {
252 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
253 return ip6_default_np_autolabel(net);
254 return inet6_test_bit(AUTOFLOWLABEL, sk);
255 }
256
257 /*
258 * xmit an sk_buff (used by TCP, SCTP and DCCP)
259 * Note : socket lock is not held for SYNACK packets, but might be modified
260 * by calls to skb_set_owner_w() and ipv6_local_error(),
261 * which are using proper atomic operations or spinlocks.
262 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
264 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
265 {
266 struct net *net = sock_net(sk);
267 const struct ipv6_pinfo *np = inet6_sk(sk);
268 struct in6_addr *first_hop = &fl6->daddr;
269 struct dst_entry *dst = skb_dst(skb);
270 struct net_device *dev = dst->dev;
271 struct inet6_dev *idev = ip6_dst_idev(dst);
272 struct hop_jumbo_hdr *hop_jumbo;
273 int hoplen = sizeof(*hop_jumbo);
274 unsigned int head_room;
275 struct ipv6hdr *hdr;
276 u8 proto = fl6->flowi6_proto;
277 int seg_len = skb->len;
278 int hlimit = -1;
279 u32 mtu;
280
281 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
282 if (opt)
283 head_room += opt->opt_nflen + opt->opt_flen;
284
285 if (unlikely(head_room > skb_headroom(skb))) {
286 skb = skb_expand_head(skb, head_room);
287 if (!skb) {
288 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
289 return -ENOBUFS;
290 }
291 }
292
293 if (opt) {
294 seg_len += opt->opt_nflen + opt->opt_flen;
295
296 if (opt->opt_flen)
297 ipv6_push_frag_opts(skb, opt, &proto);
298
299 if (opt->opt_nflen)
300 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
301 &fl6->saddr);
302 }
303
304 if (unlikely(seg_len > IPV6_MAXPLEN)) {
305 hop_jumbo = skb_push(skb, hoplen);
306
307 hop_jumbo->nexthdr = proto;
308 hop_jumbo->hdrlen = 0;
309 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
310 hop_jumbo->tlv_len = 4;
311 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
312
313 proto = IPPROTO_HOPOPTS;
314 seg_len = 0;
315 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
316 }
317
318 skb_push(skb, sizeof(struct ipv6hdr));
319 skb_reset_network_header(skb);
320 hdr = ipv6_hdr(skb);
321
322 /*
323 * Fill in the IPv6 header
324 */
325 if (np)
326 hlimit = READ_ONCE(np->hop_limit);
327 if (hlimit < 0)
328 hlimit = ip6_dst_hoplimit(dst);
329
330 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
331 ip6_autoflowlabel(net, sk), fl6));
332
333 hdr->payload_len = htons(seg_len);
334 hdr->nexthdr = proto;
335 hdr->hop_limit = hlimit;
336
337 hdr->saddr = fl6->saddr;
338 hdr->daddr = *first_hop;
339
340 skb->protocol = htons(ETH_P_IPV6);
341 skb->priority = priority;
342 skb->mark = mark;
343
344 mtu = dst_mtu(dst);
345 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
346 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
347
348 /* if egress device is enslaved to an L3 master device pass the
349 * skb to its handler for processing
350 */
351 skb = l3mdev_ip6_out((struct sock *)sk, skb);
352 if (unlikely(!skb))
353 return 0;
354
355 /* hooks should never assume socket lock is held.
356 * we promote our socket to non const
357 */
358 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
359 net, (struct sock *)sk, skb, NULL, dev,
360 dst_output);
361 }
362
363 skb->dev = dev;
364 /* ipv6_local_error() does not require socket lock,
365 * we promote our socket to non const
366 */
367 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
368
369 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
370 kfree_skb(skb);
371 return -EMSGSIZE;
372 }
373 EXPORT_SYMBOL(ip6_xmit);
374
ip6_call_ra_chain(struct sk_buff * skb,int sel)375 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
376 {
377 struct ip6_ra_chain *ra;
378 struct sock *last = NULL;
379
380 read_lock(&ip6_ra_lock);
381 for (ra = ip6_ra_chain; ra; ra = ra->next) {
382 struct sock *sk = ra->sk;
383 if (sk && ra->sel == sel &&
384 (!sk->sk_bound_dev_if ||
385 sk->sk_bound_dev_if == skb->dev->ifindex)) {
386
387 if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
388 !net_eq(sock_net(sk), dev_net(skb->dev))) {
389 continue;
390 }
391 if (last) {
392 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
393 if (skb2)
394 rawv6_rcv(last, skb2);
395 }
396 last = sk;
397 }
398 }
399
400 if (last) {
401 rawv6_rcv(last, skb);
402 read_unlock(&ip6_ra_lock);
403 return 1;
404 }
405 read_unlock(&ip6_ra_lock);
406 return 0;
407 }
408
ip6_forward_proxy_check(struct sk_buff * skb)409 static int ip6_forward_proxy_check(struct sk_buff *skb)
410 {
411 struct ipv6hdr *hdr = ipv6_hdr(skb);
412 u8 nexthdr = hdr->nexthdr;
413 __be16 frag_off;
414 int offset;
415
416 if (ipv6_ext_hdr(nexthdr)) {
417 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
418 if (offset < 0)
419 return 0;
420 } else
421 offset = sizeof(struct ipv6hdr);
422
423 if (nexthdr == IPPROTO_ICMPV6) {
424 struct icmp6hdr *icmp6;
425
426 if (!pskb_may_pull(skb, (skb_network_header(skb) +
427 offset + 1 - skb->data)))
428 return 0;
429
430 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
431
432 switch (icmp6->icmp6_type) {
433 case NDISC_ROUTER_SOLICITATION:
434 case NDISC_ROUTER_ADVERTISEMENT:
435 case NDISC_NEIGHBOUR_SOLICITATION:
436 case NDISC_NEIGHBOUR_ADVERTISEMENT:
437 case NDISC_REDIRECT:
438 /* For reaction involving unicast neighbor discovery
439 * message destined to the proxied address, pass it to
440 * input function.
441 */
442 return 1;
443 default:
444 break;
445 }
446 }
447
448 /*
449 * The proxying router can't forward traffic sent to a link-local
450 * address, so signal the sender and discard the packet. This
451 * behavior is clarified by the MIPv6 specification.
452 */
453 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
454 dst_link_failure(skb);
455 return -1;
456 }
457
458 return 0;
459 }
460
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)461 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
462 struct sk_buff *skb)
463 {
464 #ifdef CONFIG_NET_SWITCHDEV
465 if (skb->offload_l3_fwd_mark) {
466 consume_skb(skb);
467 return 0;
468 }
469 #endif
470
471 skb_clear_tstamp(skb);
472 return dst_output(net, sk, skb);
473 }
474
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)475 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
476 {
477 if (skb->len <= mtu)
478 return false;
479
480 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
481 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
482 return true;
483
484 if (skb->ignore_df)
485 return false;
486
487 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
488 return false;
489
490 return true;
491 }
492
ip6_forward(struct sk_buff * skb)493 int ip6_forward(struct sk_buff *skb)
494 {
495 struct dst_entry *dst = skb_dst(skb);
496 struct ipv6hdr *hdr = ipv6_hdr(skb);
497 struct inet6_skb_parm *opt = IP6CB(skb);
498 struct net *net = dev_net(dst->dev);
499 struct inet6_dev *idev;
500 SKB_DR(reason);
501 u32 mtu;
502
503 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
504 if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
505 goto error;
506
507 if (skb->pkt_type != PACKET_HOST)
508 goto drop;
509
510 if (unlikely(skb->sk))
511 goto drop;
512
513 if (skb_warn_if_lro(skb))
514 goto drop;
515
516 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
517 (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
518 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
519 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
520 goto drop;
521 }
522
523 skb_forward_csum(skb);
524
525 /*
526 * We DO NOT make any processing on
527 * RA packets, pushing them to user level AS IS
528 * without ane WARRANTY that application will be able
529 * to interpret them. The reason is that we
530 * cannot make anything clever here.
531 *
532 * We are not end-node, so that if packet contains
533 * AH/ESP, we cannot make anything.
534 * Defragmentation also would be mistake, RA packets
535 * cannot be fragmented, because there is no warranty
536 * that different fragments will go along one path. --ANK
537 */
538 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
539 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
540 return 0;
541 }
542
543 /*
544 * check and decrement ttl
545 */
546 if (hdr->hop_limit <= 1) {
547 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
548 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
549
550 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
551 return -ETIMEDOUT;
552 }
553
554 /* XXX: idev->cnf.proxy_ndp? */
555 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
556 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
557 int proxied = ip6_forward_proxy_check(skb);
558 if (proxied > 0) {
559 /* It's tempting to decrease the hop limit
560 * here by 1, as we do at the end of the
561 * function too.
562 *
563 * But that would be incorrect, as proxying is
564 * not forwarding. The ip6_input function
565 * will handle this packet locally, and it
566 * depends on the hop limit being unchanged.
567 *
568 * One example is the NDP hop limit, that
569 * always has to stay 255, but other would be
570 * similar checks around RA packets, where the
571 * user can even change the desired limit.
572 */
573 return ip6_input(skb);
574 } else if (proxied < 0) {
575 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
576 goto drop;
577 }
578 }
579
580 if (!xfrm6_route_forward(skb)) {
581 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
582 SKB_DR_SET(reason, XFRM_POLICY);
583 goto drop;
584 }
585 dst = skb_dst(skb);
586
587 /* IPv6 specs say nothing about it, but it is clear that we cannot
588 send redirects to source routed frames.
589 We don't send redirects to frames decapsulated from IPsec.
590 */
591 if (IP6CB(skb)->iif == dst->dev->ifindex &&
592 opt->srcrt == 0 && !skb_sec_path(skb)) {
593 struct in6_addr *target = NULL;
594 struct inet_peer *peer;
595 struct rt6_info *rt;
596
597 /*
598 * incoming and outgoing devices are the same
599 * send a redirect.
600 */
601
602 rt = dst_rt6_info(dst);
603 if (rt->rt6i_flags & RTF_GATEWAY)
604 target = &rt->rt6i_gateway;
605 else
606 target = &hdr->daddr;
607
608 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
609
610 /* Limit redirects both by destination (here)
611 and by source (inside ndisc_send_redirect)
612 */
613 if (inet_peer_xrlim_allow(peer, 1*HZ))
614 ndisc_send_redirect(skb, target);
615 if (peer)
616 inet_putpeer(peer);
617 } else {
618 int addrtype = ipv6_addr_type(&hdr->saddr);
619
620 /* This check is security critical. */
621 if (addrtype == IPV6_ADDR_ANY ||
622 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
623 goto error;
624 if (addrtype & IPV6_ADDR_LINKLOCAL) {
625 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
626 ICMPV6_NOT_NEIGHBOUR, 0);
627 goto error;
628 }
629 }
630
631 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
632
633 mtu = ip6_dst_mtu_maybe_forward(dst, true);
634 if (mtu < IPV6_MIN_MTU)
635 mtu = IPV6_MIN_MTU;
636
637 if (ip6_pkt_too_big(skb, mtu)) {
638 /* Again, force OUTPUT device used as source address */
639 skb->dev = dst->dev;
640 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
642 __IP6_INC_STATS(net, ip6_dst_idev(dst),
643 IPSTATS_MIB_FRAGFAILS);
644 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
645 return -EMSGSIZE;
646 }
647
648 if (skb_cow(skb, dst->dev->hard_header_len)) {
649 __IP6_INC_STATS(net, ip6_dst_idev(dst),
650 IPSTATS_MIB_OUTDISCARDS);
651 goto drop;
652 }
653
654 hdr = ipv6_hdr(skb);
655
656 /* Mangling hops number delayed to point after skb COW */
657
658 hdr->hop_limit--;
659
660 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
661 net, NULL, skb, skb->dev, dst->dev,
662 ip6_forward_finish);
663
664 error:
665 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
666 SKB_DR_SET(reason, IP_INADDRERRORS);
667 drop:
668 kfree_skb_reason(skb, reason);
669 return -EINVAL;
670 }
671
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)672 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
673 {
674 to->pkt_type = from->pkt_type;
675 to->priority = from->priority;
676 to->protocol = from->protocol;
677 skb_dst_drop(to);
678 skb_dst_set(to, dst_clone(skb_dst(from)));
679 to->dev = from->dev;
680 to->mark = from->mark;
681
682 skb_copy_hash(to, from);
683
684 #ifdef CONFIG_NET_SCHED
685 to->tc_index = from->tc_index;
686 #endif
687 nf_copy(to, from);
688 skb_ext_copy(to, from);
689 skb_copy_secmark(to, from);
690 }
691
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)692 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
693 u8 nexthdr, __be32 frag_id,
694 struct ip6_fraglist_iter *iter)
695 {
696 unsigned int first_len;
697 struct frag_hdr *fh;
698
699 /* BUILD HEADER */
700 *prevhdr = NEXTHDR_FRAGMENT;
701 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
702 if (!iter->tmp_hdr)
703 return -ENOMEM;
704
705 iter->frag = skb_shinfo(skb)->frag_list;
706 skb_frag_list_init(skb);
707
708 iter->offset = 0;
709 iter->hlen = hlen;
710 iter->frag_id = frag_id;
711 iter->nexthdr = nexthdr;
712
713 __skb_pull(skb, hlen);
714 fh = __skb_push(skb, sizeof(struct frag_hdr));
715 __skb_push(skb, hlen);
716 skb_reset_network_header(skb);
717 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
718
719 fh->nexthdr = nexthdr;
720 fh->reserved = 0;
721 fh->frag_off = htons(IP6_MF);
722 fh->identification = frag_id;
723
724 first_len = skb_pagelen(skb);
725 skb->data_len = first_len - skb_headlen(skb);
726 skb->len = first_len;
727 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
728
729 return 0;
730 }
731 EXPORT_SYMBOL(ip6_fraglist_init);
732
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)733 void ip6_fraglist_prepare(struct sk_buff *skb,
734 struct ip6_fraglist_iter *iter)
735 {
736 struct sk_buff *frag = iter->frag;
737 unsigned int hlen = iter->hlen;
738 struct frag_hdr *fh;
739
740 frag->ip_summed = CHECKSUM_NONE;
741 skb_reset_transport_header(frag);
742 fh = __skb_push(frag, sizeof(struct frag_hdr));
743 __skb_push(frag, hlen);
744 skb_reset_network_header(frag);
745 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
746 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
747 fh->nexthdr = iter->nexthdr;
748 fh->reserved = 0;
749 fh->frag_off = htons(iter->offset);
750 if (frag->next)
751 fh->frag_off |= htons(IP6_MF);
752 fh->identification = iter->frag_id;
753 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
754 ip6_copy_metadata(frag, skb);
755 }
756 EXPORT_SYMBOL(ip6_fraglist_prepare);
757
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)758 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
759 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
760 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
761 {
762 state->prevhdr = prevhdr;
763 state->nexthdr = nexthdr;
764 state->frag_id = frag_id;
765
766 state->hlen = hlen;
767 state->mtu = mtu;
768
769 state->left = skb->len - hlen; /* Space per frame */
770 state->ptr = hlen; /* Where to start from */
771
772 state->hroom = hdr_room;
773 state->troom = needed_tailroom;
774
775 state->offset = 0;
776 }
777 EXPORT_SYMBOL(ip6_frag_init);
778
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)779 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
780 {
781 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
782 struct sk_buff *frag;
783 struct frag_hdr *fh;
784 unsigned int len;
785
786 len = state->left;
787 /* IF: it doesn't fit, use 'mtu' - the data space left */
788 if (len > state->mtu)
789 len = state->mtu;
790 /* IF: we are not sending up to and including the packet end
791 then align the next start on an eight byte boundary */
792 if (len < state->left)
793 len &= ~7;
794
795 /* Allocate buffer */
796 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
797 state->hroom + state->troom, GFP_ATOMIC);
798 if (!frag)
799 return ERR_PTR(-ENOMEM);
800
801 /*
802 * Set up data on packet
803 */
804
805 ip6_copy_metadata(frag, skb);
806 skb_reserve(frag, state->hroom);
807 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
808 skb_reset_network_header(frag);
809 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
810 frag->transport_header = (frag->network_header + state->hlen +
811 sizeof(struct frag_hdr));
812
813 /*
814 * Charge the memory for the fragment to any owner
815 * it might possess
816 */
817 if (skb->sk)
818 skb_set_owner_w(frag, skb->sk);
819
820 /*
821 * Copy the packet header into the new buffer.
822 */
823 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
824
825 fragnexthdr_offset = skb_network_header(frag);
826 fragnexthdr_offset += prevhdr - skb_network_header(skb);
827 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
828
829 /*
830 * Build fragment header.
831 */
832 fh->nexthdr = state->nexthdr;
833 fh->reserved = 0;
834 fh->identification = state->frag_id;
835
836 /*
837 * Copy a block of the IP datagram.
838 */
839 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
840 len));
841 state->left -= len;
842
843 fh->frag_off = htons(state->offset);
844 if (state->left > 0)
845 fh->frag_off |= htons(IP6_MF);
846 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
847
848 state->ptr += len;
849 state->offset += len;
850
851 return frag;
852 }
853 EXPORT_SYMBOL(ip6_frag_next);
854
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))855 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
856 int (*output)(struct net *, struct sock *, struct sk_buff *))
857 {
858 struct sk_buff *frag;
859 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
860 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
861 inet6_sk(skb->sk) : NULL;
862 u8 tstamp_type = skb->tstamp_type;
863 struct ip6_frag_state state;
864 unsigned int mtu, hlen, nexthdr_offset;
865 ktime_t tstamp = skb->tstamp;
866 int hroom, err = 0;
867 __be32 frag_id;
868 u8 *prevhdr, nexthdr = 0;
869
870 err = ip6_find_1stfragopt(skb, &prevhdr);
871 if (err < 0)
872 goto fail;
873 hlen = err;
874 nexthdr = *prevhdr;
875 nexthdr_offset = prevhdr - skb_network_header(skb);
876
877 mtu = ip6_skb_dst_mtu(skb);
878
879 /* We must not fragment if the socket is set to force MTU discovery
880 * or if the skb it not generated by a local socket.
881 */
882 if (unlikely(!skb->ignore_df && skb->len > mtu))
883 goto fail_toobig;
884
885 if (IP6CB(skb)->frag_max_size) {
886 if (IP6CB(skb)->frag_max_size > mtu)
887 goto fail_toobig;
888
889 /* don't send fragments larger than what we received */
890 mtu = IP6CB(skb)->frag_max_size;
891 if (mtu < IPV6_MIN_MTU)
892 mtu = IPV6_MIN_MTU;
893 }
894
895 if (np) {
896 u32 frag_size = READ_ONCE(np->frag_size);
897
898 if (frag_size && frag_size < mtu)
899 mtu = frag_size;
900 }
901 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
902 goto fail_toobig;
903 mtu -= hlen + sizeof(struct frag_hdr);
904
905 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
906 &ipv6_hdr(skb)->saddr);
907
908 if (skb->ip_summed == CHECKSUM_PARTIAL &&
909 (err = skb_checksum_help(skb)))
910 goto fail;
911
912 prevhdr = skb_network_header(skb) + nexthdr_offset;
913 hroom = LL_RESERVED_SPACE(rt->dst.dev);
914 if (skb_has_frag_list(skb)) {
915 unsigned int first_len = skb_pagelen(skb);
916 struct ip6_fraglist_iter iter;
917 struct sk_buff *frag2;
918
919 if (first_len - hlen > mtu ||
920 ((first_len - hlen) & 7) ||
921 skb_cloned(skb) ||
922 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
923 goto slow_path;
924
925 skb_walk_frags(skb, frag) {
926 /* Correct geometry. */
927 if (frag->len > mtu ||
928 ((frag->len & 7) && frag->next) ||
929 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
930 goto slow_path_clean;
931
932 /* Partially cloned skb? */
933 if (skb_shared(frag))
934 goto slow_path_clean;
935
936 BUG_ON(frag->sk);
937 if (skb->sk) {
938 frag->sk = skb->sk;
939 frag->destructor = sock_wfree;
940 }
941 skb->truesize -= frag->truesize;
942 }
943
944 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
945 &iter);
946 if (err < 0)
947 goto fail;
948
949 /* We prevent @rt from being freed. */
950 rcu_read_lock();
951
952 for (;;) {
953 /* Prepare header of the next frame,
954 * before previous one went down. */
955 if (iter.frag)
956 ip6_fraglist_prepare(skb, &iter);
957
958 skb_set_delivery_time(skb, tstamp, tstamp_type);
959 err = output(net, sk, skb);
960 if (!err)
961 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
962 IPSTATS_MIB_FRAGCREATES);
963
964 if (err || !iter.frag)
965 break;
966
967 skb = ip6_fraglist_next(&iter);
968 }
969
970 kfree(iter.tmp_hdr);
971
972 if (err == 0) {
973 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
974 IPSTATS_MIB_FRAGOKS);
975 rcu_read_unlock();
976 return 0;
977 }
978
979 kfree_skb_list(iter.frag);
980
981 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
982 IPSTATS_MIB_FRAGFAILS);
983 rcu_read_unlock();
984 return err;
985
986 slow_path_clean:
987 skb_walk_frags(skb, frag2) {
988 if (frag2 == frag)
989 break;
990 frag2->sk = NULL;
991 frag2->destructor = NULL;
992 skb->truesize += frag2->truesize;
993 }
994 }
995
996 slow_path:
997 /*
998 * Fragment the datagram.
999 */
1000
1001 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1002 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1003 &state);
1004
1005 /*
1006 * Keep copying data until we run out.
1007 */
1008
1009 while (state.left > 0) {
1010 frag = ip6_frag_next(skb, &state);
1011 if (IS_ERR(frag)) {
1012 err = PTR_ERR(frag);
1013 goto fail;
1014 }
1015
1016 /*
1017 * Put this fragment into the sending queue.
1018 */
1019 skb_set_delivery_time(frag, tstamp, tstamp_type);
1020 err = output(net, sk, frag);
1021 if (err)
1022 goto fail;
1023
1024 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1025 IPSTATS_MIB_FRAGCREATES);
1026 }
1027 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 IPSTATS_MIB_FRAGOKS);
1029 consume_skb(skb);
1030 return err;
1031
1032 fail_toobig:
1033 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1034 err = -EMSGSIZE;
1035
1036 fail:
1037 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1038 IPSTATS_MIB_FRAGFAILS);
1039 kfree_skb(skb);
1040 return err;
1041 }
1042
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1043 static inline int ip6_rt_check(const struct rt6key *rt_key,
1044 const struct in6_addr *fl_addr,
1045 const struct in6_addr *addr_cache)
1046 {
1047 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1048 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1049 }
1050
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1051 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1052 struct dst_entry *dst,
1053 const struct flowi6 *fl6)
1054 {
1055 struct ipv6_pinfo *np = inet6_sk(sk);
1056 struct rt6_info *rt;
1057
1058 if (!dst)
1059 goto out;
1060
1061 if (dst->ops->family != AF_INET6) {
1062 dst_release(dst);
1063 return NULL;
1064 }
1065
1066 rt = dst_rt6_info(dst);
1067 /* Yes, checking route validity in not connected
1068 * case is not very simple. Take into account,
1069 * that we do not support routing by source, TOS,
1070 * and MSG_DONTROUTE --ANK (980726)
1071 *
1072 * 1. ip6_rt_check(): If route was host route,
1073 * check that cached destination is current.
1074 * If it is network route, we still may
1075 * check its validity using saved pointer
1076 * to the last used address: daddr_cache.
1077 * We do not want to save whole address now,
1078 * (because main consumer of this service
1079 * is tcp, which has not this problem),
1080 * so that the last trick works only on connected
1081 * sockets.
1082 * 2. oif also should be the same.
1083 */
1084 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1085 #ifdef CONFIG_IPV6_SUBTREES
1086 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1087 #endif
1088 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1089 dst_release(dst);
1090 dst = NULL;
1091 }
1092
1093 out:
1094 return dst;
1095 }
1096
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1097 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1098 struct dst_entry **dst, struct flowi6 *fl6)
1099 {
1100 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1101 struct neighbour *n;
1102 struct rt6_info *rt;
1103 #endif
1104 int err;
1105 int flags = 0;
1106
1107 /* The correct way to handle this would be to do
1108 * ip6_route_get_saddr, and then ip6_route_output; however,
1109 * the route-specific preferred source forces the
1110 * ip6_route_output call _before_ ip6_route_get_saddr.
1111 *
1112 * In source specific routing (no src=any default route),
1113 * ip6_route_output will fail given src=any saddr, though, so
1114 * that's why we try it again later.
1115 */
1116 if (ipv6_addr_any(&fl6->saddr)) {
1117 struct fib6_info *from;
1118 struct rt6_info *rt;
1119
1120 *dst = ip6_route_output(net, sk, fl6);
1121 rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1122
1123 rcu_read_lock();
1124 from = rt ? rcu_dereference(rt->from) : NULL;
1125 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1126 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1127 fl6->flowi6_l3mdev,
1128 &fl6->saddr);
1129 rcu_read_unlock();
1130
1131 if (err)
1132 goto out_err_release;
1133
1134 /* If we had an erroneous initial result, pretend it
1135 * never existed and let the SA-enabled version take
1136 * over.
1137 */
1138 if ((*dst)->error) {
1139 dst_release(*dst);
1140 *dst = NULL;
1141 }
1142
1143 if (fl6->flowi6_oif)
1144 flags |= RT6_LOOKUP_F_IFACE;
1145 }
1146
1147 if (!*dst)
1148 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1149
1150 err = (*dst)->error;
1151 if (err)
1152 goto out_err_release;
1153
1154 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1155 /*
1156 * Here if the dst entry we've looked up
1157 * has a neighbour entry that is in the INCOMPLETE
1158 * state and the src address from the flow is
1159 * marked as OPTIMISTIC, we release the found
1160 * dst entry and replace it instead with the
1161 * dst entry of the nexthop router
1162 */
1163 rt = dst_rt6_info(*dst);
1164 rcu_read_lock();
1165 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1166 rt6_nexthop(rt, &fl6->daddr));
1167 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1168 rcu_read_unlock();
1169
1170 if (err) {
1171 struct inet6_ifaddr *ifp;
1172 struct flowi6 fl_gw6;
1173 int redirect;
1174
1175 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1176 (*dst)->dev, 1);
1177
1178 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1179 if (ifp)
1180 in6_ifa_put(ifp);
1181
1182 if (redirect) {
1183 /*
1184 * We need to get the dst entry for the
1185 * default router instead
1186 */
1187 dst_release(*dst);
1188 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1189 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1190 *dst = ip6_route_output(net, sk, &fl_gw6);
1191 err = (*dst)->error;
1192 if (err)
1193 goto out_err_release;
1194 }
1195 }
1196 #endif
1197 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1198 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1199 err = -EAFNOSUPPORT;
1200 goto out_err_release;
1201 }
1202
1203 return 0;
1204
1205 out_err_release:
1206 dst_release(*dst);
1207 *dst = NULL;
1208
1209 if (err == -ENETUNREACH)
1210 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1211 return err;
1212 }
1213
1214 /**
1215 * ip6_dst_lookup - perform route lookup on flow
1216 * @net: Network namespace to perform lookup in
1217 * @sk: socket which provides route info
1218 * @dst: pointer to dst_entry * for result
1219 * @fl6: flow to lookup
1220 *
1221 * This function performs a route lookup on the given flow.
1222 *
1223 * It returns zero on success, or a standard errno code on error.
1224 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1225 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1226 struct flowi6 *fl6)
1227 {
1228 *dst = NULL;
1229 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1230 }
1231 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1232
1233 /**
1234 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1235 * @net: Network namespace to perform lookup in
1236 * @sk: socket which provides route info
1237 * @fl6: flow to lookup
1238 * @final_dst: final destination address for ipsec lookup
1239 *
1240 * This function performs a route lookup on the given flow.
1241 *
1242 * It returns a valid dst pointer on success, or a pointer encoded
1243 * error code.
1244 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1245 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1246 const struct in6_addr *final_dst)
1247 {
1248 struct dst_entry *dst = NULL;
1249 int err;
1250
1251 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1252 if (err)
1253 return ERR_PTR(err);
1254 if (final_dst)
1255 fl6->daddr = *final_dst;
1256
1257 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1258 }
1259 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1260
1261 /**
1262 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1263 * @sk: socket which provides the dst cache and route info
1264 * @fl6: flow to lookup
1265 * @final_dst: final destination address for ipsec lookup
1266 * @connected: whether @sk is connected or not
1267 *
1268 * This function performs a route lookup on the given flow with the
1269 * possibility of using the cached route in the socket if it is valid.
1270 * It will take the socket dst lock when operating on the dst cache.
1271 * As a result, this function can only be used in process context.
1272 *
1273 * In addition, for a connected socket, cache the dst in the socket
1274 * if the current cache is not valid.
1275 *
1276 * It returns a valid dst pointer on success, or a pointer encoded
1277 * error code.
1278 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1279 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1280 const struct in6_addr *final_dst,
1281 bool connected)
1282 {
1283 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1284
1285 dst = ip6_sk_dst_check(sk, dst, fl6);
1286 if (dst)
1287 return dst;
1288
1289 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1290 if (connected && !IS_ERR(dst))
1291 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1292
1293 return dst;
1294 }
1295 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1296
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1297 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1298 gfp_t gfp)
1299 {
1300 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1301 }
1302
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1303 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1304 gfp_t gfp)
1305 {
1306 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1307 }
1308
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1309 static void ip6_append_data_mtu(unsigned int *mtu,
1310 int *maxfraglen,
1311 unsigned int fragheaderlen,
1312 struct sk_buff *skb,
1313 struct rt6_info *rt,
1314 unsigned int orig_mtu)
1315 {
1316 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1317 if (!skb) {
1318 /* first fragment, reserve header_len */
1319 *mtu = orig_mtu - rt->dst.header_len;
1320
1321 } else {
1322 /*
1323 * this fragment is not first, the headers
1324 * space is regarded as data space.
1325 */
1326 *mtu = orig_mtu;
1327 }
1328 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1329 + fragheaderlen - sizeof(struct frag_hdr);
1330 }
1331 }
1332
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1333 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1334 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1335 struct rt6_info *rt)
1336 {
1337 struct ipv6_pinfo *np = inet6_sk(sk);
1338 unsigned int mtu, frag_size;
1339 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1340
1341 /* callers pass dst together with a reference, set it first so
1342 * ip6_cork_release() can put it down even in case of an error.
1343 */
1344 cork->base.dst = &rt->dst;
1345
1346 /*
1347 * setup for corking
1348 */
1349 if (opt) {
1350 if (WARN_ON(v6_cork->opt))
1351 return -EINVAL;
1352
1353 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1354 if (unlikely(!nopt))
1355 return -ENOBUFS;
1356
1357 nopt->tot_len = sizeof(*opt);
1358 nopt->opt_flen = opt->opt_flen;
1359 nopt->opt_nflen = opt->opt_nflen;
1360
1361 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1362 if (opt->dst0opt && !nopt->dst0opt)
1363 return -ENOBUFS;
1364
1365 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1366 if (opt->dst1opt && !nopt->dst1opt)
1367 return -ENOBUFS;
1368
1369 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1370 if (opt->hopopt && !nopt->hopopt)
1371 return -ENOBUFS;
1372
1373 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1374 if (opt->srcrt && !nopt->srcrt)
1375 return -ENOBUFS;
1376
1377 /* need source address above miyazawa*/
1378 }
1379 v6_cork->hop_limit = ipc6->hlimit;
1380 v6_cork->tclass = ipc6->tclass;
1381 if (rt->dst.flags & DST_XFRM_TUNNEL)
1382 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1383 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1384 else
1385 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1386 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1387
1388 frag_size = READ_ONCE(np->frag_size);
1389 if (frag_size && frag_size < mtu)
1390 mtu = frag_size;
1391
1392 cork->base.fragsize = mtu;
1393 cork->base.gso_size = ipc6->gso_size;
1394 cork->base.tx_flags = 0;
1395 cork->base.mark = ipc6->sockc.mark;
1396 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1397
1398 cork->base.length = 0;
1399 cork->base.transmit_time = ipc6->sockc.transmit_time;
1400
1401 return 0;
1402 }
1403
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1404 static int __ip6_append_data(struct sock *sk,
1405 struct sk_buff_head *queue,
1406 struct inet_cork_full *cork_full,
1407 struct inet6_cork *v6_cork,
1408 struct page_frag *pfrag,
1409 int getfrag(void *from, char *to, int offset,
1410 int len, int odd, struct sk_buff *skb),
1411 void *from, size_t length, int transhdrlen,
1412 unsigned int flags, struct ipcm6_cookie *ipc6)
1413 {
1414 struct sk_buff *skb, *skb_prev = NULL;
1415 struct inet_cork *cork = &cork_full->base;
1416 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1417 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1418 struct ubuf_info *uarg = NULL;
1419 int exthdrlen = 0;
1420 int dst_exthdrlen = 0;
1421 int hh_len;
1422 int copy;
1423 int err;
1424 int offset = 0;
1425 bool zc = false;
1426 u32 tskey = 0;
1427 struct rt6_info *rt = dst_rt6_info(cork->dst);
1428 bool paged, hold_tskey, extra_uref = false;
1429 struct ipv6_txoptions *opt = v6_cork->opt;
1430 int csummode = CHECKSUM_NONE;
1431 unsigned int maxnonfragsize, headersize;
1432 unsigned int wmem_alloc_delta = 0;
1433
1434 skb = skb_peek_tail(queue);
1435 if (!skb) {
1436 exthdrlen = opt ? opt->opt_flen : 0;
1437 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1438 }
1439
1440 paged = !!cork->gso_size;
1441 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1442 orig_mtu = mtu;
1443
1444 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1445
1446 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1447 (opt ? opt->opt_nflen : 0);
1448
1449 headersize = sizeof(struct ipv6hdr) +
1450 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1451 rt->rt6i_nfheader_len;
1452
1453 if (mtu <= fragheaderlen ||
1454 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1455 goto emsgsize;
1456
1457 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1458 sizeof(struct frag_hdr);
1459
1460 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1461 * the first fragment
1462 */
1463 if (headersize + transhdrlen > mtu)
1464 goto emsgsize;
1465
1466 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1467 (sk->sk_protocol == IPPROTO_UDP ||
1468 sk->sk_protocol == IPPROTO_ICMPV6 ||
1469 sk->sk_protocol == IPPROTO_RAW)) {
1470 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1471 sizeof(struct ipv6hdr));
1472 goto emsgsize;
1473 }
1474
1475 if (ip6_sk_ignore_df(sk))
1476 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1477 else
1478 maxnonfragsize = mtu;
1479
1480 if (cork->length + length > maxnonfragsize - headersize) {
1481 emsgsize:
1482 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1483 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1484 return -EMSGSIZE;
1485 }
1486
1487 /* CHECKSUM_PARTIAL only with no extension headers and when
1488 * we are not going to fragment
1489 */
1490 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1491 headersize == sizeof(struct ipv6hdr) &&
1492 length <= mtu - headersize &&
1493 (!(flags & MSG_MORE) || cork->gso_size) &&
1494 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1495 csummode = CHECKSUM_PARTIAL;
1496
1497 if ((flags & MSG_ZEROCOPY) && length) {
1498 struct msghdr *msg = from;
1499
1500 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1501 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1502 return -EINVAL;
1503
1504 /* Leave uarg NULL if can't zerocopy, callers should
1505 * be able to handle it.
1506 */
1507 if ((rt->dst.dev->features & NETIF_F_SG) &&
1508 csummode == CHECKSUM_PARTIAL) {
1509 paged = true;
1510 zc = true;
1511 uarg = msg->msg_ubuf;
1512 }
1513 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1514 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1515 if (!uarg)
1516 return -ENOBUFS;
1517 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1518 if (rt->dst.dev->features & NETIF_F_SG &&
1519 csummode == CHECKSUM_PARTIAL) {
1520 paged = true;
1521 zc = true;
1522 } else {
1523 uarg_to_msgzc(uarg)->zerocopy = 0;
1524 skb_zcopy_set(skb, uarg, &extra_uref);
1525 }
1526 }
1527 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1528 if (inet_test_bit(HDRINCL, sk))
1529 return -EPERM;
1530 if (rt->dst.dev->features & NETIF_F_SG &&
1531 getfrag == ip_generic_getfrag)
1532 /* We need an empty buffer to attach stuff to */
1533 paged = true;
1534 else
1535 flags &= ~MSG_SPLICE_PAGES;
1536 }
1537
1538 hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
1539 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
1540 if (hold_tskey)
1541 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1542
1543 /*
1544 * Let's try using as much space as possible.
1545 * Use MTU if total length of the message fits into the MTU.
1546 * Otherwise, we need to reserve fragment header and
1547 * fragment alignment (= 8-15 octects, in total).
1548 *
1549 * Note that we may need to "move" the data from the tail
1550 * of the buffer to the new fragment when we split
1551 * the message.
1552 *
1553 * FIXME: It may be fragmented into multiple chunks
1554 * at once if non-fragmentable extension headers
1555 * are too large.
1556 * --yoshfuji
1557 */
1558
1559 cork->length += length;
1560 if (!skb)
1561 goto alloc_new_skb;
1562
1563 while (length > 0) {
1564 /* Check if the remaining data fits into current packet. */
1565 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1566 if (copy < length)
1567 copy = maxfraglen - skb->len;
1568
1569 if (copy <= 0) {
1570 char *data;
1571 unsigned int datalen;
1572 unsigned int fraglen;
1573 unsigned int fraggap;
1574 unsigned int alloclen, alloc_extra;
1575 unsigned int pagedlen;
1576 alloc_new_skb:
1577 /* There's no room in the current skb */
1578 if (skb)
1579 fraggap = skb->len - maxfraglen;
1580 else
1581 fraggap = 0;
1582 /* update mtu and maxfraglen if necessary */
1583 if (!skb || !skb_prev)
1584 ip6_append_data_mtu(&mtu, &maxfraglen,
1585 fragheaderlen, skb, rt,
1586 orig_mtu);
1587
1588 skb_prev = skb;
1589
1590 /*
1591 * If remaining data exceeds the mtu,
1592 * we know we need more fragment(s).
1593 */
1594 datalen = length + fraggap;
1595
1596 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1597 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1598 fraglen = datalen + fragheaderlen;
1599 pagedlen = 0;
1600
1601 alloc_extra = hh_len;
1602 alloc_extra += dst_exthdrlen;
1603 alloc_extra += rt->dst.trailer_len;
1604
1605 /* We just reserve space for fragment header.
1606 * Note: this may be overallocation if the message
1607 * (without MSG_MORE) fits into the MTU.
1608 */
1609 alloc_extra += sizeof(struct frag_hdr);
1610
1611 if ((flags & MSG_MORE) &&
1612 !(rt->dst.dev->features&NETIF_F_SG))
1613 alloclen = mtu;
1614 else if (!paged &&
1615 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1616 !(rt->dst.dev->features & NETIF_F_SG)))
1617 alloclen = fraglen;
1618 else {
1619 alloclen = fragheaderlen + transhdrlen;
1620 pagedlen = datalen - transhdrlen;
1621 }
1622 alloclen += alloc_extra;
1623
1624 if (datalen != length + fraggap) {
1625 /*
1626 * this is not the last fragment, the trailer
1627 * space is regarded as data space.
1628 */
1629 datalen += rt->dst.trailer_len;
1630 }
1631
1632 fraglen = datalen + fragheaderlen;
1633
1634 copy = datalen - transhdrlen - fraggap - pagedlen;
1635 /* [!] NOTE: copy may be negative if pagedlen>0
1636 * because then the equation may reduces to -fraggap.
1637 */
1638 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1639 err = -EINVAL;
1640 goto error;
1641 }
1642 if (transhdrlen) {
1643 skb = sock_alloc_send_skb(sk, alloclen,
1644 (flags & MSG_DONTWAIT), &err);
1645 } else {
1646 skb = NULL;
1647 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1648 2 * sk->sk_sndbuf)
1649 skb = alloc_skb(alloclen,
1650 sk->sk_allocation);
1651 if (unlikely(!skb))
1652 err = -ENOBUFS;
1653 }
1654 if (!skb)
1655 goto error;
1656 /*
1657 * Fill in the control structures
1658 */
1659 skb->protocol = htons(ETH_P_IPV6);
1660 skb->ip_summed = csummode;
1661 skb->csum = 0;
1662 /* reserve for fragmentation and ipsec header */
1663 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1664 dst_exthdrlen);
1665
1666 /*
1667 * Find where to start putting bytes
1668 */
1669 data = skb_put(skb, fraglen - pagedlen);
1670 skb_set_network_header(skb, exthdrlen);
1671 data += fragheaderlen;
1672 skb->transport_header = (skb->network_header +
1673 fragheaderlen);
1674 if (fraggap) {
1675 skb->csum = skb_copy_and_csum_bits(
1676 skb_prev, maxfraglen,
1677 data + transhdrlen, fraggap);
1678 skb_prev->csum = csum_sub(skb_prev->csum,
1679 skb->csum);
1680 data += fraggap;
1681 pskb_trim_unique(skb_prev, maxfraglen);
1682 }
1683 if (copy > 0 &&
1684 getfrag(from, data + transhdrlen, offset,
1685 copy, fraggap, skb) < 0) {
1686 err = -EFAULT;
1687 kfree_skb(skb);
1688 goto error;
1689 } else if (flags & MSG_SPLICE_PAGES) {
1690 copy = 0;
1691 }
1692
1693 offset += copy;
1694 length -= copy + transhdrlen;
1695 transhdrlen = 0;
1696 exthdrlen = 0;
1697 dst_exthdrlen = 0;
1698
1699 /* Only the initial fragment is time stamped */
1700 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1701 cork->tx_flags = 0;
1702 skb_shinfo(skb)->tskey = tskey;
1703 tskey = 0;
1704 skb_zcopy_set(skb, uarg, &extra_uref);
1705
1706 if ((flags & MSG_CONFIRM) && !skb_prev)
1707 skb_set_dst_pending_confirm(skb, 1);
1708
1709 /*
1710 * Put the packet on the pending queue
1711 */
1712 if (!skb->destructor) {
1713 skb->destructor = sock_wfree;
1714 skb->sk = sk;
1715 wmem_alloc_delta += skb->truesize;
1716 }
1717 __skb_queue_tail(queue, skb);
1718 continue;
1719 }
1720
1721 if (copy > length)
1722 copy = length;
1723
1724 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1725 skb_tailroom(skb) >= copy) {
1726 unsigned int off;
1727
1728 off = skb->len;
1729 if (getfrag(from, skb_put(skb, copy),
1730 offset, copy, off, skb) < 0) {
1731 __skb_trim(skb, off);
1732 err = -EFAULT;
1733 goto error;
1734 }
1735 } else if (flags & MSG_SPLICE_PAGES) {
1736 struct msghdr *msg = from;
1737
1738 err = -EIO;
1739 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1740 goto error;
1741
1742 err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1743 sk->sk_allocation);
1744 if (err < 0)
1745 goto error;
1746 copy = err;
1747 wmem_alloc_delta += copy;
1748 } else if (!zc) {
1749 int i = skb_shinfo(skb)->nr_frags;
1750
1751 err = -ENOMEM;
1752 if (!sk_page_frag_refill(sk, pfrag))
1753 goto error;
1754
1755 skb_zcopy_downgrade_managed(skb);
1756 if (!skb_can_coalesce(skb, i, pfrag->page,
1757 pfrag->offset)) {
1758 err = -EMSGSIZE;
1759 if (i == MAX_SKB_FRAGS)
1760 goto error;
1761
1762 __skb_fill_page_desc(skb, i, pfrag->page,
1763 pfrag->offset, 0);
1764 skb_shinfo(skb)->nr_frags = ++i;
1765 get_page(pfrag->page);
1766 }
1767 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1768 if (getfrag(from,
1769 page_address(pfrag->page) + pfrag->offset,
1770 offset, copy, skb->len, skb) < 0)
1771 goto error_efault;
1772
1773 pfrag->offset += copy;
1774 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1775 skb->len += copy;
1776 skb->data_len += copy;
1777 skb->truesize += copy;
1778 wmem_alloc_delta += copy;
1779 } else {
1780 err = skb_zerocopy_iter_dgram(skb, from, copy);
1781 if (err < 0)
1782 goto error;
1783 }
1784 offset += copy;
1785 length -= copy;
1786 }
1787
1788 if (wmem_alloc_delta)
1789 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1790 return 0;
1791
1792 error_efault:
1793 err = -EFAULT;
1794 error:
1795 net_zcopy_put_abort(uarg, extra_uref);
1796 cork->length -= length;
1797 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1798 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1799 if (hold_tskey)
1800 atomic_dec(&sk->sk_tskey);
1801 return err;
1802 }
1803
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1804 int ip6_append_data(struct sock *sk,
1805 int getfrag(void *from, char *to, int offset, int len,
1806 int odd, struct sk_buff *skb),
1807 void *from, size_t length, int transhdrlen,
1808 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1809 struct rt6_info *rt, unsigned int flags)
1810 {
1811 struct inet_sock *inet = inet_sk(sk);
1812 struct ipv6_pinfo *np = inet6_sk(sk);
1813 int exthdrlen;
1814 int err;
1815
1816 if (flags&MSG_PROBE)
1817 return 0;
1818 if (skb_queue_empty(&sk->sk_write_queue)) {
1819 /*
1820 * setup for corking
1821 */
1822 dst_hold(&rt->dst);
1823 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1824 ipc6, rt);
1825 if (err)
1826 return err;
1827
1828 inet->cork.fl.u.ip6 = *fl6;
1829 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1830 length += exthdrlen;
1831 transhdrlen += exthdrlen;
1832 } else {
1833 transhdrlen = 0;
1834 }
1835
1836 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1837 &np->cork, sk_page_frag(sk), getfrag,
1838 from, length, transhdrlen, flags, ipc6);
1839 }
1840 EXPORT_SYMBOL_GPL(ip6_append_data);
1841
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1842 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1843 {
1844 struct dst_entry *dst = cork->base.dst;
1845
1846 cork->base.dst = NULL;
1847 skb_dst_set(skb, dst);
1848 }
1849
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1850 static void ip6_cork_release(struct inet_cork_full *cork,
1851 struct inet6_cork *v6_cork)
1852 {
1853 if (v6_cork->opt) {
1854 struct ipv6_txoptions *opt = v6_cork->opt;
1855
1856 kfree(opt->dst0opt);
1857 kfree(opt->dst1opt);
1858 kfree(opt->hopopt);
1859 kfree(opt->srcrt);
1860 kfree(opt);
1861 v6_cork->opt = NULL;
1862 }
1863
1864 if (cork->base.dst) {
1865 dst_release(cork->base.dst);
1866 cork->base.dst = NULL;
1867 }
1868 }
1869
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1870 struct sk_buff *__ip6_make_skb(struct sock *sk,
1871 struct sk_buff_head *queue,
1872 struct inet_cork_full *cork,
1873 struct inet6_cork *v6_cork)
1874 {
1875 struct sk_buff *skb, *tmp_skb;
1876 struct sk_buff **tail_skb;
1877 struct in6_addr *final_dst;
1878 struct net *net = sock_net(sk);
1879 struct ipv6hdr *hdr;
1880 struct ipv6_txoptions *opt = v6_cork->opt;
1881 struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1882 struct flowi6 *fl6 = &cork->fl.u.ip6;
1883 unsigned char proto = fl6->flowi6_proto;
1884
1885 skb = __skb_dequeue(queue);
1886 if (!skb)
1887 goto out;
1888 tail_skb = &(skb_shinfo(skb)->frag_list);
1889
1890 /* move skb->data to ip header from ext header */
1891 if (skb->data < skb_network_header(skb))
1892 __skb_pull(skb, skb_network_offset(skb));
1893 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1894 __skb_pull(tmp_skb, skb_network_header_len(skb));
1895 *tail_skb = tmp_skb;
1896 tail_skb = &(tmp_skb->next);
1897 skb->len += tmp_skb->len;
1898 skb->data_len += tmp_skb->len;
1899 skb->truesize += tmp_skb->truesize;
1900 tmp_skb->destructor = NULL;
1901 tmp_skb->sk = NULL;
1902 }
1903
1904 /* Allow local fragmentation. */
1905 skb->ignore_df = ip6_sk_ignore_df(sk);
1906 __skb_pull(skb, skb_network_header_len(skb));
1907
1908 final_dst = &fl6->daddr;
1909 if (opt && opt->opt_flen)
1910 ipv6_push_frag_opts(skb, opt, &proto);
1911 if (opt && opt->opt_nflen)
1912 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1913
1914 skb_push(skb, sizeof(struct ipv6hdr));
1915 skb_reset_network_header(skb);
1916 hdr = ipv6_hdr(skb);
1917
1918 ip6_flow_hdr(hdr, v6_cork->tclass,
1919 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1920 ip6_autoflowlabel(net, sk), fl6));
1921 hdr->hop_limit = v6_cork->hop_limit;
1922 hdr->nexthdr = proto;
1923 hdr->saddr = fl6->saddr;
1924 hdr->daddr = *final_dst;
1925
1926 skb->priority = READ_ONCE(sk->sk_priority);
1927 skb->mark = cork->base.mark;
1928 if (sk_is_tcp(sk))
1929 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1930 else
1931 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1932
1933 ip6_cork_steal_dst(skb, cork);
1934 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1935 if (proto == IPPROTO_ICMPV6) {
1936 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1937 u8 icmp6_type;
1938
1939 if (sk->sk_socket->type == SOCK_RAW &&
1940 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1941 icmp6_type = fl6->fl6_icmp_type;
1942 else
1943 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1944 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1945 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1946 }
1947
1948 ip6_cork_release(cork, v6_cork);
1949 out:
1950 return skb;
1951 }
1952
ip6_send_skb(struct sk_buff * skb)1953 int ip6_send_skb(struct sk_buff *skb)
1954 {
1955 struct net *net = sock_net(skb->sk);
1956 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1957 int err;
1958
1959 err = ip6_local_out(net, skb->sk, skb);
1960 if (err) {
1961 if (err > 0)
1962 err = net_xmit_errno(err);
1963 if (err)
1964 IP6_INC_STATS(net, rt->rt6i_idev,
1965 IPSTATS_MIB_OUTDISCARDS);
1966 }
1967
1968 return err;
1969 }
1970
ip6_push_pending_frames(struct sock * sk)1971 int ip6_push_pending_frames(struct sock *sk)
1972 {
1973 struct sk_buff *skb;
1974
1975 skb = ip6_finish_skb(sk);
1976 if (!skb)
1977 return 0;
1978
1979 return ip6_send_skb(skb);
1980 }
1981 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1982
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1983 static void __ip6_flush_pending_frames(struct sock *sk,
1984 struct sk_buff_head *queue,
1985 struct inet_cork_full *cork,
1986 struct inet6_cork *v6_cork)
1987 {
1988 struct sk_buff *skb;
1989
1990 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1991 if (skb_dst(skb))
1992 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1993 IPSTATS_MIB_OUTDISCARDS);
1994 kfree_skb(skb);
1995 }
1996
1997 ip6_cork_release(cork, v6_cork);
1998 }
1999
ip6_flush_pending_frames(struct sock * sk)2000 void ip6_flush_pending_frames(struct sock *sk)
2001 {
2002 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2003 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2004 }
2005 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2006
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2007 struct sk_buff *ip6_make_skb(struct sock *sk,
2008 int getfrag(void *from, char *to, int offset,
2009 int len, int odd, struct sk_buff *skb),
2010 void *from, size_t length, int transhdrlen,
2011 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2012 unsigned int flags, struct inet_cork_full *cork)
2013 {
2014 struct inet6_cork v6_cork;
2015 struct sk_buff_head queue;
2016 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2017 int err;
2018
2019 if (flags & MSG_PROBE) {
2020 dst_release(&rt->dst);
2021 return NULL;
2022 }
2023
2024 __skb_queue_head_init(&queue);
2025
2026 cork->base.flags = 0;
2027 cork->base.addr = 0;
2028 cork->base.opt = NULL;
2029 v6_cork.opt = NULL;
2030 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2031 if (err) {
2032 ip6_cork_release(cork, &v6_cork);
2033 return ERR_PTR(err);
2034 }
2035 if (ipc6->dontfrag < 0)
2036 ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
2037
2038 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2039 ¤t->task_frag, getfrag, from,
2040 length + exthdrlen, transhdrlen + exthdrlen,
2041 flags, ipc6);
2042 if (err) {
2043 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2044 return ERR_PTR(err);
2045 }
2046
2047 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2048 }
2049