2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 static u32 ipv6_fragmentation_id = 1;
63 static DEFINE_SPINLOCK(ip6_id_lock);
65 spin_lock_bh(&ip6_id_lock);
66 fhdr->identification = htonl(ipv6_fragmentation_id);
67 if (++ipv6_fragmentation_id == 0)
68 ipv6_fragmentation_id = 1;
69 spin_unlock_bh(&ip6_id_lock);
72 int __ip6_local_out(struct sk_buff *skb)
76 len = skb->len - sizeof(struct ipv6hdr);
77 if (len > IPV6_MAXPLEN)
79 ipv6_hdr(skb)->payload_len = htons(len);
81 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
85 int ip6_local_out(struct sk_buff *skb)
89 err = __ip6_local_out(skb);
91 err = dst_output(skb);
95 EXPORT_SYMBOL_GPL(ip6_local_out);
97 static int ip6_output_finish(struct sk_buff *skb)
99 struct dst_entry *dst = skb_dst(skb);
102 return neigh_hh_output(dst->hh, skb);
103 else if (dst->neighbour)
104 return dst->neighbour->output(skb);
106 IP6_INC_STATS_BH(dev_net(dst->dev),
107 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
113 /* dev_loopback_xmit for use with netfilter. */
114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
116 skb_reset_mac_header(newskb);
117 __skb_pull(newskb, skb_network_offset(newskb));
118 newskb->pkt_type = PACKET_LOOPBACK;
119 newskb->ip_summed = CHECKSUM_UNNECESSARY;
120 WARN_ON(!skb_dst(newskb));
127 static int ip6_output2(struct sk_buff *skb)
129 struct dst_entry *dst = skb_dst(skb);
130 struct net_device *dev = dst->dev;
132 skb->protocol = htons(ETH_P_IPV6);
135 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
136 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
137 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
140 ((mroute6_socket(dev_net(dev)) &&
141 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
142 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
143 &ipv6_hdr(skb)->saddr))) {
144 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
146 /* Do not check for IFF_ALLMULTI; multicast routing
147 is not supported in any case.
150 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
152 ip6_dev_loopback_xmit);
154 if (ipv6_hdr(skb)->hop_limit == 0) {
155 IP6_INC_STATS(dev_net(dev), idev,
156 IPSTATS_MIB_OUTDISCARDS);
162 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
166 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
170 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
172 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
174 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
175 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
178 int ip6_output(struct sk_buff *skb)
180 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
181 if (unlikely(idev->cnf.disable_ipv6)) {
182 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
183 IPSTATS_MIB_OUTDISCARDS);
188 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
189 dst_allfrag(skb_dst(skb)))
190 return ip6_fragment(skb, ip6_output2);
192 return ip6_output2(skb);
196 * xmit an sk_buff (used by TCP)
199 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
200 struct ipv6_txoptions *opt, int ipfragok)
202 struct net *net = sock_net(sk);
203 struct ipv6_pinfo *np = inet6_sk(sk);
204 struct in6_addr *first_hop = &fl->fl6_dst;
205 struct dst_entry *dst = skb_dst(skb);
207 u8 proto = fl->proto;
208 int seg_len = skb->len;
213 unsigned int head_room;
215 /* First: exthdrs may take lots of space (~8K for now)
216 MAX_HEADER is not enough.
218 head_room = opt->opt_nflen + opt->opt_flen;
219 seg_len += head_room;
220 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
222 if (skb_headroom(skb) < head_room) {
223 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
225 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 IPSTATS_MIB_OUTDISCARDS);
233 skb_set_owner_w(skb, sk);
236 ipv6_push_frag_opts(skb, opt, &proto);
238 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
241 skb_push(skb, sizeof(struct ipv6hdr));
242 skb_reset_network_header(skb);
245 /* Allow local fragmentation. */
250 * Fill in the IPv6 header
255 hlimit = np->hop_limit;
257 hlimit = ip6_dst_hoplimit(dst);
265 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
267 hdr->payload_len = htons(seg_len);
268 hdr->nexthdr = proto;
269 hdr->hop_limit = hlimit;
271 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
272 ipv6_addr_copy(&hdr->daddr, first_hop);
274 skb->priority = sk->sk_priority;
275 skb->mark = sk->sk_mark;
278 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
279 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
280 IPSTATS_MIB_OUT, skb->len);
281 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
286 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
288 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
289 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
294 EXPORT_SYMBOL(ip6_xmit);
297 * To avoid extra problems ND packets are send through this
298 * routine. It's code duplication but I really want to avoid
299 * extra checks since ipv6_build_header is used by TCP (which
300 * is for us performance critical)
303 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
304 const struct in6_addr *saddr, const struct in6_addr *daddr,
307 struct ipv6_pinfo *np = inet6_sk(sk);
311 skb->protocol = htons(ETH_P_IPV6);
314 totlen = len + sizeof(struct ipv6hdr);
316 skb_reset_network_header(skb);
317 skb_put(skb, sizeof(struct ipv6hdr));
320 *(__be32*)hdr = htonl(0x60000000);
322 hdr->payload_len = htons(len);
323 hdr->nexthdr = proto;
324 hdr->hop_limit = np->hop_limit;
326 ipv6_addr_copy(&hdr->saddr, saddr);
327 ipv6_addr_copy(&hdr->daddr, daddr);
332 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
334 struct ip6_ra_chain *ra;
335 struct sock *last = NULL;
337 read_lock(&ip6_ra_lock);
338 for (ra = ip6_ra_chain; ra; ra = ra->next) {
339 struct sock *sk = ra->sk;
340 if (sk && ra->sel == sel &&
341 (!sk->sk_bound_dev_if ||
342 sk->sk_bound_dev_if == skb->dev->ifindex)) {
344 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
346 rawv6_rcv(last, skb2);
353 rawv6_rcv(last, skb);
354 read_unlock(&ip6_ra_lock);
357 read_unlock(&ip6_ra_lock);
361 static int ip6_forward_proxy_check(struct sk_buff *skb)
363 struct ipv6hdr *hdr = ipv6_hdr(skb);
364 u8 nexthdr = hdr->nexthdr;
367 if (ipv6_ext_hdr(nexthdr)) {
368 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
372 offset = sizeof(struct ipv6hdr);
374 if (nexthdr == IPPROTO_ICMPV6) {
375 struct icmp6hdr *icmp6;
377 if (!pskb_may_pull(skb, (skb_network_header(skb) +
378 offset + 1 - skb->data)))
381 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
383 switch (icmp6->icmp6_type) {
384 case NDISC_ROUTER_SOLICITATION:
385 case NDISC_ROUTER_ADVERTISEMENT:
386 case NDISC_NEIGHBOUR_SOLICITATION:
387 case NDISC_NEIGHBOUR_ADVERTISEMENT:
389 /* For reaction involving unicast neighbor discovery
390 * message destined to the proxied address, pass it to
400 * The proxying router can't forward traffic sent to a link-local
401 * address, so signal the sender and discard the packet. This
402 * behavior is clarified by the MIPv6 specification.
404 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
405 dst_link_failure(skb);
412 static inline int ip6_forward_finish(struct sk_buff *skb)
414 return dst_output(skb);
417 int ip6_forward(struct sk_buff *skb)
419 struct dst_entry *dst = skb_dst(skb);
420 struct ipv6hdr *hdr = ipv6_hdr(skb);
421 struct inet6_skb_parm *opt = IP6CB(skb);
422 struct net *net = dev_net(dst->dev);
424 if (net->ipv6.devconf_all->forwarding == 0)
427 if (skb_warn_if_lro(skb))
430 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
431 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
435 skb_forward_csum(skb);
438 * We DO NOT make any processing on
439 * RA packets, pushing them to user level AS IS
440 * without ane WARRANTY that application will be able
441 * to interpret them. The reason is that we
442 * cannot make anything clever here.
444 * We are not end-node, so that if packet contains
445 * AH/ESP, we cannot make anything.
446 * Defragmentation also would be mistake, RA packets
447 * cannot be fragmented, because there is no warranty
448 * that different fragments will go along one path. --ANK
451 u8 *ptr = skb_network_header(skb) + opt->ra;
452 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
457 * check and decrement ttl
459 if (hdr->hop_limit <= 1) {
460 /* Force OUTPUT device used as source address */
462 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
464 IP6_INC_STATS_BH(net,
465 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
471 /* XXX: idev->cnf.proxy_ndp? */
472 if (net->ipv6.devconf_all->proxy_ndp &&
473 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
474 int proxied = ip6_forward_proxy_check(skb);
476 return ip6_input(skb);
477 else if (proxied < 0) {
478 IP6_INC_STATS(net, ip6_dst_idev(dst),
479 IPSTATS_MIB_INDISCARDS);
484 if (!xfrm6_route_forward(skb)) {
485 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
490 /* IPv6 specs say nothing about it, but it is clear that we cannot
491 send redirects to source routed frames.
492 We don't send redirects to frames decapsulated from IPsec.
494 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
495 !skb_sec_path(skb)) {
496 struct in6_addr *target = NULL;
498 struct neighbour *n = dst->neighbour;
501 * incoming and outgoing devices are the same
505 rt = (struct rt6_info *) dst;
506 if ((rt->rt6i_flags & RTF_GATEWAY))
507 target = (struct in6_addr*)&n->primary_key;
509 target = &hdr->daddr;
511 /* Limit redirects both by destination (here)
512 and by source (inside ndisc_send_redirect)
514 if (xrlim_allow(dst, 1*HZ))
515 ndisc_send_redirect(skb, n, target);
517 int addrtype = ipv6_addr_type(&hdr->saddr);
519 /* This check is security critical. */
520 if (addrtype == IPV6_ADDR_ANY ||
521 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
523 if (addrtype & IPV6_ADDR_LINKLOCAL) {
524 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
525 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
530 if (skb->len > dst_mtu(dst)) {
531 /* Again, force OUTPUT device used as source address */
533 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
534 IP6_INC_STATS_BH(net,
535 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
536 IP6_INC_STATS_BH(net,
537 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
542 if (skb_cow(skb, dst->dev->hard_header_len)) {
543 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
549 /* Mangling hops number delayed to point after skb COW */
553 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
554 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
558 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
564 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
566 to->pkt_type = from->pkt_type;
567 to->priority = from->priority;
568 to->protocol = from->protocol;
570 skb_dst_set(to, dst_clone(skb_dst(from)));
572 to->mark = from->mark;
574 #ifdef CONFIG_NET_SCHED
575 to->tc_index = from->tc_index;
578 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
579 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
580 to->nf_trace = from->nf_trace;
582 skb_copy_secmark(to, from);
585 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
587 u16 offset = sizeof(struct ipv6hdr);
588 struct ipv6_opt_hdr *exthdr =
589 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
590 unsigned int packet_len = skb->tail - skb->network_header;
592 *nexthdr = &ipv6_hdr(skb)->nexthdr;
594 while (offset + 1 <= packet_len) {
600 case NEXTHDR_ROUTING:
604 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
605 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
615 offset += ipv6_optlen(exthdr);
616 *nexthdr = &exthdr->nexthdr;
617 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
624 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
626 struct sk_buff *frag;
627 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 struct ipv6hdr *tmp_hdr;
631 unsigned int mtu, hlen, left, len;
633 int ptr, offset = 0, err=0;
634 u8 *prevhdr, nexthdr = 0;
635 struct net *net = dev_net(skb_dst(skb)->dev);
637 hlen = ip6_find_1stfragopt(skb, &prevhdr);
640 mtu = ip6_skb_dst_mtu(skb);
642 /* We must not fragment if the socket is set to force MTU discovery
643 * or if the skb it not generated by a local socket. (This last
644 * check should be redundant, but it's free.)
646 if (!skb->local_df) {
647 skb->dev = skb_dst(skb)->dev;
648 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
649 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
650 IPSTATS_MIB_FRAGFAILS);
655 if (np && np->frag_size < mtu) {
659 mtu -= hlen + sizeof(struct frag_hdr);
661 if (skb_has_frags(skb)) {
662 int first_len = skb_pagelen(skb);
665 if (first_len - hlen > mtu ||
666 ((first_len - hlen) & 7) ||
670 skb_walk_frags(skb, frag) {
671 /* Correct geometry. */
672 if (frag->len > mtu ||
673 ((frag->len & 7) && frag->next) ||
674 skb_headroom(frag) < hlen)
677 /* Partially cloned skb? */
678 if (skb_shared(frag))
684 frag->destructor = sock_wfree;
685 truesizes += frag->truesize;
691 frag = skb_shinfo(skb)->frag_list;
692 skb_frag_list_init(skb);
695 *prevhdr = NEXTHDR_FRAGMENT;
696 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
699 IPSTATS_MIB_FRAGFAILS);
703 __skb_pull(skb, hlen);
704 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
705 __skb_push(skb, hlen);
706 skb_reset_network_header(skb);
707 memcpy(skb_network_header(skb), tmp_hdr, hlen);
709 ipv6_select_ident(skb, fh);
710 fh->nexthdr = nexthdr;
712 fh->frag_off = htons(IP6_MF);
713 frag_id = fh->identification;
715 first_len = skb_pagelen(skb);
716 skb->data_len = first_len - skb_headlen(skb);
717 skb->truesize -= truesizes;
718 skb->len = first_len;
719 ipv6_hdr(skb)->payload_len = htons(first_len -
720 sizeof(struct ipv6hdr));
722 dst_hold(&rt->u.dst);
725 /* Prepare header of the next frame,
726 * before previous one went down. */
728 frag->ip_summed = CHECKSUM_NONE;
729 skb_reset_transport_header(frag);
730 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731 __skb_push(frag, hlen);
732 skb_reset_network_header(frag);
733 memcpy(skb_network_header(frag), tmp_hdr,
735 offset += skb->len - hlen - sizeof(struct frag_hdr);
736 fh->nexthdr = nexthdr;
738 fh->frag_off = htons(offset);
739 if (frag->next != NULL)
740 fh->frag_off |= htons(IP6_MF);
741 fh->identification = frag_id;
742 ipv6_hdr(frag)->payload_len =
744 sizeof(struct ipv6hdr));
745 ip6_copy_metadata(frag, skb);
750 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
751 IPSTATS_MIB_FRAGCREATES);
764 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
765 IPSTATS_MIB_FRAGOKS);
766 dst_release(&rt->u.dst);
776 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
777 IPSTATS_MIB_FRAGFAILS);
778 dst_release(&rt->u.dst);
783 left = skb->len - hlen; /* Space per frame */
784 ptr = hlen; /* Where to start from */
787 * Fragment the datagram.
790 *prevhdr = NEXTHDR_FRAGMENT;
793 * Keep copying data until we run out.
797 /* IF: it doesn't fit, use 'mtu' - the data space left */
800 /* IF: we are not sending upto and including the packet end
801 then align the next start on an eight byte boundary */
809 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
810 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
811 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
812 IPSTATS_MIB_FRAGFAILS);
818 * Set up data on packet
821 ip6_copy_metadata(frag, skb);
822 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
823 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
824 skb_reset_network_header(frag);
825 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
826 frag->transport_header = (frag->network_header + hlen +
827 sizeof(struct frag_hdr));
830 * Charge the memory for the fragment to any owner
834 skb_set_owner_w(frag, skb->sk);
837 * Copy the packet header into the new buffer.
839 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
842 * Build fragment header.
844 fh->nexthdr = nexthdr;
847 ipv6_select_ident(skb, fh);
848 frag_id = fh->identification;
850 fh->identification = frag_id;
853 * Copy a block of the IP datagram.
855 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
859 fh->frag_off = htons(offset);
861 fh->frag_off |= htons(IP6_MF);
862 ipv6_hdr(frag)->payload_len = htons(frag->len -
863 sizeof(struct ipv6hdr));
869 * Put this fragment into the sending queue.
875 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
876 IPSTATS_MIB_FRAGCREATES);
878 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
879 IPSTATS_MIB_FRAGOKS);
884 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 IPSTATS_MIB_FRAGFAILS);
890 static inline int ip6_rt_check(struct rt6key *rt_key,
891 struct in6_addr *fl_addr,
892 struct in6_addr *addr_cache)
894 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
895 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
898 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
899 struct dst_entry *dst,
902 struct ipv6_pinfo *np = inet6_sk(sk);
903 struct rt6_info *rt = (struct rt6_info *)dst;
908 /* Yes, checking route validity in not connected
909 * case is not very simple. Take into account,
910 * that we do not support routing by source, TOS,
911 * and MSG_DONTROUTE --ANK (980726)
913 * 1. ip6_rt_check(): If route was host route,
914 * check that cached destination is current.
915 * If it is network route, we still may
916 * check its validity using saved pointer
917 * to the last used address: daddr_cache.
918 * We do not want to save whole address now,
919 * (because main consumer of this service
920 * is tcp, which has not this problem),
921 * so that the last trick works only on connected
923 * 2. oif also should be the same.
925 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
926 #ifdef CONFIG_IPV6_SUBTREES
927 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
929 (fl->oif && fl->oif != dst->dev->ifindex)) {
938 static int ip6_dst_lookup_tail(struct sock *sk,
939 struct dst_entry **dst, struct flowi *fl)
942 struct net *net = sock_net(sk);
945 *dst = ip6_route_output(net, sk, fl);
947 if ((err = (*dst)->error))
948 goto out_err_release;
950 if (ipv6_addr_any(&fl->fl6_src)) {
951 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
953 sk ? inet6_sk(sk)->srcprefs : 0,
956 goto out_err_release;
959 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
961 * Here if the dst entry we've looked up
962 * has a neighbour entry that is in the INCOMPLETE
963 * state and the src address from the flow is
964 * marked as OPTIMISTIC, we release the found
965 * dst entry and replace it instead with the
966 * dst entry of the nexthop router
968 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
969 struct inet6_ifaddr *ifp;
973 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
976 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
982 * We need to get the dst entry for the
983 * default router instead
986 memcpy(&fl_gw, fl, sizeof(struct flowi));
987 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
988 *dst = ip6_route_output(net, sk, &fl_gw);
989 if ((err = (*dst)->error))
990 goto out_err_release;
998 if (err == -ENETUNREACH)
999 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1006 * ip6_dst_lookup - perform route lookup on flow
1007 * @sk: socket which provides route info
1008 * @dst: pointer to dst_entry * for result
1009 * @fl: flow to lookup
1011 * This function performs a route lookup on the given flow.
1013 * It returns zero on success, or a standard errno code on error.
1015 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 return ip6_dst_lookup_tail(sk, dst, fl);
1020 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1023 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1024 * @sk: socket which provides the dst cache and route info
1025 * @dst: pointer to dst_entry * for result
1026 * @fl: flow to lookup
1028 * This function performs a route lookup on the given flow with the
1029 * possibility of using the cached route in the socket if it is valid.
1030 * It will take the socket dst lock when operating on the dst cache.
1031 * As a result, this function can only be used in process context.
1033 * It returns zero on success, or a standard errno code on error.
1035 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1039 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1040 *dst = ip6_sk_dst_check(sk, *dst, fl);
1043 return ip6_dst_lookup_tail(sk, dst, fl);
1045 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1047 static inline int ip6_ufo_append_data(struct sock *sk,
1048 int getfrag(void *from, char *to, int offset, int len,
1049 int odd, struct sk_buff *skb),
1050 void *from, int length, int hh_len, int fragheaderlen,
1051 int transhdrlen, int mtu,unsigned int flags)
1054 struct sk_buff *skb;
1057 /* There is support for UDP large send offload by network
1058 * device, so create one single skb packet containing complete
1061 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1062 skb = sock_alloc_send_skb(sk,
1063 hh_len + fragheaderlen + transhdrlen + 20,
1064 (flags & MSG_DONTWAIT), &err);
1068 /* reserve space for Hardware header */
1069 skb_reserve(skb, hh_len);
1071 /* create space for UDP/IP header */
1072 skb_put(skb,fragheaderlen + transhdrlen);
1074 /* initialize network header pointer */
1075 skb_reset_network_header(skb);
1077 /* initialize protocol header pointer */
1078 skb->transport_header = skb->network_header + fragheaderlen;
1080 skb->ip_summed = CHECKSUM_PARTIAL;
1082 sk->sk_sndmsg_off = 0;
1085 err = skb_append_datato_frags(sk,skb, getfrag, from,
1086 (length - transhdrlen));
1088 struct frag_hdr fhdr;
1090 /* specify the length of each IP datagram fragment*/
1091 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1092 sizeof(struct frag_hdr);
1093 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1094 ipv6_select_ident(skb, &fhdr);
1095 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1096 __skb_queue_tail(&sk->sk_write_queue, skb);
1100 /* There is not enough support do UPD LSO,
1101 * so follow normal path
1108 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1111 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1114 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1117 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1120 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1121 int offset, int len, int odd, struct sk_buff *skb),
1122 void *from, int length, int transhdrlen,
1123 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1124 struct rt6_info *rt, unsigned int flags)
1126 struct inet_sock *inet = inet_sk(sk);
1127 struct ipv6_pinfo *np = inet6_sk(sk);
1128 struct sk_buff *skb;
1129 unsigned int maxfraglen, fragheaderlen;
1136 int csummode = CHECKSUM_NONE;
1138 if (flags&MSG_PROBE)
1140 if (skb_queue_empty(&sk->sk_write_queue)) {
1145 if (WARN_ON(np->cork.opt))
1148 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1149 if (unlikely(np->cork.opt == NULL))
1152 np->cork.opt->tot_len = opt->tot_len;
1153 np->cork.opt->opt_flen = opt->opt_flen;
1154 np->cork.opt->opt_nflen = opt->opt_nflen;
1156 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1158 if (opt->dst0opt && !np->cork.opt->dst0opt)
1161 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1163 if (opt->dst1opt && !np->cork.opt->dst1opt)
1166 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1168 if (opt->hopopt && !np->cork.opt->hopopt)
1171 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1173 if (opt->srcrt && !np->cork.opt->srcrt)
1176 /* need source address above miyazawa*/
1178 dst_hold(&rt->u.dst);
1179 inet->cork.dst = &rt->u.dst;
1180 inet->cork.fl = *fl;
1181 np->cork.hop_limit = hlimit;
1182 np->cork.tclass = tclass;
1183 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1184 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1185 if (np->frag_size < mtu) {
1187 mtu = np->frag_size;
1189 inet->cork.fragsize = mtu;
1190 if (dst_allfrag(rt->u.dst.path))
1191 inet->cork.flags |= IPCORK_ALLFRAG;
1192 inet->cork.length = 0;
1193 sk->sk_sndmsg_page = NULL;
1194 sk->sk_sndmsg_off = 0;
1195 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1196 rt->rt6i_nfheader_len;
1197 length += exthdrlen;
1198 transhdrlen += exthdrlen;
1200 rt = (struct rt6_info *)inet->cork.dst;
1201 fl = &inet->cork.fl;
1205 mtu = inet->cork.fragsize;
1208 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1210 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1211 (opt ? opt->opt_nflen : 0);
1212 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1214 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1215 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1216 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1222 * Let's try using as much space as possible.
1223 * Use MTU if total length of the message fits into the MTU.
1224 * Otherwise, we need to reserve fragment header and
1225 * fragment alignment (= 8-15 octects, in total).
1227 * Note that we may need to "move" the data from the tail of
1228 * of the buffer to the new fragment when we split
1231 * FIXME: It may be fragmented into multiple chunks
1232 * at once if non-fragmentable extension headers
1237 inet->cork.length += length;
1238 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1239 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1241 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1242 fragheaderlen, transhdrlen, mtu,
1249 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1252 while (length > 0) {
1253 /* Check if the remaining data fits into current packet. */
1254 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1256 copy = maxfraglen - skb->len;
1260 unsigned int datalen;
1261 unsigned int fraglen;
1262 unsigned int fraggap;
1263 unsigned int alloclen;
1264 struct sk_buff *skb_prev;
1268 /* There's no room in the current skb */
1270 fraggap = skb_prev->len - maxfraglen;
1275 * If remaining data exceeds the mtu,
1276 * we know we need more fragment(s).
1278 datalen = length + fraggap;
1279 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1280 datalen = maxfraglen - fragheaderlen;
1282 fraglen = datalen + fragheaderlen;
1283 if ((flags & MSG_MORE) &&
1284 !(rt->u.dst.dev->features&NETIF_F_SG))
1287 alloclen = datalen + fragheaderlen;
1290 * The last fragment gets additional space at tail.
1291 * Note: we overallocate on fragments with MSG_MODE
1292 * because we have no idea if we're the last one.
1294 if (datalen == length + fraggap)
1295 alloclen += rt->u.dst.trailer_len;
1298 * We just reserve space for fragment header.
1299 * Note: this may be overallocation if the message
1300 * (without MSG_MORE) fits into the MTU.
1302 alloclen += sizeof(struct frag_hdr);
1305 skb = sock_alloc_send_skb(sk,
1307 (flags & MSG_DONTWAIT), &err);
1310 if (atomic_read(&sk->sk_wmem_alloc) <=
1312 skb = sock_wmalloc(sk,
1313 alloclen + hh_len, 1,
1315 if (unlikely(skb == NULL))
1321 * Fill in the control structures
1323 skb->ip_summed = csummode;
1325 /* reserve for fragmentation */
1326 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1329 * Find where to start putting bytes
1331 data = skb_put(skb, fraglen);
1332 skb_set_network_header(skb, exthdrlen);
1333 data += fragheaderlen;
1334 skb->transport_header = (skb->network_header +
1337 skb->csum = skb_copy_and_csum_bits(
1338 skb_prev, maxfraglen,
1339 data + transhdrlen, fraggap, 0);
1340 skb_prev->csum = csum_sub(skb_prev->csum,
1343 pskb_trim_unique(skb_prev, maxfraglen);
1345 copy = datalen - transhdrlen - fraggap;
1350 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1357 length -= datalen - fraggap;
1360 csummode = CHECKSUM_NONE;
1363 * Put the packet on the pending queue
1365 __skb_queue_tail(&sk->sk_write_queue, skb);
1372 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1376 if (getfrag(from, skb_put(skb, copy),
1377 offset, copy, off, skb) < 0) {
1378 __skb_trim(skb, off);
1383 int i = skb_shinfo(skb)->nr_frags;
1384 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1385 struct page *page = sk->sk_sndmsg_page;
1386 int off = sk->sk_sndmsg_off;
1389 if (page && (left = PAGE_SIZE - off) > 0) {
1392 if (page != frag->page) {
1393 if (i == MAX_SKB_FRAGS) {
1398 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1399 frag = &skb_shinfo(skb)->frags[i];
1401 } else if(i < MAX_SKB_FRAGS) {
1402 if (copy > PAGE_SIZE)
1404 page = alloc_pages(sk->sk_allocation, 0);
1409 sk->sk_sndmsg_page = page;
1410 sk->sk_sndmsg_off = 0;
1412 skb_fill_page_desc(skb, i, page, 0, 0);
1413 frag = &skb_shinfo(skb)->frags[i];
1418 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1422 sk->sk_sndmsg_off += copy;
1425 skb->data_len += copy;
1426 skb->truesize += copy;
1427 atomic_add(copy, &sk->sk_wmem_alloc);
1434 inet->cork.length -= length;
1435 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1439 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1442 kfree(np->cork.opt->dst0opt);
1443 kfree(np->cork.opt->dst1opt);
1444 kfree(np->cork.opt->hopopt);
1445 kfree(np->cork.opt->srcrt);
1446 kfree(np->cork.opt);
1447 np->cork.opt = NULL;
1450 if (inet->cork.dst) {
1451 dst_release(inet->cork.dst);
1452 inet->cork.dst = NULL;
1453 inet->cork.flags &= ~IPCORK_ALLFRAG;
1455 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1458 int ip6_push_pending_frames(struct sock *sk)
1460 struct sk_buff *skb, *tmp_skb;
1461 struct sk_buff **tail_skb;
1462 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1463 struct inet_sock *inet = inet_sk(sk);
1464 struct ipv6_pinfo *np = inet6_sk(sk);
1465 struct net *net = sock_net(sk);
1466 struct ipv6hdr *hdr;
1467 struct ipv6_txoptions *opt = np->cork.opt;
1468 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1469 struct flowi *fl = &inet->cork.fl;
1470 unsigned char proto = fl->proto;
1473 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1475 tail_skb = &(skb_shinfo(skb)->frag_list);
1477 /* move skb->data to ip header from ext header */
1478 if (skb->data < skb_network_header(skb))
1479 __skb_pull(skb, skb_network_offset(skb));
1480 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1481 __skb_pull(tmp_skb, skb_network_header_len(skb));
1482 *tail_skb = tmp_skb;
1483 tail_skb = &(tmp_skb->next);
1484 skb->len += tmp_skb->len;
1485 skb->data_len += tmp_skb->len;
1486 skb->truesize += tmp_skb->truesize;
1487 __sock_put(tmp_skb->sk);
1488 tmp_skb->destructor = NULL;
1492 /* Allow local fragmentation. */
1493 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1496 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1497 __skb_pull(skb, skb_network_header_len(skb));
1498 if (opt && opt->opt_flen)
1499 ipv6_push_frag_opts(skb, opt, &proto);
1500 if (opt && opt->opt_nflen)
1501 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1503 skb_push(skb, sizeof(struct ipv6hdr));
1504 skb_reset_network_header(skb);
1505 hdr = ipv6_hdr(skb);
1507 *(__be32*)hdr = fl->fl6_flowlabel |
1508 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1510 hdr->hop_limit = np->cork.hop_limit;
1511 hdr->nexthdr = proto;
1512 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1513 ipv6_addr_copy(&hdr->daddr, final_dst);
1515 skb->priority = sk->sk_priority;
1516 skb->mark = sk->sk_mark;
1518 skb_dst_set(skb, dst_clone(&rt->u.dst));
1519 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1520 if (proto == IPPROTO_ICMPV6) {
1521 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1523 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1524 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1527 err = ip6_local_out(skb);
1530 err = np->recverr ? net_xmit_errno(err) : 0;
1536 ip6_cork_release(inet, np);
1542 void ip6_flush_pending_frames(struct sock *sk)
1544 struct sk_buff *skb;
1546 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1548 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1549 IPSTATS_MIB_OUTDISCARDS);
1553 ip6_cork_release(inet_sk(sk), inet6_sk(sk));