2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
31 #include <linux/errno.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/socket.h>
35 #include <linux/net.h>
36 #include <linux/netdevice.h>
37 #include <linux/if_arp.h>
38 #include <linux/in6.h>
39 #include <linux/tcp.h>
40 #include <linux/route.h>
41 #include <linux/module.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
60 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
62 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
64 static u32 ipv6_fragmentation_id = 1;
65 static DEFINE_SPINLOCK(ip6_id_lock);
67 spin_lock_bh(&ip6_id_lock);
68 fhdr->identification = htonl(ipv6_fragmentation_id);
69 if (++ipv6_fragmentation_id == 0)
70 ipv6_fragmentation_id = 1;
71 spin_unlock_bh(&ip6_id_lock);
74 int __ip6_local_out(struct sk_buff *skb)
78 len = skb->len - sizeof(struct ipv6hdr);
79 if (len > IPV6_MAXPLEN)
81 ipv6_hdr(skb)->payload_len = htons(len);
83 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
87 int ip6_local_out(struct sk_buff *skb)
91 err = __ip6_local_out(skb);
93 err = dst_output(skb);
97 EXPORT_SYMBOL_GPL(ip6_local_out);
99 static int ip6_output_finish(struct sk_buff *skb)
101 struct dst_entry *dst = skb->dst;
104 return neigh_hh_output(dst->hh, skb);
105 else if (dst->neighbour)
106 return dst->neighbour->output(skb);
108 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
114 /* dev_loopback_xmit for use with netfilter. */
115 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
117 skb_reset_mac_header(newskb);
118 __skb_pull(newskb, skb_network_offset(newskb));
119 newskb->pkt_type = PACKET_LOOPBACK;
120 newskb->ip_summed = CHECKSUM_UNNECESSARY;
121 BUG_TRAP(newskb->dst);
128 static int ip6_output2(struct sk_buff *skb)
130 struct dst_entry *dst = skb->dst;
131 struct net_device *dev = dst->dev;
133 skb->protocol = htons(ETH_P_IPV6);
136 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
137 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
138 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
140 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
141 ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
142 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
143 &ipv6_hdr(skb)->saddr))) {
144 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
146 /* Do not check for IFF_ALLMULTI; multicast routing
147 is not supported in any case.
150 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
152 ip6_dev_loopback_xmit);
154 if (ipv6_hdr(skb)->hop_limit == 0) {
155 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
161 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
164 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
168 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
170 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
172 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
173 skb->dst->dev->mtu : dst_mtu(skb->dst);
176 int ip6_output(struct sk_buff *skb)
178 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
179 dst_allfrag(skb->dst))
180 return ip6_fragment(skb, ip6_output2);
182 return ip6_output2(skb);
186 * xmit an sk_buff (used by TCP)
189 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
190 struct ipv6_txoptions *opt, int ipfragok)
192 struct ipv6_pinfo *np = inet6_sk(sk);
193 struct in6_addr *first_hop = &fl->fl6_dst;
194 struct dst_entry *dst = skb->dst;
196 u8 proto = fl->proto;
197 int seg_len = skb->len;
202 unsigned int head_room;
204 /* First: exthdrs may take lots of space (~8K for now)
205 MAX_HEADER is not enough.
207 head_room = opt->opt_nflen + opt->opt_flen;
208 seg_len += head_room;
209 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
211 if (skb_headroom(skb) < head_room) {
212 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
214 IP6_INC_STATS(ip6_dst_idev(skb->dst),
215 IPSTATS_MIB_OUTDISCARDS);
222 skb_set_owner_w(skb, sk);
225 ipv6_push_frag_opts(skb, opt, &proto);
227 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
230 skb_push(skb, sizeof(struct ipv6hdr));
231 skb_reset_network_header(skb);
235 * Fill in the IPv6 header
240 hlimit = np->hop_limit;
242 hlimit = ip6_dst_hoplimit(dst);
250 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
252 hdr->payload_len = htons(seg_len);
253 hdr->nexthdr = proto;
254 hdr->hop_limit = hlimit;
256 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
257 ipv6_addr_copy(&hdr->daddr, first_hop);
259 skb->priority = sk->sk_priority;
260 skb->mark = sk->sk_mark;
263 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
264 IP6_INC_STATS(ip6_dst_idev(skb->dst),
265 IPSTATS_MIB_OUTREQUESTS);
266 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
271 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
273 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
274 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
279 EXPORT_SYMBOL(ip6_xmit);
282 * To avoid extra problems ND packets are send through this
283 * routine. It's code duplication but I really want to avoid
284 * extra checks since ipv6_build_header is used by TCP (which
285 * is for us performance critical)
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 const struct in6_addr *saddr, const struct in6_addr *daddr,
292 struct ipv6_pinfo *np = inet6_sk(sk);
296 skb->protocol = htons(ETH_P_IPV6);
299 totlen = len + sizeof(struct ipv6hdr);
301 skb_reset_network_header(skb);
302 skb_put(skb, sizeof(struct ipv6hdr));
305 *(__be32*)hdr = htonl(0x60000000);
307 hdr->payload_len = htons(len);
308 hdr->nexthdr = proto;
309 hdr->hop_limit = np->hop_limit;
311 ipv6_addr_copy(&hdr->saddr, saddr);
312 ipv6_addr_copy(&hdr->daddr, daddr);
317 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
319 struct ip6_ra_chain *ra;
320 struct sock *last = NULL;
322 read_lock(&ip6_ra_lock);
323 for (ra = ip6_ra_chain; ra; ra = ra->next) {
324 struct sock *sk = ra->sk;
325 if (sk && ra->sel == sel &&
326 (!sk->sk_bound_dev_if ||
327 sk->sk_bound_dev_if == skb->dev->ifindex)) {
329 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
331 rawv6_rcv(last, skb2);
338 rawv6_rcv(last, skb);
339 read_unlock(&ip6_ra_lock);
342 read_unlock(&ip6_ra_lock);
346 static int ip6_forward_proxy_check(struct sk_buff *skb)
348 struct ipv6hdr *hdr = ipv6_hdr(skb);
349 u8 nexthdr = hdr->nexthdr;
352 if (ipv6_ext_hdr(nexthdr)) {
353 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
357 offset = sizeof(struct ipv6hdr);
359 if (nexthdr == IPPROTO_ICMPV6) {
360 struct icmp6hdr *icmp6;
362 if (!pskb_may_pull(skb, (skb_network_header(skb) +
363 offset + 1 - skb->data)))
366 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
368 switch (icmp6->icmp6_type) {
369 case NDISC_ROUTER_SOLICITATION:
370 case NDISC_ROUTER_ADVERTISEMENT:
371 case NDISC_NEIGHBOUR_SOLICITATION:
372 case NDISC_NEIGHBOUR_ADVERTISEMENT:
374 /* For reaction involving unicast neighbor discovery
375 * message destined to the proxied address, pass it to
385 * The proxying router can't forward traffic sent to a link-local
386 * address, so signal the sender and discard the packet. This
387 * behavior is clarified by the MIPv6 specification.
389 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
390 dst_link_failure(skb);
397 static inline int ip6_forward_finish(struct sk_buff *skb)
399 return dst_output(skb);
402 int ip6_forward(struct sk_buff *skb)
404 struct dst_entry *dst = skb->dst;
405 struct ipv6hdr *hdr = ipv6_hdr(skb);
406 struct inet6_skb_parm *opt = IP6CB(skb);
407 struct net *net = dev_net(dst->dev);
409 if (ipv6_devconf.forwarding == 0)
412 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
413 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
417 skb_forward_csum(skb);
420 * We DO NOT make any processing on
421 * RA packets, pushing them to user level AS IS
422 * without ane WARRANTY that application will be able
423 * to interpret them. The reason is that we
424 * cannot make anything clever here.
426 * We are not end-node, so that if packet contains
427 * AH/ESP, we cannot make anything.
428 * Defragmentation also would be mistake, RA packets
429 * cannot be fragmented, because there is no warranty
430 * that different fragments will go along one path. --ANK
433 u8 *ptr = skb_network_header(skb) + opt->ra;
434 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
439 * check and decrement ttl
441 if (hdr->hop_limit <= 1) {
442 /* Force OUTPUT device used as source address */
444 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
446 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
452 /* XXX: idev->cnf.proxy_ndp? */
453 if (ipv6_devconf.proxy_ndp &&
454 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
455 int proxied = ip6_forward_proxy_check(skb);
457 return ip6_input(skb);
458 else if (proxied < 0) {
459 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
464 if (!xfrm6_route_forward(skb)) {
465 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
470 /* IPv6 specs say nothing about it, but it is clear that we cannot
471 send redirects to source routed frames.
472 We don't send redirects to frames decapsulated from IPsec.
474 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
476 struct in6_addr *target = NULL;
478 struct neighbour *n = dst->neighbour;
481 * incoming and outgoing devices are the same
485 rt = (struct rt6_info *) dst;
486 if ((rt->rt6i_flags & RTF_GATEWAY))
487 target = (struct in6_addr*)&n->primary_key;
489 target = &hdr->daddr;
491 /* Limit redirects both by destination (here)
492 and by source (inside ndisc_send_redirect)
494 if (xrlim_allow(dst, 1*HZ))
495 ndisc_send_redirect(skb, n, target);
497 int addrtype = ipv6_addr_type(&hdr->saddr);
499 /* This check is security critical. */
500 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
502 if (addrtype & IPV6_ADDR_LINKLOCAL) {
503 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
504 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
509 if (skb->len > dst_mtu(dst)) {
510 /* Again, force OUTPUT device used as source address */
512 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
513 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
514 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
519 if (skb_cow(skb, dst->dev->hard_header_len)) {
520 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
526 /* Mangling hops number delayed to point after skb COW */
530 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 to->pkt_type = from->pkt_type;
544 to->priority = from->priority;
545 to->protocol = from->protocol;
546 dst_release(to->dst);
547 to->dst = dst_clone(from->dst);
549 to->mark = from->mark;
551 #ifdef CONFIG_NET_SCHED
552 to->tc_index = from->tc_index;
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557 to->nf_trace = from->nf_trace;
559 skb_copy_secmark(to, from);
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
564 u16 offset = sizeof(struct ipv6hdr);
565 struct ipv6_opt_hdr *exthdr =
566 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567 unsigned int packet_len = skb->tail - skb->network_header;
569 *nexthdr = &ipv6_hdr(skb)->nexthdr;
571 while (offset + 1 <= packet_len) {
577 case NEXTHDR_ROUTING:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592 offset += ipv6_optlen(exthdr);
593 *nexthdr = &exthdr->nexthdr;
594 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
603 struct net_device *dev;
604 struct sk_buff *frag;
605 struct rt6_info *rt = (struct rt6_info*)skb->dst;
606 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
607 struct ipv6hdr *tmp_hdr;
609 unsigned int mtu, hlen, left, len;
611 int ptr, offset = 0, err=0;
612 u8 *prevhdr, nexthdr = 0;
615 hlen = ip6_find_1stfragopt(skb, &prevhdr);
618 mtu = ip6_skb_dst_mtu(skb);
620 /* We must not fragment if the socket is set to force MTU discovery
621 * or if the skb it not generated by a local socket. (This last
622 * check should be redundant, but it's free.)
624 if (!skb->local_df) {
625 skb->dev = skb->dst->dev;
626 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
627 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
632 if (np && np->frag_size < mtu) {
636 mtu -= hlen + sizeof(struct frag_hdr);
638 if (skb_shinfo(skb)->frag_list) {
639 int first_len = skb_pagelen(skb);
642 if (first_len - hlen > mtu ||
643 ((first_len - hlen) & 7) ||
647 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
648 /* Correct geometry. */
649 if (frag->len > mtu ||
650 ((frag->len & 7) && frag->next) ||
651 skb_headroom(frag) < hlen)
654 /* Partially cloned skb? */
655 if (skb_shared(frag))
662 frag->destructor = sock_wfree;
663 truesizes += frag->truesize;
669 frag = skb_shinfo(skb)->frag_list;
670 skb_shinfo(skb)->frag_list = NULL;
673 *prevhdr = NEXTHDR_FRAGMENT;
674 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
676 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
680 __skb_pull(skb, hlen);
681 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
682 __skb_push(skb, hlen);
683 skb_reset_network_header(skb);
684 memcpy(skb_network_header(skb), tmp_hdr, hlen);
686 ipv6_select_ident(skb, fh);
687 fh->nexthdr = nexthdr;
689 fh->frag_off = htons(IP6_MF);
690 frag_id = fh->identification;
692 first_len = skb_pagelen(skb);
693 skb->data_len = first_len - skb_headlen(skb);
694 skb->truesize -= truesizes;
695 skb->len = first_len;
696 ipv6_hdr(skb)->payload_len = htons(first_len -
697 sizeof(struct ipv6hdr));
699 dst_hold(&rt->u.dst);
702 /* Prepare header of the next frame,
703 * before previous one went down. */
705 frag->ip_summed = CHECKSUM_NONE;
706 skb_reset_transport_header(frag);
707 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
708 __skb_push(frag, hlen);
709 skb_reset_network_header(frag);
710 memcpy(skb_network_header(frag), tmp_hdr,
712 offset += skb->len - hlen - sizeof(struct frag_hdr);
713 fh->nexthdr = nexthdr;
715 fh->frag_off = htons(offset);
716 if (frag->next != NULL)
717 fh->frag_off |= htons(IP6_MF);
718 fh->identification = frag_id;
719 ipv6_hdr(frag)->payload_len =
721 sizeof(struct ipv6hdr));
722 ip6_copy_metadata(frag, skb);
727 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
740 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
741 dst_release(&rt->u.dst);
751 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
752 dst_release(&rt->u.dst);
757 left = skb->len - hlen; /* Space per frame */
758 ptr = hlen; /* Where to start from */
761 * Fragment the datagram.
764 *prevhdr = NEXTHDR_FRAGMENT;
767 * Keep copying data until we run out.
771 /* IF: it doesn't fit, use 'mtu' - the data space left */
774 /* IF: we are not sending upto and including the packet end
775 then align the next start on an eight byte boundary */
783 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
784 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
785 IP6_INC_STATS(ip6_dst_idev(skb->dst),
786 IPSTATS_MIB_FRAGFAILS);
792 * Set up data on packet
795 ip6_copy_metadata(frag, skb);
796 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
797 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
798 skb_reset_network_header(frag);
799 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
800 frag->transport_header = (frag->network_header + hlen +
801 sizeof(struct frag_hdr));
804 * Charge the memory for the fragment to any owner
808 skb_set_owner_w(frag, skb->sk);
811 * Copy the packet header into the new buffer.
813 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
816 * Build fragment header.
818 fh->nexthdr = nexthdr;
821 ipv6_select_ident(skb, fh);
822 frag_id = fh->identification;
824 fh->identification = frag_id;
827 * Copy a block of the IP datagram.
829 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
833 fh->frag_off = htons(offset);
835 fh->frag_off |= htons(IP6_MF);
836 ipv6_hdr(frag)->payload_len = htons(frag->len -
837 sizeof(struct ipv6hdr));
843 * Put this fragment into the sending queue.
849 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
851 IP6_INC_STATS(ip6_dst_idev(skb->dst),
852 IPSTATS_MIB_FRAGOKS);
857 IP6_INC_STATS(ip6_dst_idev(skb->dst),
858 IPSTATS_MIB_FRAGFAILS);
863 static inline int ip6_rt_check(struct rt6key *rt_key,
864 struct in6_addr *fl_addr,
865 struct in6_addr *addr_cache)
867 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
868 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
871 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
872 struct dst_entry *dst,
875 struct ipv6_pinfo *np = inet6_sk(sk);
876 struct rt6_info *rt = (struct rt6_info *)dst;
881 /* Yes, checking route validity in not connected
882 * case is not very simple. Take into account,
883 * that we do not support routing by source, TOS,
884 * and MSG_DONTROUTE --ANK (980726)
886 * 1. ip6_rt_check(): If route was host route,
887 * check that cached destination is current.
888 * If it is network route, we still may
889 * check its validity using saved pointer
890 * to the last used address: daddr_cache.
891 * We do not want to save whole address now,
892 * (because main consumer of this service
893 * is tcp, which has not this problem),
894 * so that the last trick works only on connected
896 * 2. oif also should be the same.
898 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
899 #ifdef CONFIG_IPV6_SUBTREES
900 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
902 (fl->oif && fl->oif != dst->dev->ifindex)) {
911 static int ip6_dst_lookup_tail(struct sock *sk,
912 struct dst_entry **dst, struct flowi *fl)
915 struct net *net = sock_net(sk);
918 *dst = ip6_route_output(net, sk, fl);
920 if ((err = (*dst)->error))
921 goto out_err_release;
923 if (ipv6_addr_any(&fl->fl6_src)) {
924 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
926 sk ? inet6_sk(sk)->srcprefs : 0,
929 goto out_err_release;
932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
934 * Here if the dst entry we've looked up
935 * has a neighbour entry that is in the INCOMPLETE
936 * state and the src address from the flow is
937 * marked as OPTIMISTIC, we release the found
938 * dst entry and replace it instead with the
939 * dst entry of the nexthop router
941 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
942 struct inet6_ifaddr *ifp;
946 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
949 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
955 * We need to get the dst entry for the
956 * default router instead
959 memcpy(&fl_gw, fl, sizeof(struct flowi));
960 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
961 *dst = ip6_route_output(net, sk, &fl_gw);
962 if ((err = (*dst)->error))
963 goto out_err_release;
971 if (err == -ENETUNREACH)
972 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
979 * ip6_dst_lookup - perform route lookup on flow
980 * @sk: socket which provides route info
981 * @dst: pointer to dst_entry * for result
982 * @fl: flow to lookup
984 * This function performs a route lookup on the given flow.
986 * It returns zero on success, or a standard errno code on error.
988 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
991 return ip6_dst_lookup_tail(sk, dst, fl);
993 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
996 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
997 * @sk: socket which provides the dst cache and route info
998 * @dst: pointer to dst_entry * for result
999 * @fl: flow to lookup
1001 * This function performs a route lookup on the given flow with the
1002 * possibility of using the cached route in the socket if it is valid.
1003 * It will take the socket dst lock when operating on the dst cache.
1004 * As a result, this function can only be used in process context.
1006 * It returns zero on success, or a standard errno code on error.
1008 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1012 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1013 *dst = ip6_sk_dst_check(sk, *dst, fl);
1016 return ip6_dst_lookup_tail(sk, dst, fl);
1018 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1020 static inline int ip6_ufo_append_data(struct sock *sk,
1021 int getfrag(void *from, char *to, int offset, int len,
1022 int odd, struct sk_buff *skb),
1023 void *from, int length, int hh_len, int fragheaderlen,
1024 int transhdrlen, int mtu,unsigned int flags)
1027 struct sk_buff *skb;
1030 /* There is support for UDP large send offload by network
1031 * device, so create one single skb packet containing complete
1034 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1035 skb = sock_alloc_send_skb(sk,
1036 hh_len + fragheaderlen + transhdrlen + 20,
1037 (flags & MSG_DONTWAIT), &err);
1041 /* reserve space for Hardware header */
1042 skb_reserve(skb, hh_len);
1044 /* create space for UDP/IP header */
1045 skb_put(skb,fragheaderlen + transhdrlen);
1047 /* initialize network header pointer */
1048 skb_reset_network_header(skb);
1050 /* initialize protocol header pointer */
1051 skb->transport_header = skb->network_header + fragheaderlen;
1053 skb->ip_summed = CHECKSUM_PARTIAL;
1055 sk->sk_sndmsg_off = 0;
1058 err = skb_append_datato_frags(sk,skb, getfrag, from,
1059 (length - transhdrlen));
1061 struct frag_hdr fhdr;
1063 /* specify the length of each IP datagram fragment*/
1064 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1065 sizeof(struct frag_hdr);
1066 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1067 ipv6_select_ident(skb, &fhdr);
1068 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1069 __skb_queue_tail(&sk->sk_write_queue, skb);
1073 /* There is not enough support do UPD LSO,
1074 * so follow normal path
1081 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1082 int offset, int len, int odd, struct sk_buff *skb),
1083 void *from, int length, int transhdrlen,
1084 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1085 struct rt6_info *rt, unsigned int flags)
1087 struct inet_sock *inet = inet_sk(sk);
1088 struct ipv6_pinfo *np = inet6_sk(sk);
1089 struct sk_buff *skb;
1090 unsigned int maxfraglen, fragheaderlen;
1097 int csummode = CHECKSUM_NONE;
1099 if (flags&MSG_PROBE)
1101 if (skb_queue_empty(&sk->sk_write_queue)) {
1106 if (np->cork.opt == NULL) {
1107 np->cork.opt = kmalloc(opt->tot_len,
1109 if (unlikely(np->cork.opt == NULL))
1111 } else if (np->cork.opt->tot_len < opt->tot_len) {
1112 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1115 memcpy(np->cork.opt, opt, opt->tot_len);
1116 inet->cork.flags |= IPCORK_OPT;
1117 /* need source address above miyazawa*/
1119 dst_hold(&rt->u.dst);
1120 inet->cork.dst = &rt->u.dst;
1121 inet->cork.fl = *fl;
1122 np->cork.hop_limit = hlimit;
1123 np->cork.tclass = tclass;
1124 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1125 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1126 if (np->frag_size < mtu) {
1128 mtu = np->frag_size;
1130 inet->cork.fragsize = mtu;
1131 if (dst_allfrag(rt->u.dst.path))
1132 inet->cork.flags |= IPCORK_ALLFRAG;
1133 inet->cork.length = 0;
1134 sk->sk_sndmsg_page = NULL;
1135 sk->sk_sndmsg_off = 0;
1136 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1137 rt->rt6i_nfheader_len;
1138 length += exthdrlen;
1139 transhdrlen += exthdrlen;
1141 rt = (struct rt6_info *)inet->cork.dst;
1142 fl = &inet->cork.fl;
1143 if (inet->cork.flags & IPCORK_OPT)
1147 mtu = inet->cork.fragsize;
1150 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1152 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1153 (opt ? opt->opt_nflen : 0);
1154 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1156 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1157 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1158 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1164 * Let's try using as much space as possible.
1165 * Use MTU if total length of the message fits into the MTU.
1166 * Otherwise, we need to reserve fragment header and
1167 * fragment alignment (= 8-15 octects, in total).
1169 * Note that we may need to "move" the data from the tail of
1170 * of the buffer to the new fragment when we split
1173 * FIXME: It may be fragmented into multiple chunks
1174 * at once if non-fragmentable extension headers
1179 inet->cork.length += length;
1180 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1181 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1183 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1184 fragheaderlen, transhdrlen, mtu,
1191 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1194 while (length > 0) {
1195 /* Check if the remaining data fits into current packet. */
1196 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1198 copy = maxfraglen - skb->len;
1202 unsigned int datalen;
1203 unsigned int fraglen;
1204 unsigned int fraggap;
1205 unsigned int alloclen;
1206 struct sk_buff *skb_prev;
1210 /* There's no room in the current skb */
1212 fraggap = skb_prev->len - maxfraglen;
1217 * If remaining data exceeds the mtu,
1218 * we know we need more fragment(s).
1220 datalen = length + fraggap;
1221 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1222 datalen = maxfraglen - fragheaderlen;
1224 fraglen = datalen + fragheaderlen;
1225 if ((flags & MSG_MORE) &&
1226 !(rt->u.dst.dev->features&NETIF_F_SG))
1229 alloclen = datalen + fragheaderlen;
1232 * The last fragment gets additional space at tail.
1233 * Note: we overallocate on fragments with MSG_MODE
1234 * because we have no idea if we're the last one.
1236 if (datalen == length + fraggap)
1237 alloclen += rt->u.dst.trailer_len;
1240 * We just reserve space for fragment header.
1241 * Note: this may be overallocation if the message
1242 * (without MSG_MORE) fits into the MTU.
1244 alloclen += sizeof(struct frag_hdr);
1247 skb = sock_alloc_send_skb(sk,
1249 (flags & MSG_DONTWAIT), &err);
1252 if (atomic_read(&sk->sk_wmem_alloc) <=
1254 skb = sock_wmalloc(sk,
1255 alloclen + hh_len, 1,
1257 if (unlikely(skb == NULL))
1263 * Fill in the control structures
1265 skb->ip_summed = csummode;
1267 /* reserve for fragmentation */
1268 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1271 * Find where to start putting bytes
1273 data = skb_put(skb, fraglen);
1274 skb_set_network_header(skb, exthdrlen);
1275 data += fragheaderlen;
1276 skb->transport_header = (skb->network_header +
1279 skb->csum = skb_copy_and_csum_bits(
1280 skb_prev, maxfraglen,
1281 data + transhdrlen, fraggap, 0);
1282 skb_prev->csum = csum_sub(skb_prev->csum,
1285 pskb_trim_unique(skb_prev, maxfraglen);
1287 copy = datalen - transhdrlen - fraggap;
1292 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1299 length -= datalen - fraggap;
1302 csummode = CHECKSUM_NONE;
1305 * Put the packet on the pending queue
1307 __skb_queue_tail(&sk->sk_write_queue, skb);
1314 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1318 if (getfrag(from, skb_put(skb, copy),
1319 offset, copy, off, skb) < 0) {
1320 __skb_trim(skb, off);
1325 int i = skb_shinfo(skb)->nr_frags;
1326 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1327 struct page *page = sk->sk_sndmsg_page;
1328 int off = sk->sk_sndmsg_off;
1331 if (page && (left = PAGE_SIZE - off) > 0) {
1334 if (page != frag->page) {
1335 if (i == MAX_SKB_FRAGS) {
1340 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1341 frag = &skb_shinfo(skb)->frags[i];
1343 } else if(i < MAX_SKB_FRAGS) {
1344 if (copy > PAGE_SIZE)
1346 page = alloc_pages(sk->sk_allocation, 0);
1351 sk->sk_sndmsg_page = page;
1352 sk->sk_sndmsg_off = 0;
1354 skb_fill_page_desc(skb, i, page, 0, 0);
1355 frag = &skb_shinfo(skb)->frags[i];
1360 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1364 sk->sk_sndmsg_off += copy;
1367 skb->data_len += copy;
1368 skb->truesize += copy;
1369 atomic_add(copy, &sk->sk_wmem_alloc);
1376 inet->cork.length -= length;
1377 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1381 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1383 inet->cork.flags &= ~IPCORK_OPT;
1384 kfree(np->cork.opt);
1385 np->cork.opt = NULL;
1386 if (inet->cork.dst) {
1387 dst_release(inet->cork.dst);
1388 inet->cork.dst = NULL;
1389 inet->cork.flags &= ~IPCORK_ALLFRAG;
1391 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1394 int ip6_push_pending_frames(struct sock *sk)
1396 struct sk_buff *skb, *tmp_skb;
1397 struct sk_buff **tail_skb;
1398 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1399 struct inet_sock *inet = inet_sk(sk);
1400 struct ipv6_pinfo *np = inet6_sk(sk);
1401 struct ipv6hdr *hdr;
1402 struct ipv6_txoptions *opt = np->cork.opt;
1403 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1404 struct flowi *fl = &inet->cork.fl;
1405 unsigned char proto = fl->proto;
1408 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1410 tail_skb = &(skb_shinfo(skb)->frag_list);
1412 /* move skb->data to ip header from ext header */
1413 if (skb->data < skb_network_header(skb))
1414 __skb_pull(skb, skb_network_offset(skb));
1415 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1416 __skb_pull(tmp_skb, skb_network_header_len(skb));
1417 *tail_skb = tmp_skb;
1418 tail_skb = &(tmp_skb->next);
1419 skb->len += tmp_skb->len;
1420 skb->data_len += tmp_skb->len;
1421 skb->truesize += tmp_skb->truesize;
1422 __sock_put(tmp_skb->sk);
1423 tmp_skb->destructor = NULL;
1427 /* Allow local fragmentation. */
1428 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1431 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1432 __skb_pull(skb, skb_network_header_len(skb));
1433 if (opt && opt->opt_flen)
1434 ipv6_push_frag_opts(skb, opt, &proto);
1435 if (opt && opt->opt_nflen)
1436 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1438 skb_push(skb, sizeof(struct ipv6hdr));
1439 skb_reset_network_header(skb);
1440 hdr = ipv6_hdr(skb);
1442 *(__be32*)hdr = fl->fl6_flowlabel |
1443 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1445 hdr->hop_limit = np->cork.hop_limit;
1446 hdr->nexthdr = proto;
1447 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1448 ipv6_addr_copy(&hdr->daddr, final_dst);
1450 skb->priority = sk->sk_priority;
1451 skb->mark = sk->sk_mark;
1453 skb->dst = dst_clone(&rt->u.dst);
1454 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1455 if (proto == IPPROTO_ICMPV6) {
1456 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1458 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1459 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1462 err = ip6_local_out(skb);
1465 err = np->recverr ? net_xmit_errno(err) : 0;
1471 ip6_cork_release(inet, np);
1477 void ip6_flush_pending_frames(struct sock *sk)
1479 struct sk_buff *skb;
1481 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1483 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1484 IPSTATS_MIB_OUTDISCARDS);
1488 ip6_cork_release(inet_sk(sk), inet6_sk(sk));