2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 static u32 ipv6_fragmentation_id = 1;
63 static DEFINE_SPINLOCK(ip6_id_lock);
65 spin_lock_bh(&ip6_id_lock);
66 fhdr->identification = htonl(ipv6_fragmentation_id);
67 if (++ipv6_fragmentation_id == 0)
68 ipv6_fragmentation_id = 1;
69 spin_unlock_bh(&ip6_id_lock);
72 int __ip6_local_out(struct sk_buff *skb)
76 len = skb->len - sizeof(struct ipv6hdr);
77 if (len > IPV6_MAXPLEN)
79 ipv6_hdr(skb)->payload_len = htons(len);
81 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
85 int ip6_local_out(struct sk_buff *skb)
89 err = __ip6_local_out(skb);
91 err = dst_output(skb);
95 EXPORT_SYMBOL_GPL(ip6_local_out);
97 static int ip6_output_finish(struct sk_buff *skb)
99 struct dst_entry *dst = skb->dst;
102 return neigh_hh_output(dst->hh, skb);
103 else if (dst->neighbour)
104 return dst->neighbour->output(skb);
106 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115 skb_reset_mac_header(newskb);
116 __skb_pull(newskb, skb_network_offset(newskb));
117 newskb->pkt_type = PACKET_LOOPBACK;
118 newskb->ip_summed = CHECKSUM_UNNECESSARY;
119 BUG_TRAP(newskb->dst);
126 static int ip6_output2(struct sk_buff *skb)
128 struct dst_entry *dst = skb->dst;
129 struct net_device *dev = dst->dev;
131 skb->protocol = htons(ETH_P_IPV6);
134 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
138 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139 ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr))) {
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
148 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
150 ip6_dev_loopback_xmit);
152 if (ipv6_hdr(skb)->hop_limit == 0) {
153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
162 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
168 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
170 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 skb->dst->dev->mtu : dst_mtu(skb->dst);
174 int ip6_output(struct sk_buff *skb)
176 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177 dst_allfrag(skb->dst))
178 return ip6_fragment(skb, ip6_output2);
180 return ip6_output2(skb);
184 * xmit an sk_buff (used by TCP)
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 struct ipv6_txoptions *opt, int ipfragok)
190 struct ipv6_pinfo *np = inet6_sk(sk);
191 struct in6_addr *first_hop = &fl->fl6_dst;
192 struct dst_entry *dst = skb->dst;
194 u8 proto = fl->proto;
195 int seg_len = skb->len;
200 unsigned int head_room;
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 IP6_INC_STATS(ip6_dst_idev(skb->dst),
213 IPSTATS_MIB_OUTDISCARDS);
220 skb_set_owner_w(skb, sk);
223 ipv6_push_frag_opts(skb, opt, &proto);
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
233 * Fill in the IPv6 header
238 hlimit = np->hop_limit;
240 hlimit = ip6_dst_hoplimit(dst);
248 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
250 hdr->payload_len = htons(seg_len);
251 hdr->nexthdr = proto;
252 hdr->hop_limit = hlimit;
254 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
255 ipv6_addr_copy(&hdr->daddr, first_hop);
257 skb->priority = sk->sk_priority;
258 skb->mark = sk->sk_mark;
261 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
262 IP6_INC_STATS(ip6_dst_idev(skb->dst),
263 IPSTATS_MIB_OUTREQUESTS);
264 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
269 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
271 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
272 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
277 EXPORT_SYMBOL(ip6_xmit);
280 * To avoid extra problems ND packets are send through this
281 * routine. It's code duplication but I really want to avoid
282 * extra checks since ipv6_build_header is used by TCP (which
283 * is for us performance critical)
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287 const struct in6_addr *saddr, const struct in6_addr *daddr,
290 struct ipv6_pinfo *np = inet6_sk(sk);
294 skb->protocol = htons(ETH_P_IPV6);
297 totlen = len + sizeof(struct ipv6hdr);
299 skb_reset_network_header(skb);
300 skb_put(skb, sizeof(struct ipv6hdr));
303 *(__be32*)hdr = htonl(0x60000000);
305 hdr->payload_len = htons(len);
306 hdr->nexthdr = proto;
307 hdr->hop_limit = np->hop_limit;
309 ipv6_addr_copy(&hdr->saddr, saddr);
310 ipv6_addr_copy(&hdr->daddr, daddr);
315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 struct ip6_ra_chain *ra;
318 struct sock *last = NULL;
320 read_lock(&ip6_ra_lock);
321 for (ra = ip6_ra_chain; ra; ra = ra->next) {
322 struct sock *sk = ra->sk;
323 if (sk && ra->sel == sel &&
324 (!sk->sk_bound_dev_if ||
325 sk->sk_bound_dev_if == skb->dev->ifindex)) {
327 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
329 rawv6_rcv(last, skb2);
336 rawv6_rcv(last, skb);
337 read_unlock(&ip6_ra_lock);
340 read_unlock(&ip6_ra_lock);
344 static int ip6_forward_proxy_check(struct sk_buff *skb)
346 struct ipv6hdr *hdr = ipv6_hdr(skb);
347 u8 nexthdr = hdr->nexthdr;
350 if (ipv6_ext_hdr(nexthdr)) {
351 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
355 offset = sizeof(struct ipv6hdr);
357 if (nexthdr == IPPROTO_ICMPV6) {
358 struct icmp6hdr *icmp6;
360 if (!pskb_may_pull(skb, (skb_network_header(skb) +
361 offset + 1 - skb->data)))
364 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
366 switch (icmp6->icmp6_type) {
367 case NDISC_ROUTER_SOLICITATION:
368 case NDISC_ROUTER_ADVERTISEMENT:
369 case NDISC_NEIGHBOUR_SOLICITATION:
370 case NDISC_NEIGHBOUR_ADVERTISEMENT:
372 /* For reaction involving unicast neighbor discovery
373 * message destined to the proxied address, pass it to
383 * The proxying router can't forward traffic sent to a link-local
384 * address, so signal the sender and discard the packet. This
385 * behavior is clarified by the MIPv6 specification.
387 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388 dst_link_failure(skb);
395 static inline int ip6_forward_finish(struct sk_buff *skb)
397 return dst_output(skb);
400 int ip6_forward(struct sk_buff *skb)
402 struct dst_entry *dst = skb->dst;
403 struct ipv6hdr *hdr = ipv6_hdr(skb);
404 struct inet6_skb_parm *opt = IP6CB(skb);
405 struct net *net = dev_net(dst->dev);
407 if (ipv6_devconf.forwarding == 0)
410 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
411 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
415 skb_forward_csum(skb);
418 * We DO NOT make any processing on
419 * RA packets, pushing them to user level AS IS
420 * without ane WARRANTY that application will be able
421 * to interpret them. The reason is that we
422 * cannot make anything clever here.
424 * We are not end-node, so that if packet contains
425 * AH/ESP, we cannot make anything.
426 * Defragmentation also would be mistake, RA packets
427 * cannot be fragmented, because there is no warranty
428 * that different fragments will go along one path. --ANK
431 u8 *ptr = skb_network_header(skb) + opt->ra;
432 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
437 * check and decrement ttl
439 if (hdr->hop_limit <= 1) {
440 /* Force OUTPUT device used as source address */
442 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
444 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
450 /* XXX: idev->cnf.proxy_ndp? */
451 if (ipv6_devconf.proxy_ndp &&
452 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
453 int proxied = ip6_forward_proxy_check(skb);
455 return ip6_input(skb);
456 else if (proxied < 0) {
457 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
462 if (!xfrm6_route_forward(skb)) {
463 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
468 /* IPv6 specs say nothing about it, but it is clear that we cannot
469 send redirects to source routed frames.
470 We don't send redirects to frames decapsulated from IPsec.
472 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
474 struct in6_addr *target = NULL;
476 struct neighbour *n = dst->neighbour;
479 * incoming and outgoing devices are the same
483 rt = (struct rt6_info *) dst;
484 if ((rt->rt6i_flags & RTF_GATEWAY))
485 target = (struct in6_addr*)&n->primary_key;
487 target = &hdr->daddr;
489 /* Limit redirects both by destination (here)
490 and by source (inside ndisc_send_redirect)
492 if (xrlim_allow(dst, 1*HZ))
493 ndisc_send_redirect(skb, n, target);
495 int addrtype = ipv6_addr_type(&hdr->saddr);
497 /* This check is security critical. */
498 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
500 if (addrtype & IPV6_ADDR_LINKLOCAL) {
501 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
502 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
507 if (skb->len > dst_mtu(dst)) {
508 /* Again, force OUTPUT device used as source address */
510 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
511 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
512 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
517 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 /* Mangling hops number delayed to point after skb COW */
528 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
533 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 to->pkt_type = from->pkt_type;
542 to->priority = from->priority;
543 to->protocol = from->protocol;
544 dst_release(to->dst);
545 to->dst = dst_clone(from->dst);
547 to->mark = from->mark;
549 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index;
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace;
557 skb_copy_secmark(to, from);
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr;
569 while (offset + 1 <= packet_len) {
575 case NEXTHDR_ROUTING:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
590 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
599 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
601 struct net_device *dev;
602 struct sk_buff *frag;
603 struct rt6_info *rt = (struct rt6_info*)skb->dst;
604 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
605 struct ipv6hdr *tmp_hdr;
607 unsigned int mtu, hlen, left, len;
609 int ptr, offset = 0, err=0;
610 u8 *prevhdr, nexthdr = 0;
613 hlen = ip6_find_1stfragopt(skb, &prevhdr);
616 mtu = ip6_skb_dst_mtu(skb);
618 /* We must not fragment if the socket is set to force MTU discovery
619 * or if the skb it not generated by a local socket. (This last
620 * check should be redundant, but it's free.)
622 if (!skb->local_df) {
623 skb->dev = skb->dst->dev;
624 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
625 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
630 if (np && np->frag_size < mtu) {
634 mtu -= hlen + sizeof(struct frag_hdr);
636 if (skb_shinfo(skb)->frag_list) {
637 int first_len = skb_pagelen(skb);
640 if (first_len - hlen > mtu ||
641 ((first_len - hlen) & 7) ||
645 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
646 /* Correct geometry. */
647 if (frag->len > mtu ||
648 ((frag->len & 7) && frag->next) ||
649 skb_headroom(frag) < hlen)
652 /* Partially cloned skb? */
653 if (skb_shared(frag))
660 frag->destructor = sock_wfree;
661 truesizes += frag->truesize;
667 frag = skb_shinfo(skb)->frag_list;
668 skb_shinfo(skb)->frag_list = NULL;
671 *prevhdr = NEXTHDR_FRAGMENT;
672 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
674 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
678 __skb_pull(skb, hlen);
679 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
680 __skb_push(skb, hlen);
681 skb_reset_network_header(skb);
682 memcpy(skb_network_header(skb), tmp_hdr, hlen);
684 ipv6_select_ident(skb, fh);
685 fh->nexthdr = nexthdr;
687 fh->frag_off = htons(IP6_MF);
688 frag_id = fh->identification;
690 first_len = skb_pagelen(skb);
691 skb->data_len = first_len - skb_headlen(skb);
692 skb->truesize -= truesizes;
693 skb->len = first_len;
694 ipv6_hdr(skb)->payload_len = htons(first_len -
695 sizeof(struct ipv6hdr));
697 dst_hold(&rt->u.dst);
700 /* Prepare header of the next frame,
701 * before previous one went down. */
703 frag->ip_summed = CHECKSUM_NONE;
704 skb_reset_transport_header(frag);
705 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
706 __skb_push(frag, hlen);
707 skb_reset_network_header(frag);
708 memcpy(skb_network_header(frag), tmp_hdr,
710 offset += skb->len - hlen - sizeof(struct frag_hdr);
711 fh->nexthdr = nexthdr;
713 fh->frag_off = htons(offset);
714 if (frag->next != NULL)
715 fh->frag_off |= htons(IP6_MF);
716 fh->identification = frag_id;
717 ipv6_hdr(frag)->payload_len =
719 sizeof(struct ipv6hdr));
720 ip6_copy_metadata(frag, skb);
725 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
738 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
739 dst_release(&rt->u.dst);
749 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
750 dst_release(&rt->u.dst);
755 left = skb->len - hlen; /* Space per frame */
756 ptr = hlen; /* Where to start from */
759 * Fragment the datagram.
762 *prevhdr = NEXTHDR_FRAGMENT;
765 * Keep copying data until we run out.
769 /* IF: it doesn't fit, use 'mtu' - the data space left */
772 /* IF: we are not sending upto and including the packet end
773 then align the next start on an eight byte boundary */
781 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
782 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
783 IP6_INC_STATS(ip6_dst_idev(skb->dst),
784 IPSTATS_MIB_FRAGFAILS);
790 * Set up data on packet
793 ip6_copy_metadata(frag, skb);
794 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
795 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
796 skb_reset_network_header(frag);
797 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
798 frag->transport_header = (frag->network_header + hlen +
799 sizeof(struct frag_hdr));
802 * Charge the memory for the fragment to any owner
806 skb_set_owner_w(frag, skb->sk);
809 * Copy the packet header into the new buffer.
811 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
814 * Build fragment header.
816 fh->nexthdr = nexthdr;
819 ipv6_select_ident(skb, fh);
820 frag_id = fh->identification;
822 fh->identification = frag_id;
825 * Copy a block of the IP datagram.
827 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
831 fh->frag_off = htons(offset);
833 fh->frag_off |= htons(IP6_MF);
834 ipv6_hdr(frag)->payload_len = htons(frag->len -
835 sizeof(struct ipv6hdr));
841 * Put this fragment into the sending queue.
847 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
849 IP6_INC_STATS(ip6_dst_idev(skb->dst),
850 IPSTATS_MIB_FRAGOKS);
855 IP6_INC_STATS(ip6_dst_idev(skb->dst),
856 IPSTATS_MIB_FRAGFAILS);
861 static inline int ip6_rt_check(struct rt6key *rt_key,
862 struct in6_addr *fl_addr,
863 struct in6_addr *addr_cache)
865 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
866 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
869 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
870 struct dst_entry *dst,
873 struct ipv6_pinfo *np = inet6_sk(sk);
874 struct rt6_info *rt = (struct rt6_info *)dst;
879 /* Yes, checking route validity in not connected
880 * case is not very simple. Take into account,
881 * that we do not support routing by source, TOS,
882 * and MSG_DONTROUTE --ANK (980726)
884 * 1. ip6_rt_check(): If route was host route,
885 * check that cached destination is current.
886 * If it is network route, we still may
887 * check its validity using saved pointer
888 * to the last used address: daddr_cache.
889 * We do not want to save whole address now,
890 * (because main consumer of this service
891 * is tcp, which has not this problem),
892 * so that the last trick works only on connected
894 * 2. oif also should be the same.
896 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
897 #ifdef CONFIG_IPV6_SUBTREES
898 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
900 (fl->oif && fl->oif != dst->dev->ifindex)) {
909 static int ip6_dst_lookup_tail(struct sock *sk,
910 struct dst_entry **dst, struct flowi *fl)
913 struct net *net = sock_net(sk);
916 *dst = ip6_route_output(net, sk, fl);
918 if ((err = (*dst)->error))
919 goto out_err_release;
921 if (ipv6_addr_any(&fl->fl6_src)) {
922 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
924 sk ? inet6_sk(sk)->srcprefs : 0,
927 goto out_err_release;
930 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
932 * Here if the dst entry we've looked up
933 * has a neighbour entry that is in the INCOMPLETE
934 * state and the src address from the flow is
935 * marked as OPTIMISTIC, we release the found
936 * dst entry and replace it instead with the
937 * dst entry of the nexthop router
939 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
940 struct inet6_ifaddr *ifp;
944 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
947 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
953 * We need to get the dst entry for the
954 * default router instead
957 memcpy(&fl_gw, fl, sizeof(struct flowi));
958 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
959 *dst = ip6_route_output(net, sk, &fl_gw);
960 if ((err = (*dst)->error))
961 goto out_err_release;
969 if (err == -ENETUNREACH)
970 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
977 * ip6_dst_lookup - perform route lookup on flow
978 * @sk: socket which provides route info
979 * @dst: pointer to dst_entry * for result
980 * @fl: flow to lookup
982 * This function performs a route lookup on the given flow.
984 * It returns zero on success, or a standard errno code on error.
986 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
989 return ip6_dst_lookup_tail(sk, dst, fl);
991 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
994 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
995 * @sk: socket which provides the dst cache and route info
996 * @dst: pointer to dst_entry * for result
997 * @fl: flow to lookup
999 * This function performs a route lookup on the given flow with the
1000 * possibility of using the cached route in the socket if it is valid.
1001 * It will take the socket dst lock when operating on the dst cache.
1002 * As a result, this function can only be used in process context.
1004 * It returns zero on success, or a standard errno code on error.
1006 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1010 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1011 *dst = ip6_sk_dst_check(sk, *dst, fl);
1014 return ip6_dst_lookup_tail(sk, dst, fl);
1016 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1018 static inline int ip6_ufo_append_data(struct sock *sk,
1019 int getfrag(void *from, char *to, int offset, int len,
1020 int odd, struct sk_buff *skb),
1021 void *from, int length, int hh_len, int fragheaderlen,
1022 int transhdrlen, int mtu,unsigned int flags)
1025 struct sk_buff *skb;
1028 /* There is support for UDP large send offload by network
1029 * device, so create one single skb packet containing complete
1032 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1033 skb = sock_alloc_send_skb(sk,
1034 hh_len + fragheaderlen + transhdrlen + 20,
1035 (flags & MSG_DONTWAIT), &err);
1039 /* reserve space for Hardware header */
1040 skb_reserve(skb, hh_len);
1042 /* create space for UDP/IP header */
1043 skb_put(skb,fragheaderlen + transhdrlen);
1045 /* initialize network header pointer */
1046 skb_reset_network_header(skb);
1048 /* initialize protocol header pointer */
1049 skb->transport_header = skb->network_header + fragheaderlen;
1051 skb->ip_summed = CHECKSUM_PARTIAL;
1053 sk->sk_sndmsg_off = 0;
1056 err = skb_append_datato_frags(sk,skb, getfrag, from,
1057 (length - transhdrlen));
1059 struct frag_hdr fhdr;
1061 /* specify the length of each IP datagram fragment*/
1062 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1063 sizeof(struct frag_hdr);
1064 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1065 ipv6_select_ident(skb, &fhdr);
1066 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1067 __skb_queue_tail(&sk->sk_write_queue, skb);
1071 /* There is not enough support do UPD LSO,
1072 * so follow normal path
1079 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1080 int offset, int len, int odd, struct sk_buff *skb),
1081 void *from, int length, int transhdrlen,
1082 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1083 struct rt6_info *rt, unsigned int flags)
1085 struct inet_sock *inet = inet_sk(sk);
1086 struct ipv6_pinfo *np = inet6_sk(sk);
1087 struct sk_buff *skb;
1088 unsigned int maxfraglen, fragheaderlen;
1095 int csummode = CHECKSUM_NONE;
1097 if (flags&MSG_PROBE)
1099 if (skb_queue_empty(&sk->sk_write_queue)) {
1104 if (np->cork.opt == NULL) {
1105 np->cork.opt = kmalloc(opt->tot_len,
1107 if (unlikely(np->cork.opt == NULL))
1109 } else if (np->cork.opt->tot_len < opt->tot_len) {
1110 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1113 memcpy(np->cork.opt, opt, opt->tot_len);
1114 inet->cork.flags |= IPCORK_OPT;
1115 /* need source address above miyazawa*/
1117 dst_hold(&rt->u.dst);
1118 inet->cork.dst = &rt->u.dst;
1119 inet->cork.fl = *fl;
1120 np->cork.hop_limit = hlimit;
1121 np->cork.tclass = tclass;
1122 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1123 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1124 if (np->frag_size < mtu) {
1126 mtu = np->frag_size;
1128 inet->cork.fragsize = mtu;
1129 if (dst_allfrag(rt->u.dst.path))
1130 inet->cork.flags |= IPCORK_ALLFRAG;
1131 inet->cork.length = 0;
1132 sk->sk_sndmsg_page = NULL;
1133 sk->sk_sndmsg_off = 0;
1134 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1135 rt->rt6i_nfheader_len;
1136 length += exthdrlen;
1137 transhdrlen += exthdrlen;
1139 rt = (struct rt6_info *)inet->cork.dst;
1140 fl = &inet->cork.fl;
1141 if (inet->cork.flags & IPCORK_OPT)
1145 mtu = inet->cork.fragsize;
1148 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1150 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1151 (opt ? opt->opt_nflen : 0);
1152 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1154 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1155 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1156 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1162 * Let's try using as much space as possible.
1163 * Use MTU if total length of the message fits into the MTU.
1164 * Otherwise, we need to reserve fragment header and
1165 * fragment alignment (= 8-15 octects, in total).
1167 * Note that we may need to "move" the data from the tail of
1168 * of the buffer to the new fragment when we split
1171 * FIXME: It may be fragmented into multiple chunks
1172 * at once if non-fragmentable extension headers
1177 inet->cork.length += length;
1178 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1179 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1181 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1182 fragheaderlen, transhdrlen, mtu,
1189 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1192 while (length > 0) {
1193 /* Check if the remaining data fits into current packet. */
1194 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1196 copy = maxfraglen - skb->len;
1200 unsigned int datalen;
1201 unsigned int fraglen;
1202 unsigned int fraggap;
1203 unsigned int alloclen;
1204 struct sk_buff *skb_prev;
1208 /* There's no room in the current skb */
1210 fraggap = skb_prev->len - maxfraglen;
1215 * If remaining data exceeds the mtu,
1216 * we know we need more fragment(s).
1218 datalen = length + fraggap;
1219 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1220 datalen = maxfraglen - fragheaderlen;
1222 fraglen = datalen + fragheaderlen;
1223 if ((flags & MSG_MORE) &&
1224 !(rt->u.dst.dev->features&NETIF_F_SG))
1227 alloclen = datalen + fragheaderlen;
1230 * The last fragment gets additional space at tail.
1231 * Note: we overallocate on fragments with MSG_MODE
1232 * because we have no idea if we're the last one.
1234 if (datalen == length + fraggap)
1235 alloclen += rt->u.dst.trailer_len;
1238 * We just reserve space for fragment header.
1239 * Note: this may be overallocation if the message
1240 * (without MSG_MORE) fits into the MTU.
1242 alloclen += sizeof(struct frag_hdr);
1245 skb = sock_alloc_send_skb(sk,
1247 (flags & MSG_DONTWAIT), &err);
1250 if (atomic_read(&sk->sk_wmem_alloc) <=
1252 skb = sock_wmalloc(sk,
1253 alloclen + hh_len, 1,
1255 if (unlikely(skb == NULL))
1261 * Fill in the control structures
1263 skb->ip_summed = csummode;
1265 /* reserve for fragmentation */
1266 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1269 * Find where to start putting bytes
1271 data = skb_put(skb, fraglen);
1272 skb_set_network_header(skb, exthdrlen);
1273 data += fragheaderlen;
1274 skb->transport_header = (skb->network_header +
1277 skb->csum = skb_copy_and_csum_bits(
1278 skb_prev, maxfraglen,
1279 data + transhdrlen, fraggap, 0);
1280 skb_prev->csum = csum_sub(skb_prev->csum,
1283 pskb_trim_unique(skb_prev, maxfraglen);
1285 copy = datalen - transhdrlen - fraggap;
1290 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1297 length -= datalen - fraggap;
1300 csummode = CHECKSUM_NONE;
1303 * Put the packet on the pending queue
1305 __skb_queue_tail(&sk->sk_write_queue, skb);
1312 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1316 if (getfrag(from, skb_put(skb, copy),
1317 offset, copy, off, skb) < 0) {
1318 __skb_trim(skb, off);
1323 int i = skb_shinfo(skb)->nr_frags;
1324 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1325 struct page *page = sk->sk_sndmsg_page;
1326 int off = sk->sk_sndmsg_off;
1329 if (page && (left = PAGE_SIZE - off) > 0) {
1332 if (page != frag->page) {
1333 if (i == MAX_SKB_FRAGS) {
1338 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1339 frag = &skb_shinfo(skb)->frags[i];
1341 } else if(i < MAX_SKB_FRAGS) {
1342 if (copy > PAGE_SIZE)
1344 page = alloc_pages(sk->sk_allocation, 0);
1349 sk->sk_sndmsg_page = page;
1350 sk->sk_sndmsg_off = 0;
1352 skb_fill_page_desc(skb, i, page, 0, 0);
1353 frag = &skb_shinfo(skb)->frags[i];
1358 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1362 sk->sk_sndmsg_off += copy;
1365 skb->data_len += copy;
1366 skb->truesize += copy;
1367 atomic_add(copy, &sk->sk_wmem_alloc);
1374 inet->cork.length -= length;
1375 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1379 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1381 inet->cork.flags &= ~IPCORK_OPT;
1382 kfree(np->cork.opt);
1383 np->cork.opt = NULL;
1384 if (inet->cork.dst) {
1385 dst_release(inet->cork.dst);
1386 inet->cork.dst = NULL;
1387 inet->cork.flags &= ~IPCORK_ALLFRAG;
1389 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1392 int ip6_push_pending_frames(struct sock *sk)
1394 struct sk_buff *skb, *tmp_skb;
1395 struct sk_buff **tail_skb;
1396 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1397 struct inet_sock *inet = inet_sk(sk);
1398 struct ipv6_pinfo *np = inet6_sk(sk);
1399 struct ipv6hdr *hdr;
1400 struct ipv6_txoptions *opt = np->cork.opt;
1401 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1402 struct flowi *fl = &inet->cork.fl;
1403 unsigned char proto = fl->proto;
1406 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1408 tail_skb = &(skb_shinfo(skb)->frag_list);
1410 /* move skb->data to ip header from ext header */
1411 if (skb->data < skb_network_header(skb))
1412 __skb_pull(skb, skb_network_offset(skb));
1413 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1414 __skb_pull(tmp_skb, skb_network_header_len(skb));
1415 *tail_skb = tmp_skb;
1416 tail_skb = &(tmp_skb->next);
1417 skb->len += tmp_skb->len;
1418 skb->data_len += tmp_skb->len;
1419 skb->truesize += tmp_skb->truesize;
1420 __sock_put(tmp_skb->sk);
1421 tmp_skb->destructor = NULL;
1425 /* Allow local fragmentation. */
1426 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1429 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1430 __skb_pull(skb, skb_network_header_len(skb));
1431 if (opt && opt->opt_flen)
1432 ipv6_push_frag_opts(skb, opt, &proto);
1433 if (opt && opt->opt_nflen)
1434 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1436 skb_push(skb, sizeof(struct ipv6hdr));
1437 skb_reset_network_header(skb);
1438 hdr = ipv6_hdr(skb);
1440 *(__be32*)hdr = fl->fl6_flowlabel |
1441 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1443 hdr->hop_limit = np->cork.hop_limit;
1444 hdr->nexthdr = proto;
1445 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1446 ipv6_addr_copy(&hdr->daddr, final_dst);
1448 skb->priority = sk->sk_priority;
1449 skb->mark = sk->sk_mark;
1451 skb->dst = dst_clone(&rt->u.dst);
1452 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1453 if (proto == IPPROTO_ICMPV6) {
1454 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1456 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1457 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1460 err = ip6_local_out(skb);
1463 err = np->recverr ? net_xmit_errno(err) : 0;
1469 ip6_cork_release(inet, np);
1475 void ip6_flush_pending_frames(struct sock *sk)
1477 struct sk_buff *skb;
1479 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1481 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1482 IPSTATS_MIB_OUTDISCARDS);
1486 ip6_cork_release(inet_sk(sk), inet6_sk(sk));