2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
31 #include <linux/errno.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/socket.h>
35 #include <linux/net.h>
36 #include <linux/netdevice.h>
37 #include <linux/if_arp.h>
38 #include <linux/in6.h>
39 #include <linux/tcp.h>
40 #include <linux/route.h>
41 #include <linux/module.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
57 #include <net/checksum.h>
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
73 int __ip6_local_out(struct sk_buff *skb)
77 len = skb->len - sizeof(struct ipv6hdr);
78 if (len > IPV6_MAXPLEN)
80 ipv6_hdr(skb)->payload_len = htons(len);
82 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
86 int ip6_local_out(struct sk_buff *skb)
90 err = __ip6_local_out(skb);
92 err = dst_output(skb);
96 EXPORT_SYMBOL_GPL(ip6_local_out);
98 static int ip6_output_finish(struct sk_buff *skb)
100 struct dst_entry *dst = skb->dst;
103 return neigh_hh_output(dst->hh, skb);
104 else if (dst->neighbour)
105 return dst->neighbour->output(skb);
107 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
113 /* dev_loopback_xmit for use with netfilter. */
114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
116 skb_reset_mac_header(newskb);
117 __skb_pull(newskb, skb_network_offset(newskb));
118 newskb->pkt_type = PACKET_LOOPBACK;
119 newskb->ip_summed = CHECKSUM_UNNECESSARY;
120 BUG_TRAP(newskb->dst);
127 static int ip6_output2(struct sk_buff *skb)
129 struct dst_entry *dst = skb->dst;
130 struct net_device *dev = dst->dev;
132 skb->protocol = htons(ETH_P_IPV6);
135 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
136 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
137 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
139 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr)) {
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
148 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
150 ip6_dev_loopback_xmit);
152 if (ipv6_hdr(skb)->hop_limit == 0) {
153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
162 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
168 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
170 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 skb->dst->dev->mtu : dst_mtu(skb->dst);
174 int ip6_output(struct sk_buff *skb)
176 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177 dst_allfrag(skb->dst))
178 return ip6_fragment(skb, ip6_output2);
180 return ip6_output2(skb);
184 * xmit an sk_buff (used by TCP)
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 struct ipv6_txoptions *opt, int ipfragok)
190 struct ipv6_pinfo *np = inet6_sk(sk);
191 struct in6_addr *first_hop = &fl->fl6_dst;
192 struct dst_entry *dst = skb->dst;
194 u8 proto = fl->proto;
195 int seg_len = skb->len;
200 unsigned int head_room;
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 IP6_INC_STATS(ip6_dst_idev(skb->dst),
213 IPSTATS_MIB_OUTDISCARDS);
220 skb_set_owner_w(skb, sk);
223 ipv6_push_frag_opts(skb, opt, &proto);
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
233 * Fill in the IPv6 header
238 hlimit = np->hop_limit;
240 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
242 hlimit = ipv6_get_hoplimit(dst->dev);
250 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
252 hdr->payload_len = htons(seg_len);
253 hdr->nexthdr = proto;
254 hdr->hop_limit = hlimit;
256 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
257 ipv6_addr_copy(&hdr->daddr, first_hop);
259 skb->priority = sk->sk_priority;
262 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
263 IP6_INC_STATS(ip6_dst_idev(skb->dst),
264 IPSTATS_MIB_OUTREQUESTS);
265 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
270 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
272 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
273 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
278 EXPORT_SYMBOL(ip6_xmit);
281 * To avoid extra problems ND packets are send through this
282 * routine. It's code duplication but I really want to avoid
283 * extra checks since ipv6_build_header is used by TCP (which
284 * is for us performance critical)
287 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
288 struct in6_addr *saddr, struct in6_addr *daddr,
291 struct ipv6_pinfo *np = inet6_sk(sk);
295 skb->protocol = htons(ETH_P_IPV6);
298 totlen = len + sizeof(struct ipv6hdr);
300 skb_reset_network_header(skb);
301 skb_put(skb, sizeof(struct ipv6hdr));
304 *(__be32*)hdr = htonl(0x60000000);
306 hdr->payload_len = htons(len);
307 hdr->nexthdr = proto;
308 hdr->hop_limit = np->hop_limit;
310 ipv6_addr_copy(&hdr->saddr, saddr);
311 ipv6_addr_copy(&hdr->daddr, daddr);
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
318 struct ip6_ra_chain *ra;
319 struct sock *last = NULL;
321 read_lock(&ip6_ra_lock);
322 for (ra = ip6_ra_chain; ra; ra = ra->next) {
323 struct sock *sk = ra->sk;
324 if (sk && ra->sel == sel &&
325 (!sk->sk_bound_dev_if ||
326 sk->sk_bound_dev_if == skb->dev->ifindex)) {
328 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
330 rawv6_rcv(last, skb2);
337 rawv6_rcv(last, skb);
338 read_unlock(&ip6_ra_lock);
341 read_unlock(&ip6_ra_lock);
345 static int ip6_forward_proxy_check(struct sk_buff *skb)
347 struct ipv6hdr *hdr = ipv6_hdr(skb);
348 u8 nexthdr = hdr->nexthdr;
351 if (ipv6_ext_hdr(nexthdr)) {
352 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
356 offset = sizeof(struct ipv6hdr);
358 if (nexthdr == IPPROTO_ICMPV6) {
359 struct icmp6hdr *icmp6;
361 if (!pskb_may_pull(skb, (skb_network_header(skb) +
362 offset + 1 - skb->data)))
365 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
367 switch (icmp6->icmp6_type) {
368 case NDISC_ROUTER_SOLICITATION:
369 case NDISC_ROUTER_ADVERTISEMENT:
370 case NDISC_NEIGHBOUR_SOLICITATION:
371 case NDISC_NEIGHBOUR_ADVERTISEMENT:
373 /* For reaction involving unicast neighbor discovery
374 * message destined to the proxied address, pass it to
384 * The proxying router can't forward traffic sent to a link-local
385 * address, so signal the sender and discard the packet. This
386 * behavior is clarified by the MIPv6 specification.
388 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
389 dst_link_failure(skb);
396 static inline int ip6_forward_finish(struct sk_buff *skb)
398 return dst_output(skb);
401 int ip6_forward(struct sk_buff *skb)
403 struct dst_entry *dst = skb->dst;
404 struct ipv6hdr *hdr = ipv6_hdr(skb);
405 struct inet6_skb_parm *opt = IP6CB(skb);
407 if (ipv6_devconf.forwarding == 0)
410 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
411 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
415 skb_forward_csum(skb);
418 * We DO NOT make any processing on
419 * RA packets, pushing them to user level AS IS
420 * without ane WARRANTY that application will be able
421 * to interpret them. The reason is that we
422 * cannot make anything clever here.
424 * We are not end-node, so that if packet contains
425 * AH/ESP, we cannot make anything.
426 * Defragmentation also would be mistake, RA packets
427 * cannot be fragmented, because there is no warranty
428 * that different fragments will go along one path. --ANK
431 u8 *ptr = skb_network_header(skb) + opt->ra;
432 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
437 * check and decrement ttl
439 if (hdr->hop_limit <= 1) {
440 /* Force OUTPUT device used as source address */
442 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
444 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
450 /* XXX: idev->cnf.proxy_ndp? */
451 if (ipv6_devconf.proxy_ndp &&
452 pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) {
453 int proxied = ip6_forward_proxy_check(skb);
455 return ip6_input(skb);
456 else if (proxied < 0) {
457 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
462 if (!xfrm6_route_forward(skb)) {
463 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
468 /* IPv6 specs say nothing about it, but it is clear that we cannot
469 send redirects to source routed frames.
470 We don't send redirects to frames decapsulated from IPsec.
472 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
474 struct in6_addr *target = NULL;
476 struct neighbour *n = dst->neighbour;
479 * incoming and outgoing devices are the same
483 rt = (struct rt6_info *) dst;
484 if ((rt->rt6i_flags & RTF_GATEWAY))
485 target = (struct in6_addr*)&n->primary_key;
487 target = &hdr->daddr;
489 /* Limit redirects both by destination (here)
490 and by source (inside ndisc_send_redirect)
492 if (xrlim_allow(dst, 1*HZ))
493 ndisc_send_redirect(skb, n, target);
495 int addrtype = ipv6_addr_type(&hdr->saddr);
497 /* This check is security critical. */
498 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
500 if (addrtype & IPV6_ADDR_LINKLOCAL) {
501 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
502 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
507 if (skb->len > dst_mtu(dst)) {
508 /* Again, force OUTPUT device used as source address */
510 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
511 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
512 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
517 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 /* Mangling hops number delayed to point after skb COW */
528 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
533 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 to->pkt_type = from->pkt_type;
542 to->priority = from->priority;
543 to->protocol = from->protocol;
544 dst_release(to->dst);
545 to->dst = dst_clone(from->dst);
547 to->mark = from->mark;
549 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index;
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace;
557 skb_copy_secmark(to, from);
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr;
569 while (offset + 1 <= packet_len) {
575 case NEXTHDR_ROUTING:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
590 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
598 EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
600 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
602 struct net_device *dev;
603 struct sk_buff *frag;
604 struct rt6_info *rt = (struct rt6_info*)skb->dst;
605 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
606 struct ipv6hdr *tmp_hdr;
608 unsigned int mtu, hlen, left, len;
610 int ptr, offset = 0, err=0;
611 u8 *prevhdr, nexthdr = 0;
614 hlen = ip6_find_1stfragopt(skb, &prevhdr);
617 mtu = ip6_skb_dst_mtu(skb);
619 /* We must not fragment if the socket is set to force MTU discovery
620 * or if the skb it not generated by a local socket. (This last
621 * check should be redundant, but it's free.)
623 if (!np || np->pmtudisc >= IPV6_PMTUDISC_DO) {
624 skb->dev = skb->dst->dev;
625 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
626 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
631 if (np && np->frag_size < mtu) {
635 mtu -= hlen + sizeof(struct frag_hdr);
637 if (skb_shinfo(skb)->frag_list) {
638 int first_len = skb_pagelen(skb);
640 if (first_len - hlen > mtu ||
641 ((first_len - hlen) & 7) ||
645 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
646 /* Correct geometry. */
647 if (frag->len > mtu ||
648 ((frag->len & 7) && frag->next) ||
649 skb_headroom(frag) < hlen)
652 /* Partially cloned skb? */
653 if (skb_shared(frag))
660 frag->destructor = sock_wfree;
661 skb->truesize -= frag->truesize;
667 frag = skb_shinfo(skb)->frag_list;
668 skb_shinfo(skb)->frag_list = NULL;
671 *prevhdr = NEXTHDR_FRAGMENT;
672 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
674 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
678 __skb_pull(skb, hlen);
679 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
680 __skb_push(skb, hlen);
681 skb_reset_network_header(skb);
682 memcpy(skb_network_header(skb), tmp_hdr, hlen);
684 ipv6_select_ident(skb, fh);
685 fh->nexthdr = nexthdr;
687 fh->frag_off = htons(IP6_MF);
688 frag_id = fh->identification;
690 first_len = skb_pagelen(skb);
691 skb->data_len = first_len - skb_headlen(skb);
692 skb->len = first_len;
693 ipv6_hdr(skb)->payload_len = htons(first_len -
694 sizeof(struct ipv6hdr));
696 dst_hold(&rt->u.dst);
699 /* Prepare header of the next frame,
700 * before previous one went down. */
702 frag->ip_summed = CHECKSUM_NONE;
703 skb_reset_transport_header(frag);
704 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
705 __skb_push(frag, hlen);
706 skb_reset_network_header(frag);
707 memcpy(skb_network_header(frag), tmp_hdr,
709 offset += skb->len - hlen - sizeof(struct frag_hdr);
710 fh->nexthdr = nexthdr;
712 fh->frag_off = htons(offset);
713 if (frag->next != NULL)
714 fh->frag_off |= htons(IP6_MF);
715 fh->identification = frag_id;
716 ipv6_hdr(frag)->payload_len =
718 sizeof(struct ipv6hdr));
719 ip6_copy_metadata(frag, skb);
724 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
737 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
738 dst_release(&rt->u.dst);
748 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
749 dst_release(&rt->u.dst);
754 left = skb->len - hlen; /* Space per frame */
755 ptr = hlen; /* Where to start from */
758 * Fragment the datagram.
761 *prevhdr = NEXTHDR_FRAGMENT;
764 * Keep copying data until we run out.
768 /* IF: it doesn't fit, use 'mtu' - the data space left */
771 /* IF: we are not sending upto and including the packet end
772 then align the next start on an eight byte boundary */
780 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
781 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
782 IP6_INC_STATS(ip6_dst_idev(skb->dst),
783 IPSTATS_MIB_FRAGFAILS);
789 * Set up data on packet
792 ip6_copy_metadata(frag, skb);
793 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
794 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
795 skb_reset_network_header(frag);
796 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
797 frag->transport_header = (frag->network_header + hlen +
798 sizeof(struct frag_hdr));
801 * Charge the memory for the fragment to any owner
805 skb_set_owner_w(frag, skb->sk);
808 * Copy the packet header into the new buffer.
810 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
813 * Build fragment header.
815 fh->nexthdr = nexthdr;
818 ipv6_select_ident(skb, fh);
819 frag_id = fh->identification;
821 fh->identification = frag_id;
824 * Copy a block of the IP datagram.
826 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
830 fh->frag_off = htons(offset);
832 fh->frag_off |= htons(IP6_MF);
833 ipv6_hdr(frag)->payload_len = htons(frag->len -
834 sizeof(struct ipv6hdr));
840 * Put this fragment into the sending queue.
846 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
848 IP6_INC_STATS(ip6_dst_idev(skb->dst),
849 IPSTATS_MIB_FRAGOKS);
854 IP6_INC_STATS(ip6_dst_idev(skb->dst),
855 IPSTATS_MIB_FRAGFAILS);
860 static inline int ip6_rt_check(struct rt6key *rt_key,
861 struct in6_addr *fl_addr,
862 struct in6_addr *addr_cache)
864 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
865 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
868 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
869 struct dst_entry *dst,
872 struct ipv6_pinfo *np = inet6_sk(sk);
873 struct rt6_info *rt = (struct rt6_info *)dst;
878 /* Yes, checking route validity in not connected
879 * case is not very simple. Take into account,
880 * that we do not support routing by source, TOS,
881 * and MSG_DONTROUTE --ANK (980726)
883 * 1. ip6_rt_check(): If route was host route,
884 * check that cached destination is current.
885 * If it is network route, we still may
886 * check its validity using saved pointer
887 * to the last used address: daddr_cache.
888 * We do not want to save whole address now,
889 * (because main consumer of this service
890 * is tcp, which has not this problem),
891 * so that the last trick works only on connected
893 * 2. oif also should be the same.
895 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
896 #ifdef CONFIG_IPV6_SUBTREES
897 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
899 (fl->oif && fl->oif != dst->dev->ifindex)) {
908 static int ip6_dst_lookup_tail(struct sock *sk,
909 struct dst_entry **dst, struct flowi *fl)
914 *dst = ip6_route_output(sk, fl);
916 if ((err = (*dst)->error))
917 goto out_err_release;
919 if (ipv6_addr_any(&fl->fl6_src)) {
920 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
922 goto out_err_release;
925 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
927 * Here if the dst entry we've looked up
928 * has a neighbour entry that is in the INCOMPLETE
929 * state and the src address from the flow is
930 * marked as OPTIMISTIC, we release the found
931 * dst entry and replace it instead with the
932 * dst entry of the nexthop router
934 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
935 struct inet6_ifaddr *ifp;
939 ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src,
942 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
948 * We need to get the dst entry for the
949 * default router instead
952 memcpy(&fl_gw, fl, sizeof(struct flowi));
953 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
954 *dst = ip6_route_output(sk, &fl_gw);
955 if ((err = (*dst)->error))
956 goto out_err_release;
964 if (err == -ENETUNREACH)
965 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
972 * ip6_dst_lookup - perform route lookup on flow
973 * @sk: socket which provides route info
974 * @dst: pointer to dst_entry * for result
975 * @fl: flow to lookup
977 * This function performs a route lookup on the given flow.
979 * It returns zero on success, or a standard errno code on error.
981 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
984 return ip6_dst_lookup_tail(sk, dst, fl);
986 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
989 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
990 * @sk: socket which provides the dst cache and route info
991 * @dst: pointer to dst_entry * for result
992 * @fl: flow to lookup
994 * This function performs a route lookup on the given flow with the
995 * possibility of using the cached route in the socket if it is valid.
996 * It will take the socket dst lock when operating on the dst cache.
997 * As a result, this function can only be used in process context.
999 * It returns zero on success, or a standard errno code on error.
1001 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1005 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1006 *dst = ip6_sk_dst_check(sk, *dst, fl);
1009 return ip6_dst_lookup_tail(sk, dst, fl);
1011 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1013 static inline int ip6_ufo_append_data(struct sock *sk,
1014 int getfrag(void *from, char *to, int offset, int len,
1015 int odd, struct sk_buff *skb),
1016 void *from, int length, int hh_len, int fragheaderlen,
1017 int transhdrlen, int mtu,unsigned int flags)
1020 struct sk_buff *skb;
1023 /* There is support for UDP large send offload by network
1024 * device, so create one single skb packet containing complete
1027 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1028 skb = sock_alloc_send_skb(sk,
1029 hh_len + fragheaderlen + transhdrlen + 20,
1030 (flags & MSG_DONTWAIT), &err);
1034 /* reserve space for Hardware header */
1035 skb_reserve(skb, hh_len);
1037 /* create space for UDP/IP header */
1038 skb_put(skb,fragheaderlen + transhdrlen);
1040 /* initialize network header pointer */
1041 skb_reset_network_header(skb);
1043 /* initialize protocol header pointer */
1044 skb->transport_header = skb->network_header + fragheaderlen;
1046 skb->ip_summed = CHECKSUM_PARTIAL;
1048 sk->sk_sndmsg_off = 0;
1051 err = skb_append_datato_frags(sk,skb, getfrag, from,
1052 (length - transhdrlen));
1054 struct frag_hdr fhdr;
1056 /* specify the length of each IP datagram fragment*/
1057 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1058 sizeof(struct frag_hdr);
1059 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1060 ipv6_select_ident(skb, &fhdr);
1061 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1062 __skb_queue_tail(&sk->sk_write_queue, skb);
1066 /* There is not enough support do UPD LSO,
1067 * so follow normal path
1074 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1075 int offset, int len, int odd, struct sk_buff *skb),
1076 void *from, int length, int transhdrlen,
1077 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1078 struct rt6_info *rt, unsigned int flags)
1080 struct inet_sock *inet = inet_sk(sk);
1081 struct ipv6_pinfo *np = inet6_sk(sk);
1082 struct sk_buff *skb;
1083 unsigned int maxfraglen, fragheaderlen;
1090 int csummode = CHECKSUM_NONE;
1092 if (flags&MSG_PROBE)
1094 if (skb_queue_empty(&sk->sk_write_queue)) {
1099 if (np->cork.opt == NULL) {
1100 np->cork.opt = kmalloc(opt->tot_len,
1102 if (unlikely(np->cork.opt == NULL))
1104 } else if (np->cork.opt->tot_len < opt->tot_len) {
1105 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1108 memcpy(np->cork.opt, opt, opt->tot_len);
1109 inet->cork.flags |= IPCORK_OPT;
1110 /* need source address above miyazawa*/
1112 dst_hold(&rt->u.dst);
1114 inet->cork.fl = *fl;
1115 np->cork.hop_limit = hlimit;
1116 np->cork.tclass = tclass;
1117 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1118 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1119 if (np->frag_size < mtu) {
1121 mtu = np->frag_size;
1123 inet->cork.fragsize = mtu;
1124 if (dst_allfrag(rt->u.dst.path))
1125 inet->cork.flags |= IPCORK_ALLFRAG;
1126 inet->cork.length = 0;
1127 sk->sk_sndmsg_page = NULL;
1128 sk->sk_sndmsg_off = 0;
1129 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1130 rt->rt6i_nfheader_len;
1131 length += exthdrlen;
1132 transhdrlen += exthdrlen;
1135 fl = &inet->cork.fl;
1136 if (inet->cork.flags & IPCORK_OPT)
1140 mtu = inet->cork.fragsize;
1143 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1145 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1146 (opt ? opt->opt_nflen : 0);
1147 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1149 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1150 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1151 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1157 * Let's try using as much space as possible.
1158 * Use MTU if total length of the message fits into the MTU.
1159 * Otherwise, we need to reserve fragment header and
1160 * fragment alignment (= 8-15 octects, in total).
1162 * Note that we may need to "move" the data from the tail of
1163 * of the buffer to the new fragment when we split
1166 * FIXME: It may be fragmented into multiple chunks
1167 * at once if non-fragmentable extension headers
1172 inet->cork.length += length;
1173 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1174 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1176 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1177 fragheaderlen, transhdrlen, mtu,
1184 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1187 while (length > 0) {
1188 /* Check if the remaining data fits into current packet. */
1189 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1191 copy = maxfraglen - skb->len;
1195 unsigned int datalen;
1196 unsigned int fraglen;
1197 unsigned int fraggap;
1198 unsigned int alloclen;
1199 struct sk_buff *skb_prev;
1203 /* There's no room in the current skb */
1205 fraggap = skb_prev->len - maxfraglen;
1210 * If remaining data exceeds the mtu,
1211 * we know we need more fragment(s).
1213 datalen = length + fraggap;
1214 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1215 datalen = maxfraglen - fragheaderlen;
1217 fraglen = datalen + fragheaderlen;
1218 if ((flags & MSG_MORE) &&
1219 !(rt->u.dst.dev->features&NETIF_F_SG))
1222 alloclen = datalen + fragheaderlen;
1225 * The last fragment gets additional space at tail.
1226 * Note: we overallocate on fragments with MSG_MODE
1227 * because we have no idea if we're the last one.
1229 if (datalen == length + fraggap)
1230 alloclen += rt->u.dst.trailer_len;
1233 * We just reserve space for fragment header.
1234 * Note: this may be overallocation if the message
1235 * (without MSG_MORE) fits into the MTU.
1237 alloclen += sizeof(struct frag_hdr);
1240 skb = sock_alloc_send_skb(sk,
1242 (flags & MSG_DONTWAIT), &err);
1245 if (atomic_read(&sk->sk_wmem_alloc) <=
1247 skb = sock_wmalloc(sk,
1248 alloclen + hh_len, 1,
1250 if (unlikely(skb == NULL))
1256 * Fill in the control structures
1258 skb->ip_summed = csummode;
1260 /* reserve for fragmentation */
1261 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1264 * Find where to start putting bytes
1266 data = skb_put(skb, fraglen);
1267 skb_set_network_header(skb, exthdrlen);
1268 data += fragheaderlen;
1269 skb->transport_header = (skb->network_header +
1272 skb->csum = skb_copy_and_csum_bits(
1273 skb_prev, maxfraglen,
1274 data + transhdrlen, fraggap, 0);
1275 skb_prev->csum = csum_sub(skb_prev->csum,
1278 pskb_trim_unique(skb_prev, maxfraglen);
1280 copy = datalen - transhdrlen - fraggap;
1285 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1292 length -= datalen - fraggap;
1295 csummode = CHECKSUM_NONE;
1298 * Put the packet on the pending queue
1300 __skb_queue_tail(&sk->sk_write_queue, skb);
1307 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1311 if (getfrag(from, skb_put(skb, copy),
1312 offset, copy, off, skb) < 0) {
1313 __skb_trim(skb, off);
1318 int i = skb_shinfo(skb)->nr_frags;
1319 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1320 struct page *page = sk->sk_sndmsg_page;
1321 int off = sk->sk_sndmsg_off;
1324 if (page && (left = PAGE_SIZE - off) > 0) {
1327 if (page != frag->page) {
1328 if (i == MAX_SKB_FRAGS) {
1333 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1334 frag = &skb_shinfo(skb)->frags[i];
1336 } else if(i < MAX_SKB_FRAGS) {
1337 if (copy > PAGE_SIZE)
1339 page = alloc_pages(sk->sk_allocation, 0);
1344 sk->sk_sndmsg_page = page;
1345 sk->sk_sndmsg_off = 0;
1347 skb_fill_page_desc(skb, i, page, 0, 0);
1348 frag = &skb_shinfo(skb)->frags[i];
1353 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1357 sk->sk_sndmsg_off += copy;
1360 skb->data_len += copy;
1361 skb->truesize += copy;
1362 atomic_add(copy, &sk->sk_wmem_alloc);
1369 inet->cork.length -= length;
1370 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1374 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1376 inet->cork.flags &= ~IPCORK_OPT;
1377 kfree(np->cork.opt);
1378 np->cork.opt = NULL;
1380 dst_release(&np->cork.rt->u.dst);
1382 inet->cork.flags &= ~IPCORK_ALLFRAG;
1384 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1387 int ip6_push_pending_frames(struct sock *sk)
1389 struct sk_buff *skb, *tmp_skb;
1390 struct sk_buff **tail_skb;
1391 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1392 struct inet_sock *inet = inet_sk(sk);
1393 struct ipv6_pinfo *np = inet6_sk(sk);
1394 struct ipv6hdr *hdr;
1395 struct ipv6_txoptions *opt = np->cork.opt;
1396 struct rt6_info *rt = np->cork.rt;
1397 struct flowi *fl = &inet->cork.fl;
1398 unsigned char proto = fl->proto;
1401 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1403 tail_skb = &(skb_shinfo(skb)->frag_list);
1405 /* move skb->data to ip header from ext header */
1406 if (skb->data < skb_network_header(skb))
1407 __skb_pull(skb, skb_network_offset(skb));
1408 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1409 __skb_pull(tmp_skb, skb_network_header_len(skb));
1410 *tail_skb = tmp_skb;
1411 tail_skb = &(tmp_skb->next);
1412 skb->len += tmp_skb->len;
1413 skb->data_len += tmp_skb->len;
1414 skb->truesize += tmp_skb->truesize;
1415 __sock_put(tmp_skb->sk);
1416 tmp_skb->destructor = NULL;
1420 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1421 __skb_pull(skb, skb_network_header_len(skb));
1422 if (opt && opt->opt_flen)
1423 ipv6_push_frag_opts(skb, opt, &proto);
1424 if (opt && opt->opt_nflen)
1425 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1427 skb_push(skb, sizeof(struct ipv6hdr));
1428 skb_reset_network_header(skb);
1429 hdr = ipv6_hdr(skb);
1431 *(__be32*)hdr = fl->fl6_flowlabel |
1432 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1434 hdr->hop_limit = np->cork.hop_limit;
1435 hdr->nexthdr = proto;
1436 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1437 ipv6_addr_copy(&hdr->daddr, final_dst);
1439 skb->priority = sk->sk_priority;
1441 skb->dst = dst_clone(&rt->u.dst);
1442 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1443 if (proto == IPPROTO_ICMPV6) {
1444 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1446 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1447 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1450 err = ip6_local_out(skb);
1453 err = np->recverr ? net_xmit_errno(err) : 0;
1459 ip6_cork_release(inet, np);
1465 void ip6_flush_pending_frames(struct sock *sk)
1467 struct sk_buff *skb;
1469 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1471 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1472 IPSTATS_MIB_OUTDISCARDS);
1476 ip6_cork_release(inet_sk(sk), inet6_sk(sk));