2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 static u32 ipv6_fragmentation_id = 1;
63 static DEFINE_SPINLOCK(ip6_id_lock);
65 spin_lock_bh(&ip6_id_lock);
66 fhdr->identification = htonl(ipv6_fragmentation_id);
67 if (++ipv6_fragmentation_id == 0)
68 ipv6_fragmentation_id = 1;
69 spin_unlock_bh(&ip6_id_lock);
72 int __ip6_local_out(struct sk_buff *skb)
76 len = skb->len - sizeof(struct ipv6hdr);
77 if (len > IPV6_MAXPLEN)
79 ipv6_hdr(skb)->payload_len = htons(len);
81 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
85 int ip6_local_out(struct sk_buff *skb)
89 err = __ip6_local_out(skb);
91 err = dst_output(skb);
95 EXPORT_SYMBOL_GPL(ip6_local_out);
97 static int ip6_output_finish(struct sk_buff *skb)
99 struct dst_entry *dst = skb->dst;
102 return neigh_hh_output(dst->hh, skb);
103 else if (dst->neighbour)
104 return dst->neighbour->output(skb);
106 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115 skb_reset_mac_header(newskb);
116 __skb_pull(newskb, skb_network_offset(newskb));
117 newskb->pkt_type = PACKET_LOOPBACK;
118 newskb->ip_summed = CHECKSUM_UNNECESSARY;
119 BUG_TRAP(newskb->dst);
126 static int ip6_output2(struct sk_buff *skb)
128 struct dst_entry *dst = skb->dst;
129 struct net_device *dev = dst->dev;
131 skb->protocol = htons(ETH_P_IPV6);
134 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
138 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139 ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr))) {
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
148 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
150 ip6_dev_loopback_xmit);
152 if (ipv6_hdr(skb)->hop_limit == 0) {
153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
162 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
168 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
170 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 skb->dst->dev->mtu : dst_mtu(skb->dst);
174 int ip6_output(struct sk_buff *skb)
176 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
177 if (unlikely(idev->cnf.disable_ipv6)) {
178 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
183 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
184 dst_allfrag(skb->dst))
185 return ip6_fragment(skb, ip6_output2);
187 return ip6_output2(skb);
191 * xmit an sk_buff (used by TCP)
194 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
195 struct ipv6_txoptions *opt, int ipfragok)
197 struct ipv6_pinfo *np = inet6_sk(sk);
198 struct in6_addr *first_hop = &fl->fl6_dst;
199 struct dst_entry *dst = skb->dst;
201 u8 proto = fl->proto;
202 int seg_len = skb->len;
207 unsigned int head_room;
209 /* First: exthdrs may take lots of space (~8K for now)
210 MAX_HEADER is not enough.
212 head_room = opt->opt_nflen + opt->opt_flen;
213 seg_len += head_room;
214 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
216 if (skb_headroom(skb) < head_room) {
217 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
219 IP6_INC_STATS(ip6_dst_idev(skb->dst),
220 IPSTATS_MIB_OUTDISCARDS);
227 skb_set_owner_w(skb, sk);
230 ipv6_push_frag_opts(skb, opt, &proto);
232 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
235 skb_push(skb, sizeof(struct ipv6hdr));
236 skb_reset_network_header(skb);
240 * Fill in the IPv6 header
245 hlimit = np->hop_limit;
247 hlimit = ip6_dst_hoplimit(dst);
255 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
257 hdr->payload_len = htons(seg_len);
258 hdr->nexthdr = proto;
259 hdr->hop_limit = hlimit;
261 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
262 ipv6_addr_copy(&hdr->daddr, first_hop);
264 skb->priority = sk->sk_priority;
265 skb->mark = sk->sk_mark;
268 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
269 IP6_INC_STATS(ip6_dst_idev(skb->dst),
270 IPSTATS_MIB_OUTREQUESTS);
271 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
276 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
278 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
279 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
284 EXPORT_SYMBOL(ip6_xmit);
287 * To avoid extra problems ND packets are send through this
288 * routine. It's code duplication but I really want to avoid
289 * extra checks since ipv6_build_header is used by TCP (which
290 * is for us performance critical)
293 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
294 const struct in6_addr *saddr, const struct in6_addr *daddr,
297 struct ipv6_pinfo *np = inet6_sk(sk);
301 skb->protocol = htons(ETH_P_IPV6);
304 totlen = len + sizeof(struct ipv6hdr);
306 skb_reset_network_header(skb);
307 skb_put(skb, sizeof(struct ipv6hdr));
310 *(__be32*)hdr = htonl(0x60000000);
312 hdr->payload_len = htons(len);
313 hdr->nexthdr = proto;
314 hdr->hop_limit = np->hop_limit;
316 ipv6_addr_copy(&hdr->saddr, saddr);
317 ipv6_addr_copy(&hdr->daddr, daddr);
322 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
324 struct ip6_ra_chain *ra;
325 struct sock *last = NULL;
327 read_lock(&ip6_ra_lock);
328 for (ra = ip6_ra_chain; ra; ra = ra->next) {
329 struct sock *sk = ra->sk;
330 if (sk && ra->sel == sel &&
331 (!sk->sk_bound_dev_if ||
332 sk->sk_bound_dev_if == skb->dev->ifindex)) {
334 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
336 rawv6_rcv(last, skb2);
343 rawv6_rcv(last, skb);
344 read_unlock(&ip6_ra_lock);
347 read_unlock(&ip6_ra_lock);
351 static int ip6_forward_proxy_check(struct sk_buff *skb)
353 struct ipv6hdr *hdr = ipv6_hdr(skb);
354 u8 nexthdr = hdr->nexthdr;
357 if (ipv6_ext_hdr(nexthdr)) {
358 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
362 offset = sizeof(struct ipv6hdr);
364 if (nexthdr == IPPROTO_ICMPV6) {
365 struct icmp6hdr *icmp6;
367 if (!pskb_may_pull(skb, (skb_network_header(skb) +
368 offset + 1 - skb->data)))
371 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
373 switch (icmp6->icmp6_type) {
374 case NDISC_ROUTER_SOLICITATION:
375 case NDISC_ROUTER_ADVERTISEMENT:
376 case NDISC_NEIGHBOUR_SOLICITATION:
377 case NDISC_NEIGHBOUR_ADVERTISEMENT:
379 /* For reaction involving unicast neighbor discovery
380 * message destined to the proxied address, pass it to
390 * The proxying router can't forward traffic sent to a link-local
391 * address, so signal the sender and discard the packet. This
392 * behavior is clarified by the MIPv6 specification.
394 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
395 dst_link_failure(skb);
402 static inline int ip6_forward_finish(struct sk_buff *skb)
404 return dst_output(skb);
407 int ip6_forward(struct sk_buff *skb)
409 struct dst_entry *dst = skb->dst;
410 struct ipv6hdr *hdr = ipv6_hdr(skb);
411 struct inet6_skb_parm *opt = IP6CB(skb);
412 struct net *net = dev_net(dst->dev);
414 if (net->ipv6.devconf_all->forwarding == 0)
417 if (skb_warn_if_lro(skb))
420 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
421 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
425 skb_forward_csum(skb);
428 * We DO NOT make any processing on
429 * RA packets, pushing them to user level AS IS
430 * without ane WARRANTY that application will be able
431 * to interpret them. The reason is that we
432 * cannot make anything clever here.
434 * We are not end-node, so that if packet contains
435 * AH/ESP, we cannot make anything.
436 * Defragmentation also would be mistake, RA packets
437 * cannot be fragmented, because there is no warranty
438 * that different fragments will go along one path. --ANK
441 u8 *ptr = skb_network_header(skb) + opt->ra;
442 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
447 * check and decrement ttl
449 if (hdr->hop_limit <= 1) {
450 /* Force OUTPUT device used as source address */
452 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
454 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
460 /* XXX: idev->cnf.proxy_ndp? */
461 if (net->ipv6.devconf_all->proxy_ndp &&
462 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
463 int proxied = ip6_forward_proxy_check(skb);
465 return ip6_input(skb);
466 else if (proxied < 0) {
467 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
472 if (!xfrm6_route_forward(skb)) {
473 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
478 /* IPv6 specs say nothing about it, but it is clear that we cannot
479 send redirects to source routed frames.
480 We don't send redirects to frames decapsulated from IPsec.
482 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
484 struct in6_addr *target = NULL;
486 struct neighbour *n = dst->neighbour;
489 * incoming and outgoing devices are the same
493 rt = (struct rt6_info *) dst;
494 if ((rt->rt6i_flags & RTF_GATEWAY))
495 target = (struct in6_addr*)&n->primary_key;
497 target = &hdr->daddr;
499 /* Limit redirects both by destination (here)
500 and by source (inside ndisc_send_redirect)
502 if (xrlim_allow(dst, 1*HZ))
503 ndisc_send_redirect(skb, n, target);
505 int addrtype = ipv6_addr_type(&hdr->saddr);
507 /* This check is security critical. */
508 if (addrtype == IPV6_ADDR_ANY ||
509 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
511 if (addrtype & IPV6_ADDR_LINKLOCAL) {
512 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
513 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
518 if (skb->len > dst_mtu(dst)) {
519 /* Again, force OUTPUT device used as source address */
521 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
522 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
523 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
528 if (skb_cow(skb, dst->dev->hard_header_len)) {
529 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
535 /* Mangling hops number delayed to point after skb COW */
539 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
540 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
544 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
552 to->pkt_type = from->pkt_type;
553 to->priority = from->priority;
554 to->protocol = from->protocol;
555 dst_release(to->dst);
556 to->dst = dst_clone(from->dst);
558 to->mark = from->mark;
560 #ifdef CONFIG_NET_SCHED
561 to->tc_index = from->tc_index;
564 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
565 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
566 to->nf_trace = from->nf_trace;
568 skb_copy_secmark(to, from);
571 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
573 u16 offset = sizeof(struct ipv6hdr);
574 struct ipv6_opt_hdr *exthdr =
575 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
576 unsigned int packet_len = skb->tail - skb->network_header;
578 *nexthdr = &ipv6_hdr(skb)->nexthdr;
580 while (offset + 1 <= packet_len) {
586 case NEXTHDR_ROUTING:
590 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
591 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
601 offset += ipv6_optlen(exthdr);
602 *nexthdr = &exthdr->nexthdr;
603 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
610 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
612 struct net_device *dev;
613 struct sk_buff *frag;
614 struct rt6_info *rt = (struct rt6_info*)skb->dst;
615 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
616 struct ipv6hdr *tmp_hdr;
618 unsigned int mtu, hlen, left, len;
620 int ptr, offset = 0, err=0;
621 u8 *prevhdr, nexthdr = 0;
624 hlen = ip6_find_1stfragopt(skb, &prevhdr);
627 mtu = ip6_skb_dst_mtu(skb);
629 /* We must not fragment if the socket is set to force MTU discovery
630 * or if the skb it not generated by a local socket. (This last
631 * check should be redundant, but it's free.)
633 if (!skb->local_df) {
634 skb->dev = skb->dst->dev;
635 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
636 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
641 if (np && np->frag_size < mtu) {
645 mtu -= hlen + sizeof(struct frag_hdr);
647 if (skb_shinfo(skb)->frag_list) {
648 int first_len = skb_pagelen(skb);
651 if (first_len - hlen > mtu ||
652 ((first_len - hlen) & 7) ||
656 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
657 /* Correct geometry. */
658 if (frag->len > mtu ||
659 ((frag->len & 7) && frag->next) ||
660 skb_headroom(frag) < hlen)
663 /* Partially cloned skb? */
664 if (skb_shared(frag))
671 frag->destructor = sock_wfree;
672 truesizes += frag->truesize;
678 frag = skb_shinfo(skb)->frag_list;
679 skb_shinfo(skb)->frag_list = NULL;
682 *prevhdr = NEXTHDR_FRAGMENT;
683 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
685 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
689 __skb_pull(skb, hlen);
690 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
691 __skb_push(skb, hlen);
692 skb_reset_network_header(skb);
693 memcpy(skb_network_header(skb), tmp_hdr, hlen);
695 ipv6_select_ident(skb, fh);
696 fh->nexthdr = nexthdr;
698 fh->frag_off = htons(IP6_MF);
699 frag_id = fh->identification;
701 first_len = skb_pagelen(skb);
702 skb->data_len = first_len - skb_headlen(skb);
703 skb->truesize -= truesizes;
704 skb->len = first_len;
705 ipv6_hdr(skb)->payload_len = htons(first_len -
706 sizeof(struct ipv6hdr));
708 dst_hold(&rt->u.dst);
711 /* Prepare header of the next frame,
712 * before previous one went down. */
714 frag->ip_summed = CHECKSUM_NONE;
715 skb_reset_transport_header(frag);
716 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
717 __skb_push(frag, hlen);
718 skb_reset_network_header(frag);
719 memcpy(skb_network_header(frag), tmp_hdr,
721 offset += skb->len - hlen - sizeof(struct frag_hdr);
722 fh->nexthdr = nexthdr;
724 fh->frag_off = htons(offset);
725 if (frag->next != NULL)
726 fh->frag_off |= htons(IP6_MF);
727 fh->identification = frag_id;
728 ipv6_hdr(frag)->payload_len =
730 sizeof(struct ipv6hdr));
731 ip6_copy_metadata(frag, skb);
736 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
749 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
750 dst_release(&rt->u.dst);
760 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
761 dst_release(&rt->u.dst);
766 left = skb->len - hlen; /* Space per frame */
767 ptr = hlen; /* Where to start from */
770 * Fragment the datagram.
773 *prevhdr = NEXTHDR_FRAGMENT;
776 * Keep copying data until we run out.
780 /* IF: it doesn't fit, use 'mtu' - the data space left */
783 /* IF: we are not sending upto and including the packet end
784 then align the next start on an eight byte boundary */
792 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
793 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
794 IP6_INC_STATS(ip6_dst_idev(skb->dst),
795 IPSTATS_MIB_FRAGFAILS);
801 * Set up data on packet
804 ip6_copy_metadata(frag, skb);
805 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
806 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
807 skb_reset_network_header(frag);
808 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
809 frag->transport_header = (frag->network_header + hlen +
810 sizeof(struct frag_hdr));
813 * Charge the memory for the fragment to any owner
817 skb_set_owner_w(frag, skb->sk);
820 * Copy the packet header into the new buffer.
822 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825 * Build fragment header.
827 fh->nexthdr = nexthdr;
830 ipv6_select_ident(skb, fh);
831 frag_id = fh->identification;
833 fh->identification = frag_id;
836 * Copy a block of the IP datagram.
838 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
842 fh->frag_off = htons(offset);
844 fh->frag_off |= htons(IP6_MF);
845 ipv6_hdr(frag)->payload_len = htons(frag->len -
846 sizeof(struct ipv6hdr));
852 * Put this fragment into the sending queue.
858 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
860 IP6_INC_STATS(ip6_dst_idev(skb->dst),
861 IPSTATS_MIB_FRAGOKS);
866 IP6_INC_STATS(ip6_dst_idev(skb->dst),
867 IPSTATS_MIB_FRAGFAILS);
872 static inline int ip6_rt_check(struct rt6key *rt_key,
873 struct in6_addr *fl_addr,
874 struct in6_addr *addr_cache)
876 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 struct dst_entry *dst,
884 struct ipv6_pinfo *np = inet6_sk(sk);
885 struct rt6_info *rt = (struct rt6_info *)dst;
890 /* Yes, checking route validity in not connected
891 * case is not very simple. Take into account,
892 * that we do not support routing by source, TOS,
893 * and MSG_DONTROUTE --ANK (980726)
895 * 1. ip6_rt_check(): If route was host route,
896 * check that cached destination is current.
897 * If it is network route, we still may
898 * check its validity using saved pointer
899 * to the last used address: daddr_cache.
900 * We do not want to save whole address now,
901 * (because main consumer of this service
902 * is tcp, which has not this problem),
903 * so that the last trick works only on connected
905 * 2. oif also should be the same.
907 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
911 (fl->oif && fl->oif != dst->dev->ifindex)) {
920 static int ip6_dst_lookup_tail(struct sock *sk,
921 struct dst_entry **dst, struct flowi *fl)
924 struct net *net = sock_net(sk);
927 *dst = ip6_route_output(net, sk, fl);
929 if ((err = (*dst)->error))
930 goto out_err_release;
932 if (ipv6_addr_any(&fl->fl6_src)) {
933 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
935 sk ? inet6_sk(sk)->srcprefs : 0,
938 goto out_err_release;
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943 * Here if the dst entry we've looked up
944 * has a neighbour entry that is in the INCOMPLETE
945 * state and the src address from the flow is
946 * marked as OPTIMISTIC, we release the found
947 * dst entry and replace it instead with the
948 * dst entry of the nexthop router
950 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
951 struct inet6_ifaddr *ifp;
955 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
958 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
964 * We need to get the dst entry for the
965 * default router instead
968 memcpy(&fl_gw, fl, sizeof(struct flowi));
969 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970 *dst = ip6_route_output(net, sk, &fl_gw);
971 if ((err = (*dst)->error))
972 goto out_err_release;
980 if (err == -ENETUNREACH)
981 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
988 * ip6_dst_lookup - perform route lookup on flow
989 * @sk: socket which provides route info
990 * @dst: pointer to dst_entry * for result
991 * @fl: flow to lookup
993 * This function performs a route lookup on the given flow.
995 * It returns zero on success, or a standard errno code on error.
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1000 return ip6_dst_lookup_tail(sk, dst, fl);
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1005 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006 * @sk: socket which provides the dst cache and route info
1007 * @dst: pointer to dst_entry * for result
1008 * @fl: flow to lookup
1010 * This function performs a route lookup on the given flow with the
1011 * possibility of using the cached route in the socket if it is valid.
1012 * It will take the socket dst lock when operating on the dst cache.
1013 * As a result, this function can only be used in process context.
1015 * It returns zero on success, or a standard errno code on error.
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1021 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022 *dst = ip6_sk_dst_check(sk, *dst, fl);
1025 return ip6_dst_lookup_tail(sk, dst, fl);
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030 int getfrag(void *from, char *to, int offset, int len,
1031 int odd, struct sk_buff *skb),
1032 void *from, int length, int hh_len, int fragheaderlen,
1033 int transhdrlen, int mtu,unsigned int flags)
1036 struct sk_buff *skb;
1039 /* There is support for UDP large send offload by network
1040 * device, so create one single skb packet containing complete
1043 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044 skb = sock_alloc_send_skb(sk,
1045 hh_len + fragheaderlen + transhdrlen + 20,
1046 (flags & MSG_DONTWAIT), &err);
1050 /* reserve space for Hardware header */
1051 skb_reserve(skb, hh_len);
1053 /* create space for UDP/IP header */
1054 skb_put(skb,fragheaderlen + transhdrlen);
1056 /* initialize network header pointer */
1057 skb_reset_network_header(skb);
1059 /* initialize protocol header pointer */
1060 skb->transport_header = skb->network_header + fragheaderlen;
1062 skb->ip_summed = CHECKSUM_PARTIAL;
1064 sk->sk_sndmsg_off = 0;
1067 err = skb_append_datato_frags(sk,skb, getfrag, from,
1068 (length - transhdrlen));
1070 struct frag_hdr fhdr;
1072 /* specify the length of each IP datagram fragment*/
1073 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1074 sizeof(struct frag_hdr);
1075 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1076 ipv6_select_ident(skb, &fhdr);
1077 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1078 __skb_queue_tail(&sk->sk_write_queue, skb);
1082 /* There is not enough support do UPD LSO,
1083 * so follow normal path
1090 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1091 int offset, int len, int odd, struct sk_buff *skb),
1092 void *from, int length, int transhdrlen,
1093 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1094 struct rt6_info *rt, unsigned int flags)
1096 struct inet_sock *inet = inet_sk(sk);
1097 struct ipv6_pinfo *np = inet6_sk(sk);
1098 struct sk_buff *skb;
1099 unsigned int maxfraglen, fragheaderlen;
1106 int csummode = CHECKSUM_NONE;
1108 if (flags&MSG_PROBE)
1110 if (skb_queue_empty(&sk->sk_write_queue)) {
1115 if (np->cork.opt == NULL) {
1116 np->cork.opt = kmalloc(opt->tot_len,
1118 if (unlikely(np->cork.opt == NULL))
1120 } else if (np->cork.opt->tot_len < opt->tot_len) {
1121 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1124 memcpy(np->cork.opt, opt, opt->tot_len);
1125 inet->cork.flags |= IPCORK_OPT;
1126 /* need source address above miyazawa*/
1128 dst_hold(&rt->u.dst);
1129 inet->cork.dst = &rt->u.dst;
1130 inet->cork.fl = *fl;
1131 np->cork.hop_limit = hlimit;
1132 np->cork.tclass = tclass;
1133 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1134 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1135 if (np->frag_size < mtu) {
1137 mtu = np->frag_size;
1139 inet->cork.fragsize = mtu;
1140 if (dst_allfrag(rt->u.dst.path))
1141 inet->cork.flags |= IPCORK_ALLFRAG;
1142 inet->cork.length = 0;
1143 sk->sk_sndmsg_page = NULL;
1144 sk->sk_sndmsg_off = 0;
1145 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1146 rt->rt6i_nfheader_len;
1147 length += exthdrlen;
1148 transhdrlen += exthdrlen;
1150 rt = (struct rt6_info *)inet->cork.dst;
1151 fl = &inet->cork.fl;
1152 if (inet->cork.flags & IPCORK_OPT)
1156 mtu = inet->cork.fragsize;
1159 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1161 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1162 (opt ? opt->opt_nflen : 0);
1163 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1165 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1166 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1167 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1173 * Let's try using as much space as possible.
1174 * Use MTU if total length of the message fits into the MTU.
1175 * Otherwise, we need to reserve fragment header and
1176 * fragment alignment (= 8-15 octects, in total).
1178 * Note that we may need to "move" the data from the tail of
1179 * of the buffer to the new fragment when we split
1182 * FIXME: It may be fragmented into multiple chunks
1183 * at once if non-fragmentable extension headers
1188 inet->cork.length += length;
1189 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1190 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1192 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1193 fragheaderlen, transhdrlen, mtu,
1200 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1203 while (length > 0) {
1204 /* Check if the remaining data fits into current packet. */
1205 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1207 copy = maxfraglen - skb->len;
1211 unsigned int datalen;
1212 unsigned int fraglen;
1213 unsigned int fraggap;
1214 unsigned int alloclen;
1215 struct sk_buff *skb_prev;
1219 /* There's no room in the current skb */
1221 fraggap = skb_prev->len - maxfraglen;
1226 * If remaining data exceeds the mtu,
1227 * we know we need more fragment(s).
1229 datalen = length + fraggap;
1230 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1231 datalen = maxfraglen - fragheaderlen;
1233 fraglen = datalen + fragheaderlen;
1234 if ((flags & MSG_MORE) &&
1235 !(rt->u.dst.dev->features&NETIF_F_SG))
1238 alloclen = datalen + fragheaderlen;
1241 * The last fragment gets additional space at tail.
1242 * Note: we overallocate on fragments with MSG_MODE
1243 * because we have no idea if we're the last one.
1245 if (datalen == length + fraggap)
1246 alloclen += rt->u.dst.trailer_len;
1249 * We just reserve space for fragment header.
1250 * Note: this may be overallocation if the message
1251 * (without MSG_MORE) fits into the MTU.
1253 alloclen += sizeof(struct frag_hdr);
1256 skb = sock_alloc_send_skb(sk,
1258 (flags & MSG_DONTWAIT), &err);
1261 if (atomic_read(&sk->sk_wmem_alloc) <=
1263 skb = sock_wmalloc(sk,
1264 alloclen + hh_len, 1,
1266 if (unlikely(skb == NULL))
1272 * Fill in the control structures
1274 skb->ip_summed = csummode;
1276 /* reserve for fragmentation */
1277 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1280 * Find where to start putting bytes
1282 data = skb_put(skb, fraglen);
1283 skb_set_network_header(skb, exthdrlen);
1284 data += fragheaderlen;
1285 skb->transport_header = (skb->network_header +
1288 skb->csum = skb_copy_and_csum_bits(
1289 skb_prev, maxfraglen,
1290 data + transhdrlen, fraggap, 0);
1291 skb_prev->csum = csum_sub(skb_prev->csum,
1294 pskb_trim_unique(skb_prev, maxfraglen);
1296 copy = datalen - transhdrlen - fraggap;
1301 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1308 length -= datalen - fraggap;
1311 csummode = CHECKSUM_NONE;
1314 * Put the packet on the pending queue
1316 __skb_queue_tail(&sk->sk_write_queue, skb);
1323 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1327 if (getfrag(from, skb_put(skb, copy),
1328 offset, copy, off, skb) < 0) {
1329 __skb_trim(skb, off);
1334 int i = skb_shinfo(skb)->nr_frags;
1335 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1336 struct page *page = sk->sk_sndmsg_page;
1337 int off = sk->sk_sndmsg_off;
1340 if (page && (left = PAGE_SIZE - off) > 0) {
1343 if (page != frag->page) {
1344 if (i == MAX_SKB_FRAGS) {
1349 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1350 frag = &skb_shinfo(skb)->frags[i];
1352 } else if(i < MAX_SKB_FRAGS) {
1353 if (copy > PAGE_SIZE)
1355 page = alloc_pages(sk->sk_allocation, 0);
1360 sk->sk_sndmsg_page = page;
1361 sk->sk_sndmsg_off = 0;
1363 skb_fill_page_desc(skb, i, page, 0, 0);
1364 frag = &skb_shinfo(skb)->frags[i];
1369 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1373 sk->sk_sndmsg_off += copy;
1376 skb->data_len += copy;
1377 skb->truesize += copy;
1378 atomic_add(copy, &sk->sk_wmem_alloc);
1385 inet->cork.length -= length;
1386 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1390 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1392 inet->cork.flags &= ~IPCORK_OPT;
1393 kfree(np->cork.opt);
1394 np->cork.opt = NULL;
1395 if (inet->cork.dst) {
1396 dst_release(inet->cork.dst);
1397 inet->cork.dst = NULL;
1398 inet->cork.flags &= ~IPCORK_ALLFRAG;
1400 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1403 int ip6_push_pending_frames(struct sock *sk)
1405 struct sk_buff *skb, *tmp_skb;
1406 struct sk_buff **tail_skb;
1407 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1408 struct inet_sock *inet = inet_sk(sk);
1409 struct ipv6_pinfo *np = inet6_sk(sk);
1410 struct ipv6hdr *hdr;
1411 struct ipv6_txoptions *opt = np->cork.opt;
1412 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1413 struct flowi *fl = &inet->cork.fl;
1414 unsigned char proto = fl->proto;
1417 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1419 tail_skb = &(skb_shinfo(skb)->frag_list);
1421 /* move skb->data to ip header from ext header */
1422 if (skb->data < skb_network_header(skb))
1423 __skb_pull(skb, skb_network_offset(skb));
1424 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1425 __skb_pull(tmp_skb, skb_network_header_len(skb));
1426 *tail_skb = tmp_skb;
1427 tail_skb = &(tmp_skb->next);
1428 skb->len += tmp_skb->len;
1429 skb->data_len += tmp_skb->len;
1430 skb->truesize += tmp_skb->truesize;
1431 __sock_put(tmp_skb->sk);
1432 tmp_skb->destructor = NULL;
1436 /* Allow local fragmentation. */
1437 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1440 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1441 __skb_pull(skb, skb_network_header_len(skb));
1442 if (opt && opt->opt_flen)
1443 ipv6_push_frag_opts(skb, opt, &proto);
1444 if (opt && opt->opt_nflen)
1445 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1447 skb_push(skb, sizeof(struct ipv6hdr));
1448 skb_reset_network_header(skb);
1449 hdr = ipv6_hdr(skb);
1451 *(__be32*)hdr = fl->fl6_flowlabel |
1452 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1454 hdr->hop_limit = np->cork.hop_limit;
1455 hdr->nexthdr = proto;
1456 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1457 ipv6_addr_copy(&hdr->daddr, final_dst);
1459 skb->priority = sk->sk_priority;
1460 skb->mark = sk->sk_mark;
1462 skb->dst = dst_clone(&rt->u.dst);
1463 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1464 if (proto == IPPROTO_ICMPV6) {
1465 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1467 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1468 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1471 err = ip6_local_out(skb);
1474 err = np->recverr ? net_xmit_errno(err) : 0;
1480 ip6_cork_release(inet, np);
1486 void ip6_flush_pending_frames(struct sock *sk)
1488 struct sk_buff *skb;
1490 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1492 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1493 IPSTATS_MIB_OUTDISCARDS);
1497 ip6_cork_release(inet_sk(sk), inet6_sk(sk));