2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
26 * Fixed routing subtrees.
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
54 #include <linux/rtnetlink.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
60 #include <asm/uaccess.h>
63 #include <linux/sysctl.h>
66 /* Set to 3 to get tracing. */
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
74 #define RT6_TRACE(x...) do { ; } while (0)
77 #define CLONE_OFFLINK_ROUTE 0
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void ip6_dst_destroy(struct dst_entry *);
91 static void ip6_dst_ifdown(struct dst_entry *,
92 struct net_device *dev, int how);
93 static int ip6_dst_gc(void);
95 static int ip6_pkt_discard(struct sk_buff *skb);
96 static int ip6_pkt_discard_out(struct sk_buff *skb);
97 static void ip6_link_failure(struct sk_buff *skb);
98 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102 struct in6_addr *gwaddr, int ifindex,
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105 struct in6_addr *gwaddr, int ifindex);
108 static struct dst_ops ip6_dst_ops = {
110 .protocol = __constant_htons(ETH_P_IPV6),
113 .check = ip6_dst_check,
114 .destroy = ip6_dst_destroy,
115 .ifdown = ip6_dst_ifdown,
116 .negative_advice = ip6_negative_advice,
117 .link_failure = ip6_link_failure,
118 .update_pmtu = ip6_rt_update_pmtu,
119 .entry_size = sizeof(struct rt6_info),
122 struct rt6_info ip6_null_entry = {
125 .__refcnt = ATOMIC_INIT(1),
127 .dev = &loopback_dev,
129 .error = -ENETUNREACH,
130 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
131 .input = ip6_pkt_discard,
132 .output = ip6_pkt_discard_out,
134 .path = (struct dst_entry*)&ip6_null_entry,
137 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
138 .rt6i_metric = ~(u32) 0,
139 .rt6i_ref = ATOMIC_INIT(1),
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144 struct rt6_info ip6_prohibit_entry = {
147 .__refcnt = ATOMIC_INIT(1),
149 .dev = &loopback_dev,
152 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
153 .input = ip6_pkt_discard,
154 .output = ip6_pkt_discard_out,
156 .path = (struct dst_entry*)&ip6_prohibit_entry,
159 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
160 .rt6i_metric = ~(u32) 0,
161 .rt6i_ref = ATOMIC_INIT(1),
164 struct rt6_info ip6_blk_hole_entry = {
167 .__refcnt = ATOMIC_INIT(1),
169 .dev = &loopback_dev,
172 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
173 .input = ip6_pkt_discard,
174 .output = ip6_pkt_discard_out,
176 .path = (struct dst_entry*)&ip6_blk_hole_entry,
179 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
180 .rt6i_metric = ~(u32) 0,
181 .rt6i_ref = ATOMIC_INIT(1),
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
192 static void ip6_dst_destroy(struct dst_entry *dst)
194 struct rt6_info *rt = (struct rt6_info *)dst;
195 struct inet6_dev *idev = rt->rt6i_idev;
198 rt->rt6i_idev = NULL;
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
206 struct rt6_info *rt = (struct rt6_info *)dst;
207 struct inet6_dev *idev = rt->rt6i_idev;
209 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211 if (loopback_idev != NULL) {
212 rt->rt6i_idev = loopback_idev;
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 return (rt->rt6i_flags & RTF_EXPIRES &&
221 time_after(jiffies, rt->rt6i_expires));
224 static inline int rt6_need_strict(struct in6_addr *daddr)
226 return (ipv6_addr_type(daddr) &
227 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
231 * Route lookup. Any table->tb6_lock is implied.
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
238 struct rt6_info *local = NULL;
239 struct rt6_info *sprt;
242 for (sprt = rt; sprt; sprt = sprt->u.next) {
243 struct net_device *dev = sprt->rt6i_dev;
244 if (dev->ifindex == oif)
246 if (dev->flags & IFF_LOOPBACK) {
247 if (sprt->rt6i_idev == NULL ||
248 sprt->rt6i_idev->dev->ifindex != oif) {
251 if (local && (!oif ||
252 local->rt6i_idev->dev->ifindex == oif))
263 return &ip6_null_entry;
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
271 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273 * Okay, this does not seem to be appropriate
274 * for now, however, we need to check if it
275 * is really so; aka Router Reachability Probing.
277 * Router Reachability Probe MUST be rate-limited
278 * to no more than one per minute.
280 if (!neigh || (neigh->nud_state & NUD_VALID))
282 read_lock_bh(&neigh->lock);
283 if (!(neigh->nud_state & NUD_VALID) &&
284 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285 struct in6_addr mcaddr;
286 struct in6_addr *target;
288 neigh->updated = jiffies;
289 read_unlock_bh(&neigh->lock);
291 target = (struct in6_addr *)&neigh->primary_key;
292 addrconf_addr_solict_mult(target, &mcaddr);
293 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295 read_unlock_bh(&neigh->lock);
298 static inline void rt6_probe(struct rt6_info *rt)
305 * Default Router Selection (RFC 2461 6.3.6)
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 struct net_device *dev = rt->rt6i_dev;
310 if (!oif || dev->ifindex == oif)
312 if ((dev->flags & IFF_LOOPBACK) &&
313 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
318 static int inline rt6_check_neigh(struct rt6_info *rt)
320 struct neighbour *neigh = rt->rt6i_nexthop;
322 if (rt->rt6i_flags & RTF_NONEXTHOP ||
323 !(rt->rt6i_flags & RTF_GATEWAY))
326 read_lock_bh(&neigh->lock);
327 if (neigh->nud_state & NUD_VALID)
329 read_unlock_bh(&neigh->lock);
334 static int rt6_score_route(struct rt6_info *rt, int oif,
339 m = rt6_check_dev(rt, oif);
340 if (!m && (strict & RT6_LOOKUP_F_IFACE))
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 n = rt6_check_neigh(rt);
348 else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
356 struct rt6_info *match = NULL, *last = NULL;
357 struct rt6_info *rt, *rt0 = *head;
361 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362 __FUNCTION__, head, head ? *head : NULL, oif);
364 for (rt = rt0, metric = rt0->rt6i_metric;
365 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
369 if (rt6_check_expired(rt))
374 m = rt6_score_route(rt, oif, strict);
388 (strict & RT6_LOOKUP_F_REACHABLE) &&
389 last && last != rt0) {
390 /* no entries matched; do round-robin */
391 static DEFINE_SPINLOCK(lock);
394 rt0->u.next = last->u.next;
399 RT6_TRACE("%s() => %p, score=%d\n",
400 __FUNCTION__, match, mpri);
402 return (match ? match : &ip6_null_entry);
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407 struct in6_addr *gwaddr)
409 struct route_info *rinfo = (struct route_info *) opt;
410 struct in6_addr prefix_buf, *prefix;
415 if (len < sizeof(struct route_info)) {
419 /* Sanity check for prefix_len and length */
420 if (rinfo->length > 3) {
422 } else if (rinfo->prefix_len > 128) {
424 } else if (rinfo->prefix_len > 64) {
425 if (rinfo->length < 2) {
428 } else if (rinfo->prefix_len > 0) {
429 if (rinfo->length < 1) {
434 pref = rinfo->route_pref;
435 if (pref == ICMPV6_ROUTER_PREF_INVALID)
436 pref = ICMPV6_ROUTER_PREF_MEDIUM;
438 lifetime = htonl(rinfo->lifetime);
439 if (lifetime == 0xffffffff) {
441 } else if (lifetime > 0x7fffffff/HZ) {
442 /* Avoid arithmetic overflow */
443 lifetime = 0x7fffffff/HZ - 1;
446 if (rinfo->length == 3)
447 prefix = (struct in6_addr *)rinfo->prefix;
449 /* this function is safe */
450 ipv6_addr_prefix(&prefix_buf,
451 (struct in6_addr *)rinfo->prefix,
453 prefix = &prefix_buf;
456 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458 if (rt && !lifetime) {
464 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
467 rt->rt6i_flags = RTF_ROUTEINFO |
468 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
471 if (lifetime == 0xffffffff) {
472 rt->rt6i_flags &= ~RTF_EXPIRES;
474 rt->rt6i_expires = jiffies + HZ * lifetime;
475 rt->rt6i_flags |= RTF_EXPIRES;
477 dst_release(&rt->u.dst);
483 #define BACKTRACK(saddr) \
485 if (rt == &ip6_null_entry) { \
486 struct fib6_node *pn; \
488 if (fn->fn_flags & RTN_TL_ROOT) \
491 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492 fn = fib6_lookup(pn->subtree, NULL, saddr); \
495 if (fn->fn_flags & RTN_RTINFO) \
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502 struct flowi *fl, int flags)
504 struct fib6_node *fn;
507 read_lock_bh(&table->tb6_lock);
508 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
511 rt = rt6_device_match(rt, fl->oif, flags);
512 BACKTRACK(&fl->fl6_src);
514 dst_hold(&rt->u.dst);
515 read_unlock_bh(&table->tb6_lock);
517 rt->u.dst.lastuse = jiffies;
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
535 struct dst_entry *dst;
536 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
539 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
540 flags |= RT6_LOOKUP_F_HAS_SADDR;
543 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
545 return (struct rt6_info *) dst;
552 /* ip6_ins_rt is called with FREE table->tb6_lock.
553 It takes new route entry, the addition fails by any reason the
554 route is freed. In any case, if caller does not hold it, it may
558 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
561 struct fib6_table *table;
563 table = rt->rt6i_table;
564 write_lock_bh(&table->tb6_lock);
565 err = fib6_add(&table->tb6_root, rt, info);
566 write_unlock_bh(&table->tb6_lock);
571 int ip6_ins_rt(struct rt6_info *rt)
573 return __ip6_ins_rt(rt, NULL);
576 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
577 struct in6_addr *saddr)
585 rt = ip6_rt_copy(ort);
588 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
589 if (rt->rt6i_dst.plen != 128 &&
590 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
591 rt->rt6i_flags |= RTF_ANYCAST;
592 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
595 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
596 rt->rt6i_dst.plen = 128;
597 rt->rt6i_flags |= RTF_CACHE;
598 rt->u.dst.flags |= DST_HOST;
600 #ifdef CONFIG_IPV6_SUBTREES
601 if (rt->rt6i_src.plen && saddr) {
602 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
603 rt->rt6i_src.plen = 128;
607 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
614 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
616 struct rt6_info *rt = ip6_rt_copy(ort);
618 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
619 rt->rt6i_dst.plen = 128;
620 rt->rt6i_flags |= RTF_CACHE;
621 if (rt->rt6i_flags & RTF_REJECT)
622 rt->u.dst.error = ort->u.dst.error;
623 rt->u.dst.flags |= DST_HOST;
624 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
629 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
630 struct flowi *fl, int flags)
632 struct fib6_node *fn;
633 struct rt6_info *rt, *nrt;
637 int reachable = RT6_LOOKUP_F_REACHABLE;
639 strict |= flags & RT6_LOOKUP_F_IFACE;
642 read_lock_bh(&table->tb6_lock);
645 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
648 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
649 BACKTRACK(&fl->fl6_src);
650 if (rt == &ip6_null_entry ||
651 rt->rt6i_flags & RTF_CACHE)
654 dst_hold(&rt->u.dst);
655 read_unlock_bh(&table->tb6_lock);
657 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
658 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
660 #if CLONE_OFFLINK_ROUTE
661 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
667 dst_release(&rt->u.dst);
668 rt = nrt ? : &ip6_null_entry;
670 dst_hold(&rt->u.dst);
672 err = ip6_ins_rt(nrt);
681 * Race condition! In the gap, when table->tb6_lock was
682 * released someone could insert this route. Relookup.
684 dst_release(&rt->u.dst);
692 dst_hold(&rt->u.dst);
693 read_unlock_bh(&table->tb6_lock);
695 rt->u.dst.lastuse = jiffies;
701 void ip6_route_input(struct sk_buff *skb)
703 struct ipv6hdr *iph = skb->nh.ipv6h;
704 int flags = RT6_LOOKUP_F_HAS_SADDR;
706 .iif = skb->dev->ifindex,
711 #ifdef CONFIG_IPV6_ROUTE_FWMARK
712 .fwmark = skb->nfmark,
714 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
717 .proto = iph->nexthdr,
720 if (rt6_need_strict(&iph->daddr))
721 flags |= RT6_LOOKUP_F_IFACE;
723 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
726 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
727 struct flowi *fl, int flags)
729 struct fib6_node *fn;
730 struct rt6_info *rt, *nrt;
734 int reachable = RT6_LOOKUP_F_REACHABLE;
736 strict |= flags & RT6_LOOKUP_F_IFACE;
739 read_lock_bh(&table->tb6_lock);
742 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
745 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
746 BACKTRACK(&fl->fl6_src);
747 if (rt == &ip6_null_entry ||
748 rt->rt6i_flags & RTF_CACHE)
751 dst_hold(&rt->u.dst);
752 read_unlock_bh(&table->tb6_lock);
754 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
755 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
757 #if CLONE_OFFLINK_ROUTE
758 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
764 dst_release(&rt->u.dst);
765 rt = nrt ? : &ip6_null_entry;
767 dst_hold(&rt->u.dst);
769 err = ip6_ins_rt(nrt);
778 * Race condition! In the gap, when table->tb6_lock was
779 * released someone could insert this route. Relookup.
781 dst_release(&rt->u.dst);
789 dst_hold(&rt->u.dst);
790 read_unlock_bh(&table->tb6_lock);
792 rt->u.dst.lastuse = jiffies;
797 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
801 if (rt6_need_strict(&fl->fl6_dst))
802 flags |= RT6_LOOKUP_F_IFACE;
804 if (!ipv6_addr_any(&fl->fl6_src))
805 flags |= RT6_LOOKUP_F_HAS_SADDR;
807 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
812 * Destination cache support functions
815 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
819 rt = (struct rt6_info *) dst;
821 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
827 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
829 struct rt6_info *rt = (struct rt6_info *) dst;
832 if (rt->rt6i_flags & RTF_CACHE)
840 static void ip6_link_failure(struct sk_buff *skb)
844 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
846 rt = (struct rt6_info *) skb->dst;
848 if (rt->rt6i_flags&RTF_CACHE) {
849 dst_set_expires(&rt->u.dst, 0);
850 rt->rt6i_flags |= RTF_EXPIRES;
851 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
852 rt->rt6i_node->fn_sernum = -1;
856 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
858 struct rt6_info *rt6 = (struct rt6_info*)dst;
860 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
861 rt6->rt6i_flags |= RTF_MODIFIED;
862 if (mtu < IPV6_MIN_MTU) {
864 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
866 dst->metrics[RTAX_MTU-1] = mtu;
867 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
871 static int ipv6_get_mtu(struct net_device *dev);
873 static inline unsigned int ipv6_advmss(unsigned int mtu)
875 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
877 if (mtu < ip6_rt_min_advmss)
878 mtu = ip6_rt_min_advmss;
881 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
882 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
883 * IPV6_MAXPLEN is also valid and means: "any MSS,
884 * rely only on pmtu discovery"
886 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
891 static struct dst_entry *ndisc_dst_gc_list;
892 static DEFINE_SPINLOCK(ndisc_lock);
894 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
895 struct neighbour *neigh,
896 struct in6_addr *addr,
897 int (*output)(struct sk_buff *))
900 struct inet6_dev *idev = in6_dev_get(dev);
902 if (unlikely(idev == NULL))
905 rt = ip6_dst_alloc();
906 if (unlikely(rt == NULL)) {
915 neigh = ndisc_get_neigh(dev, addr);
918 rt->rt6i_idev = idev;
919 rt->rt6i_nexthop = neigh;
920 atomic_set(&rt->u.dst.__refcnt, 1);
921 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
922 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
923 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
924 rt->u.dst.output = output;
926 #if 0 /* there's no chance to use these for ndisc */
927 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
930 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
931 rt->rt6i_dst.plen = 128;
934 spin_lock_bh(&ndisc_lock);
935 rt->u.dst.next = ndisc_dst_gc_list;
936 ndisc_dst_gc_list = &rt->u.dst;
937 spin_unlock_bh(&ndisc_lock);
939 fib6_force_start_gc();
942 return (struct dst_entry *)rt;
945 int ndisc_dst_gc(int *more)
947 struct dst_entry *dst, *next, **pprev;
953 spin_lock_bh(&ndisc_lock);
954 pprev = &ndisc_dst_gc_list;
956 while ((dst = *pprev) != NULL) {
957 if (!atomic_read(&dst->__refcnt)) {
967 spin_unlock_bh(&ndisc_lock);
972 static int ip6_dst_gc(void)
974 static unsigned expire = 30*HZ;
975 static unsigned long last_gc;
976 unsigned long now = jiffies;
978 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
979 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
985 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
986 expire = ip6_rt_gc_timeout>>1;
989 expire -= expire>>ip6_rt_gc_elasticity;
990 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
993 /* Clean host part of a prefix. Not necessary in radix tree,
994 but results in cleaner routing tables.
996 Remove it only when all the things will work!
999 static int ipv6_get_mtu(struct net_device *dev)
1001 int mtu = IPV6_MIN_MTU;
1002 struct inet6_dev *idev;
1004 idev = in6_dev_get(dev);
1006 mtu = idev->cnf.mtu6;
1012 int ipv6_get_hoplimit(struct net_device *dev)
1014 int hoplimit = ipv6_devconf.hop_limit;
1015 struct inet6_dev *idev;
1017 idev = in6_dev_get(dev);
1019 hoplimit = idev->cnf.hop_limit;
1029 int ip6_route_add(struct fib6_config *cfg)
1032 struct rt6_info *rt = NULL;
1033 struct net_device *dev = NULL;
1034 struct inet6_dev *idev = NULL;
1035 struct fib6_table *table;
1038 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1040 #ifndef CONFIG_IPV6_SUBTREES
1041 if (cfg->fc_src_len)
1044 if (cfg->fc_ifindex) {
1046 dev = dev_get_by_index(cfg->fc_ifindex);
1049 idev = in6_dev_get(dev);
1054 if (cfg->fc_metric == 0)
1055 cfg->fc_metric = IP6_RT_PRIO_USER;
1057 table = fib6_new_table(cfg->fc_table);
1058 if (table == NULL) {
1063 rt = ip6_dst_alloc();
1070 rt->u.dst.obsolete = -1;
1071 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1073 if (cfg->fc_protocol == RTPROT_UNSPEC)
1074 cfg->fc_protocol = RTPROT_BOOT;
1075 rt->rt6i_protocol = cfg->fc_protocol;
1077 addr_type = ipv6_addr_type(&cfg->fc_dst);
1079 if (addr_type & IPV6_ADDR_MULTICAST)
1080 rt->u.dst.input = ip6_mc_input;
1082 rt->u.dst.input = ip6_forward;
1084 rt->u.dst.output = ip6_output;
1086 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1087 rt->rt6i_dst.plen = cfg->fc_dst_len;
1088 if (rt->rt6i_dst.plen == 128)
1089 rt->u.dst.flags = DST_HOST;
1091 #ifdef CONFIG_IPV6_SUBTREES
1092 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1093 rt->rt6i_src.plen = cfg->fc_src_len;
1096 rt->rt6i_metric = cfg->fc_metric;
1098 /* We cannot add true routes via loopback here,
1099 they would result in kernel looping; promote them to reject routes
1101 if ((cfg->fc_flags & RTF_REJECT) ||
1102 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1103 /* hold loopback dev/idev if we haven't done so. */
1104 if (dev != &loopback_dev) {
1109 dev = &loopback_dev;
1111 idev = in6_dev_get(dev);
1117 rt->u.dst.output = ip6_pkt_discard_out;
1118 rt->u.dst.input = ip6_pkt_discard;
1119 rt->u.dst.error = -ENETUNREACH;
1120 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1124 if (cfg->fc_flags & RTF_GATEWAY) {
1125 struct in6_addr *gw_addr;
1128 gw_addr = &cfg->fc_gateway;
1129 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1130 gwa_type = ipv6_addr_type(gw_addr);
1132 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1133 struct rt6_info *grt;
1135 /* IPv6 strictly inhibits using not link-local
1136 addresses as nexthop address.
1137 Otherwise, router will not able to send redirects.
1138 It is very good, but in some (rare!) circumstances
1139 (SIT, PtP, NBMA NOARP links) it is handy to allow
1140 some exceptions. --ANK
1143 if (!(gwa_type&IPV6_ADDR_UNICAST))
1146 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1148 err = -EHOSTUNREACH;
1152 if (dev != grt->rt6i_dev) {
1153 dst_release(&grt->u.dst);
1157 dev = grt->rt6i_dev;
1158 idev = grt->rt6i_idev;
1160 in6_dev_hold(grt->rt6i_idev);
1162 if (!(grt->rt6i_flags&RTF_GATEWAY))
1164 dst_release(&grt->u.dst);
1170 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1178 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1179 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1180 if (IS_ERR(rt->rt6i_nexthop)) {
1181 err = PTR_ERR(rt->rt6i_nexthop);
1182 rt->rt6i_nexthop = NULL;
1187 rt->rt6i_flags = cfg->fc_flags;
1194 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1195 int type = nla->nla_type;
1198 if (type > RTAX_MAX) {
1203 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1208 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1209 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1210 if (!rt->u.dst.metrics[RTAX_MTU-1])
1211 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1212 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1213 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1214 rt->u.dst.dev = dev;
1215 rt->rt6i_idev = idev;
1216 rt->rt6i_table = table;
1217 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1225 dst_free((struct dst_entry *) rt);
1229 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1232 struct fib6_table *table;
1234 if (rt == &ip6_null_entry)
1237 table = rt->rt6i_table;
1238 write_lock_bh(&table->tb6_lock);
1240 err = fib6_del(rt, info);
1241 dst_release(&rt->u.dst);
1243 write_unlock_bh(&table->tb6_lock);
1248 int ip6_del_rt(struct rt6_info *rt)
1250 return __ip6_del_rt(rt, NULL);
1253 static int ip6_route_del(struct fib6_config *cfg)
1255 struct fib6_table *table;
1256 struct fib6_node *fn;
1257 struct rt6_info *rt;
1260 table = fib6_get_table(cfg->fc_table);
1264 read_lock_bh(&table->tb6_lock);
1266 fn = fib6_locate(&table->tb6_root,
1267 &cfg->fc_dst, cfg->fc_dst_len,
1268 &cfg->fc_src, cfg->fc_src_len);
1271 for (rt = fn->leaf; rt; rt = rt->u.next) {
1272 if (cfg->fc_ifindex &&
1273 (rt->rt6i_dev == NULL ||
1274 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1276 if (cfg->fc_flags & RTF_GATEWAY &&
1277 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1279 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1281 dst_hold(&rt->u.dst);
1282 read_unlock_bh(&table->tb6_lock);
1284 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1287 read_unlock_bh(&table->tb6_lock);
1295 struct ip6rd_flowi {
1297 struct in6_addr gateway;
1300 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1304 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1305 struct rt6_info *rt;
1306 struct fib6_node *fn;
1309 * Get the "current" route for this destination and
1310 * check if the redirect has come from approriate router.
1312 * RFC 2461 specifies that redirects should only be
1313 * accepted if they come from the nexthop to the target.
1314 * Due to the way the routes are chosen, this notion
1315 * is a bit fuzzy and one might need to check all possible
1319 read_lock_bh(&table->tb6_lock);
1320 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1322 for (rt = fn->leaf; rt; rt = rt->u.next) {
1324 * Current route is on-link; redirect is always invalid.
1326 * Seems, previous statement is not true. It could
1327 * be node, which looks for us as on-link (f.e. proxy ndisc)
1328 * But then router serving it might decide, that we should
1329 * know truth 8)8) --ANK (980726).
1331 if (rt6_check_expired(rt))
1333 if (!(rt->rt6i_flags & RTF_GATEWAY))
1335 if (fl->oif != rt->rt6i_dev->ifindex)
1337 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1343 rt = &ip6_null_entry;
1344 BACKTRACK(&fl->fl6_src);
1346 dst_hold(&rt->u.dst);
1348 read_unlock_bh(&table->tb6_lock);
1353 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1354 struct in6_addr *src,
1355 struct in6_addr *gateway,
1356 struct net_device *dev)
1358 int flags = RT6_LOOKUP_F_HAS_SADDR;
1359 struct ip6rd_flowi rdfl = {
1361 .oif = dev->ifindex,
1369 .gateway = *gateway,
1372 if (rt6_need_strict(dest))
1373 flags |= RT6_LOOKUP_F_IFACE;
1375 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1378 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1379 struct in6_addr *saddr,
1380 struct neighbour *neigh, u8 *lladdr, int on_link)
1382 struct rt6_info *rt, *nrt = NULL;
1383 struct netevent_redirect netevent;
1385 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1387 if (rt == &ip6_null_entry) {
1388 if (net_ratelimit())
1389 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1390 "for redirect target\n");
1395 * We have finally decided to accept it.
1398 neigh_update(neigh, lladdr, NUD_STALE,
1399 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1400 NEIGH_UPDATE_F_OVERRIDE|
1401 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1402 NEIGH_UPDATE_F_ISROUTER))
1406 * Redirect received -> path was valid.
1407 * Look, redirects are sent only in response to data packets,
1408 * so that this nexthop apparently is reachable. --ANK
1410 dst_confirm(&rt->u.dst);
1412 /* Duplicate redirect: silently ignore. */
1413 if (neigh == rt->u.dst.neighbour)
1416 nrt = ip6_rt_copy(rt);
1420 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1422 nrt->rt6i_flags &= ~RTF_GATEWAY;
1424 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1425 nrt->rt6i_dst.plen = 128;
1426 nrt->u.dst.flags |= DST_HOST;
1428 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1429 nrt->rt6i_nexthop = neigh_clone(neigh);
1430 /* Reset pmtu, it may be better */
1431 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1432 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1434 if (ip6_ins_rt(nrt))
1437 netevent.old = &rt->u.dst;
1438 netevent.new = &nrt->u.dst;
1439 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1441 if (rt->rt6i_flags&RTF_CACHE) {
1447 dst_release(&rt->u.dst);
1452 * Handle ICMP "packet too big" messages
1453 * i.e. Path MTU discovery
1456 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1457 struct net_device *dev, u32 pmtu)
1459 struct rt6_info *rt, *nrt;
1462 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1466 if (pmtu >= dst_mtu(&rt->u.dst))
1469 if (pmtu < IPV6_MIN_MTU) {
1471 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1472 * MTU (1280) and a fragment header should always be included
1473 * after a node receiving Too Big message reporting PMTU is
1474 * less than the IPv6 Minimum Link MTU.
1476 pmtu = IPV6_MIN_MTU;
1480 /* New mtu received -> path was valid.
1481 They are sent only in response to data packets,
1482 so that this nexthop apparently is reachable. --ANK
1484 dst_confirm(&rt->u.dst);
1486 /* Host route. If it is static, it would be better
1487 not to override it, but add new one, so that
1488 when cache entry will expire old pmtu
1489 would return automatically.
1491 if (rt->rt6i_flags & RTF_CACHE) {
1492 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1494 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1495 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1496 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1501 Two cases are possible:
1502 1. It is connected route. Action: COW
1503 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1505 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1506 nrt = rt6_alloc_cow(rt, daddr, saddr);
1508 nrt = rt6_alloc_clone(rt, daddr);
1511 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1513 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1515 /* According to RFC 1981, detecting PMTU increase shouldn't be
1516 * happened within 5 mins, the recommended timer is 10 mins.
1517 * Here this route expiration time is set to ip6_rt_mtu_expires
1518 * which is 10 mins. After 10 mins the decreased pmtu is expired
1519 * and detecting PMTU increase will be automatically happened.
1521 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1522 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1527 dst_release(&rt->u.dst);
1531 * Misc support functions
1534 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1536 struct rt6_info *rt = ip6_dst_alloc();
1539 rt->u.dst.input = ort->u.dst.input;
1540 rt->u.dst.output = ort->u.dst.output;
1542 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1543 rt->u.dst.dev = ort->u.dst.dev;
1545 dev_hold(rt->u.dst.dev);
1546 rt->rt6i_idev = ort->rt6i_idev;
1548 in6_dev_hold(rt->rt6i_idev);
1549 rt->u.dst.lastuse = jiffies;
1550 rt->rt6i_expires = 0;
1552 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1553 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1554 rt->rt6i_metric = 0;
1556 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1557 #ifdef CONFIG_IPV6_SUBTREES
1558 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1560 rt->rt6i_table = ort->rt6i_table;
1565 #ifdef CONFIG_IPV6_ROUTE_INFO
1566 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1567 struct in6_addr *gwaddr, int ifindex)
1569 struct fib6_node *fn;
1570 struct rt6_info *rt = NULL;
1571 struct fib6_table *table;
1573 table = fib6_get_table(RT6_TABLE_INFO);
1577 write_lock_bh(&table->tb6_lock);
1578 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1582 for (rt = fn->leaf; rt; rt = rt->u.next) {
1583 if (rt->rt6i_dev->ifindex != ifindex)
1585 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1587 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1589 dst_hold(&rt->u.dst);
1593 write_unlock_bh(&table->tb6_lock);
1597 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1598 struct in6_addr *gwaddr, int ifindex,
1601 struct fib6_config cfg = {
1602 .fc_table = RT6_TABLE_INFO,
1604 .fc_ifindex = ifindex,
1605 .fc_dst_len = prefixlen,
1606 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1607 RTF_UP | RTF_PREF(pref),
1610 ipv6_addr_copy(&cfg.fc_dst, prefix);
1611 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1613 /* We should treat it as a default route if prefix length is 0. */
1615 cfg.fc_flags |= RTF_DEFAULT;
1617 ip6_route_add(&cfg);
1619 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1623 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1625 struct rt6_info *rt;
1626 struct fib6_table *table;
1628 table = fib6_get_table(RT6_TABLE_DFLT);
1632 write_lock_bh(&table->tb6_lock);
1633 for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1634 if (dev == rt->rt6i_dev &&
1635 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1636 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1640 dst_hold(&rt->u.dst);
1641 write_unlock_bh(&table->tb6_lock);
1645 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1646 struct net_device *dev,
1649 struct fib6_config cfg = {
1650 .fc_table = RT6_TABLE_DFLT,
1652 .fc_ifindex = dev->ifindex,
1653 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1654 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1657 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1659 ip6_route_add(&cfg);
1661 return rt6_get_dflt_router(gwaddr, dev);
1664 void rt6_purge_dflt_routers(void)
1666 struct rt6_info *rt;
1667 struct fib6_table *table;
1669 /* NOTE: Keep consistent with rt6_get_dflt_router */
1670 table = fib6_get_table(RT6_TABLE_DFLT);
1675 read_lock_bh(&table->tb6_lock);
1676 for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1677 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1678 dst_hold(&rt->u.dst);
1679 read_unlock_bh(&table->tb6_lock);
1684 read_unlock_bh(&table->tb6_lock);
1687 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1688 struct fib6_config *cfg)
1690 memset(cfg, 0, sizeof(*cfg));
1692 cfg->fc_table = RT6_TABLE_MAIN;
1693 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1694 cfg->fc_metric = rtmsg->rtmsg_metric;
1695 cfg->fc_expires = rtmsg->rtmsg_info;
1696 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1697 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1698 cfg->fc_flags = rtmsg->rtmsg_flags;
1700 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1701 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1702 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1705 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1707 struct fib6_config cfg;
1708 struct in6_rtmsg rtmsg;
1712 case SIOCADDRT: /* Add a route */
1713 case SIOCDELRT: /* Delete a route */
1714 if (!capable(CAP_NET_ADMIN))
1716 err = copy_from_user(&rtmsg, arg,
1717 sizeof(struct in6_rtmsg));
1721 rtmsg_to_fib6_config(&rtmsg, &cfg);
1726 err = ip6_route_add(&cfg);
1729 err = ip6_route_del(&cfg);
1743 * Drop the packet on the floor
1746 static int ip6_pkt_discard(struct sk_buff *skb)
1748 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1749 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1750 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1752 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1753 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1758 static int ip6_pkt_discard_out(struct sk_buff *skb)
1760 skb->dev = skb->dst->dev;
1761 return ip6_pkt_discard(skb);
1765 * Allocate a dst for local (unicast / anycast) address.
1768 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1769 const struct in6_addr *addr,
1772 struct rt6_info *rt = ip6_dst_alloc();
1775 return ERR_PTR(-ENOMEM);
1777 dev_hold(&loopback_dev);
1780 rt->u.dst.flags = DST_HOST;
1781 rt->u.dst.input = ip6_input;
1782 rt->u.dst.output = ip6_output;
1783 rt->rt6i_dev = &loopback_dev;
1784 rt->rt6i_idev = idev;
1785 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1786 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1787 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1788 rt->u.dst.obsolete = -1;
1790 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1792 rt->rt6i_flags |= RTF_ANYCAST;
1794 rt->rt6i_flags |= RTF_LOCAL;
1795 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1796 if (rt->rt6i_nexthop == NULL) {
1797 dst_free((struct dst_entry *) rt);
1798 return ERR_PTR(-ENOMEM);
1801 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1802 rt->rt6i_dst.plen = 128;
1803 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1805 atomic_set(&rt->u.dst.__refcnt, 1);
1810 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1812 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1813 rt != &ip6_null_entry) {
1814 RT6_TRACE("deleted by ifdown %p\n", rt);
1820 void rt6_ifdown(struct net_device *dev)
1822 fib6_clean_all(fib6_ifdown, 0, dev);
1825 struct rt6_mtu_change_arg
1827 struct net_device *dev;
1831 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1833 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1834 struct inet6_dev *idev;
1836 /* In IPv6 pmtu discovery is not optional,
1837 so that RTAX_MTU lock cannot disable it.
1838 We still use this lock to block changes
1839 caused by addrconf/ndisc.
1842 idev = __in6_dev_get(arg->dev);
1846 /* For administrative MTU increase, there is no way to discover
1847 IPv6 PMTU increase, so PMTU increase should be updated here.
1848 Since RFC 1981 doesn't include administrative MTU increase
1849 update PMTU increase is a MUST. (i.e. jumbo frame)
1852 If new MTU is less than route PMTU, this new MTU will be the
1853 lowest MTU in the path, update the route PMTU to reflect PMTU
1854 decreases; if new MTU is greater than route PMTU, and the
1855 old MTU is the lowest MTU in the path, update the route PMTU
1856 to reflect the increase. In this case if the other nodes' MTU
1857 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1860 if (rt->rt6i_dev == arg->dev &&
1861 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1862 (dst_mtu(&rt->u.dst) > arg->mtu ||
1863 (dst_mtu(&rt->u.dst) < arg->mtu &&
1864 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1865 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1866 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1870 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1872 struct rt6_mtu_change_arg arg = {
1877 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1880 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1881 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
1882 [RTA_OIF] = { .type = NLA_U32 },
1883 [RTA_IIF] = { .type = NLA_U32 },
1884 [RTA_PRIORITY] = { .type = NLA_U32 },
1885 [RTA_METRICS] = { .type = NLA_NESTED },
1888 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1889 struct fib6_config *cfg)
1892 struct nlattr *tb[RTA_MAX+1];
1895 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1900 rtm = nlmsg_data(nlh);
1901 memset(cfg, 0, sizeof(*cfg));
1903 cfg->fc_table = rtm->rtm_table;
1904 cfg->fc_dst_len = rtm->rtm_dst_len;
1905 cfg->fc_src_len = rtm->rtm_src_len;
1906 cfg->fc_flags = RTF_UP;
1907 cfg->fc_protocol = rtm->rtm_protocol;
1909 if (rtm->rtm_type == RTN_UNREACHABLE)
1910 cfg->fc_flags |= RTF_REJECT;
1912 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1913 cfg->fc_nlinfo.nlh = nlh;
1915 if (tb[RTA_GATEWAY]) {
1916 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1917 cfg->fc_flags |= RTF_GATEWAY;
1921 int plen = (rtm->rtm_dst_len + 7) >> 3;
1923 if (nla_len(tb[RTA_DST]) < plen)
1926 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1930 int plen = (rtm->rtm_src_len + 7) >> 3;
1932 if (nla_len(tb[RTA_SRC]) < plen)
1935 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1939 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1941 if (tb[RTA_PRIORITY])
1942 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1944 if (tb[RTA_METRICS]) {
1945 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1946 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1950 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1957 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1959 struct fib6_config cfg;
1962 err = rtm_to_fib6_config(skb, nlh, &cfg);
1966 return ip6_route_del(&cfg);
1969 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1971 struct fib6_config cfg;
1974 err = rtm_to_fib6_config(skb, nlh, &cfg);
1978 return ip6_route_add(&cfg);
1981 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1982 struct in6_addr *dst, struct in6_addr *src,
1983 int iif, int type, u32 pid, u32 seq,
1984 int prefix, unsigned int flags)
1987 struct nlmsghdr *nlh;
1988 struct rta_cacheinfo ci;
1991 if (prefix) { /* user wants prefix routes only */
1992 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1993 /* success since this is not a prefix route */
1998 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2002 rtm = nlmsg_data(nlh);
2003 rtm->rtm_family = AF_INET6;
2004 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2005 rtm->rtm_src_len = rt->rt6i_src.plen;
2008 table = rt->rt6i_table->tb6_id;
2010 table = RT6_TABLE_UNSPEC;
2011 rtm->rtm_table = table;
2012 NLA_PUT_U32(skb, RTA_TABLE, table);
2013 if (rt->rt6i_flags&RTF_REJECT)
2014 rtm->rtm_type = RTN_UNREACHABLE;
2015 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2016 rtm->rtm_type = RTN_LOCAL;
2018 rtm->rtm_type = RTN_UNICAST;
2020 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2021 rtm->rtm_protocol = rt->rt6i_protocol;
2022 if (rt->rt6i_flags&RTF_DYNAMIC)
2023 rtm->rtm_protocol = RTPROT_REDIRECT;
2024 else if (rt->rt6i_flags & RTF_ADDRCONF)
2025 rtm->rtm_protocol = RTPROT_KERNEL;
2026 else if (rt->rt6i_flags&RTF_DEFAULT)
2027 rtm->rtm_protocol = RTPROT_RA;
2029 if (rt->rt6i_flags&RTF_CACHE)
2030 rtm->rtm_flags |= RTM_F_CLONED;
2033 NLA_PUT(skb, RTA_DST, 16, dst);
2034 rtm->rtm_dst_len = 128;
2035 } else if (rtm->rtm_dst_len)
2036 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2037 #ifdef CONFIG_IPV6_SUBTREES
2039 NLA_PUT(skb, RTA_SRC, 16, src);
2040 rtm->rtm_src_len = 128;
2041 } else if (rtm->rtm_src_len)
2042 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2045 NLA_PUT_U32(skb, RTA_IIF, iif);
2047 struct in6_addr saddr_buf;
2048 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2049 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2052 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2053 goto nla_put_failure;
2055 if (rt->u.dst.neighbour)
2056 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2059 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2061 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2062 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2063 if (rt->rt6i_expires)
2064 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2067 ci.rta_used = rt->u.dst.__use;
2068 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2069 ci.rta_error = rt->u.dst.error;
2073 NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2075 return nlmsg_end(skb, nlh);
2078 return nlmsg_cancel(skb, nlh);
2081 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2083 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2086 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2087 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2088 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2092 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2093 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2094 prefix, NLM_F_MULTI);
2097 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2099 struct nlattr *tb[RTA_MAX+1];
2100 struct rt6_info *rt;
2101 struct sk_buff *skb;
2106 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2111 memset(&fl, 0, sizeof(fl));
2114 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2117 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2121 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2124 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2128 iif = nla_get_u32(tb[RTA_IIF]);
2131 fl.oif = nla_get_u32(tb[RTA_OIF]);
2134 struct net_device *dev;
2135 dev = __dev_get_by_index(iif);
2142 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2148 /* Reserve room for dummy headers, this skb can pass
2149 through good chunk of routing engine.
2151 skb->mac.raw = skb->data;
2152 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2154 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2155 skb->dst = &rt->u.dst;
2157 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2158 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2159 nlh->nlmsg_seq, 0, 0);
2165 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2170 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2172 struct sk_buff *skb;
2173 u32 pid = 0, seq = 0;
2174 struct nlmsghdr *nlh = NULL;
2175 int payload = sizeof(struct rtmsg) + 256;
2182 seq = nlh->nlmsg_seq;
2185 skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2189 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2195 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2198 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2205 #ifdef CONFIG_PROC_FS
2207 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2218 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2220 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2223 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2228 if (arg->len >= arg->length)
2231 for (i=0; i<16; i++) {
2232 sprintf(arg->buffer + arg->len, "%02x",
2233 rt->rt6i_dst.addr.s6_addr[i]);
2236 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2239 #ifdef CONFIG_IPV6_SUBTREES
2240 for (i=0; i<16; i++) {
2241 sprintf(arg->buffer + arg->len, "%02x",
2242 rt->rt6i_src.addr.s6_addr[i]);
2245 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2248 sprintf(arg->buffer + arg->len,
2249 "00000000000000000000000000000000 00 ");
2253 if (rt->rt6i_nexthop) {
2254 for (i=0; i<16; i++) {
2255 sprintf(arg->buffer + arg->len, "%02x",
2256 rt->rt6i_nexthop->primary_key[i]);
2260 sprintf(arg->buffer + arg->len,
2261 "00000000000000000000000000000000");
2264 arg->len += sprintf(arg->buffer + arg->len,
2265 " %08x %08x %08x %08x %8s\n",
2266 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2267 rt->u.dst.__use, rt->rt6i_flags,
2268 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2272 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2274 struct rt6_proc_arg arg = {
2280 fib6_clean_all(rt6_info_route, 0, &arg);
2284 *start += offset % RT6_INFO_LEN;
2286 arg.len -= offset % RT6_INFO_LEN;
2288 if (arg.len > length)
2296 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2298 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2299 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2300 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2301 rt6_stats.fib_rt_cache,
2302 atomic_read(&ip6_dst_ops.entries),
2303 rt6_stats.fib_discarded_routes);
2308 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2310 return single_open(file, rt6_stats_seq_show, NULL);
2313 static struct file_operations rt6_stats_seq_fops = {
2314 .owner = THIS_MODULE,
2315 .open = rt6_stats_seq_open,
2317 .llseek = seq_lseek,
2318 .release = single_release,
2320 #endif /* CONFIG_PROC_FS */
2322 #ifdef CONFIG_SYSCTL
2324 static int flush_delay;
2327 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2328 void __user *buffer, size_t *lenp, loff_t *ppos)
2331 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2332 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2338 ctl_table ipv6_route_table[] = {
2340 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2341 .procname = "flush",
2342 .data = &flush_delay,
2343 .maxlen = sizeof(int),
2345 .proc_handler = &ipv6_sysctl_rtcache_flush
2348 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2349 .procname = "gc_thresh",
2350 .data = &ip6_dst_ops.gc_thresh,
2351 .maxlen = sizeof(int),
2353 .proc_handler = &proc_dointvec,
2356 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2357 .procname = "max_size",
2358 .data = &ip6_rt_max_size,
2359 .maxlen = sizeof(int),
2361 .proc_handler = &proc_dointvec,
2364 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2365 .procname = "gc_min_interval",
2366 .data = &ip6_rt_gc_min_interval,
2367 .maxlen = sizeof(int),
2369 .proc_handler = &proc_dointvec_jiffies,
2370 .strategy = &sysctl_jiffies,
2373 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2374 .procname = "gc_timeout",
2375 .data = &ip6_rt_gc_timeout,
2376 .maxlen = sizeof(int),
2378 .proc_handler = &proc_dointvec_jiffies,
2379 .strategy = &sysctl_jiffies,
2382 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2383 .procname = "gc_interval",
2384 .data = &ip6_rt_gc_interval,
2385 .maxlen = sizeof(int),
2387 .proc_handler = &proc_dointvec_jiffies,
2388 .strategy = &sysctl_jiffies,
2391 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2392 .procname = "gc_elasticity",
2393 .data = &ip6_rt_gc_elasticity,
2394 .maxlen = sizeof(int),
2396 .proc_handler = &proc_dointvec_jiffies,
2397 .strategy = &sysctl_jiffies,
2400 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2401 .procname = "mtu_expires",
2402 .data = &ip6_rt_mtu_expires,
2403 .maxlen = sizeof(int),
2405 .proc_handler = &proc_dointvec_jiffies,
2406 .strategy = &sysctl_jiffies,
2409 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2410 .procname = "min_adv_mss",
2411 .data = &ip6_rt_min_advmss,
2412 .maxlen = sizeof(int),
2414 .proc_handler = &proc_dointvec_jiffies,
2415 .strategy = &sysctl_jiffies,
2418 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2419 .procname = "gc_min_interval_ms",
2420 .data = &ip6_rt_gc_min_interval,
2421 .maxlen = sizeof(int),
2423 .proc_handler = &proc_dointvec_ms_jiffies,
2424 .strategy = &sysctl_ms_jiffies,
2431 void __init ip6_route_init(void)
2433 struct proc_dir_entry *p;
2435 ip6_dst_ops.kmem_cachep =
2436 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2437 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2439 #ifdef CONFIG_PROC_FS
2440 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2442 p->owner = THIS_MODULE;
2444 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2449 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2454 void ip6_route_cleanup(void)
2456 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2457 fib6_rules_cleanup();
2459 #ifdef CONFIG_PROC_FS
2460 proc_net_remove("ipv6_route");
2461 proc_net_remove("rt6_stats");
2468 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);