2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
26 * Fixed routing subtrees.
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
54 #include <linux/rtnetlink.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
60 #include <asm/uaccess.h>
63 #include <linux/sysctl.h>
66 /* Set to 3 to get tracing. */
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
74 #define RT6_TRACE(x...) do { ; } while (0)
77 #define CLONE_OFFLINK_ROUTE 0
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void ip6_dst_destroy(struct dst_entry *);
91 static void ip6_dst_ifdown(struct dst_entry *,
92 struct net_device *dev, int how);
93 static int ip6_dst_gc(void);
95 static int ip6_pkt_discard(struct sk_buff *skb);
96 static int ip6_pkt_discard_out(struct sk_buff *skb);
97 static void ip6_link_failure(struct sk_buff *skb);
98 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102 struct in6_addr *gwaddr, int ifindex,
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105 struct in6_addr *gwaddr, int ifindex);
108 static struct dst_ops ip6_dst_ops = {
110 .protocol = __constant_htons(ETH_P_IPV6),
113 .check = ip6_dst_check,
114 .destroy = ip6_dst_destroy,
115 .ifdown = ip6_dst_ifdown,
116 .negative_advice = ip6_negative_advice,
117 .link_failure = ip6_link_failure,
118 .update_pmtu = ip6_rt_update_pmtu,
119 .entry_size = sizeof(struct rt6_info),
122 struct rt6_info ip6_null_entry = {
125 .__refcnt = ATOMIC_INIT(1),
127 .dev = &loopback_dev,
129 .error = -ENETUNREACH,
130 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
131 .input = ip6_pkt_discard,
132 .output = ip6_pkt_discard_out,
134 .path = (struct dst_entry*)&ip6_null_entry,
137 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
138 .rt6i_metric = ~(u32) 0,
139 .rt6i_ref = ATOMIC_INIT(1),
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144 struct rt6_info ip6_prohibit_entry = {
147 .__refcnt = ATOMIC_INIT(1),
149 .dev = &loopback_dev,
152 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
153 .input = ip6_pkt_discard,
154 .output = ip6_pkt_discard_out,
156 .path = (struct dst_entry*)&ip6_prohibit_entry,
159 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
160 .rt6i_metric = ~(u32) 0,
161 .rt6i_ref = ATOMIC_INIT(1),
164 struct rt6_info ip6_blk_hole_entry = {
167 .__refcnt = ATOMIC_INIT(1),
169 .dev = &loopback_dev,
172 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
173 .input = ip6_pkt_discard,
174 .output = ip6_pkt_discard_out,
176 .path = (struct dst_entry*)&ip6_blk_hole_entry,
179 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
180 .rt6i_metric = ~(u32) 0,
181 .rt6i_ref = ATOMIC_INIT(1),
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
192 static void ip6_dst_destroy(struct dst_entry *dst)
194 struct rt6_info *rt = (struct rt6_info *)dst;
195 struct inet6_dev *idev = rt->rt6i_idev;
198 rt->rt6i_idev = NULL;
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
206 struct rt6_info *rt = (struct rt6_info *)dst;
207 struct inet6_dev *idev = rt->rt6i_idev;
209 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211 if (loopback_idev != NULL) {
212 rt->rt6i_idev = loopback_idev;
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 return (rt->rt6i_flags & RTF_EXPIRES &&
221 time_after(jiffies, rt->rt6i_expires));
224 static inline int rt6_need_strict(struct in6_addr *daddr)
226 return (ipv6_addr_type(daddr) &
227 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
231 * Route lookup. Any table->tb6_lock is implied.
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
238 struct rt6_info *local = NULL;
239 struct rt6_info *sprt;
242 for (sprt = rt; sprt; sprt = sprt->u.next) {
243 struct net_device *dev = sprt->rt6i_dev;
244 if (dev->ifindex == oif)
246 if (dev->flags & IFF_LOOPBACK) {
247 if (sprt->rt6i_idev == NULL ||
248 sprt->rt6i_idev->dev->ifindex != oif) {
251 if (local && (!oif ||
252 local->rt6i_idev->dev->ifindex == oif))
263 return &ip6_null_entry;
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
271 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273 * Okay, this does not seem to be appropriate
274 * for now, however, we need to check if it
275 * is really so; aka Router Reachability Probing.
277 * Router Reachability Probe MUST be rate-limited
278 * to no more than one per minute.
280 if (!neigh || (neigh->nud_state & NUD_VALID))
282 read_lock_bh(&neigh->lock);
283 if (!(neigh->nud_state & NUD_VALID) &&
284 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285 struct in6_addr mcaddr;
286 struct in6_addr *target;
288 neigh->updated = jiffies;
289 read_unlock_bh(&neigh->lock);
291 target = (struct in6_addr *)&neigh->primary_key;
292 addrconf_addr_solict_mult(target, &mcaddr);
293 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295 read_unlock_bh(&neigh->lock);
298 static inline void rt6_probe(struct rt6_info *rt)
305 * Default Router Selection (RFC 2461 6.3.6)
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 struct net_device *dev = rt->rt6i_dev;
310 if (!oif || dev->ifindex == oif)
312 if ((dev->flags & IFF_LOOPBACK) &&
313 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
318 static int inline rt6_check_neigh(struct rt6_info *rt)
320 struct neighbour *neigh = rt->rt6i_nexthop;
322 if (rt->rt6i_flags & RTF_NONEXTHOP ||
323 !(rt->rt6i_flags & RTF_GATEWAY))
326 read_lock_bh(&neigh->lock);
327 if (neigh->nud_state & NUD_VALID)
329 read_unlock_bh(&neigh->lock);
334 static int rt6_score_route(struct rt6_info *rt, int oif,
339 m = rt6_check_dev(rt, oif);
340 if (!m && (strict & RT6_LOOKUP_F_IFACE))
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 n = rt6_check_neigh(rt);
348 else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
356 struct rt6_info *match = NULL, *last = NULL;
357 struct rt6_info *rt, *rt0 = *head;
361 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362 __FUNCTION__, head, head ? *head : NULL, oif);
364 for (rt = rt0, metric = rt0->rt6i_metric;
365 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
369 if (rt6_check_expired(rt))
374 m = rt6_score_route(rt, oif, strict);
388 (strict & RT6_LOOKUP_F_REACHABLE) &&
389 last && last != rt0) {
390 /* no entries matched; do round-robin */
391 static DEFINE_SPINLOCK(lock);
394 rt0->u.next = last->u.next;
399 RT6_TRACE("%s() => %p, score=%d\n",
400 __FUNCTION__, match, mpri);
402 return (match ? match : &ip6_null_entry);
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407 struct in6_addr *gwaddr)
409 struct route_info *rinfo = (struct route_info *) opt;
410 struct in6_addr prefix_buf, *prefix;
415 if (len < sizeof(struct route_info)) {
419 /* Sanity check for prefix_len and length */
420 if (rinfo->length > 3) {
422 } else if (rinfo->prefix_len > 128) {
424 } else if (rinfo->prefix_len > 64) {
425 if (rinfo->length < 2) {
428 } else if (rinfo->prefix_len > 0) {
429 if (rinfo->length < 1) {
434 pref = rinfo->route_pref;
435 if (pref == ICMPV6_ROUTER_PREF_INVALID)
436 pref = ICMPV6_ROUTER_PREF_MEDIUM;
438 lifetime = htonl(rinfo->lifetime);
439 if (lifetime == 0xffffffff) {
441 } else if (lifetime > 0x7fffffff/HZ) {
442 /* Avoid arithmetic overflow */
443 lifetime = 0x7fffffff/HZ - 1;
446 if (rinfo->length == 3)
447 prefix = (struct in6_addr *)rinfo->prefix;
449 /* this function is safe */
450 ipv6_addr_prefix(&prefix_buf,
451 (struct in6_addr *)rinfo->prefix,
453 prefix = &prefix_buf;
456 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458 if (rt && !lifetime) {
464 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
467 rt->rt6i_flags = RTF_ROUTEINFO |
468 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
471 if (lifetime == 0xffffffff) {
472 rt->rt6i_flags &= ~RTF_EXPIRES;
474 rt->rt6i_expires = jiffies + HZ * lifetime;
475 rt->rt6i_flags |= RTF_EXPIRES;
477 dst_release(&rt->u.dst);
483 #define BACKTRACK(saddr) \
485 if (rt == &ip6_null_entry) { \
486 struct fib6_node *pn; \
488 if (fn->fn_flags & RTN_TL_ROOT) \
491 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492 fn = fib6_lookup(pn->subtree, NULL, saddr); \
495 if (fn->fn_flags & RTN_RTINFO) \
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502 struct flowi *fl, int flags)
504 struct fib6_node *fn;
507 read_lock_bh(&table->tb6_lock);
508 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
511 rt = rt6_device_match(rt, fl->oif, flags);
512 BACKTRACK(&fl->fl6_src);
514 dst_hold(&rt->u.dst);
515 read_unlock_bh(&table->tb6_lock);
517 rt->u.dst.lastuse = jiffies;
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
536 struct dst_entry *dst;
537 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
539 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
541 return (struct rt6_info *) dst;
548 /* ip6_ins_rt is called with FREE table->tb6_lock.
549 It takes new route entry, the addition fails by any reason the
550 route is freed. In any case, if caller does not hold it, it may
554 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
557 struct fib6_table *table;
559 table = rt->rt6i_table;
560 write_lock_bh(&table->tb6_lock);
561 err = fib6_add(&table->tb6_root, rt, info);
562 write_unlock_bh(&table->tb6_lock);
567 int ip6_ins_rt(struct rt6_info *rt)
569 return __ip6_ins_rt(rt, NULL);
572 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
573 struct in6_addr *saddr)
581 rt = ip6_rt_copy(ort);
584 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
585 if (rt->rt6i_dst.plen != 128 &&
586 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
587 rt->rt6i_flags |= RTF_ANYCAST;
588 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
591 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
592 rt->rt6i_dst.plen = 128;
593 rt->rt6i_flags |= RTF_CACHE;
594 rt->u.dst.flags |= DST_HOST;
596 #ifdef CONFIG_IPV6_SUBTREES
597 if (rt->rt6i_src.plen && saddr) {
598 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
599 rt->rt6i_src.plen = 128;
603 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
610 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
612 struct rt6_info *rt = ip6_rt_copy(ort);
614 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
615 rt->rt6i_dst.plen = 128;
616 rt->rt6i_flags |= RTF_CACHE;
617 if (rt->rt6i_flags & RTF_REJECT)
618 rt->u.dst.error = ort->u.dst.error;
619 rt->u.dst.flags |= DST_HOST;
620 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
625 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
626 struct flowi *fl, int flags)
628 struct fib6_node *fn;
629 struct rt6_info *rt, *nrt;
633 int reachable = RT6_LOOKUP_F_REACHABLE;
635 strict |= flags & RT6_LOOKUP_F_IFACE;
638 read_lock_bh(&table->tb6_lock);
641 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
644 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
645 BACKTRACK(&fl->fl6_src);
646 if (rt == &ip6_null_entry ||
647 rt->rt6i_flags & RTF_CACHE)
650 dst_hold(&rt->u.dst);
651 read_unlock_bh(&table->tb6_lock);
653 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
654 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
656 #if CLONE_OFFLINK_ROUTE
657 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
663 dst_release(&rt->u.dst);
664 rt = nrt ? : &ip6_null_entry;
666 dst_hold(&rt->u.dst);
668 err = ip6_ins_rt(nrt);
677 * Race condition! In the gap, when table->tb6_lock was
678 * released someone could insert this route. Relookup.
680 dst_release(&rt->u.dst);
688 dst_hold(&rt->u.dst);
689 read_unlock_bh(&table->tb6_lock);
691 rt->u.dst.lastuse = jiffies;
697 void ip6_route_input(struct sk_buff *skb)
699 struct ipv6hdr *iph = skb->nh.ipv6h;
701 .iif = skb->dev->ifindex,
706 #ifdef CONFIG_IPV6_ROUTE_FWMARK
707 .fwmark = skb->nfmark,
709 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
712 .proto = iph->nexthdr,
714 int flags = rt6_need_strict(&iph->daddr) ? RT6_LOOKUP_F_IFACE : 0;
716 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
719 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
720 struct flowi *fl, int flags)
722 struct fib6_node *fn;
723 struct rt6_info *rt, *nrt;
727 int reachable = RT6_LOOKUP_F_REACHABLE;
729 strict |= flags & RT6_LOOKUP_F_IFACE;
732 read_lock_bh(&table->tb6_lock);
735 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
738 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
739 BACKTRACK(&fl->fl6_src);
740 if (rt == &ip6_null_entry ||
741 rt->rt6i_flags & RTF_CACHE)
744 dst_hold(&rt->u.dst);
745 read_unlock_bh(&table->tb6_lock);
747 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
748 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
750 #if CLONE_OFFLINK_ROUTE
751 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
757 dst_release(&rt->u.dst);
758 rt = nrt ? : &ip6_null_entry;
760 dst_hold(&rt->u.dst);
762 err = ip6_ins_rt(nrt);
771 * Race condition! In the gap, when table->tb6_lock was
772 * released someone could insert this route. Relookup.
774 dst_release(&rt->u.dst);
782 dst_hold(&rt->u.dst);
783 read_unlock_bh(&table->tb6_lock);
785 rt->u.dst.lastuse = jiffies;
790 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
794 if (rt6_need_strict(&fl->fl6_dst))
795 flags |= RT6_LOOKUP_F_IFACE;
797 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
802 * Destination cache support functions
805 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
809 rt = (struct rt6_info *) dst;
811 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
817 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
819 struct rt6_info *rt = (struct rt6_info *) dst;
822 if (rt->rt6i_flags & RTF_CACHE)
830 static void ip6_link_failure(struct sk_buff *skb)
834 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
836 rt = (struct rt6_info *) skb->dst;
838 if (rt->rt6i_flags&RTF_CACHE) {
839 dst_set_expires(&rt->u.dst, 0);
840 rt->rt6i_flags |= RTF_EXPIRES;
841 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
842 rt->rt6i_node->fn_sernum = -1;
846 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
848 struct rt6_info *rt6 = (struct rt6_info*)dst;
850 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
851 rt6->rt6i_flags |= RTF_MODIFIED;
852 if (mtu < IPV6_MIN_MTU) {
854 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
856 dst->metrics[RTAX_MTU-1] = mtu;
857 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
861 static int ipv6_get_mtu(struct net_device *dev);
863 static inline unsigned int ipv6_advmss(unsigned int mtu)
865 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
867 if (mtu < ip6_rt_min_advmss)
868 mtu = ip6_rt_min_advmss;
871 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
872 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
873 * IPV6_MAXPLEN is also valid and means: "any MSS,
874 * rely only on pmtu discovery"
876 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
881 static struct dst_entry *ndisc_dst_gc_list;
882 static DEFINE_SPINLOCK(ndisc_lock);
884 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
885 struct neighbour *neigh,
886 struct in6_addr *addr,
887 int (*output)(struct sk_buff *))
890 struct inet6_dev *idev = in6_dev_get(dev);
892 if (unlikely(idev == NULL))
895 rt = ip6_dst_alloc();
896 if (unlikely(rt == NULL)) {
905 neigh = ndisc_get_neigh(dev, addr);
908 rt->rt6i_idev = idev;
909 rt->rt6i_nexthop = neigh;
910 atomic_set(&rt->u.dst.__refcnt, 1);
911 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
912 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
913 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
914 rt->u.dst.output = output;
916 #if 0 /* there's no chance to use these for ndisc */
917 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
920 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
921 rt->rt6i_dst.plen = 128;
924 spin_lock_bh(&ndisc_lock);
925 rt->u.dst.next = ndisc_dst_gc_list;
926 ndisc_dst_gc_list = &rt->u.dst;
927 spin_unlock_bh(&ndisc_lock);
929 fib6_force_start_gc();
932 return (struct dst_entry *)rt;
935 int ndisc_dst_gc(int *more)
937 struct dst_entry *dst, *next, **pprev;
943 spin_lock_bh(&ndisc_lock);
944 pprev = &ndisc_dst_gc_list;
946 while ((dst = *pprev) != NULL) {
947 if (!atomic_read(&dst->__refcnt)) {
957 spin_unlock_bh(&ndisc_lock);
962 static int ip6_dst_gc(void)
964 static unsigned expire = 30*HZ;
965 static unsigned long last_gc;
966 unsigned long now = jiffies;
968 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
969 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
975 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
976 expire = ip6_rt_gc_timeout>>1;
979 expire -= expire>>ip6_rt_gc_elasticity;
980 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
983 /* Clean host part of a prefix. Not necessary in radix tree,
984 but results in cleaner routing tables.
986 Remove it only when all the things will work!
989 static int ipv6_get_mtu(struct net_device *dev)
991 int mtu = IPV6_MIN_MTU;
992 struct inet6_dev *idev;
994 idev = in6_dev_get(dev);
996 mtu = idev->cnf.mtu6;
1002 int ipv6_get_hoplimit(struct net_device *dev)
1004 int hoplimit = ipv6_devconf.hop_limit;
1005 struct inet6_dev *idev;
1007 idev = in6_dev_get(dev);
1009 hoplimit = idev->cnf.hop_limit;
1019 int ip6_route_add(struct fib6_config *cfg)
1022 struct rt6_info *rt = NULL;
1023 struct net_device *dev = NULL;
1024 struct inet6_dev *idev = NULL;
1025 struct fib6_table *table;
1028 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1030 #ifndef CONFIG_IPV6_SUBTREES
1031 if (cfg->fc_src_len)
1034 if (cfg->fc_ifindex) {
1036 dev = dev_get_by_index(cfg->fc_ifindex);
1039 idev = in6_dev_get(dev);
1044 if (cfg->fc_metric == 0)
1045 cfg->fc_metric = IP6_RT_PRIO_USER;
1047 table = fib6_new_table(cfg->fc_table);
1048 if (table == NULL) {
1053 rt = ip6_dst_alloc();
1060 rt->u.dst.obsolete = -1;
1061 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1063 if (cfg->fc_protocol == RTPROT_UNSPEC)
1064 cfg->fc_protocol = RTPROT_BOOT;
1065 rt->rt6i_protocol = cfg->fc_protocol;
1067 addr_type = ipv6_addr_type(&cfg->fc_dst);
1069 if (addr_type & IPV6_ADDR_MULTICAST)
1070 rt->u.dst.input = ip6_mc_input;
1072 rt->u.dst.input = ip6_forward;
1074 rt->u.dst.output = ip6_output;
1076 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1077 rt->rt6i_dst.plen = cfg->fc_dst_len;
1078 if (rt->rt6i_dst.plen == 128)
1079 rt->u.dst.flags = DST_HOST;
1081 #ifdef CONFIG_IPV6_SUBTREES
1082 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1083 rt->rt6i_src.plen = cfg->fc_src_len;
1086 rt->rt6i_metric = cfg->fc_metric;
1088 /* We cannot add true routes via loopback here,
1089 they would result in kernel looping; promote them to reject routes
1091 if ((cfg->fc_flags & RTF_REJECT) ||
1092 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1093 /* hold loopback dev/idev if we haven't done so. */
1094 if (dev != &loopback_dev) {
1099 dev = &loopback_dev;
1101 idev = in6_dev_get(dev);
1107 rt->u.dst.output = ip6_pkt_discard_out;
1108 rt->u.dst.input = ip6_pkt_discard;
1109 rt->u.dst.error = -ENETUNREACH;
1110 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1114 if (cfg->fc_flags & RTF_GATEWAY) {
1115 struct in6_addr *gw_addr;
1118 gw_addr = &cfg->fc_gateway;
1119 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1120 gwa_type = ipv6_addr_type(gw_addr);
1122 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1123 struct rt6_info *grt;
1125 /* IPv6 strictly inhibits using not link-local
1126 addresses as nexthop address.
1127 Otherwise, router will not able to send redirects.
1128 It is very good, but in some (rare!) circumstances
1129 (SIT, PtP, NBMA NOARP links) it is handy to allow
1130 some exceptions. --ANK
1133 if (!(gwa_type&IPV6_ADDR_UNICAST))
1136 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1138 err = -EHOSTUNREACH;
1142 if (dev != grt->rt6i_dev) {
1143 dst_release(&grt->u.dst);
1147 dev = grt->rt6i_dev;
1148 idev = grt->rt6i_idev;
1150 in6_dev_hold(grt->rt6i_idev);
1152 if (!(grt->rt6i_flags&RTF_GATEWAY))
1154 dst_release(&grt->u.dst);
1160 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1168 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1169 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1170 if (IS_ERR(rt->rt6i_nexthop)) {
1171 err = PTR_ERR(rt->rt6i_nexthop);
1172 rt->rt6i_nexthop = NULL;
1177 rt->rt6i_flags = cfg->fc_flags;
1184 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1185 int type = nla->nla_type;
1188 if (type > RTAX_MAX) {
1193 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1198 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1199 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1200 if (!rt->u.dst.metrics[RTAX_MTU-1])
1201 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1202 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1203 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1204 rt->u.dst.dev = dev;
1205 rt->rt6i_idev = idev;
1206 rt->rt6i_table = table;
1207 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1215 dst_free((struct dst_entry *) rt);
1219 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1222 struct fib6_table *table;
1224 if (rt == &ip6_null_entry)
1227 table = rt->rt6i_table;
1228 write_lock_bh(&table->tb6_lock);
1230 err = fib6_del(rt, info);
1231 dst_release(&rt->u.dst);
1233 write_unlock_bh(&table->tb6_lock);
1238 int ip6_del_rt(struct rt6_info *rt)
1240 return __ip6_del_rt(rt, NULL);
1243 static int ip6_route_del(struct fib6_config *cfg)
1245 struct fib6_table *table;
1246 struct fib6_node *fn;
1247 struct rt6_info *rt;
1250 table = fib6_get_table(cfg->fc_table);
1254 read_lock_bh(&table->tb6_lock);
1256 fn = fib6_locate(&table->tb6_root,
1257 &cfg->fc_dst, cfg->fc_dst_len,
1258 &cfg->fc_src, cfg->fc_src_len);
1261 for (rt = fn->leaf; rt; rt = rt->u.next) {
1262 if (cfg->fc_ifindex &&
1263 (rt->rt6i_dev == NULL ||
1264 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1266 if (cfg->fc_flags & RTF_GATEWAY &&
1267 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1269 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1271 dst_hold(&rt->u.dst);
1272 read_unlock_bh(&table->tb6_lock);
1274 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1277 read_unlock_bh(&table->tb6_lock);
1285 struct ip6rd_flowi {
1287 struct in6_addr gateway;
1290 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1294 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1295 struct rt6_info *rt;
1296 struct fib6_node *fn;
1299 * Get the "current" route for this destination and
1300 * check if the redirect has come from approriate router.
1302 * RFC 2461 specifies that redirects should only be
1303 * accepted if they come from the nexthop to the target.
1304 * Due to the way the routes are chosen, this notion
1305 * is a bit fuzzy and one might need to check all possible
1309 read_lock_bh(&table->tb6_lock);
1310 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1312 for (rt = fn->leaf; rt; rt = rt->u.next) {
1314 * Current route is on-link; redirect is always invalid.
1316 * Seems, previous statement is not true. It could
1317 * be node, which looks for us as on-link (f.e. proxy ndisc)
1318 * But then router serving it might decide, that we should
1319 * know truth 8)8) --ANK (980726).
1321 if (rt6_check_expired(rt))
1323 if (!(rt->rt6i_flags & RTF_GATEWAY))
1325 if (fl->oif != rt->rt6i_dev->ifindex)
1327 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1333 rt = &ip6_null_entry;
1334 BACKTRACK(&fl->fl6_src);
1336 dst_hold(&rt->u.dst);
1338 read_unlock_bh(&table->tb6_lock);
1343 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1344 struct in6_addr *src,
1345 struct in6_addr *gateway,
1346 struct net_device *dev)
1348 struct ip6rd_flowi rdfl = {
1350 .oif = dev->ifindex,
1358 .gateway = *gateway,
1360 int flags = rt6_need_strict(dest) ? RT6_LOOKUP_F_IFACE : 0;
1362 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1365 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1366 struct in6_addr *saddr,
1367 struct neighbour *neigh, u8 *lladdr, int on_link)
1369 struct rt6_info *rt, *nrt = NULL;
1370 struct netevent_redirect netevent;
1372 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1374 if (rt == &ip6_null_entry) {
1375 if (net_ratelimit())
1376 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1377 "for redirect target\n");
1382 * We have finally decided to accept it.
1385 neigh_update(neigh, lladdr, NUD_STALE,
1386 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1387 NEIGH_UPDATE_F_OVERRIDE|
1388 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1389 NEIGH_UPDATE_F_ISROUTER))
1393 * Redirect received -> path was valid.
1394 * Look, redirects are sent only in response to data packets,
1395 * so that this nexthop apparently is reachable. --ANK
1397 dst_confirm(&rt->u.dst);
1399 /* Duplicate redirect: silently ignore. */
1400 if (neigh == rt->u.dst.neighbour)
1403 nrt = ip6_rt_copy(rt);
1407 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1409 nrt->rt6i_flags &= ~RTF_GATEWAY;
1411 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1412 nrt->rt6i_dst.plen = 128;
1413 nrt->u.dst.flags |= DST_HOST;
1415 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1416 nrt->rt6i_nexthop = neigh_clone(neigh);
1417 /* Reset pmtu, it may be better */
1418 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1419 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1421 if (ip6_ins_rt(nrt))
1424 netevent.old = &rt->u.dst;
1425 netevent.new = &nrt->u.dst;
1426 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1428 if (rt->rt6i_flags&RTF_CACHE) {
1434 dst_release(&rt->u.dst);
1439 * Handle ICMP "packet too big" messages
1440 * i.e. Path MTU discovery
1443 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1444 struct net_device *dev, u32 pmtu)
1446 struct rt6_info *rt, *nrt;
1449 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1453 if (pmtu >= dst_mtu(&rt->u.dst))
1456 if (pmtu < IPV6_MIN_MTU) {
1458 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1459 * MTU (1280) and a fragment header should always be included
1460 * after a node receiving Too Big message reporting PMTU is
1461 * less than the IPv6 Minimum Link MTU.
1463 pmtu = IPV6_MIN_MTU;
1467 /* New mtu received -> path was valid.
1468 They are sent only in response to data packets,
1469 so that this nexthop apparently is reachable. --ANK
1471 dst_confirm(&rt->u.dst);
1473 /* Host route. If it is static, it would be better
1474 not to override it, but add new one, so that
1475 when cache entry will expire old pmtu
1476 would return automatically.
1478 if (rt->rt6i_flags & RTF_CACHE) {
1479 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1481 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1482 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1483 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1488 Two cases are possible:
1489 1. It is connected route. Action: COW
1490 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1492 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1493 nrt = rt6_alloc_cow(rt, daddr, saddr);
1495 nrt = rt6_alloc_clone(rt, daddr);
1498 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1500 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1502 /* According to RFC 1981, detecting PMTU increase shouldn't be
1503 * happened within 5 mins, the recommended timer is 10 mins.
1504 * Here this route expiration time is set to ip6_rt_mtu_expires
1505 * which is 10 mins. After 10 mins the decreased pmtu is expired
1506 * and detecting PMTU increase will be automatically happened.
1508 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1509 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1514 dst_release(&rt->u.dst);
1518 * Misc support functions
1521 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1523 struct rt6_info *rt = ip6_dst_alloc();
1526 rt->u.dst.input = ort->u.dst.input;
1527 rt->u.dst.output = ort->u.dst.output;
1529 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1530 rt->u.dst.dev = ort->u.dst.dev;
1532 dev_hold(rt->u.dst.dev);
1533 rt->rt6i_idev = ort->rt6i_idev;
1535 in6_dev_hold(rt->rt6i_idev);
1536 rt->u.dst.lastuse = jiffies;
1537 rt->rt6i_expires = 0;
1539 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1540 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1541 rt->rt6i_metric = 0;
1543 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1544 #ifdef CONFIG_IPV6_SUBTREES
1545 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1547 rt->rt6i_table = ort->rt6i_table;
1552 #ifdef CONFIG_IPV6_ROUTE_INFO
1553 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1554 struct in6_addr *gwaddr, int ifindex)
1556 struct fib6_node *fn;
1557 struct rt6_info *rt = NULL;
1558 struct fib6_table *table;
1560 table = fib6_get_table(RT6_TABLE_INFO);
1564 write_lock_bh(&table->tb6_lock);
1565 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1569 for (rt = fn->leaf; rt; rt = rt->u.next) {
1570 if (rt->rt6i_dev->ifindex != ifindex)
1572 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1574 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1576 dst_hold(&rt->u.dst);
1580 write_unlock_bh(&table->tb6_lock);
1584 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1585 struct in6_addr *gwaddr, int ifindex,
1588 struct fib6_config cfg = {
1589 .fc_table = RT6_TABLE_INFO,
1591 .fc_ifindex = ifindex,
1592 .fc_dst_len = prefixlen,
1593 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1594 RTF_UP | RTF_PREF(pref),
1597 ipv6_addr_copy(&cfg.fc_dst, prefix);
1598 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1600 /* We should treat it as a default route if prefix length is 0. */
1602 cfg.fc_flags |= RTF_DEFAULT;
1604 ip6_route_add(&cfg);
1606 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1610 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1612 struct rt6_info *rt;
1613 struct fib6_table *table;
1615 table = fib6_get_table(RT6_TABLE_DFLT);
1619 write_lock_bh(&table->tb6_lock);
1620 for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1621 if (dev == rt->rt6i_dev &&
1622 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1623 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1627 dst_hold(&rt->u.dst);
1628 write_unlock_bh(&table->tb6_lock);
1632 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1633 struct net_device *dev,
1636 struct fib6_config cfg = {
1637 .fc_table = RT6_TABLE_DFLT,
1639 .fc_ifindex = dev->ifindex,
1640 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1641 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1644 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1646 ip6_route_add(&cfg);
1648 return rt6_get_dflt_router(gwaddr, dev);
1651 void rt6_purge_dflt_routers(void)
1653 struct rt6_info *rt;
1654 struct fib6_table *table;
1656 /* NOTE: Keep consistent with rt6_get_dflt_router */
1657 table = fib6_get_table(RT6_TABLE_DFLT);
1662 read_lock_bh(&table->tb6_lock);
1663 for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1664 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1665 dst_hold(&rt->u.dst);
1666 read_unlock_bh(&table->tb6_lock);
1671 read_unlock_bh(&table->tb6_lock);
1674 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1675 struct fib6_config *cfg)
1677 memset(cfg, 0, sizeof(*cfg));
1679 cfg->fc_table = RT6_TABLE_MAIN;
1680 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1681 cfg->fc_metric = rtmsg->rtmsg_metric;
1682 cfg->fc_expires = rtmsg->rtmsg_info;
1683 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1684 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1685 cfg->fc_flags = rtmsg->rtmsg_flags;
1687 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1688 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1689 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1692 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1694 struct fib6_config cfg;
1695 struct in6_rtmsg rtmsg;
1699 case SIOCADDRT: /* Add a route */
1700 case SIOCDELRT: /* Delete a route */
1701 if (!capable(CAP_NET_ADMIN))
1703 err = copy_from_user(&rtmsg, arg,
1704 sizeof(struct in6_rtmsg));
1708 rtmsg_to_fib6_config(&rtmsg, &cfg);
1713 err = ip6_route_add(&cfg);
1716 err = ip6_route_del(&cfg);
1730 * Drop the packet on the floor
1733 static int ip6_pkt_discard(struct sk_buff *skb)
1735 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1736 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1737 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1739 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1740 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1745 static int ip6_pkt_discard_out(struct sk_buff *skb)
1747 skb->dev = skb->dst->dev;
1748 return ip6_pkt_discard(skb);
1752 * Allocate a dst for local (unicast / anycast) address.
1755 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1756 const struct in6_addr *addr,
1759 struct rt6_info *rt = ip6_dst_alloc();
1762 return ERR_PTR(-ENOMEM);
1764 dev_hold(&loopback_dev);
1767 rt->u.dst.flags = DST_HOST;
1768 rt->u.dst.input = ip6_input;
1769 rt->u.dst.output = ip6_output;
1770 rt->rt6i_dev = &loopback_dev;
1771 rt->rt6i_idev = idev;
1772 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1773 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1774 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1775 rt->u.dst.obsolete = -1;
1777 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1779 rt->rt6i_flags |= RTF_ANYCAST;
1781 rt->rt6i_flags |= RTF_LOCAL;
1782 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1783 if (rt->rt6i_nexthop == NULL) {
1784 dst_free((struct dst_entry *) rt);
1785 return ERR_PTR(-ENOMEM);
1788 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1789 rt->rt6i_dst.plen = 128;
1790 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1792 atomic_set(&rt->u.dst.__refcnt, 1);
1797 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1799 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1800 rt != &ip6_null_entry) {
1801 RT6_TRACE("deleted by ifdown %p\n", rt);
1807 void rt6_ifdown(struct net_device *dev)
1809 fib6_clean_all(fib6_ifdown, 0, dev);
1812 struct rt6_mtu_change_arg
1814 struct net_device *dev;
1818 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1820 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1821 struct inet6_dev *idev;
1823 /* In IPv6 pmtu discovery is not optional,
1824 so that RTAX_MTU lock cannot disable it.
1825 We still use this lock to block changes
1826 caused by addrconf/ndisc.
1829 idev = __in6_dev_get(arg->dev);
1833 /* For administrative MTU increase, there is no way to discover
1834 IPv6 PMTU increase, so PMTU increase should be updated here.
1835 Since RFC 1981 doesn't include administrative MTU increase
1836 update PMTU increase is a MUST. (i.e. jumbo frame)
1839 If new MTU is less than route PMTU, this new MTU will be the
1840 lowest MTU in the path, update the route PMTU to reflect PMTU
1841 decreases; if new MTU is greater than route PMTU, and the
1842 old MTU is the lowest MTU in the path, update the route PMTU
1843 to reflect the increase. In this case if the other nodes' MTU
1844 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1847 if (rt->rt6i_dev == arg->dev &&
1848 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1849 (dst_mtu(&rt->u.dst) > arg->mtu ||
1850 (dst_mtu(&rt->u.dst) < arg->mtu &&
1851 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1852 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1853 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1857 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1859 struct rt6_mtu_change_arg arg = {
1864 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1867 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1868 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
1869 [RTA_OIF] = { .type = NLA_U32 },
1870 [RTA_IIF] = { .type = NLA_U32 },
1871 [RTA_PRIORITY] = { .type = NLA_U32 },
1872 [RTA_METRICS] = { .type = NLA_NESTED },
1875 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1876 struct fib6_config *cfg)
1879 struct nlattr *tb[RTA_MAX+1];
1882 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1887 rtm = nlmsg_data(nlh);
1888 memset(cfg, 0, sizeof(*cfg));
1890 cfg->fc_table = rtm->rtm_table;
1891 cfg->fc_dst_len = rtm->rtm_dst_len;
1892 cfg->fc_src_len = rtm->rtm_src_len;
1893 cfg->fc_flags = RTF_UP;
1894 cfg->fc_protocol = rtm->rtm_protocol;
1896 if (rtm->rtm_type == RTN_UNREACHABLE)
1897 cfg->fc_flags |= RTF_REJECT;
1899 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1900 cfg->fc_nlinfo.nlh = nlh;
1902 if (tb[RTA_GATEWAY]) {
1903 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1904 cfg->fc_flags |= RTF_GATEWAY;
1908 int plen = (rtm->rtm_dst_len + 7) >> 3;
1910 if (nla_len(tb[RTA_DST]) < plen)
1913 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1917 int plen = (rtm->rtm_src_len + 7) >> 3;
1919 if (nla_len(tb[RTA_SRC]) < plen)
1922 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1926 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1928 if (tb[RTA_PRIORITY])
1929 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1931 if (tb[RTA_METRICS]) {
1932 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1933 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1937 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1944 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1946 struct fib6_config cfg;
1949 err = rtm_to_fib6_config(skb, nlh, &cfg);
1953 return ip6_route_del(&cfg);
1956 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1958 struct fib6_config cfg;
1961 err = rtm_to_fib6_config(skb, nlh, &cfg);
1965 return ip6_route_add(&cfg);
1968 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1969 struct in6_addr *dst, struct in6_addr *src,
1970 int iif, int type, u32 pid, u32 seq,
1971 int prefix, unsigned int flags)
1974 struct nlmsghdr *nlh;
1975 struct rta_cacheinfo ci;
1978 if (prefix) { /* user wants prefix routes only */
1979 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1980 /* success since this is not a prefix route */
1985 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1989 rtm = nlmsg_data(nlh);
1990 rtm->rtm_family = AF_INET6;
1991 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1992 rtm->rtm_src_len = rt->rt6i_src.plen;
1995 table = rt->rt6i_table->tb6_id;
1997 table = RT6_TABLE_UNSPEC;
1998 rtm->rtm_table = table;
1999 NLA_PUT_U32(skb, RTA_TABLE, table);
2000 if (rt->rt6i_flags&RTF_REJECT)
2001 rtm->rtm_type = RTN_UNREACHABLE;
2002 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2003 rtm->rtm_type = RTN_LOCAL;
2005 rtm->rtm_type = RTN_UNICAST;
2007 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2008 rtm->rtm_protocol = rt->rt6i_protocol;
2009 if (rt->rt6i_flags&RTF_DYNAMIC)
2010 rtm->rtm_protocol = RTPROT_REDIRECT;
2011 else if (rt->rt6i_flags & RTF_ADDRCONF)
2012 rtm->rtm_protocol = RTPROT_KERNEL;
2013 else if (rt->rt6i_flags&RTF_DEFAULT)
2014 rtm->rtm_protocol = RTPROT_RA;
2016 if (rt->rt6i_flags&RTF_CACHE)
2017 rtm->rtm_flags |= RTM_F_CLONED;
2020 NLA_PUT(skb, RTA_DST, 16, dst);
2021 rtm->rtm_dst_len = 128;
2022 } else if (rtm->rtm_dst_len)
2023 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2024 #ifdef CONFIG_IPV6_SUBTREES
2026 NLA_PUT(skb, RTA_SRC, 16, src);
2027 rtm->rtm_src_len = 128;
2028 } else if (rtm->rtm_src_len)
2029 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2032 NLA_PUT_U32(skb, RTA_IIF, iif);
2034 struct in6_addr saddr_buf;
2035 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2036 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2039 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2040 goto nla_put_failure;
2042 if (rt->u.dst.neighbour)
2043 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2046 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2048 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2049 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2050 if (rt->rt6i_expires)
2051 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2054 ci.rta_used = rt->u.dst.__use;
2055 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2056 ci.rta_error = rt->u.dst.error;
2060 NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2062 return nlmsg_end(skb, nlh);
2065 return nlmsg_cancel(skb, nlh);
2068 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2070 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2073 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2074 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2075 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2079 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2080 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2081 prefix, NLM_F_MULTI);
2084 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2086 struct nlattr *tb[RTA_MAX+1];
2087 struct rt6_info *rt;
2088 struct sk_buff *skb;
2093 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2098 memset(&fl, 0, sizeof(fl));
2101 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2104 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2108 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2111 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2115 iif = nla_get_u32(tb[RTA_IIF]);
2118 fl.oif = nla_get_u32(tb[RTA_OIF]);
2121 struct net_device *dev;
2122 dev = __dev_get_by_index(iif);
2129 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2135 /* Reserve room for dummy headers, this skb can pass
2136 through good chunk of routing engine.
2138 skb->mac.raw = skb->data;
2139 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2141 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2142 skb->dst = &rt->u.dst;
2144 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2145 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2146 nlh->nlmsg_seq, 0, 0);
2152 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2157 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2159 struct sk_buff *skb;
2160 u32 pid = 0, seq = 0;
2161 struct nlmsghdr *nlh = NULL;
2162 int payload = sizeof(struct rtmsg) + 256;
2169 seq = nlh->nlmsg_seq;
2172 skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2176 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2182 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2185 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2192 #ifdef CONFIG_PROC_FS
2194 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2205 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2207 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2210 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2215 if (arg->len >= arg->length)
2218 for (i=0; i<16; i++) {
2219 sprintf(arg->buffer + arg->len, "%02x",
2220 rt->rt6i_dst.addr.s6_addr[i]);
2223 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2226 #ifdef CONFIG_IPV6_SUBTREES
2227 for (i=0; i<16; i++) {
2228 sprintf(arg->buffer + arg->len, "%02x",
2229 rt->rt6i_src.addr.s6_addr[i]);
2232 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2235 sprintf(arg->buffer + arg->len,
2236 "00000000000000000000000000000000 00 ");
2240 if (rt->rt6i_nexthop) {
2241 for (i=0; i<16; i++) {
2242 sprintf(arg->buffer + arg->len, "%02x",
2243 rt->rt6i_nexthop->primary_key[i]);
2247 sprintf(arg->buffer + arg->len,
2248 "00000000000000000000000000000000");
2251 arg->len += sprintf(arg->buffer + arg->len,
2252 " %08x %08x %08x %08x %8s\n",
2253 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2254 rt->u.dst.__use, rt->rt6i_flags,
2255 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2259 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2261 struct rt6_proc_arg arg = {
2267 fib6_clean_all(rt6_info_route, 0, &arg);
2271 *start += offset % RT6_INFO_LEN;
2273 arg.len -= offset % RT6_INFO_LEN;
2275 if (arg.len > length)
2283 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2285 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2286 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2287 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2288 rt6_stats.fib_rt_cache,
2289 atomic_read(&ip6_dst_ops.entries),
2290 rt6_stats.fib_discarded_routes);
2295 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2297 return single_open(file, rt6_stats_seq_show, NULL);
2300 static struct file_operations rt6_stats_seq_fops = {
2301 .owner = THIS_MODULE,
2302 .open = rt6_stats_seq_open,
2304 .llseek = seq_lseek,
2305 .release = single_release,
2307 #endif /* CONFIG_PROC_FS */
2309 #ifdef CONFIG_SYSCTL
2311 static int flush_delay;
2314 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2315 void __user *buffer, size_t *lenp, loff_t *ppos)
2318 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2319 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2325 ctl_table ipv6_route_table[] = {
2327 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2328 .procname = "flush",
2329 .data = &flush_delay,
2330 .maxlen = sizeof(int),
2332 .proc_handler = &ipv6_sysctl_rtcache_flush
2335 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2336 .procname = "gc_thresh",
2337 .data = &ip6_dst_ops.gc_thresh,
2338 .maxlen = sizeof(int),
2340 .proc_handler = &proc_dointvec,
2343 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2344 .procname = "max_size",
2345 .data = &ip6_rt_max_size,
2346 .maxlen = sizeof(int),
2348 .proc_handler = &proc_dointvec,
2351 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2352 .procname = "gc_min_interval",
2353 .data = &ip6_rt_gc_min_interval,
2354 .maxlen = sizeof(int),
2356 .proc_handler = &proc_dointvec_jiffies,
2357 .strategy = &sysctl_jiffies,
2360 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2361 .procname = "gc_timeout",
2362 .data = &ip6_rt_gc_timeout,
2363 .maxlen = sizeof(int),
2365 .proc_handler = &proc_dointvec_jiffies,
2366 .strategy = &sysctl_jiffies,
2369 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2370 .procname = "gc_interval",
2371 .data = &ip6_rt_gc_interval,
2372 .maxlen = sizeof(int),
2374 .proc_handler = &proc_dointvec_jiffies,
2375 .strategy = &sysctl_jiffies,
2378 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2379 .procname = "gc_elasticity",
2380 .data = &ip6_rt_gc_elasticity,
2381 .maxlen = sizeof(int),
2383 .proc_handler = &proc_dointvec_jiffies,
2384 .strategy = &sysctl_jiffies,
2387 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2388 .procname = "mtu_expires",
2389 .data = &ip6_rt_mtu_expires,
2390 .maxlen = sizeof(int),
2392 .proc_handler = &proc_dointvec_jiffies,
2393 .strategy = &sysctl_jiffies,
2396 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2397 .procname = "min_adv_mss",
2398 .data = &ip6_rt_min_advmss,
2399 .maxlen = sizeof(int),
2401 .proc_handler = &proc_dointvec_jiffies,
2402 .strategy = &sysctl_jiffies,
2405 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2406 .procname = "gc_min_interval_ms",
2407 .data = &ip6_rt_gc_min_interval,
2408 .maxlen = sizeof(int),
2410 .proc_handler = &proc_dointvec_ms_jiffies,
2411 .strategy = &sysctl_ms_jiffies,
2418 void __init ip6_route_init(void)
2420 struct proc_dir_entry *p;
2422 ip6_dst_ops.kmem_cachep =
2423 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2424 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2426 #ifdef CONFIG_PROC_FS
2427 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2429 p->owner = THIS_MODULE;
2431 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2436 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2441 void ip6_route_cleanup(void)
2443 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2444 fib6_rules_cleanup();
2446 #ifdef CONFIG_PROC_FS
2447 proc_net_remove("ipv6_route");
2448 proc_net_remove("rt6_stats");
2455 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);