2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
26 * Fixed routing subtrees.
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
54 #include <linux/rtnetlink.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
60 #include <asm/uaccess.h>
63 #include <linux/sysctl.h>
66 /* Set to 3 to get tracing. */
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
74 #define RT6_TRACE(x...) do { ; } while (0)
77 #define CLONE_OFFLINK_ROUTE 0
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void ip6_dst_destroy(struct dst_entry *);
91 static void ip6_dst_ifdown(struct dst_entry *,
92 struct net_device *dev, int how);
93 static int ip6_dst_gc(void);
95 static int ip6_pkt_discard(struct sk_buff *skb);
96 static int ip6_pkt_discard_out(struct sk_buff *skb);
97 static void ip6_link_failure(struct sk_buff *skb);
98 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102 struct in6_addr *gwaddr, int ifindex,
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105 struct in6_addr *gwaddr, int ifindex);
108 static struct dst_ops ip6_dst_ops = {
110 .protocol = __constant_htons(ETH_P_IPV6),
113 .check = ip6_dst_check,
114 .destroy = ip6_dst_destroy,
115 .ifdown = ip6_dst_ifdown,
116 .negative_advice = ip6_negative_advice,
117 .link_failure = ip6_link_failure,
118 .update_pmtu = ip6_rt_update_pmtu,
119 .entry_size = sizeof(struct rt6_info),
122 struct rt6_info ip6_null_entry = {
125 .__refcnt = ATOMIC_INIT(1),
127 .dev = &loopback_dev,
129 .error = -ENETUNREACH,
130 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
131 .input = ip6_pkt_discard,
132 .output = ip6_pkt_discard_out,
134 .path = (struct dst_entry*)&ip6_null_entry,
137 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
138 .rt6i_metric = ~(u32) 0,
139 .rt6i_ref = ATOMIC_INIT(1),
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146 static int ip6_pkt_blk_hole(struct sk_buff *skb);
148 struct rt6_info ip6_prohibit_entry = {
151 .__refcnt = ATOMIC_INIT(1),
153 .dev = &loopback_dev,
156 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
157 .input = ip6_pkt_prohibit,
158 .output = ip6_pkt_prohibit_out,
160 .path = (struct dst_entry*)&ip6_prohibit_entry,
163 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
164 .rt6i_metric = ~(u32) 0,
165 .rt6i_ref = ATOMIC_INIT(1),
168 struct rt6_info ip6_blk_hole_entry = {
171 .__refcnt = ATOMIC_INIT(1),
173 .dev = &loopback_dev,
176 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
177 .input = ip6_pkt_blk_hole,
178 .output = ip6_pkt_blk_hole,
180 .path = (struct dst_entry*)&ip6_blk_hole_entry,
183 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
184 .rt6i_metric = ~(u32) 0,
185 .rt6i_ref = ATOMIC_INIT(1),
190 /* allocate dst with ip6_dst_ops */
191 static __inline__ struct rt6_info *ip6_dst_alloc(void)
193 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
196 static void ip6_dst_destroy(struct dst_entry *dst)
198 struct rt6_info *rt = (struct rt6_info *)dst;
199 struct inet6_dev *idev = rt->rt6i_idev;
202 rt->rt6i_idev = NULL;
207 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
210 struct rt6_info *rt = (struct rt6_info *)dst;
211 struct inet6_dev *idev = rt->rt6i_idev;
213 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
214 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
215 if (loopback_idev != NULL) {
216 rt->rt6i_idev = loopback_idev;
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
224 return (rt->rt6i_flags & RTF_EXPIRES &&
225 time_after(jiffies, rt->rt6i_expires));
228 static inline int rt6_need_strict(struct in6_addr *daddr)
230 return (ipv6_addr_type(daddr) &
231 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
235 * Route lookup. Any table->tb6_lock is implied.
238 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
242 struct rt6_info *local = NULL;
243 struct rt6_info *sprt;
246 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
247 struct net_device *dev = sprt->rt6i_dev;
248 if (dev->ifindex == oif)
250 if (dev->flags & IFF_LOOPBACK) {
251 if (sprt->rt6i_idev == NULL ||
252 sprt->rt6i_idev->dev->ifindex != oif) {
255 if (local && (!oif ||
256 local->rt6i_idev->dev->ifindex == oif))
267 return &ip6_null_entry;
272 #ifdef CONFIG_IPV6_ROUTER_PREF
273 static void rt6_probe(struct rt6_info *rt)
275 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
277 * Okay, this does not seem to be appropriate
278 * for now, however, we need to check if it
279 * is really so; aka Router Reachability Probing.
281 * Router Reachability Probe MUST be rate-limited
282 * to no more than one per minute.
284 if (!neigh || (neigh->nud_state & NUD_VALID))
286 read_lock_bh(&neigh->lock);
287 if (!(neigh->nud_state & NUD_VALID) &&
288 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
289 struct in6_addr mcaddr;
290 struct in6_addr *target;
292 neigh->updated = jiffies;
293 read_unlock_bh(&neigh->lock);
295 target = (struct in6_addr *)&neigh->primary_key;
296 addrconf_addr_solict_mult(target, &mcaddr);
297 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
299 read_unlock_bh(&neigh->lock);
302 static inline void rt6_probe(struct rt6_info *rt)
309 * Default Router Selection (RFC 2461 6.3.6)
311 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
313 struct net_device *dev = rt->rt6i_dev;
318 if (dev->flags & IFF_LOOPBACK) {
319 if (!WARN_ON(rt->rt6i_idev == NULL) &&
320 rt->rt6i_idev->dev->ifindex == oif)
325 if (dev->ifindex == oif)
331 static int inline rt6_check_neigh(struct rt6_info *rt)
333 struct neighbour *neigh = rt->rt6i_nexthop;
335 if (rt->rt6i_flags & RTF_NONEXTHOP ||
336 !(rt->rt6i_flags & RTF_GATEWAY))
339 read_lock_bh(&neigh->lock);
340 if (neigh->nud_state & NUD_VALID)
342 else if (!(neigh->nud_state & NUD_FAILED))
344 read_unlock_bh(&neigh->lock);
349 static int rt6_score_route(struct rt6_info *rt, int oif,
354 m = rt6_check_dev(rt, oif);
355 if (!m && (strict & RT6_LOOKUP_F_IFACE))
357 #ifdef CONFIG_IPV6_ROUTER_PREF
358 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
360 n = rt6_check_neigh(rt);
361 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
366 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
369 struct rt6_info *match = NULL, *last = NULL;
370 struct rt6_info *rt, *rt0 = *head;
374 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
375 __FUNCTION__, head, head ? *head : NULL, oif);
377 for (rt = rt0, metric = rt0->rt6i_metric;
378 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
379 rt = rt->u.dst.rt6_next) {
382 if (rt6_check_expired(rt))
387 m = rt6_score_route(rt, oif, strict);
392 if (strict & RT6_LOOKUP_F_REACHABLE)
396 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
402 (strict & RT6_LOOKUP_F_REACHABLE) &&
403 last && last != rt0) {
404 /* no entries matched; do round-robin */
405 static DEFINE_SPINLOCK(lock);
407 *head = rt0->u.dst.rt6_next;
408 rt0->u.dst.rt6_next = last->u.dst.rt6_next;
409 last->u.dst.rt6_next = rt0;
413 RT6_TRACE("%s() => %p, score=%d\n",
414 __FUNCTION__, match, mpri);
416 return (match ? match : &ip6_null_entry);
419 #ifdef CONFIG_IPV6_ROUTE_INFO
420 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
421 struct in6_addr *gwaddr)
423 struct route_info *rinfo = (struct route_info *) opt;
424 struct in6_addr prefix_buf, *prefix;
429 if (len < sizeof(struct route_info)) {
433 /* Sanity check for prefix_len and length */
434 if (rinfo->length > 3) {
436 } else if (rinfo->prefix_len > 128) {
438 } else if (rinfo->prefix_len > 64) {
439 if (rinfo->length < 2) {
442 } else if (rinfo->prefix_len > 0) {
443 if (rinfo->length < 1) {
448 pref = rinfo->route_pref;
449 if (pref == ICMPV6_ROUTER_PREF_INVALID)
450 pref = ICMPV6_ROUTER_PREF_MEDIUM;
452 lifetime = ntohl(rinfo->lifetime);
453 if (lifetime == 0xffffffff) {
455 } else if (lifetime > 0x7fffffff/HZ) {
456 /* Avoid arithmetic overflow */
457 lifetime = 0x7fffffff/HZ - 1;
460 if (rinfo->length == 3)
461 prefix = (struct in6_addr *)rinfo->prefix;
463 /* this function is safe */
464 ipv6_addr_prefix(&prefix_buf,
465 (struct in6_addr *)rinfo->prefix,
467 prefix = &prefix_buf;
470 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
472 if (rt && !lifetime) {
478 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
481 rt->rt6i_flags = RTF_ROUTEINFO |
482 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
485 if (lifetime == 0xffffffff) {
486 rt->rt6i_flags &= ~RTF_EXPIRES;
488 rt->rt6i_expires = jiffies + HZ * lifetime;
489 rt->rt6i_flags |= RTF_EXPIRES;
491 dst_release(&rt->u.dst);
497 #define BACKTRACK(saddr) \
499 if (rt == &ip6_null_entry) { \
500 struct fib6_node *pn; \
502 if (fn->fn_flags & RTN_TL_ROOT) \
505 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
506 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
509 if (fn->fn_flags & RTN_RTINFO) \
515 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
516 struct flowi *fl, int flags)
518 struct fib6_node *fn;
521 read_lock_bh(&table->tb6_lock);
522 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
525 rt = rt6_device_match(rt, fl->oif, flags);
526 BACKTRACK(&fl->fl6_src);
528 dst_hold(&rt->u.dst);
529 read_unlock_bh(&table->tb6_lock);
531 rt->u.dst.lastuse = jiffies;
538 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
549 struct dst_entry *dst;
550 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
553 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
554 flags |= RT6_LOOKUP_F_HAS_SADDR;
557 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
559 return (struct rt6_info *) dst;
566 /* ip6_ins_rt is called with FREE table->tb6_lock.
567 It takes new route entry, the addition fails by any reason the
568 route is freed. In any case, if caller does not hold it, it may
572 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
575 struct fib6_table *table;
577 table = rt->rt6i_table;
578 write_lock_bh(&table->tb6_lock);
579 err = fib6_add(&table->tb6_root, rt, info);
580 write_unlock_bh(&table->tb6_lock);
585 int ip6_ins_rt(struct rt6_info *rt)
587 return __ip6_ins_rt(rt, NULL);
590 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
591 struct in6_addr *saddr)
599 rt = ip6_rt_copy(ort);
602 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
603 if (rt->rt6i_dst.plen != 128 &&
604 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
605 rt->rt6i_flags |= RTF_ANYCAST;
606 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
609 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
610 rt->rt6i_dst.plen = 128;
611 rt->rt6i_flags |= RTF_CACHE;
612 rt->u.dst.flags |= DST_HOST;
614 #ifdef CONFIG_IPV6_SUBTREES
615 if (rt->rt6i_src.plen && saddr) {
616 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
617 rt->rt6i_src.plen = 128;
621 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
628 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
630 struct rt6_info *rt = ip6_rt_copy(ort);
632 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
633 rt->rt6i_dst.plen = 128;
634 rt->rt6i_flags |= RTF_CACHE;
635 rt->u.dst.flags |= DST_HOST;
636 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
641 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
642 struct flowi *fl, int flags)
644 struct fib6_node *fn;
645 struct rt6_info *rt, *nrt;
649 int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
651 strict |= flags & RT6_LOOKUP_F_IFACE;
654 read_lock_bh(&table->tb6_lock);
657 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
660 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
661 BACKTRACK(&fl->fl6_src);
662 if (rt == &ip6_null_entry ||
663 rt->rt6i_flags & RTF_CACHE)
666 dst_hold(&rt->u.dst);
667 read_unlock_bh(&table->tb6_lock);
669 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
670 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
672 #if CLONE_OFFLINK_ROUTE
673 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
679 dst_release(&rt->u.dst);
680 rt = nrt ? : &ip6_null_entry;
682 dst_hold(&rt->u.dst);
684 err = ip6_ins_rt(nrt);
693 * Race condition! In the gap, when table->tb6_lock was
694 * released someone could insert this route. Relookup.
696 dst_release(&rt->u.dst);
704 dst_hold(&rt->u.dst);
705 read_unlock_bh(&table->tb6_lock);
707 rt->u.dst.lastuse = jiffies;
713 void ip6_route_input(struct sk_buff *skb)
715 struct ipv6hdr *iph = skb->nh.ipv6h;
716 int flags = RT6_LOOKUP_F_HAS_SADDR;
718 .iif = skb->dev->ifindex,
723 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
727 .proto = iph->nexthdr,
730 if (rt6_need_strict(&iph->daddr))
731 flags |= RT6_LOOKUP_F_IFACE;
733 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
736 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
737 struct flowi *fl, int flags)
739 struct fib6_node *fn;
740 struct rt6_info *rt, *nrt;
744 int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
746 strict |= flags & RT6_LOOKUP_F_IFACE;
749 read_lock_bh(&table->tb6_lock);
752 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
755 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
756 BACKTRACK(&fl->fl6_src);
757 if (rt == &ip6_null_entry ||
758 rt->rt6i_flags & RTF_CACHE)
761 dst_hold(&rt->u.dst);
762 read_unlock_bh(&table->tb6_lock);
764 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
765 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
767 #if CLONE_OFFLINK_ROUTE
768 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
774 dst_release(&rt->u.dst);
775 rt = nrt ? : &ip6_null_entry;
777 dst_hold(&rt->u.dst);
779 err = ip6_ins_rt(nrt);
788 * Race condition! In the gap, when table->tb6_lock was
789 * released someone could insert this route. Relookup.
791 dst_release(&rt->u.dst);
799 dst_hold(&rt->u.dst);
800 read_unlock_bh(&table->tb6_lock);
802 rt->u.dst.lastuse = jiffies;
807 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
811 if (rt6_need_strict(&fl->fl6_dst))
812 flags |= RT6_LOOKUP_F_IFACE;
814 if (!ipv6_addr_any(&fl->fl6_src))
815 flags |= RT6_LOOKUP_F_HAS_SADDR;
817 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
822 * Destination cache support functions
825 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
829 rt = (struct rt6_info *) dst;
831 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
837 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
839 struct rt6_info *rt = (struct rt6_info *) dst;
842 if (rt->rt6i_flags & RTF_CACHE)
850 static void ip6_link_failure(struct sk_buff *skb)
854 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
856 rt = (struct rt6_info *) skb->dst;
858 if (rt->rt6i_flags&RTF_CACHE) {
859 dst_set_expires(&rt->u.dst, 0);
860 rt->rt6i_flags |= RTF_EXPIRES;
861 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
862 rt->rt6i_node->fn_sernum = -1;
866 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
868 struct rt6_info *rt6 = (struct rt6_info*)dst;
870 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
871 rt6->rt6i_flags |= RTF_MODIFIED;
872 if (mtu < IPV6_MIN_MTU) {
874 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
876 dst->metrics[RTAX_MTU-1] = mtu;
877 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
881 static int ipv6_get_mtu(struct net_device *dev);
883 static inline unsigned int ipv6_advmss(unsigned int mtu)
885 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
887 if (mtu < ip6_rt_min_advmss)
888 mtu = ip6_rt_min_advmss;
891 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
892 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
893 * IPV6_MAXPLEN is also valid and means: "any MSS,
894 * rely only on pmtu discovery"
896 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
901 static struct dst_entry *ndisc_dst_gc_list;
902 static DEFINE_SPINLOCK(ndisc_lock);
904 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
905 struct neighbour *neigh,
906 struct in6_addr *addr,
907 int (*output)(struct sk_buff *))
910 struct inet6_dev *idev = in6_dev_get(dev);
912 if (unlikely(idev == NULL))
915 rt = ip6_dst_alloc();
916 if (unlikely(rt == NULL)) {
925 neigh = ndisc_get_neigh(dev, addr);
928 rt->rt6i_idev = idev;
929 rt->rt6i_nexthop = neigh;
930 atomic_set(&rt->u.dst.__refcnt, 1);
931 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
932 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
933 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
934 rt->u.dst.output = output;
936 #if 0 /* there's no chance to use these for ndisc */
937 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
940 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
941 rt->rt6i_dst.plen = 128;
944 spin_lock_bh(&ndisc_lock);
945 rt->u.dst.next = ndisc_dst_gc_list;
946 ndisc_dst_gc_list = &rt->u.dst;
947 spin_unlock_bh(&ndisc_lock);
949 fib6_force_start_gc();
955 int ndisc_dst_gc(int *more)
957 struct dst_entry *dst, *next, **pprev;
963 spin_lock_bh(&ndisc_lock);
964 pprev = &ndisc_dst_gc_list;
966 while ((dst = *pprev) != NULL) {
967 if (!atomic_read(&dst->__refcnt)) {
977 spin_unlock_bh(&ndisc_lock);
982 static int ip6_dst_gc(void)
984 static unsigned expire = 30*HZ;
985 static unsigned long last_gc;
986 unsigned long now = jiffies;
988 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
989 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
995 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
996 expire = ip6_rt_gc_timeout>>1;
999 expire -= expire>>ip6_rt_gc_elasticity;
1000 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1003 /* Clean host part of a prefix. Not necessary in radix tree,
1004 but results in cleaner routing tables.
1006 Remove it only when all the things will work!
1009 static int ipv6_get_mtu(struct net_device *dev)
1011 int mtu = IPV6_MIN_MTU;
1012 struct inet6_dev *idev;
1014 idev = in6_dev_get(dev);
1016 mtu = idev->cnf.mtu6;
1022 int ipv6_get_hoplimit(struct net_device *dev)
1024 int hoplimit = ipv6_devconf.hop_limit;
1025 struct inet6_dev *idev;
1027 idev = in6_dev_get(dev);
1029 hoplimit = idev->cnf.hop_limit;
1039 int ip6_route_add(struct fib6_config *cfg)
1042 struct rt6_info *rt = NULL;
1043 struct net_device *dev = NULL;
1044 struct inet6_dev *idev = NULL;
1045 struct fib6_table *table;
1048 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1050 #ifndef CONFIG_IPV6_SUBTREES
1051 if (cfg->fc_src_len)
1054 if (cfg->fc_ifindex) {
1056 dev = dev_get_by_index(cfg->fc_ifindex);
1059 idev = in6_dev_get(dev);
1064 if (cfg->fc_metric == 0)
1065 cfg->fc_metric = IP6_RT_PRIO_USER;
1067 table = fib6_new_table(cfg->fc_table);
1068 if (table == NULL) {
1073 rt = ip6_dst_alloc();
1080 rt->u.dst.obsolete = -1;
1081 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1083 if (cfg->fc_protocol == RTPROT_UNSPEC)
1084 cfg->fc_protocol = RTPROT_BOOT;
1085 rt->rt6i_protocol = cfg->fc_protocol;
1087 addr_type = ipv6_addr_type(&cfg->fc_dst);
1089 if (addr_type & IPV6_ADDR_MULTICAST)
1090 rt->u.dst.input = ip6_mc_input;
1092 rt->u.dst.input = ip6_forward;
1094 rt->u.dst.output = ip6_output;
1096 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1097 rt->rt6i_dst.plen = cfg->fc_dst_len;
1098 if (rt->rt6i_dst.plen == 128)
1099 rt->u.dst.flags = DST_HOST;
1101 #ifdef CONFIG_IPV6_SUBTREES
1102 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1103 rt->rt6i_src.plen = cfg->fc_src_len;
1106 rt->rt6i_metric = cfg->fc_metric;
1108 /* We cannot add true routes via loopback here,
1109 they would result in kernel looping; promote them to reject routes
1111 if ((cfg->fc_flags & RTF_REJECT) ||
1112 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1113 /* hold loopback dev/idev if we haven't done so. */
1114 if (dev != &loopback_dev) {
1119 dev = &loopback_dev;
1121 idev = in6_dev_get(dev);
1127 rt->u.dst.output = ip6_pkt_discard_out;
1128 rt->u.dst.input = ip6_pkt_discard;
1129 rt->u.dst.error = -ENETUNREACH;
1130 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1134 if (cfg->fc_flags & RTF_GATEWAY) {
1135 struct in6_addr *gw_addr;
1138 gw_addr = &cfg->fc_gateway;
1139 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1140 gwa_type = ipv6_addr_type(gw_addr);
1142 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1143 struct rt6_info *grt;
1145 /* IPv6 strictly inhibits using not link-local
1146 addresses as nexthop address.
1147 Otherwise, router will not able to send redirects.
1148 It is very good, but in some (rare!) circumstances
1149 (SIT, PtP, NBMA NOARP links) it is handy to allow
1150 some exceptions. --ANK
1153 if (!(gwa_type&IPV6_ADDR_UNICAST))
1156 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1158 err = -EHOSTUNREACH;
1162 if (dev != grt->rt6i_dev) {
1163 dst_release(&grt->u.dst);
1167 dev = grt->rt6i_dev;
1168 idev = grt->rt6i_idev;
1170 in6_dev_hold(grt->rt6i_idev);
1172 if (!(grt->rt6i_flags&RTF_GATEWAY))
1174 dst_release(&grt->u.dst);
1180 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1188 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1189 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1190 if (IS_ERR(rt->rt6i_nexthop)) {
1191 err = PTR_ERR(rt->rt6i_nexthop);
1192 rt->rt6i_nexthop = NULL;
1197 rt->rt6i_flags = cfg->fc_flags;
1204 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1205 int type = nla->nla_type;
1208 if (type > RTAX_MAX) {
1213 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1218 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1219 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1220 if (!rt->u.dst.metrics[RTAX_MTU-1])
1221 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1222 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1223 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1224 rt->u.dst.dev = dev;
1225 rt->rt6i_idev = idev;
1226 rt->rt6i_table = table;
1227 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1235 dst_free(&rt->u.dst);
1239 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1242 struct fib6_table *table;
1244 if (rt == &ip6_null_entry)
1247 table = rt->rt6i_table;
1248 write_lock_bh(&table->tb6_lock);
1250 err = fib6_del(rt, info);
1251 dst_release(&rt->u.dst);
1253 write_unlock_bh(&table->tb6_lock);
1258 int ip6_del_rt(struct rt6_info *rt)
1260 return __ip6_del_rt(rt, NULL);
1263 static int ip6_route_del(struct fib6_config *cfg)
1265 struct fib6_table *table;
1266 struct fib6_node *fn;
1267 struct rt6_info *rt;
1270 table = fib6_get_table(cfg->fc_table);
1274 read_lock_bh(&table->tb6_lock);
1276 fn = fib6_locate(&table->tb6_root,
1277 &cfg->fc_dst, cfg->fc_dst_len,
1278 &cfg->fc_src, cfg->fc_src_len);
1281 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1282 if (cfg->fc_ifindex &&
1283 (rt->rt6i_dev == NULL ||
1284 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1286 if (cfg->fc_flags & RTF_GATEWAY &&
1287 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1289 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1291 dst_hold(&rt->u.dst);
1292 read_unlock_bh(&table->tb6_lock);
1294 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1297 read_unlock_bh(&table->tb6_lock);
1305 struct ip6rd_flowi {
1307 struct in6_addr gateway;
1310 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1314 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1315 struct rt6_info *rt;
1316 struct fib6_node *fn;
1319 * Get the "current" route for this destination and
1320 * check if the redirect has come from approriate router.
1322 * RFC 2461 specifies that redirects should only be
1323 * accepted if they come from the nexthop to the target.
1324 * Due to the way the routes are chosen, this notion
1325 * is a bit fuzzy and one might need to check all possible
1329 read_lock_bh(&table->tb6_lock);
1330 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1332 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1334 * Current route is on-link; redirect is always invalid.
1336 * Seems, previous statement is not true. It could
1337 * be node, which looks for us as on-link (f.e. proxy ndisc)
1338 * But then router serving it might decide, that we should
1339 * know truth 8)8) --ANK (980726).
1341 if (rt6_check_expired(rt))
1343 if (!(rt->rt6i_flags & RTF_GATEWAY))
1345 if (fl->oif != rt->rt6i_dev->ifindex)
1347 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1353 rt = &ip6_null_entry;
1354 BACKTRACK(&fl->fl6_src);
1356 dst_hold(&rt->u.dst);
1358 read_unlock_bh(&table->tb6_lock);
1363 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1364 struct in6_addr *src,
1365 struct in6_addr *gateway,
1366 struct net_device *dev)
1368 int flags = RT6_LOOKUP_F_HAS_SADDR;
1369 struct ip6rd_flowi rdfl = {
1371 .oif = dev->ifindex,
1379 .gateway = *gateway,
1382 if (rt6_need_strict(dest))
1383 flags |= RT6_LOOKUP_F_IFACE;
1385 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1388 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1389 struct in6_addr *saddr,
1390 struct neighbour *neigh, u8 *lladdr, int on_link)
1392 struct rt6_info *rt, *nrt = NULL;
1393 struct netevent_redirect netevent;
1395 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1397 if (rt == &ip6_null_entry) {
1398 if (net_ratelimit())
1399 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1400 "for redirect target\n");
1405 * We have finally decided to accept it.
1408 neigh_update(neigh, lladdr, NUD_STALE,
1409 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1410 NEIGH_UPDATE_F_OVERRIDE|
1411 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1412 NEIGH_UPDATE_F_ISROUTER))
1416 * Redirect received -> path was valid.
1417 * Look, redirects are sent only in response to data packets,
1418 * so that this nexthop apparently is reachable. --ANK
1420 dst_confirm(&rt->u.dst);
1422 /* Duplicate redirect: silently ignore. */
1423 if (neigh == rt->u.dst.neighbour)
1426 nrt = ip6_rt_copy(rt);
1430 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1432 nrt->rt6i_flags &= ~RTF_GATEWAY;
1434 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1435 nrt->rt6i_dst.plen = 128;
1436 nrt->u.dst.flags |= DST_HOST;
1438 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1439 nrt->rt6i_nexthop = neigh_clone(neigh);
1440 /* Reset pmtu, it may be better */
1441 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1442 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1444 if (ip6_ins_rt(nrt))
1447 netevent.old = &rt->u.dst;
1448 netevent.new = &nrt->u.dst;
1449 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1451 if (rt->rt6i_flags&RTF_CACHE) {
1457 dst_release(&rt->u.dst);
1462 * Handle ICMP "packet too big" messages
1463 * i.e. Path MTU discovery
1466 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1467 struct net_device *dev, u32 pmtu)
1469 struct rt6_info *rt, *nrt;
1472 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1476 if (pmtu >= dst_mtu(&rt->u.dst))
1479 if (pmtu < IPV6_MIN_MTU) {
1481 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1482 * MTU (1280) and a fragment header should always be included
1483 * after a node receiving Too Big message reporting PMTU is
1484 * less than the IPv6 Minimum Link MTU.
1486 pmtu = IPV6_MIN_MTU;
1490 /* New mtu received -> path was valid.
1491 They are sent only in response to data packets,
1492 so that this nexthop apparently is reachable. --ANK
1494 dst_confirm(&rt->u.dst);
1496 /* Host route. If it is static, it would be better
1497 not to override it, but add new one, so that
1498 when cache entry will expire old pmtu
1499 would return automatically.
1501 if (rt->rt6i_flags & RTF_CACHE) {
1502 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1504 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1505 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1506 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1511 Two cases are possible:
1512 1. It is connected route. Action: COW
1513 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1515 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1516 nrt = rt6_alloc_cow(rt, daddr, saddr);
1518 nrt = rt6_alloc_clone(rt, daddr);
1521 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1523 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1525 /* According to RFC 1981, detecting PMTU increase shouldn't be
1526 * happened within 5 mins, the recommended timer is 10 mins.
1527 * Here this route expiration time is set to ip6_rt_mtu_expires
1528 * which is 10 mins. After 10 mins the decreased pmtu is expired
1529 * and detecting PMTU increase will be automatically happened.
1531 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1532 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1537 dst_release(&rt->u.dst);
1541 * Misc support functions
1544 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1546 struct rt6_info *rt = ip6_dst_alloc();
1549 rt->u.dst.input = ort->u.dst.input;
1550 rt->u.dst.output = ort->u.dst.output;
1552 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1553 rt->u.dst.error = ort->u.dst.error;
1554 rt->u.dst.dev = ort->u.dst.dev;
1556 dev_hold(rt->u.dst.dev);
1557 rt->rt6i_idev = ort->rt6i_idev;
1559 in6_dev_hold(rt->rt6i_idev);
1560 rt->u.dst.lastuse = jiffies;
1561 rt->rt6i_expires = 0;
1563 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1564 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1565 rt->rt6i_metric = 0;
1567 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1568 #ifdef CONFIG_IPV6_SUBTREES
1569 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1571 rt->rt6i_table = ort->rt6i_table;
1576 #ifdef CONFIG_IPV6_ROUTE_INFO
1577 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1578 struct in6_addr *gwaddr, int ifindex)
1580 struct fib6_node *fn;
1581 struct rt6_info *rt = NULL;
1582 struct fib6_table *table;
1584 table = fib6_get_table(RT6_TABLE_INFO);
1588 write_lock_bh(&table->tb6_lock);
1589 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1593 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1594 if (rt->rt6i_dev->ifindex != ifindex)
1596 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1598 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1600 dst_hold(&rt->u.dst);
1604 write_unlock_bh(&table->tb6_lock);
1608 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1609 struct in6_addr *gwaddr, int ifindex,
1612 struct fib6_config cfg = {
1613 .fc_table = RT6_TABLE_INFO,
1615 .fc_ifindex = ifindex,
1616 .fc_dst_len = prefixlen,
1617 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1618 RTF_UP | RTF_PREF(pref),
1621 ipv6_addr_copy(&cfg.fc_dst, prefix);
1622 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1624 /* We should treat it as a default route if prefix length is 0. */
1626 cfg.fc_flags |= RTF_DEFAULT;
1628 ip6_route_add(&cfg);
1630 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1634 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1636 struct rt6_info *rt;
1637 struct fib6_table *table;
1639 table = fib6_get_table(RT6_TABLE_DFLT);
1643 write_lock_bh(&table->tb6_lock);
1644 for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1645 if (dev == rt->rt6i_dev &&
1646 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1647 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1651 dst_hold(&rt->u.dst);
1652 write_unlock_bh(&table->tb6_lock);
1656 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1657 struct net_device *dev,
1660 struct fib6_config cfg = {
1661 .fc_table = RT6_TABLE_DFLT,
1663 .fc_ifindex = dev->ifindex,
1664 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1665 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1668 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1670 ip6_route_add(&cfg);
1672 return rt6_get_dflt_router(gwaddr, dev);
1675 void rt6_purge_dflt_routers(void)
1677 struct rt6_info *rt;
1678 struct fib6_table *table;
1680 /* NOTE: Keep consistent with rt6_get_dflt_router */
1681 table = fib6_get_table(RT6_TABLE_DFLT);
1686 read_lock_bh(&table->tb6_lock);
1687 for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1688 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1689 dst_hold(&rt->u.dst);
1690 read_unlock_bh(&table->tb6_lock);
1695 read_unlock_bh(&table->tb6_lock);
1698 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1699 struct fib6_config *cfg)
1701 memset(cfg, 0, sizeof(*cfg));
1703 cfg->fc_table = RT6_TABLE_MAIN;
1704 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1705 cfg->fc_metric = rtmsg->rtmsg_metric;
1706 cfg->fc_expires = rtmsg->rtmsg_info;
1707 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1708 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1709 cfg->fc_flags = rtmsg->rtmsg_flags;
1711 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1712 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1713 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1716 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1718 struct fib6_config cfg;
1719 struct in6_rtmsg rtmsg;
1723 case SIOCADDRT: /* Add a route */
1724 case SIOCDELRT: /* Delete a route */
1725 if (!capable(CAP_NET_ADMIN))
1727 err = copy_from_user(&rtmsg, arg,
1728 sizeof(struct in6_rtmsg));
1732 rtmsg_to_fib6_config(&rtmsg, &cfg);
1737 err = ip6_route_add(&cfg);
1740 err = ip6_route_del(&cfg);
1754 * Drop the packet on the floor
1757 static inline int ip6_pkt_drop(struct sk_buff *skb, int code)
1759 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1760 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1761 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1763 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTNOROUTES);
1764 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1769 static int ip6_pkt_discard(struct sk_buff *skb)
1771 return ip6_pkt_drop(skb, ICMPV6_NOROUTE);
1774 static int ip6_pkt_discard_out(struct sk_buff *skb)
1776 skb->dev = skb->dst->dev;
1777 return ip6_pkt_discard(skb);
1780 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1782 static int ip6_pkt_prohibit(struct sk_buff *skb)
1784 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED);
1787 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1789 skb->dev = skb->dst->dev;
1790 return ip6_pkt_prohibit(skb);
1793 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1802 * Allocate a dst for local (unicast / anycast) address.
1805 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1806 const struct in6_addr *addr,
1809 struct rt6_info *rt = ip6_dst_alloc();
1812 return ERR_PTR(-ENOMEM);
1814 dev_hold(&loopback_dev);
1817 rt->u.dst.flags = DST_HOST;
1818 rt->u.dst.input = ip6_input;
1819 rt->u.dst.output = ip6_output;
1820 rt->rt6i_dev = &loopback_dev;
1821 rt->rt6i_idev = idev;
1822 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1823 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1824 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1825 rt->u.dst.obsolete = -1;
1827 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1829 rt->rt6i_flags |= RTF_ANYCAST;
1831 rt->rt6i_flags |= RTF_LOCAL;
1832 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1833 if (rt->rt6i_nexthop == NULL) {
1834 dst_free(&rt->u.dst);
1835 return ERR_PTR(-ENOMEM);
1838 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1839 rt->rt6i_dst.plen = 128;
1840 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1842 atomic_set(&rt->u.dst.__refcnt, 1);
1847 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1849 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1850 rt != &ip6_null_entry) {
1851 RT6_TRACE("deleted by ifdown %p\n", rt);
1857 void rt6_ifdown(struct net_device *dev)
1859 fib6_clean_all(fib6_ifdown, 0, dev);
1862 struct rt6_mtu_change_arg
1864 struct net_device *dev;
1868 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1870 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1871 struct inet6_dev *idev;
1873 /* In IPv6 pmtu discovery is not optional,
1874 so that RTAX_MTU lock cannot disable it.
1875 We still use this lock to block changes
1876 caused by addrconf/ndisc.
1879 idev = __in6_dev_get(arg->dev);
1883 /* For administrative MTU increase, there is no way to discover
1884 IPv6 PMTU increase, so PMTU increase should be updated here.
1885 Since RFC 1981 doesn't include administrative MTU increase
1886 update PMTU increase is a MUST. (i.e. jumbo frame)
1889 If new MTU is less than route PMTU, this new MTU will be the
1890 lowest MTU in the path, update the route PMTU to reflect PMTU
1891 decreases; if new MTU is greater than route PMTU, and the
1892 old MTU is the lowest MTU in the path, update the route PMTU
1893 to reflect the increase. In this case if the other nodes' MTU
1894 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1897 if (rt->rt6i_dev == arg->dev &&
1898 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1899 (dst_mtu(&rt->u.dst) > arg->mtu ||
1900 (dst_mtu(&rt->u.dst) < arg->mtu &&
1901 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1902 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1903 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1907 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1909 struct rt6_mtu_change_arg arg = {
1914 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1917 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1918 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
1919 [RTA_OIF] = { .type = NLA_U32 },
1920 [RTA_IIF] = { .type = NLA_U32 },
1921 [RTA_PRIORITY] = { .type = NLA_U32 },
1922 [RTA_METRICS] = { .type = NLA_NESTED },
1925 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1926 struct fib6_config *cfg)
1929 struct nlattr *tb[RTA_MAX+1];
1932 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1937 rtm = nlmsg_data(nlh);
1938 memset(cfg, 0, sizeof(*cfg));
1940 cfg->fc_table = rtm->rtm_table;
1941 cfg->fc_dst_len = rtm->rtm_dst_len;
1942 cfg->fc_src_len = rtm->rtm_src_len;
1943 cfg->fc_flags = RTF_UP;
1944 cfg->fc_protocol = rtm->rtm_protocol;
1946 if (rtm->rtm_type == RTN_UNREACHABLE)
1947 cfg->fc_flags |= RTF_REJECT;
1949 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1950 cfg->fc_nlinfo.nlh = nlh;
1952 if (tb[RTA_GATEWAY]) {
1953 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1954 cfg->fc_flags |= RTF_GATEWAY;
1958 int plen = (rtm->rtm_dst_len + 7) >> 3;
1960 if (nla_len(tb[RTA_DST]) < plen)
1963 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1967 int plen = (rtm->rtm_src_len + 7) >> 3;
1969 if (nla_len(tb[RTA_SRC]) < plen)
1972 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1976 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1978 if (tb[RTA_PRIORITY])
1979 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1981 if (tb[RTA_METRICS]) {
1982 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1983 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1987 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1994 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1996 struct fib6_config cfg;
1999 err = rtm_to_fib6_config(skb, nlh, &cfg);
2003 return ip6_route_del(&cfg);
2006 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2008 struct fib6_config cfg;
2011 err = rtm_to_fib6_config(skb, nlh, &cfg);
2015 return ip6_route_add(&cfg);
2018 static inline size_t rt6_nlmsg_size(void)
2020 return NLMSG_ALIGN(sizeof(struct rtmsg))
2021 + nla_total_size(16) /* RTA_SRC */
2022 + nla_total_size(16) /* RTA_DST */
2023 + nla_total_size(16) /* RTA_GATEWAY */
2024 + nla_total_size(16) /* RTA_PREFSRC */
2025 + nla_total_size(4) /* RTA_TABLE */
2026 + nla_total_size(4) /* RTA_IIF */
2027 + nla_total_size(4) /* RTA_OIF */
2028 + nla_total_size(4) /* RTA_PRIORITY */
2029 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2030 + nla_total_size(sizeof(struct rta_cacheinfo));
2033 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2034 struct in6_addr *dst, struct in6_addr *src,
2035 int iif, int type, u32 pid, u32 seq,
2036 int prefix, unsigned int flags)
2039 struct nlmsghdr *nlh;
2043 if (prefix) { /* user wants prefix routes only */
2044 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2045 /* success since this is not a prefix route */
2050 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2054 rtm = nlmsg_data(nlh);
2055 rtm->rtm_family = AF_INET6;
2056 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2057 rtm->rtm_src_len = rt->rt6i_src.plen;
2060 table = rt->rt6i_table->tb6_id;
2062 table = RT6_TABLE_UNSPEC;
2063 rtm->rtm_table = table;
2064 NLA_PUT_U32(skb, RTA_TABLE, table);
2065 if (rt->rt6i_flags&RTF_REJECT)
2066 rtm->rtm_type = RTN_UNREACHABLE;
2067 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2068 rtm->rtm_type = RTN_LOCAL;
2070 rtm->rtm_type = RTN_UNICAST;
2072 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2073 rtm->rtm_protocol = rt->rt6i_protocol;
2074 if (rt->rt6i_flags&RTF_DYNAMIC)
2075 rtm->rtm_protocol = RTPROT_REDIRECT;
2076 else if (rt->rt6i_flags & RTF_ADDRCONF)
2077 rtm->rtm_protocol = RTPROT_KERNEL;
2078 else if (rt->rt6i_flags&RTF_DEFAULT)
2079 rtm->rtm_protocol = RTPROT_RA;
2081 if (rt->rt6i_flags&RTF_CACHE)
2082 rtm->rtm_flags |= RTM_F_CLONED;
2085 NLA_PUT(skb, RTA_DST, 16, dst);
2086 rtm->rtm_dst_len = 128;
2087 } else if (rtm->rtm_dst_len)
2088 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2089 #ifdef CONFIG_IPV6_SUBTREES
2091 NLA_PUT(skb, RTA_SRC, 16, src);
2092 rtm->rtm_src_len = 128;
2093 } else if (rtm->rtm_src_len)
2094 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2097 NLA_PUT_U32(skb, RTA_IIF, iif);
2099 struct in6_addr saddr_buf;
2100 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2101 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2104 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2105 goto nla_put_failure;
2107 if (rt->u.dst.neighbour)
2108 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2111 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2113 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2115 expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2116 if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2117 expires, rt->u.dst.error) < 0)
2118 goto nla_put_failure;
2120 return nlmsg_end(skb, nlh);
2123 nlmsg_cancel(skb, nlh);
2127 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2129 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2132 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2133 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2134 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2138 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2139 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2140 prefix, NLM_F_MULTI);
2143 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2145 struct nlattr *tb[RTA_MAX+1];
2146 struct rt6_info *rt;
2147 struct sk_buff *skb;
2152 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2157 memset(&fl, 0, sizeof(fl));
2160 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2163 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2167 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2170 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2174 iif = nla_get_u32(tb[RTA_IIF]);
2177 fl.oif = nla_get_u32(tb[RTA_OIF]);
2180 struct net_device *dev;
2181 dev = __dev_get_by_index(iif);
2188 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2194 /* Reserve room for dummy headers, this skb can pass
2195 through good chunk of routing engine.
2197 skb->mac.raw = skb->data;
2198 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2200 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2201 skb->dst = &rt->u.dst;
2203 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2204 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2205 nlh->nlmsg_seq, 0, 0);
2211 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2216 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2218 struct sk_buff *skb;
2219 u32 pid = 0, seq = 0;
2220 struct nlmsghdr *nlh = NULL;
2227 seq = nlh->nlmsg_seq;
2230 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2234 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2236 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2237 WARN_ON(err == -EMSGSIZE);
2241 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2244 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2251 #ifdef CONFIG_PROC_FS
2253 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2264 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2266 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2268 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2273 if (arg->len >= arg->length)
2276 arg->len += sprintf(arg->buffer + arg->len,
2277 NIP6_SEQFMT " %02x ",
2278 NIP6(rt->rt6i_dst.addr),
2281 #ifdef CONFIG_IPV6_SUBTREES
2282 arg->len += sprintf(arg->buffer + arg->len,
2283 NIP6_SEQFMT " %02x ",
2284 NIP6(rt->rt6i_src.addr),
2287 arg->len += sprintf(arg->buffer + arg->len,
2288 "00000000000000000000000000000000 00 ");
2291 if (rt->rt6i_nexthop) {
2292 arg->len += sprintf(arg->buffer + arg->len,
2294 NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2296 arg->len += sprintf(arg->buffer + arg->len,
2297 "00000000000000000000000000000000");
2299 arg->len += sprintf(arg->buffer + arg->len,
2300 " %08x %08x %08x %08x %8s\n",
2301 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2302 rt->u.dst.__use, rt->rt6i_flags,
2303 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2307 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2309 struct rt6_proc_arg arg = {
2315 fib6_clean_all(rt6_info_route, 0, &arg);
2319 *start += offset % RT6_INFO_LEN;
2321 arg.len -= offset % RT6_INFO_LEN;
2323 if (arg.len > length)
2331 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2333 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2334 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2335 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2336 rt6_stats.fib_rt_cache,
2337 atomic_read(&ip6_dst_ops.entries),
2338 rt6_stats.fib_discarded_routes);
2343 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2345 return single_open(file, rt6_stats_seq_show, NULL);
2348 static const struct file_operations rt6_stats_seq_fops = {
2349 .owner = THIS_MODULE,
2350 .open = rt6_stats_seq_open,
2352 .llseek = seq_lseek,
2353 .release = single_release,
2355 #endif /* CONFIG_PROC_FS */
2357 #ifdef CONFIG_SYSCTL
2359 static int flush_delay;
2362 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2363 void __user *buffer, size_t *lenp, loff_t *ppos)
2366 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2367 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2373 ctl_table ipv6_route_table[] = {
2375 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2376 .procname = "flush",
2377 .data = &flush_delay,
2378 .maxlen = sizeof(int),
2380 .proc_handler = &ipv6_sysctl_rtcache_flush
2383 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2384 .procname = "gc_thresh",
2385 .data = &ip6_dst_ops.gc_thresh,
2386 .maxlen = sizeof(int),
2388 .proc_handler = &proc_dointvec,
2391 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2392 .procname = "max_size",
2393 .data = &ip6_rt_max_size,
2394 .maxlen = sizeof(int),
2396 .proc_handler = &proc_dointvec,
2399 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2400 .procname = "gc_min_interval",
2401 .data = &ip6_rt_gc_min_interval,
2402 .maxlen = sizeof(int),
2404 .proc_handler = &proc_dointvec_jiffies,
2405 .strategy = &sysctl_jiffies,
2408 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2409 .procname = "gc_timeout",
2410 .data = &ip6_rt_gc_timeout,
2411 .maxlen = sizeof(int),
2413 .proc_handler = &proc_dointvec_jiffies,
2414 .strategy = &sysctl_jiffies,
2417 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2418 .procname = "gc_interval",
2419 .data = &ip6_rt_gc_interval,
2420 .maxlen = sizeof(int),
2422 .proc_handler = &proc_dointvec_jiffies,
2423 .strategy = &sysctl_jiffies,
2426 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2427 .procname = "gc_elasticity",
2428 .data = &ip6_rt_gc_elasticity,
2429 .maxlen = sizeof(int),
2431 .proc_handler = &proc_dointvec_jiffies,
2432 .strategy = &sysctl_jiffies,
2435 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2436 .procname = "mtu_expires",
2437 .data = &ip6_rt_mtu_expires,
2438 .maxlen = sizeof(int),
2440 .proc_handler = &proc_dointvec_jiffies,
2441 .strategy = &sysctl_jiffies,
2444 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2445 .procname = "min_adv_mss",
2446 .data = &ip6_rt_min_advmss,
2447 .maxlen = sizeof(int),
2449 .proc_handler = &proc_dointvec_jiffies,
2450 .strategy = &sysctl_jiffies,
2453 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2454 .procname = "gc_min_interval_ms",
2455 .data = &ip6_rt_gc_min_interval,
2456 .maxlen = sizeof(int),
2458 .proc_handler = &proc_dointvec_ms_jiffies,
2459 .strategy = &sysctl_ms_jiffies,
2466 void __init ip6_route_init(void)
2468 struct proc_dir_entry *p;
2470 ip6_dst_ops.kmem_cachep =
2471 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2472 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2474 #ifdef CONFIG_PROC_FS
2475 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2477 p->owner = THIS_MODULE;
2479 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2484 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2489 void ip6_route_cleanup(void)
2491 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2492 fib6_rules_cleanup();
2494 #ifdef CONFIG_PROC_FS
2495 proc_net_remove("ipv6_route");
2496 proc_net_remove("rt6_stats");
2503 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);