2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
54 #include <linux/rtnetlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 #define CLONE_OFFLINK_ROUTE 0
77 #define RT6_SELECT_F_IFACE 0x1
78 #define RT6_SELECT_F_REACHABLE 0x2
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(void);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct sk_buff *skb);
98 static void ip6_link_failure(struct sk_buff *skb);
99 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 struct in6_addr *gwaddr, int ifindex,
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 struct in6_addr *gwaddr, int ifindex);
109 static struct dst_ops ip6_dst_ops = {
111 .protocol = __constant_htons(ETH_P_IPV6),
114 .check = ip6_dst_check,
115 .destroy = ip6_dst_destroy,
116 .ifdown = ip6_dst_ifdown,
117 .negative_advice = ip6_negative_advice,
118 .link_failure = ip6_link_failure,
119 .update_pmtu = ip6_rt_update_pmtu,
120 .entry_size = sizeof(struct rt6_info),
123 struct rt6_info ip6_null_entry = {
126 .__refcnt = ATOMIC_INIT(1),
128 .dev = &loopback_dev,
130 .error = -ENETUNREACH,
131 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
132 .input = ip6_pkt_discard,
133 .output = ip6_pkt_discard_out,
135 .path = (struct dst_entry*)&ip6_null_entry,
138 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
139 .rt6i_metric = ~(u32) 0,
140 .rt6i_ref = ATOMIC_INIT(1),
143 struct fib6_node ip6_routing_table = {
144 .leaf = &ip6_null_entry,
145 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
148 /* Protects all the ip6 fib */
150 DEFINE_RWLOCK(rt6_lock);
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
156 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
159 static void ip6_dst_destroy(struct dst_entry *dst)
161 struct rt6_info *rt = (struct rt6_info *)dst;
162 struct inet6_dev *idev = rt->rt6i_idev;
165 rt->rt6i_idev = NULL;
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
173 struct rt6_info *rt = (struct rt6_info *)dst;
174 struct inet6_dev *idev = rt->rt6i_idev;
176 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 if (loopback_idev != NULL) {
179 rt->rt6i_idev = loopback_idev;
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
187 return (rt->rt6i_flags & RTF_EXPIRES &&
188 time_after(jiffies, rt->rt6i_expires));
192 * Route lookup. Any rt6_lock is implied.
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
199 struct rt6_info *local = NULL;
200 struct rt6_info *sprt;
203 for (sprt = rt; sprt; sprt = sprt->u.next) {
204 struct net_device *dev = sprt->rt6i_dev;
205 if (dev->ifindex == oif)
207 if (dev->flags & IFF_LOOPBACK) {
208 if (sprt->rt6i_idev == NULL ||
209 sprt->rt6i_idev->dev->ifindex != oif) {
212 if (local && (!oif ||
213 local->rt6i_idev->dev->ifindex == oif))
224 return &ip6_null_entry;
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
232 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
234 * Okay, this does not seem to be appropriate
235 * for now, however, we need to check if it
236 * is really so; aka Router Reachability Probing.
238 * Router Reachability Probe MUST be rate-limited
239 * to no more than one per minute.
241 if (!neigh || (neigh->nud_state & NUD_VALID))
243 read_lock_bh(&neigh->lock);
244 if (!(neigh->nud_state & NUD_VALID) &&
245 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 struct in6_addr mcaddr;
247 struct in6_addr *target;
249 neigh->updated = jiffies;
250 read_unlock_bh(&neigh->lock);
252 target = (struct in6_addr *)&neigh->primary_key;
253 addrconf_addr_solict_mult(target, &mcaddr);
254 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
256 read_unlock_bh(&neigh->lock);
259 static inline void rt6_probe(struct rt6_info *rt)
266 * Default Router Selection (RFC 2461 6.3.6)
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
270 struct net_device *dev = rt->rt6i_dev;
271 if (!oif || dev->ifindex == oif)
273 if ((dev->flags & IFF_LOOPBACK) &&
274 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
279 static int inline rt6_check_neigh(struct rt6_info *rt)
281 struct neighbour *neigh = rt->rt6i_nexthop;
283 if (rt->rt6i_flags & RTF_NONEXTHOP ||
284 !(rt->rt6i_flags & RTF_GATEWAY))
287 read_lock_bh(&neigh->lock);
288 if (neigh->nud_state & NUD_VALID)
290 read_unlock_bh(&neigh->lock);
295 static int rt6_score_route(struct rt6_info *rt, int oif,
300 m = rt6_check_dev(rt, oif);
301 if (!m && (strict & RT6_SELECT_F_IFACE))
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
306 n = rt6_check_neigh(rt);
309 else if (!n && strict & RT6_SELECT_F_REACHABLE)
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
317 struct rt6_info *match = NULL, *last = NULL;
318 struct rt6_info *rt, *rt0 = *head;
322 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323 __FUNCTION__, head, head ? *head : NULL, oif);
325 for (rt = rt0, metric = rt0->rt6i_metric;
326 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
330 if (rt6_check_expired(rt))
335 m = rt6_score_route(rt, oif, strict);
349 (strict & RT6_SELECT_F_REACHABLE) &&
350 last && last != rt0) {
351 /* no entries matched; do round-robin */
352 static spinlock_t lock = SPIN_LOCK_UNLOCKED;
355 rt0->u.next = last->u.next;
360 RT6_TRACE("%s() => %p, score=%d\n",
361 __FUNCTION__, match, mpri);
363 return (match ? match : &ip6_null_entry);
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368 struct in6_addr *gwaddr)
370 struct route_info *rinfo = (struct route_info *) opt;
371 struct in6_addr prefix_buf, *prefix;
376 if (len < sizeof(struct route_info)) {
380 /* Sanity check for prefix_len and length */
381 if (rinfo->length > 3) {
383 } else if (rinfo->prefix_len > 128) {
385 } else if (rinfo->prefix_len > 64) {
386 if (rinfo->length < 2) {
389 } else if (rinfo->prefix_len > 0) {
390 if (rinfo->length < 1) {
395 pref = rinfo->route_pref;
396 if (pref == ICMPV6_ROUTER_PREF_INVALID)
397 pref = ICMPV6_ROUTER_PREF_MEDIUM;
399 lifetime = htonl(rinfo->lifetime);
400 if (lifetime == 0xffffffff) {
402 } else if (lifetime > 0x7fffffff/HZ) {
403 /* Avoid arithmetic overflow */
404 lifetime = 0x7fffffff/HZ - 1;
407 if (rinfo->length == 3)
408 prefix = (struct in6_addr *)rinfo->prefix;
410 /* this function is safe */
411 ipv6_addr_prefix(&prefix_buf,
412 (struct in6_addr *)rinfo->prefix,
414 prefix = &prefix_buf;
417 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
419 if (rt && !lifetime) {
420 ip6_del_rt(rt, NULL, NULL, NULL);
425 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
428 rt->rt6i_flags = RTF_ROUTEINFO |
429 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
432 if (lifetime == 0xffffffff) {
433 rt->rt6i_flags &= ~RTF_EXPIRES;
435 rt->rt6i_expires = jiffies + HZ * lifetime;
436 rt->rt6i_flags |= RTF_EXPIRES;
438 dst_release(&rt->u.dst);
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
447 struct fib6_node *fn;
450 read_lock_bh(&rt6_lock);
451 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452 rt = rt6_device_match(fn->leaf, oif, strict);
453 dst_hold(&rt->u.dst);
455 read_unlock_bh(&rt6_lock);
457 rt->u.dst.lastuse = jiffies;
458 if (rt->u.dst.error == 0)
460 dst_release(&rt->u.dst);
464 /* ip6_ins_rt is called with FREE rt6_lock.
465 It takes new route entry, the addition fails by any reason the
466 route is freed. In any case, if caller does not hold it, it may
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471 void *_rtattr, struct netlink_skb_parms *req)
475 write_lock_bh(&rt6_lock);
476 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477 write_unlock_bh(&rt6_lock);
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483 struct in6_addr *saddr)
491 rt = ip6_rt_copy(ort);
494 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495 if (rt->rt6i_dst.plen != 128 &&
496 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497 rt->rt6i_flags |= RTF_ANYCAST;
498 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
501 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502 rt->rt6i_dst.plen = 128;
503 rt->rt6i_flags |= RTF_CACHE;
504 rt->u.dst.flags |= DST_HOST;
506 #ifdef CONFIG_IPV6_SUBTREES
507 if (rt->rt6i_src.plen && saddr) {
508 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509 rt->rt6i_src.plen = 128;
513 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
522 struct rt6_info *rt = ip6_rt_copy(ort);
524 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525 rt->rt6i_dst.plen = 128;
526 rt->rt6i_flags |= RTF_CACHE;
527 if (rt->rt6i_flags & RTF_REJECT)
528 rt->u.dst.error = ort->u.dst.error;
529 rt->u.dst.flags |= DST_HOST;
530 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537 while ((fn = fn->parent) != NULL) { \
538 if (fn->fn_flags & RTN_ROOT) { \
541 if (fn->fn_flags & RTN_RTINFO) \
547 void ip6_route_input(struct sk_buff *skb)
549 struct fib6_node *fn;
550 struct rt6_info *rt, *nrt;
554 int reachable = RT6_SELECT_F_REACHABLE;
556 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
559 read_lock_bh(&rt6_lock);
562 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563 &skb->nh.ipv6h->saddr);
566 rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
568 if (rt == &ip6_null_entry ||
569 rt->rt6i_flags & RTF_CACHE)
572 dst_hold(&rt->u.dst);
573 read_unlock_bh(&rt6_lock);
575 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
578 #if CLONE_OFFLINK_ROUTE
579 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
585 dst_release(&rt->u.dst);
586 rt = nrt ? : &ip6_null_entry;
588 dst_hold(&rt->u.dst);
590 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
599 * Race condition! In the gap, when rt6_lock was
600 * released someone could insert this route. Relookup.
602 dst_release(&rt->u.dst);
610 dst_hold(&rt->u.dst);
611 read_unlock_bh(&rt6_lock);
613 rt->u.dst.lastuse = jiffies;
615 skb->dst = (struct dst_entry *) rt;
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
621 struct fib6_node *fn;
622 struct rt6_info *rt, *nrt;
626 int reachable = RT6_SELECT_F_REACHABLE;
628 strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
631 read_lock_bh(&rt6_lock);
634 fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
637 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
639 if (rt == &ip6_null_entry ||
640 rt->rt6i_flags & RTF_CACHE)
643 dst_hold(&rt->u.dst);
644 read_unlock_bh(&rt6_lock);
646 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
649 #if CLONE_OFFLINK_ROUTE
650 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
656 dst_release(&rt->u.dst);
657 rt = nrt ? : &ip6_null_entry;
659 dst_hold(&rt->u.dst);
661 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
670 * Race condition! In the gap, when rt6_lock was
671 * released someone could insert this route. Relookup.
673 dst_release(&rt->u.dst);
681 dst_hold(&rt->u.dst);
682 read_unlock_bh(&rt6_lock);
684 rt->u.dst.lastuse = jiffies;
691 * Destination cache support functions
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
698 rt = (struct rt6_info *) dst;
700 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
708 struct rt6_info *rt = (struct rt6_info *) dst;
711 if (rt->rt6i_flags & RTF_CACHE)
712 ip6_del_rt(rt, NULL, NULL, NULL);
719 static void ip6_link_failure(struct sk_buff *skb)
723 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
725 rt = (struct rt6_info *) skb->dst;
727 if (rt->rt6i_flags&RTF_CACHE) {
728 dst_set_expires(&rt->u.dst, 0);
729 rt->rt6i_flags |= RTF_EXPIRES;
730 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731 rt->rt6i_node->fn_sernum = -1;
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
737 struct rt6_info *rt6 = (struct rt6_info*)dst;
739 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740 rt6->rt6i_flags |= RTF_MODIFIED;
741 if (mtu < IPV6_MIN_MTU) {
743 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
745 dst->metrics[RTAX_MTU-1] = mtu;
749 /* Protected by rt6_lock. */
750 static struct dst_entry *ndisc_dst_gc_list;
751 static int ipv6_get_mtu(struct net_device *dev);
753 static inline unsigned int ipv6_advmss(unsigned int mtu)
755 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
757 if (mtu < ip6_rt_min_advmss)
758 mtu = ip6_rt_min_advmss;
761 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
762 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
763 * IPV6_MAXPLEN is also valid and means: "any MSS,
764 * rely only on pmtu discovery"
766 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
771 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
772 struct neighbour *neigh,
773 struct in6_addr *addr,
774 int (*output)(struct sk_buff *))
777 struct inet6_dev *idev = in6_dev_get(dev);
779 if (unlikely(idev == NULL))
782 rt = ip6_dst_alloc();
783 if (unlikely(rt == NULL)) {
792 neigh = ndisc_get_neigh(dev, addr);
795 rt->rt6i_idev = idev;
796 rt->rt6i_nexthop = neigh;
797 atomic_set(&rt->u.dst.__refcnt, 1);
798 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
799 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
800 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
801 rt->u.dst.output = output;
803 #if 0 /* there's no chance to use these for ndisc */
804 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
807 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
808 rt->rt6i_dst.plen = 128;
811 write_lock_bh(&rt6_lock);
812 rt->u.dst.next = ndisc_dst_gc_list;
813 ndisc_dst_gc_list = &rt->u.dst;
814 write_unlock_bh(&rt6_lock);
816 fib6_force_start_gc();
819 return (struct dst_entry *)rt;
822 int ndisc_dst_gc(int *more)
824 struct dst_entry *dst, *next, **pprev;
828 pprev = &ndisc_dst_gc_list;
830 while ((dst = *pprev) != NULL) {
831 if (!atomic_read(&dst->__refcnt)) {
844 static int ip6_dst_gc(void)
846 static unsigned expire = 30*HZ;
847 static unsigned long last_gc;
848 unsigned long now = jiffies;
850 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
851 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
857 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
858 expire = ip6_rt_gc_timeout>>1;
861 expire -= expire>>ip6_rt_gc_elasticity;
862 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
865 /* Clean host part of a prefix. Not necessary in radix tree,
866 but results in cleaner routing tables.
868 Remove it only when all the things will work!
871 static int ipv6_get_mtu(struct net_device *dev)
873 int mtu = IPV6_MIN_MTU;
874 struct inet6_dev *idev;
876 idev = in6_dev_get(dev);
878 mtu = idev->cnf.mtu6;
884 int ipv6_get_hoplimit(struct net_device *dev)
886 int hoplimit = ipv6_devconf.hop_limit;
887 struct inet6_dev *idev;
889 idev = in6_dev_get(dev);
891 hoplimit = idev->cnf.hop_limit;
901 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
902 void *_rtattr, struct netlink_skb_parms *req)
907 struct rt6_info *rt = NULL;
908 struct net_device *dev = NULL;
909 struct inet6_dev *idev = NULL;
912 rta = (struct rtattr **) _rtattr;
914 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
916 #ifndef CONFIG_IPV6_SUBTREES
917 if (rtmsg->rtmsg_src_len)
920 if (rtmsg->rtmsg_ifindex) {
922 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
925 idev = in6_dev_get(dev);
930 if (rtmsg->rtmsg_metric == 0)
931 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
933 rt = ip6_dst_alloc();
940 rt->u.dst.obsolete = -1;
941 rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
942 if (nlh && (r = NLMSG_DATA(nlh))) {
943 rt->rt6i_protocol = r->rtm_protocol;
945 rt->rt6i_protocol = RTPROT_BOOT;
948 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
950 if (addr_type & IPV6_ADDR_MULTICAST)
951 rt->u.dst.input = ip6_mc_input;
953 rt->u.dst.input = ip6_forward;
955 rt->u.dst.output = ip6_output;
957 ipv6_addr_prefix(&rt->rt6i_dst.addr,
958 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
959 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
960 if (rt->rt6i_dst.plen == 128)
961 rt->u.dst.flags = DST_HOST;
963 #ifdef CONFIG_IPV6_SUBTREES
964 ipv6_addr_prefix(&rt->rt6i_src.addr,
965 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
966 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
969 rt->rt6i_metric = rtmsg->rtmsg_metric;
971 /* We cannot add true routes via loopback here,
972 they would result in kernel looping; promote them to reject routes
974 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
975 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
976 /* hold loopback dev/idev if we haven't done so. */
977 if (dev != &loopback_dev) {
984 idev = in6_dev_get(dev);
990 rt->u.dst.output = ip6_pkt_discard_out;
991 rt->u.dst.input = ip6_pkt_discard;
992 rt->u.dst.error = -ENETUNREACH;
993 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
997 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
998 struct in6_addr *gw_addr;
1001 gw_addr = &rtmsg->rtmsg_gateway;
1002 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1003 gwa_type = ipv6_addr_type(gw_addr);
1005 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1006 struct rt6_info *grt;
1008 /* IPv6 strictly inhibits using not link-local
1009 addresses as nexthop address.
1010 Otherwise, router will not able to send redirects.
1011 It is very good, but in some (rare!) circumstances
1012 (SIT, PtP, NBMA NOARP links) it is handy to allow
1013 some exceptions. --ANK
1016 if (!(gwa_type&IPV6_ADDR_UNICAST))
1019 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1021 err = -EHOSTUNREACH;
1025 if (dev != grt->rt6i_dev) {
1026 dst_release(&grt->u.dst);
1030 dev = grt->rt6i_dev;
1031 idev = grt->rt6i_idev;
1033 in6_dev_hold(grt->rt6i_idev);
1035 if (!(grt->rt6i_flags&RTF_GATEWAY))
1037 dst_release(&grt->u.dst);
1043 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1051 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1052 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1053 if (IS_ERR(rt->rt6i_nexthop)) {
1054 err = PTR_ERR(rt->rt6i_nexthop);
1055 rt->rt6i_nexthop = NULL;
1060 rt->rt6i_flags = rtmsg->rtmsg_flags;
1063 if (rta && rta[RTA_METRICS-1]) {
1064 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1065 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1067 while (RTA_OK(attr, attrlen)) {
1068 unsigned flavor = attr->rta_type;
1070 if (flavor > RTAX_MAX) {
1074 rt->u.dst.metrics[flavor-1] =
1075 *(u32 *)RTA_DATA(attr);
1077 attr = RTA_NEXT(attr, attrlen);
1081 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1082 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1083 if (!rt->u.dst.metrics[RTAX_MTU-1])
1084 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1085 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1086 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1087 rt->u.dst.dev = dev;
1088 rt->rt6i_idev = idev;
1089 return ip6_ins_rt(rt, nlh, _rtattr, req);
1097 dst_free((struct dst_entry *) rt);
1101 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1105 write_lock_bh(&rt6_lock);
1107 err = fib6_del(rt, nlh, _rtattr, req);
1108 dst_release(&rt->u.dst);
1110 write_unlock_bh(&rt6_lock);
1115 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1117 struct fib6_node *fn;
1118 struct rt6_info *rt;
1121 read_lock_bh(&rt6_lock);
1123 fn = fib6_locate(&ip6_routing_table,
1124 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1125 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1128 for (rt = fn->leaf; rt; rt = rt->u.next) {
1129 if (rtmsg->rtmsg_ifindex &&
1130 (rt->rt6i_dev == NULL ||
1131 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1133 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1134 !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1136 if (rtmsg->rtmsg_metric &&
1137 rtmsg->rtmsg_metric != rt->rt6i_metric)
1139 dst_hold(&rt->u.dst);
1140 read_unlock_bh(&rt6_lock);
1142 return ip6_del_rt(rt, nlh, _rtattr, req);
1145 read_unlock_bh(&rt6_lock);
1153 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1154 struct neighbour *neigh, u8 *lladdr, int on_link)
1156 struct rt6_info *rt, *nrt = NULL;
1158 struct fib6_node *fn;
1161 * Get the "current" route for this destination and
1162 * check if the redirect has come from approriate router.
1164 * RFC 2461 specifies that redirects should only be
1165 * accepted if they come from the nexthop to the target.
1166 * Due to the way the routes are chosen, this notion
1167 * is a bit fuzzy and one might need to check all possible
1170 strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1172 read_lock_bh(&rt6_lock);
1173 fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1175 for (rt = fn->leaf; rt; rt = rt->u.next) {
1177 * Current route is on-link; redirect is always invalid.
1179 * Seems, previous statement is not true. It could
1180 * be node, which looks for us as on-link (f.e. proxy ndisc)
1181 * But then router serving it might decide, that we should
1182 * know truth 8)8) --ANK (980726).
1184 if (rt6_check_expired(rt))
1186 if (!(rt->rt6i_flags & RTF_GATEWAY))
1188 if (neigh->dev != rt->rt6i_dev)
1190 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1195 dst_hold(&rt->u.dst);
1197 while ((fn = fn->parent) != NULL) {
1198 if (fn->fn_flags & RTN_ROOT)
1200 if (fn->fn_flags & RTN_RTINFO)
1204 read_unlock_bh(&rt6_lock);
1207 if (net_ratelimit())
1208 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1209 "for redirect target\n");
1214 * We have finally decided to accept it.
1217 neigh_update(neigh, lladdr, NUD_STALE,
1218 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1219 NEIGH_UPDATE_F_OVERRIDE|
1220 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1221 NEIGH_UPDATE_F_ISROUTER))
1225 * Redirect received -> path was valid.
1226 * Look, redirects are sent only in response to data packets,
1227 * so that this nexthop apparently is reachable. --ANK
1229 dst_confirm(&rt->u.dst);
1231 /* Duplicate redirect: silently ignore. */
1232 if (neigh == rt->u.dst.neighbour)
1235 nrt = ip6_rt_copy(rt);
1239 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1241 nrt->rt6i_flags &= ~RTF_GATEWAY;
1243 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1244 nrt->rt6i_dst.plen = 128;
1245 nrt->u.dst.flags |= DST_HOST;
1247 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1248 nrt->rt6i_nexthop = neigh_clone(neigh);
1249 /* Reset pmtu, it may be better */
1250 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1251 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1253 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1256 if (rt->rt6i_flags&RTF_CACHE) {
1257 ip6_del_rt(rt, NULL, NULL, NULL);
1262 dst_release(&rt->u.dst);
1267 * Handle ICMP "packet too big" messages
1268 * i.e. Path MTU discovery
1271 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1272 struct net_device *dev, u32 pmtu)
1274 struct rt6_info *rt, *nrt;
1277 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1281 if (pmtu >= dst_mtu(&rt->u.dst))
1284 if (pmtu < IPV6_MIN_MTU) {
1286 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1287 * MTU (1280) and a fragment header should always be included
1288 * after a node receiving Too Big message reporting PMTU is
1289 * less than the IPv6 Minimum Link MTU.
1291 pmtu = IPV6_MIN_MTU;
1295 /* New mtu received -> path was valid.
1296 They are sent only in response to data packets,
1297 so that this nexthop apparently is reachable. --ANK
1299 dst_confirm(&rt->u.dst);
1301 /* Host route. If it is static, it would be better
1302 not to override it, but add new one, so that
1303 when cache entry will expire old pmtu
1304 would return automatically.
1306 if (rt->rt6i_flags & RTF_CACHE) {
1307 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1309 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1310 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1311 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1316 Two cases are possible:
1317 1. It is connected route. Action: COW
1318 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1320 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1321 nrt = rt6_alloc_cow(rt, daddr, saddr);
1323 nrt = rt6_alloc_clone(rt, daddr);
1326 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1328 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1330 /* According to RFC 1981, detecting PMTU increase shouldn't be
1331 * happened within 5 mins, the recommended timer is 10 mins.
1332 * Here this route expiration time is set to ip6_rt_mtu_expires
1333 * which is 10 mins. After 10 mins the decreased pmtu is expired
1334 * and detecting PMTU increase will be automatically happened.
1336 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1337 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1339 ip6_ins_rt(nrt, NULL, NULL, NULL);
1342 dst_release(&rt->u.dst);
1346 * Misc support functions
1349 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1351 struct rt6_info *rt = ip6_dst_alloc();
1354 rt->u.dst.input = ort->u.dst.input;
1355 rt->u.dst.output = ort->u.dst.output;
1357 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1358 rt->u.dst.dev = ort->u.dst.dev;
1360 dev_hold(rt->u.dst.dev);
1361 rt->rt6i_idev = ort->rt6i_idev;
1363 in6_dev_hold(rt->rt6i_idev);
1364 rt->u.dst.lastuse = jiffies;
1365 rt->rt6i_expires = 0;
1367 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1368 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1369 rt->rt6i_metric = 0;
1371 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1372 #ifdef CONFIG_IPV6_SUBTREES
1373 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1379 #ifdef CONFIG_IPV6_ROUTE_INFO
1380 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1381 struct in6_addr *gwaddr, int ifindex)
1383 struct fib6_node *fn;
1384 struct rt6_info *rt = NULL;
1386 write_lock_bh(&rt6_lock);
1387 fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1391 for (rt = fn->leaf; rt; rt = rt->u.next) {
1392 if (rt->rt6i_dev->ifindex != ifindex)
1394 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1396 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1398 dst_hold(&rt->u.dst);
1402 write_unlock_bh(&rt6_lock);
1406 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1407 struct in6_addr *gwaddr, int ifindex,
1410 struct in6_rtmsg rtmsg;
1412 memset(&rtmsg, 0, sizeof(rtmsg));
1413 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1414 ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1415 rtmsg.rtmsg_dst_len = prefixlen;
1416 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1417 rtmsg.rtmsg_metric = 1024;
1418 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1419 /* We should treat it as a default route if prefix length is 0. */
1421 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1422 rtmsg.rtmsg_ifindex = ifindex;
1424 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1426 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1430 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1432 struct rt6_info *rt;
1433 struct fib6_node *fn;
1435 fn = &ip6_routing_table;
1437 write_lock_bh(&rt6_lock);
1438 for (rt = fn->leaf; rt; rt=rt->u.next) {
1439 if (dev == rt->rt6i_dev &&
1440 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1441 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1445 dst_hold(&rt->u.dst);
1446 write_unlock_bh(&rt6_lock);
1450 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1451 struct net_device *dev,
1454 struct in6_rtmsg rtmsg;
1456 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1457 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1458 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1459 rtmsg.rtmsg_metric = 1024;
1460 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1463 rtmsg.rtmsg_ifindex = dev->ifindex;
1465 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1466 return rt6_get_dflt_router(gwaddr, dev);
1469 void rt6_purge_dflt_routers(void)
1471 struct rt6_info *rt;
1474 read_lock_bh(&rt6_lock);
1475 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1476 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1477 dst_hold(&rt->u.dst);
1479 read_unlock_bh(&rt6_lock);
1481 ip6_del_rt(rt, NULL, NULL, NULL);
1486 read_unlock_bh(&rt6_lock);
1489 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1491 struct in6_rtmsg rtmsg;
1495 case SIOCADDRT: /* Add a route */
1496 case SIOCDELRT: /* Delete a route */
1497 if (!capable(CAP_NET_ADMIN))
1499 err = copy_from_user(&rtmsg, arg,
1500 sizeof(struct in6_rtmsg));
1507 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1510 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1524 * Drop the packet on the floor
1527 static int ip6_pkt_discard(struct sk_buff *skb)
1529 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1530 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1535 static int ip6_pkt_discard_out(struct sk_buff *skb)
1537 skb->dev = skb->dst->dev;
1538 return ip6_pkt_discard(skb);
1542 * Allocate a dst for local (unicast / anycast) address.
1545 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1546 const struct in6_addr *addr,
1549 struct rt6_info *rt = ip6_dst_alloc();
1552 return ERR_PTR(-ENOMEM);
1554 dev_hold(&loopback_dev);
1557 rt->u.dst.flags = DST_HOST;
1558 rt->u.dst.input = ip6_input;
1559 rt->u.dst.output = ip6_output;
1560 rt->rt6i_dev = &loopback_dev;
1561 rt->rt6i_idev = idev;
1562 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1563 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1564 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1565 rt->u.dst.obsolete = -1;
1567 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1569 rt->rt6i_flags |= RTF_ANYCAST;
1571 rt->rt6i_flags |= RTF_LOCAL;
1572 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1573 if (rt->rt6i_nexthop == NULL) {
1574 dst_free((struct dst_entry *) rt);
1575 return ERR_PTR(-ENOMEM);
1578 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1579 rt->rt6i_dst.plen = 128;
1581 atomic_set(&rt->u.dst.__refcnt, 1);
1586 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1588 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1589 rt != &ip6_null_entry) {
1590 RT6_TRACE("deleted by ifdown %p\n", rt);
1596 void rt6_ifdown(struct net_device *dev)
1598 write_lock_bh(&rt6_lock);
1599 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1600 write_unlock_bh(&rt6_lock);
1603 struct rt6_mtu_change_arg
1605 struct net_device *dev;
1609 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1611 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1612 struct inet6_dev *idev;
1614 /* In IPv6 pmtu discovery is not optional,
1615 so that RTAX_MTU lock cannot disable it.
1616 We still use this lock to block changes
1617 caused by addrconf/ndisc.
1620 idev = __in6_dev_get(arg->dev);
1624 /* For administrative MTU increase, there is no way to discover
1625 IPv6 PMTU increase, so PMTU increase should be updated here.
1626 Since RFC 1981 doesn't include administrative MTU increase
1627 update PMTU increase is a MUST. (i.e. jumbo frame)
1630 If new MTU is less than route PMTU, this new MTU will be the
1631 lowest MTU in the path, update the route PMTU to reflect PMTU
1632 decreases; if new MTU is greater than route PMTU, and the
1633 old MTU is the lowest MTU in the path, update the route PMTU
1634 to reflect the increase. In this case if the other nodes' MTU
1635 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1638 if (rt->rt6i_dev == arg->dev &&
1639 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1640 (dst_mtu(&rt->u.dst) > arg->mtu ||
1641 (dst_mtu(&rt->u.dst) < arg->mtu &&
1642 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1643 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1644 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1648 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1650 struct rt6_mtu_change_arg arg;
1654 read_lock_bh(&rt6_lock);
1655 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1656 read_unlock_bh(&rt6_lock);
1659 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1660 struct in6_rtmsg *rtmsg)
1662 memset(rtmsg, 0, sizeof(*rtmsg));
1664 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1665 rtmsg->rtmsg_src_len = r->rtm_src_len;
1666 rtmsg->rtmsg_flags = RTF_UP;
1667 if (r->rtm_type == RTN_UNREACHABLE)
1668 rtmsg->rtmsg_flags |= RTF_REJECT;
1670 if (rta[RTA_GATEWAY-1]) {
1671 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1673 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1674 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1676 if (rta[RTA_DST-1]) {
1677 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1679 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1681 if (rta[RTA_SRC-1]) {
1682 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1684 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1686 if (rta[RTA_OIF-1]) {
1687 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1689 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1691 if (rta[RTA_PRIORITY-1]) {
1692 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1694 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1699 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1701 struct rtmsg *r = NLMSG_DATA(nlh);
1702 struct in6_rtmsg rtmsg;
1704 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1706 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1709 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1711 struct rtmsg *r = NLMSG_DATA(nlh);
1712 struct in6_rtmsg rtmsg;
1714 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1716 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1719 struct rt6_rtnl_dump_arg
1721 struct sk_buff *skb;
1722 struct netlink_callback *cb;
1725 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1726 struct in6_addr *dst, struct in6_addr *src,
1727 int iif, int type, u32 pid, u32 seq,
1728 int prefix, unsigned int flags)
1731 struct nlmsghdr *nlh;
1732 unsigned char *b = skb->tail;
1733 struct rta_cacheinfo ci;
1735 if (prefix) { /* user wants prefix routes only */
1736 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1737 /* success since this is not a prefix route */
1742 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1743 rtm = NLMSG_DATA(nlh);
1744 rtm->rtm_family = AF_INET6;
1745 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1746 rtm->rtm_src_len = rt->rt6i_src.plen;
1748 rtm->rtm_table = RT_TABLE_MAIN;
1749 if (rt->rt6i_flags&RTF_REJECT)
1750 rtm->rtm_type = RTN_UNREACHABLE;
1751 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1752 rtm->rtm_type = RTN_LOCAL;
1754 rtm->rtm_type = RTN_UNICAST;
1756 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1757 rtm->rtm_protocol = rt->rt6i_protocol;
1758 if (rt->rt6i_flags&RTF_DYNAMIC)
1759 rtm->rtm_protocol = RTPROT_REDIRECT;
1760 else if (rt->rt6i_flags & RTF_ADDRCONF)
1761 rtm->rtm_protocol = RTPROT_KERNEL;
1762 else if (rt->rt6i_flags&RTF_DEFAULT)
1763 rtm->rtm_protocol = RTPROT_RA;
1765 if (rt->rt6i_flags&RTF_CACHE)
1766 rtm->rtm_flags |= RTM_F_CLONED;
1769 RTA_PUT(skb, RTA_DST, 16, dst);
1770 rtm->rtm_dst_len = 128;
1771 } else if (rtm->rtm_dst_len)
1772 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1773 #ifdef CONFIG_IPV6_SUBTREES
1775 RTA_PUT(skb, RTA_SRC, 16, src);
1776 rtm->rtm_src_len = 128;
1777 } else if (rtm->rtm_src_len)
1778 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1781 RTA_PUT(skb, RTA_IIF, 4, &iif);
1783 struct in6_addr saddr_buf;
1784 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1785 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1787 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1788 goto rtattr_failure;
1789 if (rt->u.dst.neighbour)
1790 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1792 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1793 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1794 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1795 if (rt->rt6i_expires)
1796 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1799 ci.rta_used = rt->u.dst.__use;
1800 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1801 ci.rta_error = rt->u.dst.error;
1805 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1806 nlh->nlmsg_len = skb->tail - b;
1811 skb_trim(skb, b - skb->data);
1815 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1817 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1820 if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1821 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1822 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1826 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1827 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1828 prefix, NLM_F_MULTI);
1831 static int fib6_dump_node(struct fib6_walker_t *w)
1834 struct rt6_info *rt;
1836 for (rt = w->leaf; rt; rt = rt->u.next) {
1837 res = rt6_dump_route(rt, w->args);
1839 /* Frame is full, suspend walking */
1849 static void fib6_dump_end(struct netlink_callback *cb)
1851 struct fib6_walker_t *w = (void*)cb->args[0];
1855 fib6_walker_unlink(w);
1858 cb->done = (void*)cb->args[1];
1862 static int fib6_dump_done(struct netlink_callback *cb)
1865 return cb->done ? cb->done(cb) : 0;
1868 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1870 struct rt6_rtnl_dump_arg arg;
1871 struct fib6_walker_t *w;
1877 w = (void*)cb->args[0];
1881 * 1. hook callback destructor.
1883 cb->args[1] = (long)cb->done;
1884 cb->done = fib6_dump_done;
1887 * 2. allocate and initialize walker.
1889 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1892 RT6_TRACE("dump<%p", w);
1893 w->root = &ip6_routing_table;
1894 w->func = fib6_dump_node;
1896 cb->args[0] = (long)w;
1897 read_lock_bh(&rt6_lock);
1899 read_unlock_bh(&rt6_lock);
1902 read_lock_bh(&rt6_lock);
1903 res = fib6_walk_continue(w);
1904 read_unlock_bh(&rt6_lock);
1907 if (res <= 0 && skb->len == 0)
1908 RT6_TRACE("%p>dump end\n", w);
1910 res = res < 0 ? res : skb->len;
1911 /* res < 0 is an error. (really, impossible)
1912 res == 0 means that dump is complete, but skb still can contain data.
1913 res > 0 dump is not complete, but frame is full.
1915 /* Destroy walker, if dump of this table is complete. */
1921 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1923 struct rtattr **rta = arg;
1926 struct sk_buff *skb;
1928 struct rt6_info *rt;
1930 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1934 /* Reserve room for dummy headers, this skb can pass
1935 through good chunk of routing engine.
1937 skb->mac.raw = skb->data;
1938 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1940 memset(&fl, 0, sizeof(fl));
1942 ipv6_addr_copy(&fl.fl6_src,
1943 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1945 ipv6_addr_copy(&fl.fl6_dst,
1946 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1949 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1952 struct net_device *dev;
1953 dev = __dev_get_by_index(iif);
1962 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1964 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1966 skb->dst = &rt->u.dst;
1968 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1969 err = rt6_fill_node(skb, rt,
1970 &fl.fl6_dst, &fl.fl6_src,
1972 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1973 nlh->nlmsg_seq, 0, 0);
1979 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1989 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1990 struct netlink_skb_parms *req)
1992 struct sk_buff *skb;
1993 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1994 u32 pid = current->pid;
2000 seq = nlh->nlmsg_seq;
2002 skb = alloc_skb(size, gfp_any());
2004 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2007 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2009 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2012 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2013 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2020 #ifdef CONFIG_PROC_FS
2022 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2033 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2035 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2038 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2043 if (arg->len >= arg->length)
2046 for (i=0; i<16; i++) {
2047 sprintf(arg->buffer + arg->len, "%02x",
2048 rt->rt6i_dst.addr.s6_addr[i]);
2051 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2054 #ifdef CONFIG_IPV6_SUBTREES
2055 for (i=0; i<16; i++) {
2056 sprintf(arg->buffer + arg->len, "%02x",
2057 rt->rt6i_src.addr.s6_addr[i]);
2060 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2063 sprintf(arg->buffer + arg->len,
2064 "00000000000000000000000000000000 00 ");
2068 if (rt->rt6i_nexthop) {
2069 for (i=0; i<16; i++) {
2070 sprintf(arg->buffer + arg->len, "%02x",
2071 rt->rt6i_nexthop->primary_key[i]);
2075 sprintf(arg->buffer + arg->len,
2076 "00000000000000000000000000000000");
2079 arg->len += sprintf(arg->buffer + arg->len,
2080 " %08x %08x %08x %08x %8s\n",
2081 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2082 rt->u.dst.__use, rt->rt6i_flags,
2083 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2087 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2089 struct rt6_proc_arg arg;
2090 arg.buffer = buffer;
2091 arg.offset = offset;
2092 arg.length = length;
2096 read_lock_bh(&rt6_lock);
2097 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2098 read_unlock_bh(&rt6_lock);
2102 *start += offset % RT6_INFO_LEN;
2104 arg.len -= offset % RT6_INFO_LEN;
2106 if (arg.len > length)
2114 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2116 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2117 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2118 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2119 rt6_stats.fib_rt_cache,
2120 atomic_read(&ip6_dst_ops.entries),
2121 rt6_stats.fib_discarded_routes);
2126 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2128 return single_open(file, rt6_stats_seq_show, NULL);
2131 static struct file_operations rt6_stats_seq_fops = {
2132 .owner = THIS_MODULE,
2133 .open = rt6_stats_seq_open,
2135 .llseek = seq_lseek,
2136 .release = single_release,
2138 #endif /* CONFIG_PROC_FS */
2140 #ifdef CONFIG_SYSCTL
2142 static int flush_delay;
2145 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2146 void __user *buffer, size_t *lenp, loff_t *ppos)
2149 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2150 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2156 ctl_table ipv6_route_table[] = {
2158 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2159 .procname = "flush",
2160 .data = &flush_delay,
2161 .maxlen = sizeof(int),
2163 .proc_handler = &ipv6_sysctl_rtcache_flush
2166 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2167 .procname = "gc_thresh",
2168 .data = &ip6_dst_ops.gc_thresh,
2169 .maxlen = sizeof(int),
2171 .proc_handler = &proc_dointvec,
2174 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2175 .procname = "max_size",
2176 .data = &ip6_rt_max_size,
2177 .maxlen = sizeof(int),
2179 .proc_handler = &proc_dointvec,
2182 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2183 .procname = "gc_min_interval",
2184 .data = &ip6_rt_gc_min_interval,
2185 .maxlen = sizeof(int),
2187 .proc_handler = &proc_dointvec_jiffies,
2188 .strategy = &sysctl_jiffies,
2191 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2192 .procname = "gc_timeout",
2193 .data = &ip6_rt_gc_timeout,
2194 .maxlen = sizeof(int),
2196 .proc_handler = &proc_dointvec_jiffies,
2197 .strategy = &sysctl_jiffies,
2200 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2201 .procname = "gc_interval",
2202 .data = &ip6_rt_gc_interval,
2203 .maxlen = sizeof(int),
2205 .proc_handler = &proc_dointvec_jiffies,
2206 .strategy = &sysctl_jiffies,
2209 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2210 .procname = "gc_elasticity",
2211 .data = &ip6_rt_gc_elasticity,
2212 .maxlen = sizeof(int),
2214 .proc_handler = &proc_dointvec_jiffies,
2215 .strategy = &sysctl_jiffies,
2218 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2219 .procname = "mtu_expires",
2220 .data = &ip6_rt_mtu_expires,
2221 .maxlen = sizeof(int),
2223 .proc_handler = &proc_dointvec_jiffies,
2224 .strategy = &sysctl_jiffies,
2227 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2228 .procname = "min_adv_mss",
2229 .data = &ip6_rt_min_advmss,
2230 .maxlen = sizeof(int),
2232 .proc_handler = &proc_dointvec_jiffies,
2233 .strategy = &sysctl_jiffies,
2236 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2237 .procname = "gc_min_interval_ms",
2238 .data = &ip6_rt_gc_min_interval,
2239 .maxlen = sizeof(int),
2241 .proc_handler = &proc_dointvec_ms_jiffies,
2242 .strategy = &sysctl_ms_jiffies,
2249 void __init ip6_route_init(void)
2251 struct proc_dir_entry *p;
2253 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2254 sizeof(struct rt6_info),
2255 0, SLAB_HWCACHE_ALIGN,
2257 if (!ip6_dst_ops.kmem_cachep)
2258 panic("cannot create ip6_dst_cache");
2261 #ifdef CONFIG_PROC_FS
2262 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2264 p->owner = THIS_MODULE;
2266 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2273 void ip6_route_cleanup(void)
2275 #ifdef CONFIG_PROC_FS
2276 proc_net_remove("ipv6_route");
2277 proc_net_remove("rt6_stats");
2284 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);