Merge branch 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (neigh) {
284                 read_lock_bh(&neigh->lock);
285                 if (neigh->nud_state & NUD_VALID)
286                         m = 1;
287                 read_unlock_bh(&neigh->lock);
288         }
289         return m;
290 }
291
292 static int rt6_score_route(struct rt6_info *rt, int oif,
293                            int strict)
294 {
295         int m = rt6_check_dev(rt, oif);
296         if (!m && (strict & RT6_SELECT_F_IFACE))
297                 return -1;
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
300 #endif
301         if (rt6_check_neigh(rt))
302                 m |= 16;
303         else if (strict & RT6_SELECT_F_REACHABLE)
304                 return -1;
305         return m;
306 }
307
308 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
309                                    int strict)
310 {
311         struct rt6_info *match = NULL, *last = NULL;
312         struct rt6_info *rt, *rt0 = *head;
313         u32 metric;
314         int mpri = -1;
315
316         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
317                   __FUNCTION__, head, head ? *head : NULL, oif);
318
319         for (rt = rt0, metric = rt0->rt6i_metric;
320              rt && rt->rt6i_metric == metric;
321              rt = rt->u.next) {
322                 int m;
323
324                 if (rt6_check_expired(rt))
325                         continue;
326
327                 last = rt;
328
329                 m = rt6_score_route(rt, oif, strict);
330                 if (m < 0)
331                         continue;
332
333                 if (m > mpri) {
334                         rt6_probe(match);
335                         match = rt;
336                         mpri = m;
337                 } else {
338                         rt6_probe(rt);
339                 }
340         }
341
342         if (!match &&
343             (strict & RT6_SELECT_F_REACHABLE) &&
344             last && last != rt0) {
345                 /* no entries matched; do round-robin */
346                 *head = rt0->u.next;
347                 rt0->u.next = last->u.next;
348                 last->u.next = rt0;
349         }
350
351         RT6_TRACE("%s() => %p, score=%d\n",
352                   __FUNCTION__, match, mpri);
353
354         return (match ? match : &ip6_null_entry);
355 }
356
357 #ifdef CONFIG_IPV6_ROUTE_INFO
358 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
359                   struct in6_addr *gwaddr)
360 {
361         struct route_info *rinfo = (struct route_info *) opt;
362         struct in6_addr prefix_buf, *prefix;
363         unsigned int pref;
364         u32 lifetime;
365         struct rt6_info *rt;
366
367         if (len < sizeof(struct route_info)) {
368                 return -EINVAL;
369         }
370
371         /* Sanity check for prefix_len and length */
372         if (rinfo->length > 3) {
373                 return -EINVAL;
374         } else if (rinfo->prefix_len > 128) {
375                 return -EINVAL;
376         } else if (rinfo->prefix_len > 64) {
377                 if (rinfo->length < 2) {
378                         return -EINVAL;
379                 }
380         } else if (rinfo->prefix_len > 0) {
381                 if (rinfo->length < 1) {
382                         return -EINVAL;
383                 }
384         }
385
386         pref = rinfo->route_pref;
387         if (pref == ICMPV6_ROUTER_PREF_INVALID)
388                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
389
390         lifetime = htonl(rinfo->lifetime);
391         if (lifetime == 0xffffffff) {
392                 /* infinity */
393         } else if (lifetime > 0x7fffffff/HZ) {
394                 /* Avoid arithmetic overflow */
395                 lifetime = 0x7fffffff/HZ - 1;
396         }
397
398         if (rinfo->length == 3)
399                 prefix = (struct in6_addr *)rinfo->prefix;
400         else {
401                 /* this function is safe */
402                 ipv6_addr_prefix(&prefix_buf,
403                                  (struct in6_addr *)rinfo->prefix,
404                                  rinfo->prefix_len);
405                 prefix = &prefix_buf;
406         }
407
408         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
409
410         if (rt && !lifetime) {
411                 ip6_del_rt(rt, NULL, NULL, NULL);
412                 rt = NULL;
413         }
414
415         if (!rt && lifetime)
416                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
417                                         pref);
418         else if (rt)
419                 rt->rt6i_flags = RTF_ROUTEINFO |
420                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
421
422         if (rt) {
423                 if (lifetime == 0xffffffff) {
424                         rt->rt6i_flags &= ~RTF_EXPIRES;
425                 } else {
426                         rt->rt6i_expires = jiffies + HZ * lifetime;
427                         rt->rt6i_flags |= RTF_EXPIRES;
428                 }
429                 dst_release(&rt->u.dst);
430         }
431         return 0;
432 }
433 #endif
434
435 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
436                             int oif, int strict)
437 {
438         struct fib6_node *fn;
439         struct rt6_info *rt;
440
441         read_lock_bh(&rt6_lock);
442         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
443         rt = rt6_device_match(fn->leaf, oif, strict);
444         dst_hold(&rt->u.dst);
445         rt->u.dst.__use++;
446         read_unlock_bh(&rt6_lock);
447
448         rt->u.dst.lastuse = jiffies;
449         if (rt->u.dst.error == 0)
450                 return rt;
451         dst_release(&rt->u.dst);
452         return NULL;
453 }
454
455 /* ip6_ins_rt is called with FREE rt6_lock.
456    It takes new route entry, the addition fails by any reason the
457    route is freed. In any case, if caller does not hold it, it may
458    be destroyed.
459  */
460
461 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
462                 void *_rtattr, struct netlink_skb_parms *req)
463 {
464         int err;
465
466         write_lock_bh(&rt6_lock);
467         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
468         write_unlock_bh(&rt6_lock);
469
470         return err;
471 }
472
473 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
474                                       struct in6_addr *saddr)
475 {
476         struct rt6_info *rt;
477
478         /*
479          *      Clone the route.
480          */
481
482         rt = ip6_rt_copy(ort);
483
484         if (rt) {
485                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
486                         if (rt->rt6i_dst.plen != 128 &&
487                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
488                                 rt->rt6i_flags |= RTF_ANYCAST;
489                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
490                 }
491
492                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
493                 rt->rt6i_dst.plen = 128;
494                 rt->rt6i_flags |= RTF_CACHE;
495                 rt->u.dst.flags |= DST_HOST;
496
497 #ifdef CONFIG_IPV6_SUBTREES
498                 if (rt->rt6i_src.plen && saddr) {
499                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
500                         rt->rt6i_src.plen = 128;
501                 }
502 #endif
503
504                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
505
506         }
507
508         return rt;
509 }
510
511 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
512 {
513         struct rt6_info *rt = ip6_rt_copy(ort);
514         if (rt) {
515                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
516                 rt->rt6i_dst.plen = 128;
517                 rt->rt6i_flags |= RTF_CACHE;
518                 if (rt->rt6i_flags & RTF_REJECT)
519                         rt->u.dst.error = ort->u.dst.error;
520                 rt->u.dst.flags |= DST_HOST;
521                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
522         }
523         return rt;
524 }
525
526 #define BACKTRACK() \
527 if (rt == &ip6_null_entry) { \
528        while ((fn = fn->parent) != NULL) { \
529                 if (fn->fn_flags & RTN_ROOT) { \
530                         goto out; \
531                 } \
532                 if (fn->fn_flags & RTN_RTINFO) \
533                         goto restart; \
534         } \
535 }
536
537
538 void ip6_route_input(struct sk_buff *skb)
539 {
540         struct fib6_node *fn;
541         struct rt6_info *rt, *nrt;
542         int strict;
543         int attempts = 3;
544         int err;
545         int reachable = RT6_SELECT_F_REACHABLE;
546
547         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
548
549 relookup:
550         read_lock_bh(&rt6_lock);
551
552 restart_2:
553         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
554                          &skb->nh.ipv6h->saddr);
555
556 restart:
557         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
558         BACKTRACK();
559         if (rt == &ip6_null_entry ||
560             rt->rt6i_flags & RTF_CACHE)
561                 goto out;
562
563         dst_hold(&rt->u.dst);
564         read_unlock_bh(&rt6_lock);
565
566         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
567                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
568         else {
569 #if CLONE_OFFLINK_ROUTE
570                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
571 #else
572                 goto out2;
573 #endif
574         }
575
576         dst_release(&rt->u.dst);
577         rt = nrt ? : &ip6_null_entry;
578
579         dst_hold(&rt->u.dst);
580         if (nrt) {
581                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
582                 if (!err)
583                         goto out2;
584         }
585
586         if (--attempts <= 0)
587                 goto out2;
588
589         /*
590          * Race condition! In the gap, when rt6_lock was
591          * released someone could insert this route.  Relookup.
592          */
593         dst_release(&rt->u.dst);
594         goto relookup;
595
596 out:
597         if (reachable) {
598                 reachable = 0;
599                 goto restart_2;
600         }
601         dst_hold(&rt->u.dst);
602         read_unlock_bh(&rt6_lock);
603 out2:
604         rt->u.dst.lastuse = jiffies;
605         rt->u.dst.__use++;
606         skb->dst = (struct dst_entry *) rt;
607         return;
608 }
609
610 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
611 {
612         struct fib6_node *fn;
613         struct rt6_info *rt, *nrt;
614         int strict;
615         int attempts = 3;
616         int err;
617         int reachable = RT6_SELECT_F_REACHABLE;
618
619         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
620
621 relookup:
622         read_lock_bh(&rt6_lock);
623
624 restart_2:
625         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
626
627 restart:
628         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
629         BACKTRACK();
630         if (rt == &ip6_null_entry ||
631             rt->rt6i_flags & RTF_CACHE)
632                 goto out;
633
634         dst_hold(&rt->u.dst);
635         read_unlock_bh(&rt6_lock);
636
637         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
638                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
639         else {
640 #if CLONE_OFFLINK_ROUTE
641                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
642 #else
643                 goto out2;
644 #endif
645         }
646
647         dst_release(&rt->u.dst);
648         rt = nrt ? : &ip6_null_entry;
649
650         dst_hold(&rt->u.dst);
651         if (nrt) {
652                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
653                 if (!err)
654                         goto out2;
655         }
656
657         if (--attempts <= 0)
658                 goto out2;
659
660         /*
661          * Race condition! In the gap, when rt6_lock was
662          * released someone could insert this route.  Relookup.
663          */
664         dst_release(&rt->u.dst);
665         goto relookup;
666
667 out:
668         if (reachable) {
669                 reachable = 0;
670                 goto restart_2;
671         }
672         dst_hold(&rt->u.dst);
673         read_unlock_bh(&rt6_lock);
674 out2:
675         rt->u.dst.lastuse = jiffies;
676         rt->u.dst.__use++;
677         return &rt->u.dst;
678 }
679
680
681 /*
682  *      Destination cache support functions
683  */
684
685 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
686 {
687         struct rt6_info *rt;
688
689         rt = (struct rt6_info *) dst;
690
691         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
692                 return dst;
693
694         return NULL;
695 }
696
697 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
698 {
699         struct rt6_info *rt = (struct rt6_info *) dst;
700
701         if (rt) {
702                 if (rt->rt6i_flags & RTF_CACHE)
703                         ip6_del_rt(rt, NULL, NULL, NULL);
704                 else
705                         dst_release(dst);
706         }
707         return NULL;
708 }
709
710 static void ip6_link_failure(struct sk_buff *skb)
711 {
712         struct rt6_info *rt;
713
714         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
715
716         rt = (struct rt6_info *) skb->dst;
717         if (rt) {
718                 if (rt->rt6i_flags&RTF_CACHE) {
719                         dst_set_expires(&rt->u.dst, 0);
720                         rt->rt6i_flags |= RTF_EXPIRES;
721                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
722                         rt->rt6i_node->fn_sernum = -1;
723         }
724 }
725
726 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
727 {
728         struct rt6_info *rt6 = (struct rt6_info*)dst;
729
730         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
731                 rt6->rt6i_flags |= RTF_MODIFIED;
732                 if (mtu < IPV6_MIN_MTU) {
733                         mtu = IPV6_MIN_MTU;
734                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
735                 }
736                 dst->metrics[RTAX_MTU-1] = mtu;
737         }
738 }
739
740 /* Protected by rt6_lock.  */
741 static struct dst_entry *ndisc_dst_gc_list;
742 static int ipv6_get_mtu(struct net_device *dev);
743
744 static inline unsigned int ipv6_advmss(unsigned int mtu)
745 {
746         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
747
748         if (mtu < ip6_rt_min_advmss)
749                 mtu = ip6_rt_min_advmss;
750
751         /*
752          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
753          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
754          * IPV6_MAXPLEN is also valid and means: "any MSS, 
755          * rely only on pmtu discovery"
756          */
757         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
758                 mtu = IPV6_MAXPLEN;
759         return mtu;
760 }
761
762 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
763                                   struct neighbour *neigh,
764                                   struct in6_addr *addr,
765                                   int (*output)(struct sk_buff *))
766 {
767         struct rt6_info *rt;
768         struct inet6_dev *idev = in6_dev_get(dev);
769
770         if (unlikely(idev == NULL))
771                 return NULL;
772
773         rt = ip6_dst_alloc();
774         if (unlikely(rt == NULL)) {
775                 in6_dev_put(idev);
776                 goto out;
777         }
778
779         dev_hold(dev);
780         if (neigh)
781                 neigh_hold(neigh);
782         else
783                 neigh = ndisc_get_neigh(dev, addr);
784
785         rt->rt6i_dev      = dev;
786         rt->rt6i_idev     = idev;
787         rt->rt6i_nexthop  = neigh;
788         atomic_set(&rt->u.dst.__refcnt, 1);
789         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
790         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
791         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
792         rt->u.dst.output  = output;
793
794 #if 0   /* there's no chance to use these for ndisc */
795         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
796                                 ? DST_HOST 
797                                 : 0;
798         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
799         rt->rt6i_dst.plen = 128;
800 #endif
801
802         write_lock_bh(&rt6_lock);
803         rt->u.dst.next = ndisc_dst_gc_list;
804         ndisc_dst_gc_list = &rt->u.dst;
805         write_unlock_bh(&rt6_lock);
806
807         fib6_force_start_gc();
808
809 out:
810         return (struct dst_entry *)rt;
811 }
812
813 int ndisc_dst_gc(int *more)
814 {
815         struct dst_entry *dst, *next, **pprev;
816         int freed;
817
818         next = NULL;
819         pprev = &ndisc_dst_gc_list;
820         freed = 0;
821         while ((dst = *pprev) != NULL) {
822                 if (!atomic_read(&dst->__refcnt)) {
823                         *pprev = dst->next;
824                         dst_free(dst);
825                         freed++;
826                 } else {
827                         pprev = &dst->next;
828                         (*more)++;
829                 }
830         }
831
832         return freed;
833 }
834
835 static int ip6_dst_gc(void)
836 {
837         static unsigned expire = 30*HZ;
838         static unsigned long last_gc;
839         unsigned long now = jiffies;
840
841         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
842             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
843                 goto out;
844
845         expire++;
846         fib6_run_gc(expire);
847         last_gc = now;
848         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
849                 expire = ip6_rt_gc_timeout>>1;
850
851 out:
852         expire -= expire>>ip6_rt_gc_elasticity;
853         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
854 }
855
856 /* Clean host part of a prefix. Not necessary in radix tree,
857    but results in cleaner routing tables.
858
859    Remove it only when all the things will work!
860  */
861
862 static int ipv6_get_mtu(struct net_device *dev)
863 {
864         int mtu = IPV6_MIN_MTU;
865         struct inet6_dev *idev;
866
867         idev = in6_dev_get(dev);
868         if (idev) {
869                 mtu = idev->cnf.mtu6;
870                 in6_dev_put(idev);
871         }
872         return mtu;
873 }
874
875 int ipv6_get_hoplimit(struct net_device *dev)
876 {
877         int hoplimit = ipv6_devconf.hop_limit;
878         struct inet6_dev *idev;
879
880         idev = in6_dev_get(dev);
881         if (idev) {
882                 hoplimit = idev->cnf.hop_limit;
883                 in6_dev_put(idev);
884         }
885         return hoplimit;
886 }
887
888 /*
889  *
890  */
891
892 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
893                 void *_rtattr, struct netlink_skb_parms *req)
894 {
895         int err;
896         struct rtmsg *r;
897         struct rtattr **rta;
898         struct rt6_info *rt = NULL;
899         struct net_device *dev = NULL;
900         struct inet6_dev *idev = NULL;
901         int addr_type;
902
903         rta = (struct rtattr **) _rtattr;
904
905         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
906                 return -EINVAL;
907 #ifndef CONFIG_IPV6_SUBTREES
908         if (rtmsg->rtmsg_src_len)
909                 return -EINVAL;
910 #endif
911         if (rtmsg->rtmsg_ifindex) {
912                 err = -ENODEV;
913                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
914                 if (!dev)
915                         goto out;
916                 idev = in6_dev_get(dev);
917                 if (!idev)
918                         goto out;
919         }
920
921         if (rtmsg->rtmsg_metric == 0)
922                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
923
924         rt = ip6_dst_alloc();
925
926         if (rt == NULL) {
927                 err = -ENOMEM;
928                 goto out;
929         }
930
931         rt->u.dst.obsolete = -1;
932         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
933         if (nlh && (r = NLMSG_DATA(nlh))) {
934                 rt->rt6i_protocol = r->rtm_protocol;
935         } else {
936                 rt->rt6i_protocol = RTPROT_BOOT;
937         }
938
939         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
940
941         if (addr_type & IPV6_ADDR_MULTICAST)
942                 rt->u.dst.input = ip6_mc_input;
943         else
944                 rt->u.dst.input = ip6_forward;
945
946         rt->u.dst.output = ip6_output;
947
948         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
949                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
950         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
951         if (rt->rt6i_dst.plen == 128)
952                rt->u.dst.flags = DST_HOST;
953
954 #ifdef CONFIG_IPV6_SUBTREES
955         ipv6_addr_prefix(&rt->rt6i_src.addr, 
956                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
957         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
958 #endif
959
960         rt->rt6i_metric = rtmsg->rtmsg_metric;
961
962         /* We cannot add true routes via loopback here,
963            they would result in kernel looping; promote them to reject routes
964          */
965         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
966             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
967                 /* hold loopback dev/idev if we haven't done so. */
968                 if (dev != &loopback_dev) {
969                         if (dev) {
970                                 dev_put(dev);
971                                 in6_dev_put(idev);
972                         }
973                         dev = &loopback_dev;
974                         dev_hold(dev);
975                         idev = in6_dev_get(dev);
976                         if (!idev) {
977                                 err = -ENODEV;
978                                 goto out;
979                         }
980                 }
981                 rt->u.dst.output = ip6_pkt_discard_out;
982                 rt->u.dst.input = ip6_pkt_discard;
983                 rt->u.dst.error = -ENETUNREACH;
984                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
985                 goto install_route;
986         }
987
988         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
989                 struct in6_addr *gw_addr;
990                 int gwa_type;
991
992                 gw_addr = &rtmsg->rtmsg_gateway;
993                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
994                 gwa_type = ipv6_addr_type(gw_addr);
995
996                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
997                         struct rt6_info *grt;
998
999                         /* IPv6 strictly inhibits using not link-local
1000                            addresses as nexthop address.
1001                            Otherwise, router will not able to send redirects.
1002                            It is very good, but in some (rare!) circumstances
1003                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1004                            some exceptions. --ANK
1005                          */
1006                         err = -EINVAL;
1007                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1008                                 goto out;
1009
1010                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1011
1012                         err = -EHOSTUNREACH;
1013                         if (grt == NULL)
1014                                 goto out;
1015                         if (dev) {
1016                                 if (dev != grt->rt6i_dev) {
1017                                         dst_release(&grt->u.dst);
1018                                         goto out;
1019                                 }
1020                         } else {
1021                                 dev = grt->rt6i_dev;
1022                                 idev = grt->rt6i_idev;
1023                                 dev_hold(dev);
1024                                 in6_dev_hold(grt->rt6i_idev);
1025                         }
1026                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1027                                 err = 0;
1028                         dst_release(&grt->u.dst);
1029
1030                         if (err)
1031                                 goto out;
1032                 }
1033                 err = -EINVAL;
1034                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1035                         goto out;
1036         }
1037
1038         err = -ENODEV;
1039         if (dev == NULL)
1040                 goto out;
1041
1042         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1043                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1044                 if (IS_ERR(rt->rt6i_nexthop)) {
1045                         err = PTR_ERR(rt->rt6i_nexthop);
1046                         rt->rt6i_nexthop = NULL;
1047                         goto out;
1048                 }
1049         }
1050
1051         rt->rt6i_flags = rtmsg->rtmsg_flags;
1052
1053 install_route:
1054         if (rta && rta[RTA_METRICS-1]) {
1055                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1056                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1057
1058                 while (RTA_OK(attr, attrlen)) {
1059                         unsigned flavor = attr->rta_type;
1060                         if (flavor) {
1061                                 if (flavor > RTAX_MAX) {
1062                                         err = -EINVAL;
1063                                         goto out;
1064                                 }
1065                                 rt->u.dst.metrics[flavor-1] =
1066                                         *(u32 *)RTA_DATA(attr);
1067                         }
1068                         attr = RTA_NEXT(attr, attrlen);
1069                 }
1070         }
1071
1072         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1073                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1074         if (!rt->u.dst.metrics[RTAX_MTU-1])
1075                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1076         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1077                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1078         rt->u.dst.dev = dev;
1079         rt->rt6i_idev = idev;
1080         return ip6_ins_rt(rt, nlh, _rtattr, req);
1081
1082 out:
1083         if (dev)
1084                 dev_put(dev);
1085         if (idev)
1086                 in6_dev_put(idev);
1087         if (rt)
1088                 dst_free((struct dst_entry *) rt);
1089         return err;
1090 }
1091
1092 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1093 {
1094         int err;
1095
1096         write_lock_bh(&rt6_lock);
1097
1098         err = fib6_del(rt, nlh, _rtattr, req);
1099         dst_release(&rt->u.dst);
1100
1101         write_unlock_bh(&rt6_lock);
1102
1103         return err;
1104 }
1105
1106 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1107 {
1108         struct fib6_node *fn;
1109         struct rt6_info *rt;
1110         int err = -ESRCH;
1111
1112         read_lock_bh(&rt6_lock);
1113
1114         fn = fib6_locate(&ip6_routing_table,
1115                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1116                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1117         
1118         if (fn) {
1119                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1120                         if (rtmsg->rtmsg_ifindex &&
1121                             (rt->rt6i_dev == NULL ||
1122                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1123                                 continue;
1124                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1125                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1126                                 continue;
1127                         if (rtmsg->rtmsg_metric &&
1128                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1129                                 continue;
1130                         dst_hold(&rt->u.dst);
1131                         read_unlock_bh(&rt6_lock);
1132
1133                         return ip6_del_rt(rt, nlh, _rtattr, req);
1134                 }
1135         }
1136         read_unlock_bh(&rt6_lock);
1137
1138         return err;
1139 }
1140
1141 /*
1142  *      Handle redirects
1143  */
1144 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1145                   struct neighbour *neigh, u8 *lladdr, int on_link)
1146 {
1147         struct rt6_info *rt, *nrt = NULL;
1148         int strict;
1149         struct fib6_node *fn;
1150
1151         /*
1152          * Get the "current" route for this destination and
1153          * check if the redirect has come from approriate router.
1154          *
1155          * RFC 2461 specifies that redirects should only be
1156          * accepted if they come from the nexthop to the target.
1157          * Due to the way the routes are chosen, this notion
1158          * is a bit fuzzy and one might need to check all possible
1159          * routes.
1160          */
1161         strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1162
1163         read_lock_bh(&rt6_lock);
1164         fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1165 restart:
1166         for (rt = fn->leaf; rt; rt = rt->u.next) {
1167                 /*
1168                  * Current route is on-link; redirect is always invalid.
1169                  *
1170                  * Seems, previous statement is not true. It could
1171                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1172                  * But then router serving it might decide, that we should
1173                  * know truth 8)8) --ANK (980726).
1174                  */
1175                 if (rt6_check_expired(rt))
1176                         continue;
1177                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1178                         continue;
1179                 if (neigh->dev != rt->rt6i_dev)
1180                         continue;
1181                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1182                         continue;
1183                 break;
1184         }
1185         if (rt)
1186                 dst_hold(&rt->u.dst);
1187         else if (strict) {
1188                 while ((fn = fn->parent) != NULL) {
1189                         if (fn->fn_flags & RTN_ROOT)
1190                                 break;
1191                         if (fn->fn_flags & RTN_RTINFO)
1192                                 goto restart;
1193                 }
1194         }
1195         read_unlock_bh(&rt6_lock);
1196
1197         if (!rt) {
1198                 if (net_ratelimit())
1199                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1200                                "for redirect target\n");
1201                 return;
1202         }
1203
1204         /*
1205          *      We have finally decided to accept it.
1206          */
1207
1208         neigh_update(neigh, lladdr, NUD_STALE, 
1209                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1210                      NEIGH_UPDATE_F_OVERRIDE|
1211                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1212                                      NEIGH_UPDATE_F_ISROUTER))
1213                      );
1214
1215         /*
1216          * Redirect received -> path was valid.
1217          * Look, redirects are sent only in response to data packets,
1218          * so that this nexthop apparently is reachable. --ANK
1219          */
1220         dst_confirm(&rt->u.dst);
1221
1222         /* Duplicate redirect: silently ignore. */
1223         if (neigh == rt->u.dst.neighbour)
1224                 goto out;
1225
1226         nrt = ip6_rt_copy(rt);
1227         if (nrt == NULL)
1228                 goto out;
1229
1230         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1231         if (on_link)
1232                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1233
1234         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1235         nrt->rt6i_dst.plen = 128;
1236         nrt->u.dst.flags |= DST_HOST;
1237
1238         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1239         nrt->rt6i_nexthop = neigh_clone(neigh);
1240         /* Reset pmtu, it may be better */
1241         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1242         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1243
1244         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1245                 goto out;
1246
1247         if (rt->rt6i_flags&RTF_CACHE) {
1248                 ip6_del_rt(rt, NULL, NULL, NULL);
1249                 return;
1250         }
1251
1252 out:
1253         dst_release(&rt->u.dst);
1254         return;
1255 }
1256
1257 /*
1258  *      Handle ICMP "packet too big" messages
1259  *      i.e. Path MTU discovery
1260  */
1261
1262 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1263                         struct net_device *dev, u32 pmtu)
1264 {
1265         struct rt6_info *rt, *nrt;
1266         int allfrag = 0;
1267
1268         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1269         if (rt == NULL)
1270                 return;
1271
1272         if (pmtu >= dst_mtu(&rt->u.dst))
1273                 goto out;
1274
1275         if (pmtu < IPV6_MIN_MTU) {
1276                 /*
1277                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1278                  * MTU (1280) and a fragment header should always be included
1279                  * after a node receiving Too Big message reporting PMTU is
1280                  * less than the IPv6 Minimum Link MTU.
1281                  */
1282                 pmtu = IPV6_MIN_MTU;
1283                 allfrag = 1;
1284         }
1285
1286         /* New mtu received -> path was valid.
1287            They are sent only in response to data packets,
1288            so that this nexthop apparently is reachable. --ANK
1289          */
1290         dst_confirm(&rt->u.dst);
1291
1292         /* Host route. If it is static, it would be better
1293            not to override it, but add new one, so that
1294            when cache entry will expire old pmtu
1295            would return automatically.
1296          */
1297         if (rt->rt6i_flags & RTF_CACHE) {
1298                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1299                 if (allfrag)
1300                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1301                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1302                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1303                 goto out;
1304         }
1305
1306         /* Network route.
1307            Two cases are possible:
1308            1. It is connected route. Action: COW
1309            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1310          */
1311         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1312                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1313         else
1314                 nrt = rt6_alloc_clone(rt, daddr);
1315
1316         if (nrt) {
1317                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1318                 if (allfrag)
1319                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1320
1321                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1322                  * happened within 5 mins, the recommended timer is 10 mins.
1323                  * Here this route expiration time is set to ip6_rt_mtu_expires
1324                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1325                  * and detecting PMTU increase will be automatically happened.
1326                  */
1327                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1328                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1329
1330                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1331         }
1332 out:
1333         dst_release(&rt->u.dst);
1334 }
1335
1336 /*
1337  *      Misc support functions
1338  */
1339
1340 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1341 {
1342         struct rt6_info *rt = ip6_dst_alloc();
1343
1344         if (rt) {
1345                 rt->u.dst.input = ort->u.dst.input;
1346                 rt->u.dst.output = ort->u.dst.output;
1347
1348                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1349                 rt->u.dst.dev = ort->u.dst.dev;
1350                 if (rt->u.dst.dev)
1351                         dev_hold(rt->u.dst.dev);
1352                 rt->rt6i_idev = ort->rt6i_idev;
1353                 if (rt->rt6i_idev)
1354                         in6_dev_hold(rt->rt6i_idev);
1355                 rt->u.dst.lastuse = jiffies;
1356                 rt->rt6i_expires = 0;
1357
1358                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1359                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1360                 rt->rt6i_metric = 0;
1361
1362                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1363 #ifdef CONFIG_IPV6_SUBTREES
1364                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1365 #endif
1366         }
1367         return rt;
1368 }
1369
1370 #ifdef CONFIG_IPV6_ROUTE_INFO
1371 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1372                                            struct in6_addr *gwaddr, int ifindex)
1373 {
1374         struct fib6_node *fn;
1375         struct rt6_info *rt = NULL;
1376
1377         write_lock_bh(&rt6_lock);
1378         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1379         if (!fn)
1380                 goto out;
1381
1382         for (rt = fn->leaf; rt; rt = rt->u.next) {
1383                 if (rt->rt6i_dev->ifindex != ifindex)
1384                         continue;
1385                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1386                         continue;
1387                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1388                         continue;
1389                 dst_hold(&rt->u.dst);
1390                 break;
1391         }
1392 out:
1393         write_unlock_bh(&rt6_lock);
1394         return rt;
1395 }
1396
1397 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1398                                            struct in6_addr *gwaddr, int ifindex,
1399                                            unsigned pref)
1400 {
1401         struct in6_rtmsg rtmsg;
1402
1403         memset(&rtmsg, 0, sizeof(rtmsg));
1404         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1405         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1406         rtmsg.rtmsg_dst_len = prefixlen;
1407         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1408         rtmsg.rtmsg_metric = 1024;
1409         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1410         /* We should treat it as a default route if prefix length is 0. */
1411         if (!prefixlen)
1412                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1413         rtmsg.rtmsg_ifindex = ifindex;
1414
1415         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1416
1417         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1418 }
1419 #endif
1420
1421 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1422 {       
1423         struct rt6_info *rt;
1424         struct fib6_node *fn;
1425
1426         fn = &ip6_routing_table;
1427
1428         write_lock_bh(&rt6_lock);
1429         for (rt = fn->leaf; rt; rt=rt->u.next) {
1430                 if (dev == rt->rt6i_dev &&
1431                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1432                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1433                         break;
1434         }
1435         if (rt)
1436                 dst_hold(&rt->u.dst);
1437         write_unlock_bh(&rt6_lock);
1438         return rt;
1439 }
1440
1441 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1442                                      struct net_device *dev,
1443                                      unsigned int pref)
1444 {
1445         struct in6_rtmsg rtmsg;
1446
1447         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1448         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1449         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1450         rtmsg.rtmsg_metric = 1024;
1451         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1452                             RTF_PREF(pref);
1453
1454         rtmsg.rtmsg_ifindex = dev->ifindex;
1455
1456         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1457         return rt6_get_dflt_router(gwaddr, dev);
1458 }
1459
1460 void rt6_purge_dflt_routers(void)
1461 {
1462         struct rt6_info *rt;
1463
1464 restart:
1465         read_lock_bh(&rt6_lock);
1466         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1467                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1468                         dst_hold(&rt->u.dst);
1469
1470                         read_unlock_bh(&rt6_lock);
1471
1472                         ip6_del_rt(rt, NULL, NULL, NULL);
1473
1474                         goto restart;
1475                 }
1476         }
1477         read_unlock_bh(&rt6_lock);
1478 }
1479
1480 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1481 {
1482         struct in6_rtmsg rtmsg;
1483         int err;
1484
1485         switch(cmd) {
1486         case SIOCADDRT:         /* Add a route */
1487         case SIOCDELRT:         /* Delete a route */
1488                 if (!capable(CAP_NET_ADMIN))
1489                         return -EPERM;
1490                 err = copy_from_user(&rtmsg, arg,
1491                                      sizeof(struct in6_rtmsg));
1492                 if (err)
1493                         return -EFAULT;
1494                         
1495                 rtnl_lock();
1496                 switch (cmd) {
1497                 case SIOCADDRT:
1498                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1499                         break;
1500                 case SIOCDELRT:
1501                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1502                         break;
1503                 default:
1504                         err = -EINVAL;
1505                 }
1506                 rtnl_unlock();
1507
1508                 return err;
1509         };
1510
1511         return -EINVAL;
1512 }
1513
1514 /*
1515  *      Drop the packet on the floor
1516  */
1517
1518 static int ip6_pkt_discard(struct sk_buff *skb)
1519 {
1520         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1521         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1522         kfree_skb(skb);
1523         return 0;
1524 }
1525
1526 static int ip6_pkt_discard_out(struct sk_buff *skb)
1527 {
1528         skb->dev = skb->dst->dev;
1529         return ip6_pkt_discard(skb);
1530 }
1531
1532 /*
1533  *      Allocate a dst for local (unicast / anycast) address.
1534  */
1535
1536 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1537                                     const struct in6_addr *addr,
1538                                     int anycast)
1539 {
1540         struct rt6_info *rt = ip6_dst_alloc();
1541
1542         if (rt == NULL)
1543                 return ERR_PTR(-ENOMEM);
1544
1545         dev_hold(&loopback_dev);
1546         in6_dev_hold(idev);
1547
1548         rt->u.dst.flags = DST_HOST;
1549         rt->u.dst.input = ip6_input;
1550         rt->u.dst.output = ip6_output;
1551         rt->rt6i_dev = &loopback_dev;
1552         rt->rt6i_idev = idev;
1553         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1554         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1555         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1556         rt->u.dst.obsolete = -1;
1557
1558         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1559         if (anycast)
1560                 rt->rt6i_flags |= RTF_ANYCAST;
1561         else
1562                 rt->rt6i_flags |= RTF_LOCAL;
1563         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1564         if (rt->rt6i_nexthop == NULL) {
1565                 dst_free((struct dst_entry *) rt);
1566                 return ERR_PTR(-ENOMEM);
1567         }
1568
1569         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1570         rt->rt6i_dst.plen = 128;
1571
1572         atomic_set(&rt->u.dst.__refcnt, 1);
1573
1574         return rt;
1575 }
1576
1577 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1578 {
1579         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1580             rt != &ip6_null_entry) {
1581                 RT6_TRACE("deleted by ifdown %p\n", rt);
1582                 return -1;
1583         }
1584         return 0;
1585 }
1586
1587 void rt6_ifdown(struct net_device *dev)
1588 {
1589         write_lock_bh(&rt6_lock);
1590         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1591         write_unlock_bh(&rt6_lock);
1592 }
1593
1594 struct rt6_mtu_change_arg
1595 {
1596         struct net_device *dev;
1597         unsigned mtu;
1598 };
1599
1600 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1601 {
1602         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1603         struct inet6_dev *idev;
1604
1605         /* In IPv6 pmtu discovery is not optional,
1606            so that RTAX_MTU lock cannot disable it.
1607            We still use this lock to block changes
1608            caused by addrconf/ndisc.
1609         */
1610
1611         idev = __in6_dev_get(arg->dev);
1612         if (idev == NULL)
1613                 return 0;
1614
1615         /* For administrative MTU increase, there is no way to discover
1616            IPv6 PMTU increase, so PMTU increase should be updated here.
1617            Since RFC 1981 doesn't include administrative MTU increase
1618            update PMTU increase is a MUST. (i.e. jumbo frame)
1619          */
1620         /*
1621            If new MTU is less than route PMTU, this new MTU will be the
1622            lowest MTU in the path, update the route PMTU to reflect PMTU
1623            decreases; if new MTU is greater than route PMTU, and the
1624            old MTU is the lowest MTU in the path, update the route PMTU
1625            to reflect the increase. In this case if the other nodes' MTU
1626            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1627            PMTU discouvery.
1628          */
1629         if (rt->rt6i_dev == arg->dev &&
1630             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1631             (dst_mtu(&rt->u.dst) > arg->mtu ||
1632              (dst_mtu(&rt->u.dst) < arg->mtu &&
1633               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1634                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1635         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1636         return 0;
1637 }
1638
1639 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1640 {
1641         struct rt6_mtu_change_arg arg;
1642
1643         arg.dev = dev;
1644         arg.mtu = mtu;
1645         read_lock_bh(&rt6_lock);
1646         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1647         read_unlock_bh(&rt6_lock);
1648 }
1649
1650 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1651                               struct in6_rtmsg *rtmsg)
1652 {
1653         memset(rtmsg, 0, sizeof(*rtmsg));
1654
1655         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1656         rtmsg->rtmsg_src_len = r->rtm_src_len;
1657         rtmsg->rtmsg_flags = RTF_UP;
1658         if (r->rtm_type == RTN_UNREACHABLE)
1659                 rtmsg->rtmsg_flags |= RTF_REJECT;
1660
1661         if (rta[RTA_GATEWAY-1]) {
1662                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1663                         return -EINVAL;
1664                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1665                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1666         }
1667         if (rta[RTA_DST-1]) {
1668                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1669                         return -EINVAL;
1670                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1671         }
1672         if (rta[RTA_SRC-1]) {
1673                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1674                         return -EINVAL;
1675                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1676         }
1677         if (rta[RTA_OIF-1]) {
1678                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1679                         return -EINVAL;
1680                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1681         }
1682         if (rta[RTA_PRIORITY-1]) {
1683                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1684                         return -EINVAL;
1685                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1686         }
1687         return 0;
1688 }
1689
1690 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1691 {
1692         struct rtmsg *r = NLMSG_DATA(nlh);
1693         struct in6_rtmsg rtmsg;
1694
1695         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1696                 return -EINVAL;
1697         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1698 }
1699
1700 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1701 {
1702         struct rtmsg *r = NLMSG_DATA(nlh);
1703         struct in6_rtmsg rtmsg;
1704
1705         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1706                 return -EINVAL;
1707         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1708 }
1709
1710 struct rt6_rtnl_dump_arg
1711 {
1712         struct sk_buff *skb;
1713         struct netlink_callback *cb;
1714 };
1715
1716 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1717                          struct in6_addr *dst, struct in6_addr *src,
1718                          int iif, int type, u32 pid, u32 seq,
1719                          int prefix, unsigned int flags)
1720 {
1721         struct rtmsg *rtm;
1722         struct nlmsghdr  *nlh;
1723         unsigned char    *b = skb->tail;
1724         struct rta_cacheinfo ci;
1725
1726         if (prefix) {   /* user wants prefix routes only */
1727                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1728                         /* success since this is not a prefix route */
1729                         return 1;
1730                 }
1731         }
1732
1733         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1734         rtm = NLMSG_DATA(nlh);
1735         rtm->rtm_family = AF_INET6;
1736         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1737         rtm->rtm_src_len = rt->rt6i_src.plen;
1738         rtm->rtm_tos = 0;
1739         rtm->rtm_table = RT_TABLE_MAIN;
1740         if (rt->rt6i_flags&RTF_REJECT)
1741                 rtm->rtm_type = RTN_UNREACHABLE;
1742         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1743                 rtm->rtm_type = RTN_LOCAL;
1744         else
1745                 rtm->rtm_type = RTN_UNICAST;
1746         rtm->rtm_flags = 0;
1747         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1748         rtm->rtm_protocol = rt->rt6i_protocol;
1749         if (rt->rt6i_flags&RTF_DYNAMIC)
1750                 rtm->rtm_protocol = RTPROT_REDIRECT;
1751         else if (rt->rt6i_flags & RTF_ADDRCONF)
1752                 rtm->rtm_protocol = RTPROT_KERNEL;
1753         else if (rt->rt6i_flags&RTF_DEFAULT)
1754                 rtm->rtm_protocol = RTPROT_RA;
1755
1756         if (rt->rt6i_flags&RTF_CACHE)
1757                 rtm->rtm_flags |= RTM_F_CLONED;
1758
1759         if (dst) {
1760                 RTA_PUT(skb, RTA_DST, 16, dst);
1761                 rtm->rtm_dst_len = 128;
1762         } else if (rtm->rtm_dst_len)
1763                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1764 #ifdef CONFIG_IPV6_SUBTREES
1765         if (src) {
1766                 RTA_PUT(skb, RTA_SRC, 16, src);
1767                 rtm->rtm_src_len = 128;
1768         } else if (rtm->rtm_src_len)
1769                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1770 #endif
1771         if (iif)
1772                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1773         else if (dst) {
1774                 struct in6_addr saddr_buf;
1775                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1776                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1777         }
1778         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1779                 goto rtattr_failure;
1780         if (rt->u.dst.neighbour)
1781                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1782         if (rt->u.dst.dev)
1783                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1784         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1785         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1786         if (rt->rt6i_expires)
1787                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1788         else
1789                 ci.rta_expires = 0;
1790         ci.rta_used = rt->u.dst.__use;
1791         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1792         ci.rta_error = rt->u.dst.error;
1793         ci.rta_id = 0;
1794         ci.rta_ts = 0;
1795         ci.rta_tsage = 0;
1796         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1797         nlh->nlmsg_len = skb->tail - b;
1798         return skb->len;
1799
1800 nlmsg_failure:
1801 rtattr_failure:
1802         skb_trim(skb, b - skb->data);
1803         return -1;
1804 }
1805
1806 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1807 {
1808         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1809         int prefix;
1810
1811         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1812                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1813                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1814         } else
1815                 prefix = 0;
1816
1817         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1818                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1819                      prefix, NLM_F_MULTI);
1820 }
1821
1822 static int fib6_dump_node(struct fib6_walker_t *w)
1823 {
1824         int res;
1825         struct rt6_info *rt;
1826
1827         for (rt = w->leaf; rt; rt = rt->u.next) {
1828                 res = rt6_dump_route(rt, w->args);
1829                 if (res < 0) {
1830                         /* Frame is full, suspend walking */
1831                         w->leaf = rt;
1832                         return 1;
1833                 }
1834                 BUG_TRAP(res!=0);
1835         }
1836         w->leaf = NULL;
1837         return 0;
1838 }
1839
1840 static void fib6_dump_end(struct netlink_callback *cb)
1841 {
1842         struct fib6_walker_t *w = (void*)cb->args[0];
1843
1844         if (w) {
1845                 cb->args[0] = 0;
1846                 fib6_walker_unlink(w);
1847                 kfree(w);
1848         }
1849         cb->done = (void*)cb->args[1];
1850         cb->args[1] = 0;
1851 }
1852
1853 static int fib6_dump_done(struct netlink_callback *cb)
1854 {
1855         fib6_dump_end(cb);
1856         return cb->done ? cb->done(cb) : 0;
1857 }
1858
1859 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1860 {
1861         struct rt6_rtnl_dump_arg arg;
1862         struct fib6_walker_t *w;
1863         int res;
1864
1865         arg.skb = skb;
1866         arg.cb = cb;
1867
1868         w = (void*)cb->args[0];
1869         if (w == NULL) {
1870                 /* New dump:
1871                  * 
1872                  * 1. hook callback destructor.
1873                  */
1874                 cb->args[1] = (long)cb->done;
1875                 cb->done = fib6_dump_done;
1876
1877                 /*
1878                  * 2. allocate and initialize walker.
1879                  */
1880                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1881                 if (w == NULL)
1882                         return -ENOMEM;
1883                 RT6_TRACE("dump<%p", w);
1884                 w->root = &ip6_routing_table;
1885                 w->func = fib6_dump_node;
1886                 w->args = &arg;
1887                 cb->args[0] = (long)w;
1888                 read_lock_bh(&rt6_lock);
1889                 res = fib6_walk(w);
1890                 read_unlock_bh(&rt6_lock);
1891         } else {
1892                 w->args = &arg;
1893                 read_lock_bh(&rt6_lock);
1894                 res = fib6_walk_continue(w);
1895                 read_unlock_bh(&rt6_lock);
1896         }
1897 #if RT6_DEBUG >= 3
1898         if (res <= 0 && skb->len == 0)
1899                 RT6_TRACE("%p>dump end\n", w);
1900 #endif
1901         res = res < 0 ? res : skb->len;
1902         /* res < 0 is an error. (really, impossible)
1903            res == 0 means that dump is complete, but skb still can contain data.
1904            res > 0 dump is not complete, but frame is full.
1905          */
1906         /* Destroy walker, if dump of this table is complete. */
1907         if (res <= 0)
1908                 fib6_dump_end(cb);
1909         return res;
1910 }
1911
1912 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1913 {
1914         struct rtattr **rta = arg;
1915         int iif = 0;
1916         int err = -ENOBUFS;
1917         struct sk_buff *skb;
1918         struct flowi fl;
1919         struct rt6_info *rt;
1920
1921         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1922         if (skb == NULL)
1923                 goto out;
1924
1925         /* Reserve room for dummy headers, this skb can pass
1926            through good chunk of routing engine.
1927          */
1928         skb->mac.raw = skb->data;
1929         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1930
1931         memset(&fl, 0, sizeof(fl));
1932         if (rta[RTA_SRC-1])
1933                 ipv6_addr_copy(&fl.fl6_src,
1934                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1935         if (rta[RTA_DST-1])
1936                 ipv6_addr_copy(&fl.fl6_dst,
1937                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1938
1939         if (rta[RTA_IIF-1])
1940                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1941
1942         if (iif) {
1943                 struct net_device *dev;
1944                 dev = __dev_get_by_index(iif);
1945                 if (!dev) {
1946                         err = -ENODEV;
1947                         goto out_free;
1948                 }
1949         }
1950
1951         fl.oif = 0;
1952         if (rta[RTA_OIF-1])
1953                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1954
1955         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1956
1957         skb->dst = &rt->u.dst;
1958
1959         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1960         err = rt6_fill_node(skb, rt, 
1961                             &fl.fl6_dst, &fl.fl6_src,
1962                             iif,
1963                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1964                             nlh->nlmsg_seq, 0, 0);
1965         if (err < 0) {
1966                 err = -EMSGSIZE;
1967                 goto out_free;
1968         }
1969
1970         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1971         if (err > 0)
1972                 err = 0;
1973 out:
1974         return err;
1975 out_free:
1976         kfree_skb(skb);
1977         goto out;       
1978 }
1979
1980 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
1981                         struct netlink_skb_parms *req)
1982 {
1983         struct sk_buff *skb;
1984         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1985         u32 pid = current->pid;
1986         u32 seq = 0;
1987
1988         if (req)
1989                 pid = req->pid;
1990         if (nlh)
1991                 seq = nlh->nlmsg_seq;
1992         
1993         skb = alloc_skb(size, gfp_any());
1994         if (!skb) {
1995                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1996                 return;
1997         }
1998         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1999                 kfree_skb(skb);
2000                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2001                 return;
2002         }
2003         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2004         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2005 }
2006
2007 /*
2008  *      /proc
2009  */
2010
2011 #ifdef CONFIG_PROC_FS
2012
2013 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2014
2015 struct rt6_proc_arg
2016 {
2017         char *buffer;
2018         int offset;
2019         int length;
2020         int skip;
2021         int len;
2022 };
2023
2024 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2025 {
2026         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2027         int i;
2028
2029         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2030                 arg->skip++;
2031                 return 0;
2032         }
2033
2034         if (arg->len >= arg->length)
2035                 return 0;
2036
2037         for (i=0; i<16; i++) {
2038                 sprintf(arg->buffer + arg->len, "%02x",
2039                         rt->rt6i_dst.addr.s6_addr[i]);
2040                 arg->len += 2;
2041         }
2042         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2043                             rt->rt6i_dst.plen);
2044
2045 #ifdef CONFIG_IPV6_SUBTREES
2046         for (i=0; i<16; i++) {
2047                 sprintf(arg->buffer + arg->len, "%02x",
2048                         rt->rt6i_src.addr.s6_addr[i]);
2049                 arg->len += 2;
2050         }
2051         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2052                             rt->rt6i_src.plen);
2053 #else
2054         sprintf(arg->buffer + arg->len,
2055                 "00000000000000000000000000000000 00 ");
2056         arg->len += 36;
2057 #endif
2058
2059         if (rt->rt6i_nexthop) {
2060                 for (i=0; i<16; i++) {
2061                         sprintf(arg->buffer + arg->len, "%02x",
2062                                 rt->rt6i_nexthop->primary_key[i]);
2063                         arg->len += 2;
2064                 }
2065         } else {
2066                 sprintf(arg->buffer + arg->len,
2067                         "00000000000000000000000000000000");
2068                 arg->len += 32;
2069         }
2070         arg->len += sprintf(arg->buffer + arg->len,
2071                             " %08x %08x %08x %08x %8s\n",
2072                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2073                             rt->u.dst.__use, rt->rt6i_flags, 
2074                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2075         return 0;
2076 }
2077
2078 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2079 {
2080         struct rt6_proc_arg arg;
2081         arg.buffer = buffer;
2082         arg.offset = offset;
2083         arg.length = length;
2084         arg.skip = 0;
2085         arg.len = 0;
2086
2087         read_lock_bh(&rt6_lock);
2088         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2089         read_unlock_bh(&rt6_lock);
2090
2091         *start = buffer;
2092         if (offset)
2093                 *start += offset % RT6_INFO_LEN;
2094
2095         arg.len -= offset % RT6_INFO_LEN;
2096
2097         if (arg.len > length)
2098                 arg.len = length;
2099         if (arg.len < 0)
2100                 arg.len = 0;
2101
2102         return arg.len;
2103 }
2104
2105 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2106 {
2107         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2108                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2109                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2110                       rt6_stats.fib_rt_cache,
2111                       atomic_read(&ip6_dst_ops.entries),
2112                       rt6_stats.fib_discarded_routes);
2113
2114         return 0;
2115 }
2116
2117 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2118 {
2119         return single_open(file, rt6_stats_seq_show, NULL);
2120 }
2121
2122 static struct file_operations rt6_stats_seq_fops = {
2123         .owner   = THIS_MODULE,
2124         .open    = rt6_stats_seq_open,
2125         .read    = seq_read,
2126         .llseek  = seq_lseek,
2127         .release = single_release,
2128 };
2129 #endif  /* CONFIG_PROC_FS */
2130
2131 #ifdef CONFIG_SYSCTL
2132
2133 static int flush_delay;
2134
2135 static
2136 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2137                               void __user *buffer, size_t *lenp, loff_t *ppos)
2138 {
2139         if (write) {
2140                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2141                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2142                 return 0;
2143         } else
2144                 return -EINVAL;
2145 }
2146
2147 ctl_table ipv6_route_table[] = {
2148         {
2149                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2150                 .procname       =       "flush",
2151                 .data           =       &flush_delay,
2152                 .maxlen         =       sizeof(int),
2153                 .mode           =       0200,
2154                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2155         },
2156         {
2157                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2158                 .procname       =       "gc_thresh",
2159                 .data           =       &ip6_dst_ops.gc_thresh,
2160                 .maxlen         =       sizeof(int),
2161                 .mode           =       0644,
2162                 .proc_handler   =       &proc_dointvec,
2163         },
2164         {
2165                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2166                 .procname       =       "max_size",
2167                 .data           =       &ip6_rt_max_size,
2168                 .maxlen         =       sizeof(int),
2169                 .mode           =       0644,
2170                 .proc_handler   =       &proc_dointvec,
2171         },
2172         {
2173                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2174                 .procname       =       "gc_min_interval",
2175                 .data           =       &ip6_rt_gc_min_interval,
2176                 .maxlen         =       sizeof(int),
2177                 .mode           =       0644,
2178                 .proc_handler   =       &proc_dointvec_jiffies,
2179                 .strategy       =       &sysctl_jiffies,
2180         },
2181         {
2182                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2183                 .procname       =       "gc_timeout",
2184                 .data           =       &ip6_rt_gc_timeout,
2185                 .maxlen         =       sizeof(int),
2186                 .mode           =       0644,
2187                 .proc_handler   =       &proc_dointvec_jiffies,
2188                 .strategy       =       &sysctl_jiffies,
2189         },
2190         {
2191                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2192                 .procname       =       "gc_interval",
2193                 .data           =       &ip6_rt_gc_interval,
2194                 .maxlen         =       sizeof(int),
2195                 .mode           =       0644,
2196                 .proc_handler   =       &proc_dointvec_jiffies,
2197                 .strategy       =       &sysctl_jiffies,
2198         },
2199         {
2200                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2201                 .procname       =       "gc_elasticity",
2202                 .data           =       &ip6_rt_gc_elasticity,
2203                 .maxlen         =       sizeof(int),
2204                 .mode           =       0644,
2205                 .proc_handler   =       &proc_dointvec_jiffies,
2206                 .strategy       =       &sysctl_jiffies,
2207         },
2208         {
2209                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2210                 .procname       =       "mtu_expires",
2211                 .data           =       &ip6_rt_mtu_expires,
2212                 .maxlen         =       sizeof(int),
2213                 .mode           =       0644,
2214                 .proc_handler   =       &proc_dointvec_jiffies,
2215                 .strategy       =       &sysctl_jiffies,
2216         },
2217         {
2218                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2219                 .procname       =       "min_adv_mss",
2220                 .data           =       &ip6_rt_min_advmss,
2221                 .maxlen         =       sizeof(int),
2222                 .mode           =       0644,
2223                 .proc_handler   =       &proc_dointvec_jiffies,
2224                 .strategy       =       &sysctl_jiffies,
2225         },
2226         {
2227                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2228                 .procname       =       "gc_min_interval_ms",
2229                 .data           =       &ip6_rt_gc_min_interval,
2230                 .maxlen         =       sizeof(int),
2231                 .mode           =       0644,
2232                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2233                 .strategy       =       &sysctl_ms_jiffies,
2234         },
2235         { .ctl_name = 0 }
2236 };
2237
2238 #endif
2239
2240 void __init ip6_route_init(void)
2241 {
2242         struct proc_dir_entry *p;
2243
2244         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2245                                                      sizeof(struct rt6_info),
2246                                                      0, SLAB_HWCACHE_ALIGN,
2247                                                      NULL, NULL);
2248         if (!ip6_dst_ops.kmem_cachep)
2249                 panic("cannot create ip6_dst_cache");
2250
2251         fib6_init();
2252 #ifdef  CONFIG_PROC_FS
2253         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2254         if (p)
2255                 p->owner = THIS_MODULE;
2256
2257         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2258 #endif
2259 #ifdef CONFIG_XFRM
2260         xfrm6_init();
2261 #endif
2262 }
2263
2264 void ip6_route_cleanup(void)
2265 {
2266 #ifdef CONFIG_PROC_FS
2267         proc_net_remove("ipv6_route");
2268         proc_net_remove("rt6_stats");
2269 #endif
2270 #ifdef CONFIG_XFRM
2271         xfrm6_fini();
2272 #endif
2273         rt6_ifdown(NULL);
2274         fib6_gc_cleanup();
2275         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2276 }