Merge master.kernel.org:/home/rmk/linux-2.6-serial
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (neigh) {
284                 read_lock_bh(&neigh->lock);
285                 if (neigh->nud_state & NUD_VALID)
286                         m = 1;
287                 read_unlock_bh(&neigh->lock);
288         }
289         return m;
290 }
291
292 static int rt6_score_route(struct rt6_info *rt, int oif,
293                            int strict)
294 {
295         int m = rt6_check_dev(rt, oif);
296         if (!m && (strict & RT6_SELECT_F_IFACE))
297                 return -1;
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
300 #endif
301         if (rt6_check_neigh(rt))
302                 m |= 16;
303         else if (strict & RT6_SELECT_F_REACHABLE)
304                 return -1;
305         return m;
306 }
307
308 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
309                                    int strict)
310 {
311         struct rt6_info *match = NULL, *last = NULL;
312         struct rt6_info *rt, *rt0 = *head;
313         u32 metric;
314         int mpri = -1;
315
316         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
317                   __FUNCTION__, head, head ? *head : NULL, oif);
318
319         for (rt = rt0, metric = rt0->rt6i_metric;
320              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
321              rt = rt->u.next) {
322                 int m;
323
324                 if (rt6_check_expired(rt))
325                         continue;
326
327                 last = rt;
328
329                 m = rt6_score_route(rt, oif, strict);
330                 if (m < 0)
331                         continue;
332
333                 if (m > mpri) {
334                         rt6_probe(match);
335                         match = rt;
336                         mpri = m;
337                 } else {
338                         rt6_probe(rt);
339                 }
340         }
341
342         if (!match &&
343             (strict & RT6_SELECT_F_REACHABLE) &&
344             last && last != rt0) {
345                 /* no entries matched; do round-robin */
346                 static spinlock_t lock = SPIN_LOCK_UNLOCKED;
347                 spin_lock(&lock);
348                 *head = rt0->u.next;
349                 rt0->u.next = last->u.next;
350                 last->u.next = rt0;
351                 spin_unlock(&lock);
352         }
353
354         RT6_TRACE("%s() => %p, score=%d\n",
355                   __FUNCTION__, match, mpri);
356
357         return (match ? match : &ip6_null_entry);
358 }
359
360 #ifdef CONFIG_IPV6_ROUTE_INFO
361 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
362                   struct in6_addr *gwaddr)
363 {
364         struct route_info *rinfo = (struct route_info *) opt;
365         struct in6_addr prefix_buf, *prefix;
366         unsigned int pref;
367         u32 lifetime;
368         struct rt6_info *rt;
369
370         if (len < sizeof(struct route_info)) {
371                 return -EINVAL;
372         }
373
374         /* Sanity check for prefix_len and length */
375         if (rinfo->length > 3) {
376                 return -EINVAL;
377         } else if (rinfo->prefix_len > 128) {
378                 return -EINVAL;
379         } else if (rinfo->prefix_len > 64) {
380                 if (rinfo->length < 2) {
381                         return -EINVAL;
382                 }
383         } else if (rinfo->prefix_len > 0) {
384                 if (rinfo->length < 1) {
385                         return -EINVAL;
386                 }
387         }
388
389         pref = rinfo->route_pref;
390         if (pref == ICMPV6_ROUTER_PREF_INVALID)
391                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
392
393         lifetime = htonl(rinfo->lifetime);
394         if (lifetime == 0xffffffff) {
395                 /* infinity */
396         } else if (lifetime > 0x7fffffff/HZ) {
397                 /* Avoid arithmetic overflow */
398                 lifetime = 0x7fffffff/HZ - 1;
399         }
400
401         if (rinfo->length == 3)
402                 prefix = (struct in6_addr *)rinfo->prefix;
403         else {
404                 /* this function is safe */
405                 ipv6_addr_prefix(&prefix_buf,
406                                  (struct in6_addr *)rinfo->prefix,
407                                  rinfo->prefix_len);
408                 prefix = &prefix_buf;
409         }
410
411         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
412
413         if (rt && !lifetime) {
414                 ip6_del_rt(rt, NULL, NULL, NULL);
415                 rt = NULL;
416         }
417
418         if (!rt && lifetime)
419                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
420                                         pref);
421         else if (rt)
422                 rt->rt6i_flags = RTF_ROUTEINFO |
423                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
424
425         if (rt) {
426                 if (lifetime == 0xffffffff) {
427                         rt->rt6i_flags &= ~RTF_EXPIRES;
428                 } else {
429                         rt->rt6i_expires = jiffies + HZ * lifetime;
430                         rt->rt6i_flags |= RTF_EXPIRES;
431                 }
432                 dst_release(&rt->u.dst);
433         }
434         return 0;
435 }
436 #endif
437
438 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
439                             int oif, int strict)
440 {
441         struct fib6_node *fn;
442         struct rt6_info *rt;
443
444         read_lock_bh(&rt6_lock);
445         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
446         rt = rt6_device_match(fn->leaf, oif, strict);
447         dst_hold(&rt->u.dst);
448         rt->u.dst.__use++;
449         read_unlock_bh(&rt6_lock);
450
451         rt->u.dst.lastuse = jiffies;
452         if (rt->u.dst.error == 0)
453                 return rt;
454         dst_release(&rt->u.dst);
455         return NULL;
456 }
457
458 /* ip6_ins_rt is called with FREE rt6_lock.
459    It takes new route entry, the addition fails by any reason the
460    route is freed. In any case, if caller does not hold it, it may
461    be destroyed.
462  */
463
464 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
465                 void *_rtattr, struct netlink_skb_parms *req)
466 {
467         int err;
468
469         write_lock_bh(&rt6_lock);
470         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
471         write_unlock_bh(&rt6_lock);
472
473         return err;
474 }
475
476 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
477                                       struct in6_addr *saddr)
478 {
479         struct rt6_info *rt;
480
481         /*
482          *      Clone the route.
483          */
484
485         rt = ip6_rt_copy(ort);
486
487         if (rt) {
488                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
489                         if (rt->rt6i_dst.plen != 128 &&
490                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
491                                 rt->rt6i_flags |= RTF_ANYCAST;
492                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
493                 }
494
495                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
496                 rt->rt6i_dst.plen = 128;
497                 rt->rt6i_flags |= RTF_CACHE;
498                 rt->u.dst.flags |= DST_HOST;
499
500 #ifdef CONFIG_IPV6_SUBTREES
501                 if (rt->rt6i_src.plen && saddr) {
502                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
503                         rt->rt6i_src.plen = 128;
504                 }
505 #endif
506
507                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
508
509         }
510
511         return rt;
512 }
513
514 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
515 {
516         struct rt6_info *rt = ip6_rt_copy(ort);
517         if (rt) {
518                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
519                 rt->rt6i_dst.plen = 128;
520                 rt->rt6i_flags |= RTF_CACHE;
521                 if (rt->rt6i_flags & RTF_REJECT)
522                         rt->u.dst.error = ort->u.dst.error;
523                 rt->u.dst.flags |= DST_HOST;
524                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
525         }
526         return rt;
527 }
528
529 #define BACKTRACK() \
530 if (rt == &ip6_null_entry) { \
531        while ((fn = fn->parent) != NULL) { \
532                 if (fn->fn_flags & RTN_ROOT) { \
533                         goto out; \
534                 } \
535                 if (fn->fn_flags & RTN_RTINFO) \
536                         goto restart; \
537         } \
538 }
539
540
541 void ip6_route_input(struct sk_buff *skb)
542 {
543         struct fib6_node *fn;
544         struct rt6_info *rt, *nrt;
545         int strict;
546         int attempts = 3;
547         int err;
548         int reachable = RT6_SELECT_F_REACHABLE;
549
550         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
551
552 relookup:
553         read_lock_bh(&rt6_lock);
554
555 restart_2:
556         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
557                          &skb->nh.ipv6h->saddr);
558
559 restart:
560         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
561         BACKTRACK();
562         if (rt == &ip6_null_entry ||
563             rt->rt6i_flags & RTF_CACHE)
564                 goto out;
565
566         dst_hold(&rt->u.dst);
567         read_unlock_bh(&rt6_lock);
568
569         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
570                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
571         else {
572 #if CLONE_OFFLINK_ROUTE
573                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
574 #else
575                 goto out2;
576 #endif
577         }
578
579         dst_release(&rt->u.dst);
580         rt = nrt ? : &ip6_null_entry;
581
582         dst_hold(&rt->u.dst);
583         if (nrt) {
584                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
585                 if (!err)
586                         goto out2;
587         }
588
589         if (--attempts <= 0)
590                 goto out2;
591
592         /*
593          * Race condition! In the gap, when rt6_lock was
594          * released someone could insert this route.  Relookup.
595          */
596         dst_release(&rt->u.dst);
597         goto relookup;
598
599 out:
600         if (reachable) {
601                 reachable = 0;
602                 goto restart_2;
603         }
604         dst_hold(&rt->u.dst);
605         read_unlock_bh(&rt6_lock);
606 out2:
607         rt->u.dst.lastuse = jiffies;
608         rt->u.dst.__use++;
609         skb->dst = (struct dst_entry *) rt;
610         return;
611 }
612
613 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
614 {
615         struct fib6_node *fn;
616         struct rt6_info *rt, *nrt;
617         int strict;
618         int attempts = 3;
619         int err;
620         int reachable = RT6_SELECT_F_REACHABLE;
621
622         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
623
624 relookup:
625         read_lock_bh(&rt6_lock);
626
627 restart_2:
628         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
629
630 restart:
631         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
632         BACKTRACK();
633         if (rt == &ip6_null_entry ||
634             rt->rt6i_flags & RTF_CACHE)
635                 goto out;
636
637         dst_hold(&rt->u.dst);
638         read_unlock_bh(&rt6_lock);
639
640         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
641                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
642         else {
643 #if CLONE_OFFLINK_ROUTE
644                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
645 #else
646                 goto out2;
647 #endif
648         }
649
650         dst_release(&rt->u.dst);
651         rt = nrt ? : &ip6_null_entry;
652
653         dst_hold(&rt->u.dst);
654         if (nrt) {
655                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
656                 if (!err)
657                         goto out2;
658         }
659
660         if (--attempts <= 0)
661                 goto out2;
662
663         /*
664          * Race condition! In the gap, when rt6_lock was
665          * released someone could insert this route.  Relookup.
666          */
667         dst_release(&rt->u.dst);
668         goto relookup;
669
670 out:
671         if (reachable) {
672                 reachable = 0;
673                 goto restart_2;
674         }
675         dst_hold(&rt->u.dst);
676         read_unlock_bh(&rt6_lock);
677 out2:
678         rt->u.dst.lastuse = jiffies;
679         rt->u.dst.__use++;
680         return &rt->u.dst;
681 }
682
683
684 /*
685  *      Destination cache support functions
686  */
687
688 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
689 {
690         struct rt6_info *rt;
691
692         rt = (struct rt6_info *) dst;
693
694         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
695                 return dst;
696
697         return NULL;
698 }
699
700 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
701 {
702         struct rt6_info *rt = (struct rt6_info *) dst;
703
704         if (rt) {
705                 if (rt->rt6i_flags & RTF_CACHE)
706                         ip6_del_rt(rt, NULL, NULL, NULL);
707                 else
708                         dst_release(dst);
709         }
710         return NULL;
711 }
712
713 static void ip6_link_failure(struct sk_buff *skb)
714 {
715         struct rt6_info *rt;
716
717         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
718
719         rt = (struct rt6_info *) skb->dst;
720         if (rt) {
721                 if (rt->rt6i_flags&RTF_CACHE) {
722                         dst_set_expires(&rt->u.dst, 0);
723                         rt->rt6i_flags |= RTF_EXPIRES;
724                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
725                         rt->rt6i_node->fn_sernum = -1;
726         }
727 }
728
729 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
730 {
731         struct rt6_info *rt6 = (struct rt6_info*)dst;
732
733         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
734                 rt6->rt6i_flags |= RTF_MODIFIED;
735                 if (mtu < IPV6_MIN_MTU) {
736                         mtu = IPV6_MIN_MTU;
737                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
738                 }
739                 dst->metrics[RTAX_MTU-1] = mtu;
740         }
741 }
742
743 /* Protected by rt6_lock.  */
744 static struct dst_entry *ndisc_dst_gc_list;
745 static int ipv6_get_mtu(struct net_device *dev);
746
747 static inline unsigned int ipv6_advmss(unsigned int mtu)
748 {
749         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
750
751         if (mtu < ip6_rt_min_advmss)
752                 mtu = ip6_rt_min_advmss;
753
754         /*
755          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
756          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
757          * IPV6_MAXPLEN is also valid and means: "any MSS, 
758          * rely only on pmtu discovery"
759          */
760         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
761                 mtu = IPV6_MAXPLEN;
762         return mtu;
763 }
764
765 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
766                                   struct neighbour *neigh,
767                                   struct in6_addr *addr,
768                                   int (*output)(struct sk_buff *))
769 {
770         struct rt6_info *rt;
771         struct inet6_dev *idev = in6_dev_get(dev);
772
773         if (unlikely(idev == NULL))
774                 return NULL;
775
776         rt = ip6_dst_alloc();
777         if (unlikely(rt == NULL)) {
778                 in6_dev_put(idev);
779                 goto out;
780         }
781
782         dev_hold(dev);
783         if (neigh)
784                 neigh_hold(neigh);
785         else
786                 neigh = ndisc_get_neigh(dev, addr);
787
788         rt->rt6i_dev      = dev;
789         rt->rt6i_idev     = idev;
790         rt->rt6i_nexthop  = neigh;
791         atomic_set(&rt->u.dst.__refcnt, 1);
792         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
793         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
794         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
795         rt->u.dst.output  = output;
796
797 #if 0   /* there's no chance to use these for ndisc */
798         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
799                                 ? DST_HOST 
800                                 : 0;
801         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
802         rt->rt6i_dst.plen = 128;
803 #endif
804
805         write_lock_bh(&rt6_lock);
806         rt->u.dst.next = ndisc_dst_gc_list;
807         ndisc_dst_gc_list = &rt->u.dst;
808         write_unlock_bh(&rt6_lock);
809
810         fib6_force_start_gc();
811
812 out:
813         return (struct dst_entry *)rt;
814 }
815
816 int ndisc_dst_gc(int *more)
817 {
818         struct dst_entry *dst, *next, **pprev;
819         int freed;
820
821         next = NULL;
822         pprev = &ndisc_dst_gc_list;
823         freed = 0;
824         while ((dst = *pprev) != NULL) {
825                 if (!atomic_read(&dst->__refcnt)) {
826                         *pprev = dst->next;
827                         dst_free(dst);
828                         freed++;
829                 } else {
830                         pprev = &dst->next;
831                         (*more)++;
832                 }
833         }
834
835         return freed;
836 }
837
838 static int ip6_dst_gc(void)
839 {
840         static unsigned expire = 30*HZ;
841         static unsigned long last_gc;
842         unsigned long now = jiffies;
843
844         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
845             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
846                 goto out;
847
848         expire++;
849         fib6_run_gc(expire);
850         last_gc = now;
851         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
852                 expire = ip6_rt_gc_timeout>>1;
853
854 out:
855         expire -= expire>>ip6_rt_gc_elasticity;
856         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
857 }
858
859 /* Clean host part of a prefix. Not necessary in radix tree,
860    but results in cleaner routing tables.
861
862    Remove it only when all the things will work!
863  */
864
865 static int ipv6_get_mtu(struct net_device *dev)
866 {
867         int mtu = IPV6_MIN_MTU;
868         struct inet6_dev *idev;
869
870         idev = in6_dev_get(dev);
871         if (idev) {
872                 mtu = idev->cnf.mtu6;
873                 in6_dev_put(idev);
874         }
875         return mtu;
876 }
877
878 int ipv6_get_hoplimit(struct net_device *dev)
879 {
880         int hoplimit = ipv6_devconf.hop_limit;
881         struct inet6_dev *idev;
882
883         idev = in6_dev_get(dev);
884         if (idev) {
885                 hoplimit = idev->cnf.hop_limit;
886                 in6_dev_put(idev);
887         }
888         return hoplimit;
889 }
890
891 /*
892  *
893  */
894
895 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
896                 void *_rtattr, struct netlink_skb_parms *req)
897 {
898         int err;
899         struct rtmsg *r;
900         struct rtattr **rta;
901         struct rt6_info *rt = NULL;
902         struct net_device *dev = NULL;
903         struct inet6_dev *idev = NULL;
904         int addr_type;
905
906         rta = (struct rtattr **) _rtattr;
907
908         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
909                 return -EINVAL;
910 #ifndef CONFIG_IPV6_SUBTREES
911         if (rtmsg->rtmsg_src_len)
912                 return -EINVAL;
913 #endif
914         if (rtmsg->rtmsg_ifindex) {
915                 err = -ENODEV;
916                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
917                 if (!dev)
918                         goto out;
919                 idev = in6_dev_get(dev);
920                 if (!idev)
921                         goto out;
922         }
923
924         if (rtmsg->rtmsg_metric == 0)
925                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
926
927         rt = ip6_dst_alloc();
928
929         if (rt == NULL) {
930                 err = -ENOMEM;
931                 goto out;
932         }
933
934         rt->u.dst.obsolete = -1;
935         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
936         if (nlh && (r = NLMSG_DATA(nlh))) {
937                 rt->rt6i_protocol = r->rtm_protocol;
938         } else {
939                 rt->rt6i_protocol = RTPROT_BOOT;
940         }
941
942         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
943
944         if (addr_type & IPV6_ADDR_MULTICAST)
945                 rt->u.dst.input = ip6_mc_input;
946         else
947                 rt->u.dst.input = ip6_forward;
948
949         rt->u.dst.output = ip6_output;
950
951         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
952                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
953         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
954         if (rt->rt6i_dst.plen == 128)
955                rt->u.dst.flags = DST_HOST;
956
957 #ifdef CONFIG_IPV6_SUBTREES
958         ipv6_addr_prefix(&rt->rt6i_src.addr, 
959                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
960         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
961 #endif
962
963         rt->rt6i_metric = rtmsg->rtmsg_metric;
964
965         /* We cannot add true routes via loopback here,
966            they would result in kernel looping; promote them to reject routes
967          */
968         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
969             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
970                 /* hold loopback dev/idev if we haven't done so. */
971                 if (dev != &loopback_dev) {
972                         if (dev) {
973                                 dev_put(dev);
974                                 in6_dev_put(idev);
975                         }
976                         dev = &loopback_dev;
977                         dev_hold(dev);
978                         idev = in6_dev_get(dev);
979                         if (!idev) {
980                                 err = -ENODEV;
981                                 goto out;
982                         }
983                 }
984                 rt->u.dst.output = ip6_pkt_discard_out;
985                 rt->u.dst.input = ip6_pkt_discard;
986                 rt->u.dst.error = -ENETUNREACH;
987                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
988                 goto install_route;
989         }
990
991         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
992                 struct in6_addr *gw_addr;
993                 int gwa_type;
994
995                 gw_addr = &rtmsg->rtmsg_gateway;
996                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
997                 gwa_type = ipv6_addr_type(gw_addr);
998
999                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1000                         struct rt6_info *grt;
1001
1002                         /* IPv6 strictly inhibits using not link-local
1003                            addresses as nexthop address.
1004                            Otherwise, router will not able to send redirects.
1005                            It is very good, but in some (rare!) circumstances
1006                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1007                            some exceptions. --ANK
1008                          */
1009                         err = -EINVAL;
1010                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1011                                 goto out;
1012
1013                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1014
1015                         err = -EHOSTUNREACH;
1016                         if (grt == NULL)
1017                                 goto out;
1018                         if (dev) {
1019                                 if (dev != grt->rt6i_dev) {
1020                                         dst_release(&grt->u.dst);
1021                                         goto out;
1022                                 }
1023                         } else {
1024                                 dev = grt->rt6i_dev;
1025                                 idev = grt->rt6i_idev;
1026                                 dev_hold(dev);
1027                                 in6_dev_hold(grt->rt6i_idev);
1028                         }
1029                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1030                                 err = 0;
1031                         dst_release(&grt->u.dst);
1032
1033                         if (err)
1034                                 goto out;
1035                 }
1036                 err = -EINVAL;
1037                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1038                         goto out;
1039         }
1040
1041         err = -ENODEV;
1042         if (dev == NULL)
1043                 goto out;
1044
1045         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1046                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1047                 if (IS_ERR(rt->rt6i_nexthop)) {
1048                         err = PTR_ERR(rt->rt6i_nexthop);
1049                         rt->rt6i_nexthop = NULL;
1050                         goto out;
1051                 }
1052         }
1053
1054         rt->rt6i_flags = rtmsg->rtmsg_flags;
1055
1056 install_route:
1057         if (rta && rta[RTA_METRICS-1]) {
1058                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1059                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1060
1061                 while (RTA_OK(attr, attrlen)) {
1062                         unsigned flavor = attr->rta_type;
1063                         if (flavor) {
1064                                 if (flavor > RTAX_MAX) {
1065                                         err = -EINVAL;
1066                                         goto out;
1067                                 }
1068                                 rt->u.dst.metrics[flavor-1] =
1069                                         *(u32 *)RTA_DATA(attr);
1070                         }
1071                         attr = RTA_NEXT(attr, attrlen);
1072                 }
1073         }
1074
1075         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1076                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1077         if (!rt->u.dst.metrics[RTAX_MTU-1])
1078                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1079         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1080                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1081         rt->u.dst.dev = dev;
1082         rt->rt6i_idev = idev;
1083         return ip6_ins_rt(rt, nlh, _rtattr, req);
1084
1085 out:
1086         if (dev)
1087                 dev_put(dev);
1088         if (idev)
1089                 in6_dev_put(idev);
1090         if (rt)
1091                 dst_free((struct dst_entry *) rt);
1092         return err;
1093 }
1094
1095 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1096 {
1097         int err;
1098
1099         write_lock_bh(&rt6_lock);
1100
1101         err = fib6_del(rt, nlh, _rtattr, req);
1102         dst_release(&rt->u.dst);
1103
1104         write_unlock_bh(&rt6_lock);
1105
1106         return err;
1107 }
1108
1109 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1110 {
1111         struct fib6_node *fn;
1112         struct rt6_info *rt;
1113         int err = -ESRCH;
1114
1115         read_lock_bh(&rt6_lock);
1116
1117         fn = fib6_locate(&ip6_routing_table,
1118                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1119                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1120         
1121         if (fn) {
1122                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1123                         if (rtmsg->rtmsg_ifindex &&
1124                             (rt->rt6i_dev == NULL ||
1125                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1126                                 continue;
1127                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1128                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1129                                 continue;
1130                         if (rtmsg->rtmsg_metric &&
1131                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1132                                 continue;
1133                         dst_hold(&rt->u.dst);
1134                         read_unlock_bh(&rt6_lock);
1135
1136                         return ip6_del_rt(rt, nlh, _rtattr, req);
1137                 }
1138         }
1139         read_unlock_bh(&rt6_lock);
1140
1141         return err;
1142 }
1143
1144 /*
1145  *      Handle redirects
1146  */
1147 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1148                   struct neighbour *neigh, u8 *lladdr, int on_link)
1149 {
1150         struct rt6_info *rt, *nrt = NULL;
1151         int strict;
1152         struct fib6_node *fn;
1153
1154         /*
1155          * Get the "current" route for this destination and
1156          * check if the redirect has come from approriate router.
1157          *
1158          * RFC 2461 specifies that redirects should only be
1159          * accepted if they come from the nexthop to the target.
1160          * Due to the way the routes are chosen, this notion
1161          * is a bit fuzzy and one might need to check all possible
1162          * routes.
1163          */
1164         strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1165
1166         read_lock_bh(&rt6_lock);
1167         fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1168 restart:
1169         for (rt = fn->leaf; rt; rt = rt->u.next) {
1170                 /*
1171                  * Current route is on-link; redirect is always invalid.
1172                  *
1173                  * Seems, previous statement is not true. It could
1174                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1175                  * But then router serving it might decide, that we should
1176                  * know truth 8)8) --ANK (980726).
1177                  */
1178                 if (rt6_check_expired(rt))
1179                         continue;
1180                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1181                         continue;
1182                 if (neigh->dev != rt->rt6i_dev)
1183                         continue;
1184                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1185                         continue;
1186                 break;
1187         }
1188         if (rt)
1189                 dst_hold(&rt->u.dst);
1190         else if (strict) {
1191                 while ((fn = fn->parent) != NULL) {
1192                         if (fn->fn_flags & RTN_ROOT)
1193                                 break;
1194                         if (fn->fn_flags & RTN_RTINFO)
1195                                 goto restart;
1196                 }
1197         }
1198         read_unlock_bh(&rt6_lock);
1199
1200         if (!rt) {
1201                 if (net_ratelimit())
1202                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1203                                "for redirect target\n");
1204                 return;
1205         }
1206
1207         /*
1208          *      We have finally decided to accept it.
1209          */
1210
1211         neigh_update(neigh, lladdr, NUD_STALE, 
1212                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1213                      NEIGH_UPDATE_F_OVERRIDE|
1214                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1215                                      NEIGH_UPDATE_F_ISROUTER))
1216                      );
1217
1218         /*
1219          * Redirect received -> path was valid.
1220          * Look, redirects are sent only in response to data packets,
1221          * so that this nexthop apparently is reachable. --ANK
1222          */
1223         dst_confirm(&rt->u.dst);
1224
1225         /* Duplicate redirect: silently ignore. */
1226         if (neigh == rt->u.dst.neighbour)
1227                 goto out;
1228
1229         nrt = ip6_rt_copy(rt);
1230         if (nrt == NULL)
1231                 goto out;
1232
1233         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1234         if (on_link)
1235                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1236
1237         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1238         nrt->rt6i_dst.plen = 128;
1239         nrt->u.dst.flags |= DST_HOST;
1240
1241         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1242         nrt->rt6i_nexthop = neigh_clone(neigh);
1243         /* Reset pmtu, it may be better */
1244         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1245         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1246
1247         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1248                 goto out;
1249
1250         if (rt->rt6i_flags&RTF_CACHE) {
1251                 ip6_del_rt(rt, NULL, NULL, NULL);
1252                 return;
1253         }
1254
1255 out:
1256         dst_release(&rt->u.dst);
1257         return;
1258 }
1259
1260 /*
1261  *      Handle ICMP "packet too big" messages
1262  *      i.e. Path MTU discovery
1263  */
1264
1265 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1266                         struct net_device *dev, u32 pmtu)
1267 {
1268         struct rt6_info *rt, *nrt;
1269         int allfrag = 0;
1270
1271         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1272         if (rt == NULL)
1273                 return;
1274
1275         if (pmtu >= dst_mtu(&rt->u.dst))
1276                 goto out;
1277
1278         if (pmtu < IPV6_MIN_MTU) {
1279                 /*
1280                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1281                  * MTU (1280) and a fragment header should always be included
1282                  * after a node receiving Too Big message reporting PMTU is
1283                  * less than the IPv6 Minimum Link MTU.
1284                  */
1285                 pmtu = IPV6_MIN_MTU;
1286                 allfrag = 1;
1287         }
1288
1289         /* New mtu received -> path was valid.
1290            They are sent only in response to data packets,
1291            so that this nexthop apparently is reachable. --ANK
1292          */
1293         dst_confirm(&rt->u.dst);
1294
1295         /* Host route. If it is static, it would be better
1296            not to override it, but add new one, so that
1297            when cache entry will expire old pmtu
1298            would return automatically.
1299          */
1300         if (rt->rt6i_flags & RTF_CACHE) {
1301                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1302                 if (allfrag)
1303                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1304                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1305                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1306                 goto out;
1307         }
1308
1309         /* Network route.
1310            Two cases are possible:
1311            1. It is connected route. Action: COW
1312            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1313          */
1314         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1315                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1316         else
1317                 nrt = rt6_alloc_clone(rt, daddr);
1318
1319         if (nrt) {
1320                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1321                 if (allfrag)
1322                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1323
1324                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1325                  * happened within 5 mins, the recommended timer is 10 mins.
1326                  * Here this route expiration time is set to ip6_rt_mtu_expires
1327                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1328                  * and detecting PMTU increase will be automatically happened.
1329                  */
1330                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1331                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1332
1333                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1334         }
1335 out:
1336         dst_release(&rt->u.dst);
1337 }
1338
1339 /*
1340  *      Misc support functions
1341  */
1342
1343 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1344 {
1345         struct rt6_info *rt = ip6_dst_alloc();
1346
1347         if (rt) {
1348                 rt->u.dst.input = ort->u.dst.input;
1349                 rt->u.dst.output = ort->u.dst.output;
1350
1351                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1352                 rt->u.dst.dev = ort->u.dst.dev;
1353                 if (rt->u.dst.dev)
1354                         dev_hold(rt->u.dst.dev);
1355                 rt->rt6i_idev = ort->rt6i_idev;
1356                 if (rt->rt6i_idev)
1357                         in6_dev_hold(rt->rt6i_idev);
1358                 rt->u.dst.lastuse = jiffies;
1359                 rt->rt6i_expires = 0;
1360
1361                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1362                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1363                 rt->rt6i_metric = 0;
1364
1365                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1366 #ifdef CONFIG_IPV6_SUBTREES
1367                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1368 #endif
1369         }
1370         return rt;
1371 }
1372
1373 #ifdef CONFIG_IPV6_ROUTE_INFO
1374 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1375                                            struct in6_addr *gwaddr, int ifindex)
1376 {
1377         struct fib6_node *fn;
1378         struct rt6_info *rt = NULL;
1379
1380         write_lock_bh(&rt6_lock);
1381         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1382         if (!fn)
1383                 goto out;
1384
1385         for (rt = fn->leaf; rt; rt = rt->u.next) {
1386                 if (rt->rt6i_dev->ifindex != ifindex)
1387                         continue;
1388                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1389                         continue;
1390                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1391                         continue;
1392                 dst_hold(&rt->u.dst);
1393                 break;
1394         }
1395 out:
1396         write_unlock_bh(&rt6_lock);
1397         return rt;
1398 }
1399
1400 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1401                                            struct in6_addr *gwaddr, int ifindex,
1402                                            unsigned pref)
1403 {
1404         struct in6_rtmsg rtmsg;
1405
1406         memset(&rtmsg, 0, sizeof(rtmsg));
1407         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1408         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1409         rtmsg.rtmsg_dst_len = prefixlen;
1410         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1411         rtmsg.rtmsg_metric = 1024;
1412         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1413         /* We should treat it as a default route if prefix length is 0. */
1414         if (!prefixlen)
1415                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1416         rtmsg.rtmsg_ifindex = ifindex;
1417
1418         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1419
1420         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1421 }
1422 #endif
1423
1424 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1425 {       
1426         struct rt6_info *rt;
1427         struct fib6_node *fn;
1428
1429         fn = &ip6_routing_table;
1430
1431         write_lock_bh(&rt6_lock);
1432         for (rt = fn->leaf; rt; rt=rt->u.next) {
1433                 if (dev == rt->rt6i_dev &&
1434                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1435                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1436                         break;
1437         }
1438         if (rt)
1439                 dst_hold(&rt->u.dst);
1440         write_unlock_bh(&rt6_lock);
1441         return rt;
1442 }
1443
1444 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1445                                      struct net_device *dev,
1446                                      unsigned int pref)
1447 {
1448         struct in6_rtmsg rtmsg;
1449
1450         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1451         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1452         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1453         rtmsg.rtmsg_metric = 1024;
1454         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1455                             RTF_PREF(pref);
1456
1457         rtmsg.rtmsg_ifindex = dev->ifindex;
1458
1459         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1460         return rt6_get_dflt_router(gwaddr, dev);
1461 }
1462
1463 void rt6_purge_dflt_routers(void)
1464 {
1465         struct rt6_info *rt;
1466
1467 restart:
1468         read_lock_bh(&rt6_lock);
1469         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1470                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1471                         dst_hold(&rt->u.dst);
1472
1473                         read_unlock_bh(&rt6_lock);
1474
1475                         ip6_del_rt(rt, NULL, NULL, NULL);
1476
1477                         goto restart;
1478                 }
1479         }
1480         read_unlock_bh(&rt6_lock);
1481 }
1482
1483 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1484 {
1485         struct in6_rtmsg rtmsg;
1486         int err;
1487
1488         switch(cmd) {
1489         case SIOCADDRT:         /* Add a route */
1490         case SIOCDELRT:         /* Delete a route */
1491                 if (!capable(CAP_NET_ADMIN))
1492                         return -EPERM;
1493                 err = copy_from_user(&rtmsg, arg,
1494                                      sizeof(struct in6_rtmsg));
1495                 if (err)
1496                         return -EFAULT;
1497                         
1498                 rtnl_lock();
1499                 switch (cmd) {
1500                 case SIOCADDRT:
1501                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1502                         break;
1503                 case SIOCDELRT:
1504                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1505                         break;
1506                 default:
1507                         err = -EINVAL;
1508                 }
1509                 rtnl_unlock();
1510
1511                 return err;
1512         };
1513
1514         return -EINVAL;
1515 }
1516
1517 /*
1518  *      Drop the packet on the floor
1519  */
1520
1521 static int ip6_pkt_discard(struct sk_buff *skb)
1522 {
1523         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1524         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1525         kfree_skb(skb);
1526         return 0;
1527 }
1528
1529 static int ip6_pkt_discard_out(struct sk_buff *skb)
1530 {
1531         skb->dev = skb->dst->dev;
1532         return ip6_pkt_discard(skb);
1533 }
1534
1535 /*
1536  *      Allocate a dst for local (unicast / anycast) address.
1537  */
1538
1539 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1540                                     const struct in6_addr *addr,
1541                                     int anycast)
1542 {
1543         struct rt6_info *rt = ip6_dst_alloc();
1544
1545         if (rt == NULL)
1546                 return ERR_PTR(-ENOMEM);
1547
1548         dev_hold(&loopback_dev);
1549         in6_dev_hold(idev);
1550
1551         rt->u.dst.flags = DST_HOST;
1552         rt->u.dst.input = ip6_input;
1553         rt->u.dst.output = ip6_output;
1554         rt->rt6i_dev = &loopback_dev;
1555         rt->rt6i_idev = idev;
1556         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1557         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1558         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1559         rt->u.dst.obsolete = -1;
1560
1561         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1562         if (anycast)
1563                 rt->rt6i_flags |= RTF_ANYCAST;
1564         else
1565                 rt->rt6i_flags |= RTF_LOCAL;
1566         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1567         if (rt->rt6i_nexthop == NULL) {
1568                 dst_free((struct dst_entry *) rt);
1569                 return ERR_PTR(-ENOMEM);
1570         }
1571
1572         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1573         rt->rt6i_dst.plen = 128;
1574
1575         atomic_set(&rt->u.dst.__refcnt, 1);
1576
1577         return rt;
1578 }
1579
1580 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1581 {
1582         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1583             rt != &ip6_null_entry) {
1584                 RT6_TRACE("deleted by ifdown %p\n", rt);
1585                 return -1;
1586         }
1587         return 0;
1588 }
1589
1590 void rt6_ifdown(struct net_device *dev)
1591 {
1592         write_lock_bh(&rt6_lock);
1593         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1594         write_unlock_bh(&rt6_lock);
1595 }
1596
1597 struct rt6_mtu_change_arg
1598 {
1599         struct net_device *dev;
1600         unsigned mtu;
1601 };
1602
1603 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1604 {
1605         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1606         struct inet6_dev *idev;
1607
1608         /* In IPv6 pmtu discovery is not optional,
1609            so that RTAX_MTU lock cannot disable it.
1610            We still use this lock to block changes
1611            caused by addrconf/ndisc.
1612         */
1613
1614         idev = __in6_dev_get(arg->dev);
1615         if (idev == NULL)
1616                 return 0;
1617
1618         /* For administrative MTU increase, there is no way to discover
1619            IPv6 PMTU increase, so PMTU increase should be updated here.
1620            Since RFC 1981 doesn't include administrative MTU increase
1621            update PMTU increase is a MUST. (i.e. jumbo frame)
1622          */
1623         /*
1624            If new MTU is less than route PMTU, this new MTU will be the
1625            lowest MTU in the path, update the route PMTU to reflect PMTU
1626            decreases; if new MTU is greater than route PMTU, and the
1627            old MTU is the lowest MTU in the path, update the route PMTU
1628            to reflect the increase. In this case if the other nodes' MTU
1629            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1630            PMTU discouvery.
1631          */
1632         if (rt->rt6i_dev == arg->dev &&
1633             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1634             (dst_mtu(&rt->u.dst) > arg->mtu ||
1635              (dst_mtu(&rt->u.dst) < arg->mtu &&
1636               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1637                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1638         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1639         return 0;
1640 }
1641
1642 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1643 {
1644         struct rt6_mtu_change_arg arg;
1645
1646         arg.dev = dev;
1647         arg.mtu = mtu;
1648         read_lock_bh(&rt6_lock);
1649         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1650         read_unlock_bh(&rt6_lock);
1651 }
1652
1653 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1654                               struct in6_rtmsg *rtmsg)
1655 {
1656         memset(rtmsg, 0, sizeof(*rtmsg));
1657
1658         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1659         rtmsg->rtmsg_src_len = r->rtm_src_len;
1660         rtmsg->rtmsg_flags = RTF_UP;
1661         if (r->rtm_type == RTN_UNREACHABLE)
1662                 rtmsg->rtmsg_flags |= RTF_REJECT;
1663
1664         if (rta[RTA_GATEWAY-1]) {
1665                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1666                         return -EINVAL;
1667                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1668                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1669         }
1670         if (rta[RTA_DST-1]) {
1671                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1672                         return -EINVAL;
1673                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1674         }
1675         if (rta[RTA_SRC-1]) {
1676                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1677                         return -EINVAL;
1678                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1679         }
1680         if (rta[RTA_OIF-1]) {
1681                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1682                         return -EINVAL;
1683                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1684         }
1685         if (rta[RTA_PRIORITY-1]) {
1686                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1687                         return -EINVAL;
1688                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1689         }
1690         return 0;
1691 }
1692
1693 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1694 {
1695         struct rtmsg *r = NLMSG_DATA(nlh);
1696         struct in6_rtmsg rtmsg;
1697
1698         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1699                 return -EINVAL;
1700         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1701 }
1702
1703 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1704 {
1705         struct rtmsg *r = NLMSG_DATA(nlh);
1706         struct in6_rtmsg rtmsg;
1707
1708         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1709                 return -EINVAL;
1710         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1711 }
1712
1713 struct rt6_rtnl_dump_arg
1714 {
1715         struct sk_buff *skb;
1716         struct netlink_callback *cb;
1717 };
1718
1719 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1720                          struct in6_addr *dst, struct in6_addr *src,
1721                          int iif, int type, u32 pid, u32 seq,
1722                          int prefix, unsigned int flags)
1723 {
1724         struct rtmsg *rtm;
1725         struct nlmsghdr  *nlh;
1726         unsigned char    *b = skb->tail;
1727         struct rta_cacheinfo ci;
1728
1729         if (prefix) {   /* user wants prefix routes only */
1730                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1731                         /* success since this is not a prefix route */
1732                         return 1;
1733                 }
1734         }
1735
1736         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1737         rtm = NLMSG_DATA(nlh);
1738         rtm->rtm_family = AF_INET6;
1739         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1740         rtm->rtm_src_len = rt->rt6i_src.plen;
1741         rtm->rtm_tos = 0;
1742         rtm->rtm_table = RT_TABLE_MAIN;
1743         if (rt->rt6i_flags&RTF_REJECT)
1744                 rtm->rtm_type = RTN_UNREACHABLE;
1745         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1746                 rtm->rtm_type = RTN_LOCAL;
1747         else
1748                 rtm->rtm_type = RTN_UNICAST;
1749         rtm->rtm_flags = 0;
1750         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1751         rtm->rtm_protocol = rt->rt6i_protocol;
1752         if (rt->rt6i_flags&RTF_DYNAMIC)
1753                 rtm->rtm_protocol = RTPROT_REDIRECT;
1754         else if (rt->rt6i_flags & RTF_ADDRCONF)
1755                 rtm->rtm_protocol = RTPROT_KERNEL;
1756         else if (rt->rt6i_flags&RTF_DEFAULT)
1757                 rtm->rtm_protocol = RTPROT_RA;
1758
1759         if (rt->rt6i_flags&RTF_CACHE)
1760                 rtm->rtm_flags |= RTM_F_CLONED;
1761
1762         if (dst) {
1763                 RTA_PUT(skb, RTA_DST, 16, dst);
1764                 rtm->rtm_dst_len = 128;
1765         } else if (rtm->rtm_dst_len)
1766                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1767 #ifdef CONFIG_IPV6_SUBTREES
1768         if (src) {
1769                 RTA_PUT(skb, RTA_SRC, 16, src);
1770                 rtm->rtm_src_len = 128;
1771         } else if (rtm->rtm_src_len)
1772                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1773 #endif
1774         if (iif)
1775                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1776         else if (dst) {
1777                 struct in6_addr saddr_buf;
1778                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1779                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1780         }
1781         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1782                 goto rtattr_failure;
1783         if (rt->u.dst.neighbour)
1784                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1785         if (rt->u.dst.dev)
1786                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1787         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1788         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1789         if (rt->rt6i_expires)
1790                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1791         else
1792                 ci.rta_expires = 0;
1793         ci.rta_used = rt->u.dst.__use;
1794         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1795         ci.rta_error = rt->u.dst.error;
1796         ci.rta_id = 0;
1797         ci.rta_ts = 0;
1798         ci.rta_tsage = 0;
1799         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1800         nlh->nlmsg_len = skb->tail - b;
1801         return skb->len;
1802
1803 nlmsg_failure:
1804 rtattr_failure:
1805         skb_trim(skb, b - skb->data);
1806         return -1;
1807 }
1808
1809 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1810 {
1811         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1812         int prefix;
1813
1814         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1815                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1816                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1817         } else
1818                 prefix = 0;
1819
1820         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1821                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1822                      prefix, NLM_F_MULTI);
1823 }
1824
1825 static int fib6_dump_node(struct fib6_walker_t *w)
1826 {
1827         int res;
1828         struct rt6_info *rt;
1829
1830         for (rt = w->leaf; rt; rt = rt->u.next) {
1831                 res = rt6_dump_route(rt, w->args);
1832                 if (res < 0) {
1833                         /* Frame is full, suspend walking */
1834                         w->leaf = rt;
1835                         return 1;
1836                 }
1837                 BUG_TRAP(res!=0);
1838         }
1839         w->leaf = NULL;
1840         return 0;
1841 }
1842
1843 static void fib6_dump_end(struct netlink_callback *cb)
1844 {
1845         struct fib6_walker_t *w = (void*)cb->args[0];
1846
1847         if (w) {
1848                 cb->args[0] = 0;
1849                 fib6_walker_unlink(w);
1850                 kfree(w);
1851         }
1852         cb->done = (void*)cb->args[1];
1853         cb->args[1] = 0;
1854 }
1855
1856 static int fib6_dump_done(struct netlink_callback *cb)
1857 {
1858         fib6_dump_end(cb);
1859         return cb->done ? cb->done(cb) : 0;
1860 }
1861
1862 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1863 {
1864         struct rt6_rtnl_dump_arg arg;
1865         struct fib6_walker_t *w;
1866         int res;
1867
1868         arg.skb = skb;
1869         arg.cb = cb;
1870
1871         w = (void*)cb->args[0];
1872         if (w == NULL) {
1873                 /* New dump:
1874                  * 
1875                  * 1. hook callback destructor.
1876                  */
1877                 cb->args[1] = (long)cb->done;
1878                 cb->done = fib6_dump_done;
1879
1880                 /*
1881                  * 2. allocate and initialize walker.
1882                  */
1883                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1884                 if (w == NULL)
1885                         return -ENOMEM;
1886                 RT6_TRACE("dump<%p", w);
1887                 w->root = &ip6_routing_table;
1888                 w->func = fib6_dump_node;
1889                 w->args = &arg;
1890                 cb->args[0] = (long)w;
1891                 read_lock_bh(&rt6_lock);
1892                 res = fib6_walk(w);
1893                 read_unlock_bh(&rt6_lock);
1894         } else {
1895                 w->args = &arg;
1896                 read_lock_bh(&rt6_lock);
1897                 res = fib6_walk_continue(w);
1898                 read_unlock_bh(&rt6_lock);
1899         }
1900 #if RT6_DEBUG >= 3
1901         if (res <= 0 && skb->len == 0)
1902                 RT6_TRACE("%p>dump end\n", w);
1903 #endif
1904         res = res < 0 ? res : skb->len;
1905         /* res < 0 is an error. (really, impossible)
1906            res == 0 means that dump is complete, but skb still can contain data.
1907            res > 0 dump is not complete, but frame is full.
1908          */
1909         /* Destroy walker, if dump of this table is complete. */
1910         if (res <= 0)
1911                 fib6_dump_end(cb);
1912         return res;
1913 }
1914
1915 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1916 {
1917         struct rtattr **rta = arg;
1918         int iif = 0;
1919         int err = -ENOBUFS;
1920         struct sk_buff *skb;
1921         struct flowi fl;
1922         struct rt6_info *rt;
1923
1924         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1925         if (skb == NULL)
1926                 goto out;
1927
1928         /* Reserve room for dummy headers, this skb can pass
1929            through good chunk of routing engine.
1930          */
1931         skb->mac.raw = skb->data;
1932         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1933
1934         memset(&fl, 0, sizeof(fl));
1935         if (rta[RTA_SRC-1])
1936                 ipv6_addr_copy(&fl.fl6_src,
1937                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1938         if (rta[RTA_DST-1])
1939                 ipv6_addr_copy(&fl.fl6_dst,
1940                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1941
1942         if (rta[RTA_IIF-1])
1943                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1944
1945         if (iif) {
1946                 struct net_device *dev;
1947                 dev = __dev_get_by_index(iif);
1948                 if (!dev) {
1949                         err = -ENODEV;
1950                         goto out_free;
1951                 }
1952         }
1953
1954         fl.oif = 0;
1955         if (rta[RTA_OIF-1])
1956                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1957
1958         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1959
1960         skb->dst = &rt->u.dst;
1961
1962         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1963         err = rt6_fill_node(skb, rt, 
1964                             &fl.fl6_dst, &fl.fl6_src,
1965                             iif,
1966                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1967                             nlh->nlmsg_seq, 0, 0);
1968         if (err < 0) {
1969                 err = -EMSGSIZE;
1970                 goto out_free;
1971         }
1972
1973         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1974         if (err > 0)
1975                 err = 0;
1976 out:
1977         return err;
1978 out_free:
1979         kfree_skb(skb);
1980         goto out;       
1981 }
1982
1983 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
1984                         struct netlink_skb_parms *req)
1985 {
1986         struct sk_buff *skb;
1987         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1988         u32 pid = current->pid;
1989         u32 seq = 0;
1990
1991         if (req)
1992                 pid = req->pid;
1993         if (nlh)
1994                 seq = nlh->nlmsg_seq;
1995         
1996         skb = alloc_skb(size, gfp_any());
1997         if (!skb) {
1998                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1999                 return;
2000         }
2001         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2002                 kfree_skb(skb);
2003                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2004                 return;
2005         }
2006         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2007         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2008 }
2009
2010 /*
2011  *      /proc
2012  */
2013
2014 #ifdef CONFIG_PROC_FS
2015
2016 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2017
2018 struct rt6_proc_arg
2019 {
2020         char *buffer;
2021         int offset;
2022         int length;
2023         int skip;
2024         int len;
2025 };
2026
2027 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2028 {
2029         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2030         int i;
2031
2032         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2033                 arg->skip++;
2034                 return 0;
2035         }
2036
2037         if (arg->len >= arg->length)
2038                 return 0;
2039
2040         for (i=0; i<16; i++) {
2041                 sprintf(arg->buffer + arg->len, "%02x",
2042                         rt->rt6i_dst.addr.s6_addr[i]);
2043                 arg->len += 2;
2044         }
2045         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2046                             rt->rt6i_dst.plen);
2047
2048 #ifdef CONFIG_IPV6_SUBTREES
2049         for (i=0; i<16; i++) {
2050                 sprintf(arg->buffer + arg->len, "%02x",
2051                         rt->rt6i_src.addr.s6_addr[i]);
2052                 arg->len += 2;
2053         }
2054         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2055                             rt->rt6i_src.plen);
2056 #else
2057         sprintf(arg->buffer + arg->len,
2058                 "00000000000000000000000000000000 00 ");
2059         arg->len += 36;
2060 #endif
2061
2062         if (rt->rt6i_nexthop) {
2063                 for (i=0; i<16; i++) {
2064                         sprintf(arg->buffer + arg->len, "%02x",
2065                                 rt->rt6i_nexthop->primary_key[i]);
2066                         arg->len += 2;
2067                 }
2068         } else {
2069                 sprintf(arg->buffer + arg->len,
2070                         "00000000000000000000000000000000");
2071                 arg->len += 32;
2072         }
2073         arg->len += sprintf(arg->buffer + arg->len,
2074                             " %08x %08x %08x %08x %8s\n",
2075                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2076                             rt->u.dst.__use, rt->rt6i_flags, 
2077                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2078         return 0;
2079 }
2080
2081 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2082 {
2083         struct rt6_proc_arg arg;
2084         arg.buffer = buffer;
2085         arg.offset = offset;
2086         arg.length = length;
2087         arg.skip = 0;
2088         arg.len = 0;
2089
2090         read_lock_bh(&rt6_lock);
2091         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2092         read_unlock_bh(&rt6_lock);
2093
2094         *start = buffer;
2095         if (offset)
2096                 *start += offset % RT6_INFO_LEN;
2097
2098         arg.len -= offset % RT6_INFO_LEN;
2099
2100         if (arg.len > length)
2101                 arg.len = length;
2102         if (arg.len < 0)
2103                 arg.len = 0;
2104
2105         return arg.len;
2106 }
2107
2108 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2109 {
2110         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2111                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2112                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2113                       rt6_stats.fib_rt_cache,
2114                       atomic_read(&ip6_dst_ops.entries),
2115                       rt6_stats.fib_discarded_routes);
2116
2117         return 0;
2118 }
2119
2120 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2121 {
2122         return single_open(file, rt6_stats_seq_show, NULL);
2123 }
2124
2125 static struct file_operations rt6_stats_seq_fops = {
2126         .owner   = THIS_MODULE,
2127         .open    = rt6_stats_seq_open,
2128         .read    = seq_read,
2129         .llseek  = seq_lseek,
2130         .release = single_release,
2131 };
2132 #endif  /* CONFIG_PROC_FS */
2133
2134 #ifdef CONFIG_SYSCTL
2135
2136 static int flush_delay;
2137
2138 static
2139 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2140                               void __user *buffer, size_t *lenp, loff_t *ppos)
2141 {
2142         if (write) {
2143                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2144                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2145                 return 0;
2146         } else
2147                 return -EINVAL;
2148 }
2149
2150 ctl_table ipv6_route_table[] = {
2151         {
2152                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2153                 .procname       =       "flush",
2154                 .data           =       &flush_delay,
2155                 .maxlen         =       sizeof(int),
2156                 .mode           =       0200,
2157                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2158         },
2159         {
2160                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2161                 .procname       =       "gc_thresh",
2162                 .data           =       &ip6_dst_ops.gc_thresh,
2163                 .maxlen         =       sizeof(int),
2164                 .mode           =       0644,
2165                 .proc_handler   =       &proc_dointvec,
2166         },
2167         {
2168                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2169                 .procname       =       "max_size",
2170                 .data           =       &ip6_rt_max_size,
2171                 .maxlen         =       sizeof(int),
2172                 .mode           =       0644,
2173                 .proc_handler   =       &proc_dointvec,
2174         },
2175         {
2176                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2177                 .procname       =       "gc_min_interval",
2178                 .data           =       &ip6_rt_gc_min_interval,
2179                 .maxlen         =       sizeof(int),
2180                 .mode           =       0644,
2181                 .proc_handler   =       &proc_dointvec_jiffies,
2182                 .strategy       =       &sysctl_jiffies,
2183         },
2184         {
2185                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2186                 .procname       =       "gc_timeout",
2187                 .data           =       &ip6_rt_gc_timeout,
2188                 .maxlen         =       sizeof(int),
2189                 .mode           =       0644,
2190                 .proc_handler   =       &proc_dointvec_jiffies,
2191                 .strategy       =       &sysctl_jiffies,
2192         },
2193         {
2194                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2195                 .procname       =       "gc_interval",
2196                 .data           =       &ip6_rt_gc_interval,
2197                 .maxlen         =       sizeof(int),
2198                 .mode           =       0644,
2199                 .proc_handler   =       &proc_dointvec_jiffies,
2200                 .strategy       =       &sysctl_jiffies,
2201         },
2202         {
2203                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2204                 .procname       =       "gc_elasticity",
2205                 .data           =       &ip6_rt_gc_elasticity,
2206                 .maxlen         =       sizeof(int),
2207                 .mode           =       0644,
2208                 .proc_handler   =       &proc_dointvec_jiffies,
2209                 .strategy       =       &sysctl_jiffies,
2210         },
2211         {
2212                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2213                 .procname       =       "mtu_expires",
2214                 .data           =       &ip6_rt_mtu_expires,
2215                 .maxlen         =       sizeof(int),
2216                 .mode           =       0644,
2217                 .proc_handler   =       &proc_dointvec_jiffies,
2218                 .strategy       =       &sysctl_jiffies,
2219         },
2220         {
2221                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2222                 .procname       =       "min_adv_mss",
2223                 .data           =       &ip6_rt_min_advmss,
2224                 .maxlen         =       sizeof(int),
2225                 .mode           =       0644,
2226                 .proc_handler   =       &proc_dointvec_jiffies,
2227                 .strategy       =       &sysctl_jiffies,
2228         },
2229         {
2230                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2231                 .procname       =       "gc_min_interval_ms",
2232                 .data           =       &ip6_rt_gc_min_interval,
2233                 .maxlen         =       sizeof(int),
2234                 .mode           =       0644,
2235                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2236                 .strategy       =       &sysctl_ms_jiffies,
2237         },
2238         { .ctl_name = 0 }
2239 };
2240
2241 #endif
2242
2243 void __init ip6_route_init(void)
2244 {
2245         struct proc_dir_entry *p;
2246
2247         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2248                                                      sizeof(struct rt6_info),
2249                                                      0, SLAB_HWCACHE_ALIGN,
2250                                                      NULL, NULL);
2251         if (!ip6_dst_ops.kmem_cachep)
2252                 panic("cannot create ip6_dst_cache");
2253
2254         fib6_init();
2255 #ifdef  CONFIG_PROC_FS
2256         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2257         if (p)
2258                 p->owner = THIS_MODULE;
2259
2260         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2261 #endif
2262 #ifdef CONFIG_XFRM
2263         xfrm6_init();
2264 #endif
2265 }
2266
2267 void ip6_route_cleanup(void)
2268 {
2269 #ifdef CONFIG_PROC_FS
2270         proc_net_remove("ipv6_route");
2271         proc_net_remove("rt6_stats");
2272 #endif
2273 #ifdef CONFIG_XFRM
2274         xfrm6_fini();
2275 #endif
2276         rt6_ifdown(NULL);
2277         fib6_gc_cleanup();
2278         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2279 }