Pull novell-bugzilla-156426 into release branch
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (rt->rt6i_flags & RTF_NONEXTHOP ||
284             !(rt->rt6i_flags & RTF_GATEWAY))
285                 m = 1;
286         else if (neigh) {
287                 read_lock_bh(&neigh->lock);
288                 if (neigh->nud_state & NUD_VALID)
289                         m = 2;
290                 read_unlock_bh(&neigh->lock);
291         }
292         return m;
293 }
294
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296                            int strict)
297 {
298         int m, n;
299                 
300         m = rt6_check_dev(rt, oif);
301         if (!m && (strict & RT6_SELECT_F_IFACE))
302                 return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306         n = rt6_check_neigh(rt);
307         if (n > 1)
308                 m |= 16;
309         else if (!n && strict & RT6_SELECT_F_REACHABLE)
310                 return -1;
311         return m;
312 }
313
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315                                    int strict)
316 {
317         struct rt6_info *match = NULL, *last = NULL;
318         struct rt6_info *rt, *rt0 = *head;
319         u32 metric;
320         int mpri = -1;
321
322         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323                   __FUNCTION__, head, head ? *head : NULL, oif);
324
325         for (rt = rt0, metric = rt0->rt6i_metric;
326              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327              rt = rt->u.next) {
328                 int m;
329
330                 if (rt6_check_expired(rt))
331                         continue;
332
333                 last = rt;
334
335                 m = rt6_score_route(rt, oif, strict);
336                 if (m < 0)
337                         continue;
338
339                 if (m > mpri) {
340                         rt6_probe(match);
341                         match = rt;
342                         mpri = m;
343                 } else {
344                         rt6_probe(rt);
345                 }
346         }
347
348         if (!match &&
349             (strict & RT6_SELECT_F_REACHABLE) &&
350             last && last != rt0) {
351                 /* no entries matched; do round-robin */
352                 static spinlock_t lock = SPIN_LOCK_UNLOCKED;
353                 spin_lock(&lock);
354                 *head = rt0->u.next;
355                 rt0->u.next = last->u.next;
356                 last->u.next = rt0;
357                 spin_unlock(&lock);
358         }
359
360         RT6_TRACE("%s() => %p, score=%d\n",
361                   __FUNCTION__, match, mpri);
362
363         return (match ? match : &ip6_null_entry);
364 }
365
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368                   struct in6_addr *gwaddr)
369 {
370         struct route_info *rinfo = (struct route_info *) opt;
371         struct in6_addr prefix_buf, *prefix;
372         unsigned int pref;
373         u32 lifetime;
374         struct rt6_info *rt;
375
376         if (len < sizeof(struct route_info)) {
377                 return -EINVAL;
378         }
379
380         /* Sanity check for prefix_len and length */
381         if (rinfo->length > 3) {
382                 return -EINVAL;
383         } else if (rinfo->prefix_len > 128) {
384                 return -EINVAL;
385         } else if (rinfo->prefix_len > 64) {
386                 if (rinfo->length < 2) {
387                         return -EINVAL;
388                 }
389         } else if (rinfo->prefix_len > 0) {
390                 if (rinfo->length < 1) {
391                         return -EINVAL;
392                 }
393         }
394
395         pref = rinfo->route_pref;
396         if (pref == ICMPV6_ROUTER_PREF_INVALID)
397                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
398
399         lifetime = htonl(rinfo->lifetime);
400         if (lifetime == 0xffffffff) {
401                 /* infinity */
402         } else if (lifetime > 0x7fffffff/HZ) {
403                 /* Avoid arithmetic overflow */
404                 lifetime = 0x7fffffff/HZ - 1;
405         }
406
407         if (rinfo->length == 3)
408                 prefix = (struct in6_addr *)rinfo->prefix;
409         else {
410                 /* this function is safe */
411                 ipv6_addr_prefix(&prefix_buf,
412                                  (struct in6_addr *)rinfo->prefix,
413                                  rinfo->prefix_len);
414                 prefix = &prefix_buf;
415         }
416
417         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418
419         if (rt && !lifetime) {
420                 ip6_del_rt(rt, NULL, NULL, NULL);
421                 rt = NULL;
422         }
423
424         if (!rt && lifetime)
425                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426                                         pref);
427         else if (rt)
428                 rt->rt6i_flags = RTF_ROUTEINFO |
429                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430
431         if (rt) {
432                 if (lifetime == 0xffffffff) {
433                         rt->rt6i_flags &= ~RTF_EXPIRES;
434                 } else {
435                         rt->rt6i_expires = jiffies + HZ * lifetime;
436                         rt->rt6i_flags |= RTF_EXPIRES;
437                 }
438                 dst_release(&rt->u.dst);
439         }
440         return 0;
441 }
442 #endif
443
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445                             int oif, int strict)
446 {
447         struct fib6_node *fn;
448         struct rt6_info *rt;
449
450         read_lock_bh(&rt6_lock);
451         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452         rt = rt6_device_match(fn->leaf, oif, strict);
453         dst_hold(&rt->u.dst);
454         rt->u.dst.__use++;
455         read_unlock_bh(&rt6_lock);
456
457         rt->u.dst.lastuse = jiffies;
458         if (rt->u.dst.error == 0)
459                 return rt;
460         dst_release(&rt->u.dst);
461         return NULL;
462 }
463
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471                 void *_rtattr, struct netlink_skb_parms *req)
472 {
473         int err;
474
475         write_lock_bh(&rt6_lock);
476         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477         write_unlock_bh(&rt6_lock);
478
479         return err;
480 }
481
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483                                       struct in6_addr *saddr)
484 {
485         struct rt6_info *rt;
486
487         /*
488          *      Clone the route.
489          */
490
491         rt = ip6_rt_copy(ort);
492
493         if (rt) {
494                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495                         if (rt->rt6i_dst.plen != 128 &&
496                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497                                 rt->rt6i_flags |= RTF_ANYCAST;
498                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499                 }
500
501                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502                 rt->rt6i_dst.plen = 128;
503                 rt->rt6i_flags |= RTF_CACHE;
504                 rt->u.dst.flags |= DST_HOST;
505
506 #ifdef CONFIG_IPV6_SUBTREES
507                 if (rt->rt6i_src.plen && saddr) {
508                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509                         rt->rt6i_src.plen = 128;
510                 }
511 #endif
512
513                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514
515         }
516
517         return rt;
518 }
519
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522         struct rt6_info *rt = ip6_rt_copy(ort);
523         if (rt) {
524                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525                 rt->rt6i_dst.plen = 128;
526                 rt->rt6i_flags |= RTF_CACHE;
527                 if (rt->rt6i_flags & RTF_REJECT)
528                         rt->u.dst.error = ort->u.dst.error;
529                 rt->u.dst.flags |= DST_HOST;
530                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531         }
532         return rt;
533 }
534
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538                 if (fn->fn_flags & RTN_ROOT) { \
539                         goto out; \
540                 } \
541                 if (fn->fn_flags & RTN_RTINFO) \
542                         goto restart; \
543         } \
544 }
545
546
547 void ip6_route_input(struct sk_buff *skb)
548 {
549         struct fib6_node *fn;
550         struct rt6_info *rt, *nrt;
551         int strict;
552         int attempts = 3;
553         int err;
554         int reachable = RT6_SELECT_F_REACHABLE;
555
556         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557
558 relookup:
559         read_lock_bh(&rt6_lock);
560
561 restart_2:
562         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563                          &skb->nh.ipv6h->saddr);
564
565 restart:
566         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567         BACKTRACK();
568         if (rt == &ip6_null_entry ||
569             rt->rt6i_flags & RTF_CACHE)
570                 goto out;
571
572         dst_hold(&rt->u.dst);
573         read_unlock_bh(&rt6_lock);
574
575         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577         else {
578 #if CLONE_OFFLINK_ROUTE
579                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581                 goto out2;
582 #endif
583         }
584
585         dst_release(&rt->u.dst);
586         rt = nrt ? : &ip6_null_entry;
587
588         dst_hold(&rt->u.dst);
589         if (nrt) {
590                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591                 if (!err)
592                         goto out2;
593         }
594
595         if (--attempts <= 0)
596                 goto out2;
597
598         /*
599          * Race condition! In the gap, when rt6_lock was
600          * released someone could insert this route.  Relookup.
601          */
602         dst_release(&rt->u.dst);
603         goto relookup;
604
605 out:
606         if (reachable) {
607                 reachable = 0;
608                 goto restart_2;
609         }
610         dst_hold(&rt->u.dst);
611         read_unlock_bh(&rt6_lock);
612 out2:
613         rt->u.dst.lastuse = jiffies;
614         rt->u.dst.__use++;
615         skb->dst = (struct dst_entry *) rt;
616         return;
617 }
618
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621         struct fib6_node *fn;
622         struct rt6_info *rt, *nrt;
623         int strict;
624         int attempts = 3;
625         int err;
626         int reachable = RT6_SELECT_F_REACHABLE;
627
628         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629
630 relookup:
631         read_lock_bh(&rt6_lock);
632
633 restart_2:
634         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635
636 restart:
637         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638         BACKTRACK();
639         if (rt == &ip6_null_entry ||
640             rt->rt6i_flags & RTF_CACHE)
641                 goto out;
642
643         dst_hold(&rt->u.dst);
644         read_unlock_bh(&rt6_lock);
645
646         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648         else {
649 #if CLONE_OFFLINK_ROUTE
650                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652                 goto out2;
653 #endif
654         }
655
656         dst_release(&rt->u.dst);
657         rt = nrt ? : &ip6_null_entry;
658
659         dst_hold(&rt->u.dst);
660         if (nrt) {
661                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662                 if (!err)
663                         goto out2;
664         }
665
666         if (--attempts <= 0)
667                 goto out2;
668
669         /*
670          * Race condition! In the gap, when rt6_lock was
671          * released someone could insert this route.  Relookup.
672          */
673         dst_release(&rt->u.dst);
674         goto relookup;
675
676 out:
677         if (reachable) {
678                 reachable = 0;
679                 goto restart_2;
680         }
681         dst_hold(&rt->u.dst);
682         read_unlock_bh(&rt6_lock);
683 out2:
684         rt->u.dst.lastuse = jiffies;
685         rt->u.dst.__use++;
686         return &rt->u.dst;
687 }
688
689
690 /*
691  *      Destination cache support functions
692  */
693
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696         struct rt6_info *rt;
697
698         rt = (struct rt6_info *) dst;
699
700         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701                 return dst;
702
703         return NULL;
704 }
705
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708         struct rt6_info *rt = (struct rt6_info *) dst;
709
710         if (rt) {
711                 if (rt->rt6i_flags & RTF_CACHE)
712                         ip6_del_rt(rt, NULL, NULL, NULL);
713                 else
714                         dst_release(dst);
715         }
716         return NULL;
717 }
718
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721         struct rt6_info *rt;
722
723         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724
725         rt = (struct rt6_info *) skb->dst;
726         if (rt) {
727                 if (rt->rt6i_flags&RTF_CACHE) {
728                         dst_set_expires(&rt->u.dst, 0);
729                         rt->rt6i_flags |= RTF_EXPIRES;
730                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731                         rt->rt6i_node->fn_sernum = -1;
732         }
733 }
734
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737         struct rt6_info *rt6 = (struct rt6_info*)dst;
738
739         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740                 rt6->rt6i_flags |= RTF_MODIFIED;
741                 if (mtu < IPV6_MIN_MTU) {
742                         mtu = IPV6_MIN_MTU;
743                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744                 }
745                 dst->metrics[RTAX_MTU-1] = mtu;
746         }
747 }
748
749 /* Protected by rt6_lock.  */
750 static struct dst_entry *ndisc_dst_gc_list;
751 static int ipv6_get_mtu(struct net_device *dev);
752
753 static inline unsigned int ipv6_advmss(unsigned int mtu)
754 {
755         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
756
757         if (mtu < ip6_rt_min_advmss)
758                 mtu = ip6_rt_min_advmss;
759
760         /*
761          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
762          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
763          * IPV6_MAXPLEN is also valid and means: "any MSS, 
764          * rely only on pmtu discovery"
765          */
766         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
767                 mtu = IPV6_MAXPLEN;
768         return mtu;
769 }
770
771 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
772                                   struct neighbour *neigh,
773                                   struct in6_addr *addr,
774                                   int (*output)(struct sk_buff *))
775 {
776         struct rt6_info *rt;
777         struct inet6_dev *idev = in6_dev_get(dev);
778
779         if (unlikely(idev == NULL))
780                 return NULL;
781
782         rt = ip6_dst_alloc();
783         if (unlikely(rt == NULL)) {
784                 in6_dev_put(idev);
785                 goto out;
786         }
787
788         dev_hold(dev);
789         if (neigh)
790                 neigh_hold(neigh);
791         else
792                 neigh = ndisc_get_neigh(dev, addr);
793
794         rt->rt6i_dev      = dev;
795         rt->rt6i_idev     = idev;
796         rt->rt6i_nexthop  = neigh;
797         atomic_set(&rt->u.dst.__refcnt, 1);
798         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
799         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
800         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
801         rt->u.dst.output  = output;
802
803 #if 0   /* there's no chance to use these for ndisc */
804         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
805                                 ? DST_HOST 
806                                 : 0;
807         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
808         rt->rt6i_dst.plen = 128;
809 #endif
810
811         write_lock_bh(&rt6_lock);
812         rt->u.dst.next = ndisc_dst_gc_list;
813         ndisc_dst_gc_list = &rt->u.dst;
814         write_unlock_bh(&rt6_lock);
815
816         fib6_force_start_gc();
817
818 out:
819         return (struct dst_entry *)rt;
820 }
821
822 int ndisc_dst_gc(int *more)
823 {
824         struct dst_entry *dst, *next, **pprev;
825         int freed;
826
827         next = NULL;
828         pprev = &ndisc_dst_gc_list;
829         freed = 0;
830         while ((dst = *pprev) != NULL) {
831                 if (!atomic_read(&dst->__refcnt)) {
832                         *pprev = dst->next;
833                         dst_free(dst);
834                         freed++;
835                 } else {
836                         pprev = &dst->next;
837                         (*more)++;
838                 }
839         }
840
841         return freed;
842 }
843
844 static int ip6_dst_gc(void)
845 {
846         static unsigned expire = 30*HZ;
847         static unsigned long last_gc;
848         unsigned long now = jiffies;
849
850         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
851             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
852                 goto out;
853
854         expire++;
855         fib6_run_gc(expire);
856         last_gc = now;
857         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
858                 expire = ip6_rt_gc_timeout>>1;
859
860 out:
861         expire -= expire>>ip6_rt_gc_elasticity;
862         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
863 }
864
865 /* Clean host part of a prefix. Not necessary in radix tree,
866    but results in cleaner routing tables.
867
868    Remove it only when all the things will work!
869  */
870
871 static int ipv6_get_mtu(struct net_device *dev)
872 {
873         int mtu = IPV6_MIN_MTU;
874         struct inet6_dev *idev;
875
876         idev = in6_dev_get(dev);
877         if (idev) {
878                 mtu = idev->cnf.mtu6;
879                 in6_dev_put(idev);
880         }
881         return mtu;
882 }
883
884 int ipv6_get_hoplimit(struct net_device *dev)
885 {
886         int hoplimit = ipv6_devconf.hop_limit;
887         struct inet6_dev *idev;
888
889         idev = in6_dev_get(dev);
890         if (idev) {
891                 hoplimit = idev->cnf.hop_limit;
892                 in6_dev_put(idev);
893         }
894         return hoplimit;
895 }
896
897 /*
898  *
899  */
900
901 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
902                 void *_rtattr, struct netlink_skb_parms *req)
903 {
904         int err;
905         struct rtmsg *r;
906         struct rtattr **rta;
907         struct rt6_info *rt = NULL;
908         struct net_device *dev = NULL;
909         struct inet6_dev *idev = NULL;
910         int addr_type;
911
912         rta = (struct rtattr **) _rtattr;
913
914         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
915                 return -EINVAL;
916 #ifndef CONFIG_IPV6_SUBTREES
917         if (rtmsg->rtmsg_src_len)
918                 return -EINVAL;
919 #endif
920         if (rtmsg->rtmsg_ifindex) {
921                 err = -ENODEV;
922                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
923                 if (!dev)
924                         goto out;
925                 idev = in6_dev_get(dev);
926                 if (!idev)
927                         goto out;
928         }
929
930         if (rtmsg->rtmsg_metric == 0)
931                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
932
933         rt = ip6_dst_alloc();
934
935         if (rt == NULL) {
936                 err = -ENOMEM;
937                 goto out;
938         }
939
940         rt->u.dst.obsolete = -1;
941         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
942         if (nlh && (r = NLMSG_DATA(nlh))) {
943                 rt->rt6i_protocol = r->rtm_protocol;
944         } else {
945                 rt->rt6i_protocol = RTPROT_BOOT;
946         }
947
948         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
949
950         if (addr_type & IPV6_ADDR_MULTICAST)
951                 rt->u.dst.input = ip6_mc_input;
952         else
953                 rt->u.dst.input = ip6_forward;
954
955         rt->u.dst.output = ip6_output;
956
957         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
958                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
959         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
960         if (rt->rt6i_dst.plen == 128)
961                rt->u.dst.flags = DST_HOST;
962
963 #ifdef CONFIG_IPV6_SUBTREES
964         ipv6_addr_prefix(&rt->rt6i_src.addr, 
965                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
966         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
967 #endif
968
969         rt->rt6i_metric = rtmsg->rtmsg_metric;
970
971         /* We cannot add true routes via loopback here,
972            they would result in kernel looping; promote them to reject routes
973          */
974         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
975             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
976                 /* hold loopback dev/idev if we haven't done so. */
977                 if (dev != &loopback_dev) {
978                         if (dev) {
979                                 dev_put(dev);
980                                 in6_dev_put(idev);
981                         }
982                         dev = &loopback_dev;
983                         dev_hold(dev);
984                         idev = in6_dev_get(dev);
985                         if (!idev) {
986                                 err = -ENODEV;
987                                 goto out;
988                         }
989                 }
990                 rt->u.dst.output = ip6_pkt_discard_out;
991                 rt->u.dst.input = ip6_pkt_discard;
992                 rt->u.dst.error = -ENETUNREACH;
993                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
994                 goto install_route;
995         }
996
997         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
998                 struct in6_addr *gw_addr;
999                 int gwa_type;
1000
1001                 gw_addr = &rtmsg->rtmsg_gateway;
1002                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1003                 gwa_type = ipv6_addr_type(gw_addr);
1004
1005                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1006                         struct rt6_info *grt;
1007
1008                         /* IPv6 strictly inhibits using not link-local
1009                            addresses as nexthop address.
1010                            Otherwise, router will not able to send redirects.
1011                            It is very good, but in some (rare!) circumstances
1012                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1013                            some exceptions. --ANK
1014                          */
1015                         err = -EINVAL;
1016                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1017                                 goto out;
1018
1019                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1020
1021                         err = -EHOSTUNREACH;
1022                         if (grt == NULL)
1023                                 goto out;
1024                         if (dev) {
1025                                 if (dev != grt->rt6i_dev) {
1026                                         dst_release(&grt->u.dst);
1027                                         goto out;
1028                                 }
1029                         } else {
1030                                 dev = grt->rt6i_dev;
1031                                 idev = grt->rt6i_idev;
1032                                 dev_hold(dev);
1033                                 in6_dev_hold(grt->rt6i_idev);
1034                         }
1035                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1036                                 err = 0;
1037                         dst_release(&grt->u.dst);
1038
1039                         if (err)
1040                                 goto out;
1041                 }
1042                 err = -EINVAL;
1043                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1044                         goto out;
1045         }
1046
1047         err = -ENODEV;
1048         if (dev == NULL)
1049                 goto out;
1050
1051         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1052                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1053                 if (IS_ERR(rt->rt6i_nexthop)) {
1054                         err = PTR_ERR(rt->rt6i_nexthop);
1055                         rt->rt6i_nexthop = NULL;
1056                         goto out;
1057                 }
1058         }
1059
1060         rt->rt6i_flags = rtmsg->rtmsg_flags;
1061
1062 install_route:
1063         if (rta && rta[RTA_METRICS-1]) {
1064                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1065                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1066
1067                 while (RTA_OK(attr, attrlen)) {
1068                         unsigned flavor = attr->rta_type;
1069                         if (flavor) {
1070                                 if (flavor > RTAX_MAX) {
1071                                         err = -EINVAL;
1072                                         goto out;
1073                                 }
1074                                 rt->u.dst.metrics[flavor-1] =
1075                                         *(u32 *)RTA_DATA(attr);
1076                         }
1077                         attr = RTA_NEXT(attr, attrlen);
1078                 }
1079         }
1080
1081         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1082                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1083         if (!rt->u.dst.metrics[RTAX_MTU-1])
1084                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1085         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1086                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1087         rt->u.dst.dev = dev;
1088         rt->rt6i_idev = idev;
1089         return ip6_ins_rt(rt, nlh, _rtattr, req);
1090
1091 out:
1092         if (dev)
1093                 dev_put(dev);
1094         if (idev)
1095                 in6_dev_put(idev);
1096         if (rt)
1097                 dst_free((struct dst_entry *) rt);
1098         return err;
1099 }
1100
1101 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1102 {
1103         int err;
1104
1105         write_lock_bh(&rt6_lock);
1106
1107         err = fib6_del(rt, nlh, _rtattr, req);
1108         dst_release(&rt->u.dst);
1109
1110         write_unlock_bh(&rt6_lock);
1111
1112         return err;
1113 }
1114
1115 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1116 {
1117         struct fib6_node *fn;
1118         struct rt6_info *rt;
1119         int err = -ESRCH;
1120
1121         read_lock_bh(&rt6_lock);
1122
1123         fn = fib6_locate(&ip6_routing_table,
1124                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1125                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1126         
1127         if (fn) {
1128                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1129                         if (rtmsg->rtmsg_ifindex &&
1130                             (rt->rt6i_dev == NULL ||
1131                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1132                                 continue;
1133                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1134                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1135                                 continue;
1136                         if (rtmsg->rtmsg_metric &&
1137                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1138                                 continue;
1139                         dst_hold(&rt->u.dst);
1140                         read_unlock_bh(&rt6_lock);
1141
1142                         return ip6_del_rt(rt, nlh, _rtattr, req);
1143                 }
1144         }
1145         read_unlock_bh(&rt6_lock);
1146
1147         return err;
1148 }
1149
1150 /*
1151  *      Handle redirects
1152  */
1153 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1154                   struct neighbour *neigh, u8 *lladdr, int on_link)
1155 {
1156         struct rt6_info *rt, *nrt = NULL;
1157         int strict;
1158         struct fib6_node *fn;
1159
1160         /*
1161          * Get the "current" route for this destination and
1162          * check if the redirect has come from approriate router.
1163          *
1164          * RFC 2461 specifies that redirects should only be
1165          * accepted if they come from the nexthop to the target.
1166          * Due to the way the routes are chosen, this notion
1167          * is a bit fuzzy and one might need to check all possible
1168          * routes.
1169          */
1170         strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1171
1172         read_lock_bh(&rt6_lock);
1173         fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1174 restart:
1175         for (rt = fn->leaf; rt; rt = rt->u.next) {
1176                 /*
1177                  * Current route is on-link; redirect is always invalid.
1178                  *
1179                  * Seems, previous statement is not true. It could
1180                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1181                  * But then router serving it might decide, that we should
1182                  * know truth 8)8) --ANK (980726).
1183                  */
1184                 if (rt6_check_expired(rt))
1185                         continue;
1186                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1187                         continue;
1188                 if (neigh->dev != rt->rt6i_dev)
1189                         continue;
1190                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1191                         continue;
1192                 break;
1193         }
1194         if (rt)
1195                 dst_hold(&rt->u.dst);
1196         else if (strict) {
1197                 while ((fn = fn->parent) != NULL) {
1198                         if (fn->fn_flags & RTN_ROOT)
1199                                 break;
1200                         if (fn->fn_flags & RTN_RTINFO)
1201                                 goto restart;
1202                 }
1203         }
1204         read_unlock_bh(&rt6_lock);
1205
1206         if (!rt) {
1207                 if (net_ratelimit())
1208                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1209                                "for redirect target\n");
1210                 return;
1211         }
1212
1213         /*
1214          *      We have finally decided to accept it.
1215          */
1216
1217         neigh_update(neigh, lladdr, NUD_STALE, 
1218                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1219                      NEIGH_UPDATE_F_OVERRIDE|
1220                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1221                                      NEIGH_UPDATE_F_ISROUTER))
1222                      );
1223
1224         /*
1225          * Redirect received -> path was valid.
1226          * Look, redirects are sent only in response to data packets,
1227          * so that this nexthop apparently is reachable. --ANK
1228          */
1229         dst_confirm(&rt->u.dst);
1230
1231         /* Duplicate redirect: silently ignore. */
1232         if (neigh == rt->u.dst.neighbour)
1233                 goto out;
1234
1235         nrt = ip6_rt_copy(rt);
1236         if (nrt == NULL)
1237                 goto out;
1238
1239         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1240         if (on_link)
1241                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1242
1243         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1244         nrt->rt6i_dst.plen = 128;
1245         nrt->u.dst.flags |= DST_HOST;
1246
1247         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1248         nrt->rt6i_nexthop = neigh_clone(neigh);
1249         /* Reset pmtu, it may be better */
1250         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1251         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1252
1253         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1254                 goto out;
1255
1256         if (rt->rt6i_flags&RTF_CACHE) {
1257                 ip6_del_rt(rt, NULL, NULL, NULL);
1258                 return;
1259         }
1260
1261 out:
1262         dst_release(&rt->u.dst);
1263         return;
1264 }
1265
1266 /*
1267  *      Handle ICMP "packet too big" messages
1268  *      i.e. Path MTU discovery
1269  */
1270
1271 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1272                         struct net_device *dev, u32 pmtu)
1273 {
1274         struct rt6_info *rt, *nrt;
1275         int allfrag = 0;
1276
1277         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1278         if (rt == NULL)
1279                 return;
1280
1281         if (pmtu >= dst_mtu(&rt->u.dst))
1282                 goto out;
1283
1284         if (pmtu < IPV6_MIN_MTU) {
1285                 /*
1286                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1287                  * MTU (1280) and a fragment header should always be included
1288                  * after a node receiving Too Big message reporting PMTU is
1289                  * less than the IPv6 Minimum Link MTU.
1290                  */
1291                 pmtu = IPV6_MIN_MTU;
1292                 allfrag = 1;
1293         }
1294
1295         /* New mtu received -> path was valid.
1296            They are sent only in response to data packets,
1297            so that this nexthop apparently is reachable. --ANK
1298          */
1299         dst_confirm(&rt->u.dst);
1300
1301         /* Host route. If it is static, it would be better
1302            not to override it, but add new one, so that
1303            when cache entry will expire old pmtu
1304            would return automatically.
1305          */
1306         if (rt->rt6i_flags & RTF_CACHE) {
1307                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1308                 if (allfrag)
1309                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1310                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1311                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1312                 goto out;
1313         }
1314
1315         /* Network route.
1316            Two cases are possible:
1317            1. It is connected route. Action: COW
1318            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1319          */
1320         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1321                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1322         else
1323                 nrt = rt6_alloc_clone(rt, daddr);
1324
1325         if (nrt) {
1326                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1327                 if (allfrag)
1328                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1329
1330                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1331                  * happened within 5 mins, the recommended timer is 10 mins.
1332                  * Here this route expiration time is set to ip6_rt_mtu_expires
1333                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1334                  * and detecting PMTU increase will be automatically happened.
1335                  */
1336                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1337                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1338
1339                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1340         }
1341 out:
1342         dst_release(&rt->u.dst);
1343 }
1344
1345 /*
1346  *      Misc support functions
1347  */
1348
1349 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1350 {
1351         struct rt6_info *rt = ip6_dst_alloc();
1352
1353         if (rt) {
1354                 rt->u.dst.input = ort->u.dst.input;
1355                 rt->u.dst.output = ort->u.dst.output;
1356
1357                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1358                 rt->u.dst.dev = ort->u.dst.dev;
1359                 if (rt->u.dst.dev)
1360                         dev_hold(rt->u.dst.dev);
1361                 rt->rt6i_idev = ort->rt6i_idev;
1362                 if (rt->rt6i_idev)
1363                         in6_dev_hold(rt->rt6i_idev);
1364                 rt->u.dst.lastuse = jiffies;
1365                 rt->rt6i_expires = 0;
1366
1367                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1368                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1369                 rt->rt6i_metric = 0;
1370
1371                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1372 #ifdef CONFIG_IPV6_SUBTREES
1373                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1374 #endif
1375         }
1376         return rt;
1377 }
1378
1379 #ifdef CONFIG_IPV6_ROUTE_INFO
1380 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1381                                            struct in6_addr *gwaddr, int ifindex)
1382 {
1383         struct fib6_node *fn;
1384         struct rt6_info *rt = NULL;
1385
1386         write_lock_bh(&rt6_lock);
1387         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1388         if (!fn)
1389                 goto out;
1390
1391         for (rt = fn->leaf; rt; rt = rt->u.next) {
1392                 if (rt->rt6i_dev->ifindex != ifindex)
1393                         continue;
1394                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1395                         continue;
1396                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1397                         continue;
1398                 dst_hold(&rt->u.dst);
1399                 break;
1400         }
1401 out:
1402         write_unlock_bh(&rt6_lock);
1403         return rt;
1404 }
1405
1406 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1407                                            struct in6_addr *gwaddr, int ifindex,
1408                                            unsigned pref)
1409 {
1410         struct in6_rtmsg rtmsg;
1411
1412         memset(&rtmsg, 0, sizeof(rtmsg));
1413         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1414         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1415         rtmsg.rtmsg_dst_len = prefixlen;
1416         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1417         rtmsg.rtmsg_metric = 1024;
1418         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1419         /* We should treat it as a default route if prefix length is 0. */
1420         if (!prefixlen)
1421                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1422         rtmsg.rtmsg_ifindex = ifindex;
1423
1424         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1425
1426         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1427 }
1428 #endif
1429
1430 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1431 {       
1432         struct rt6_info *rt;
1433         struct fib6_node *fn;
1434
1435         fn = &ip6_routing_table;
1436
1437         write_lock_bh(&rt6_lock);
1438         for (rt = fn->leaf; rt; rt=rt->u.next) {
1439                 if (dev == rt->rt6i_dev &&
1440                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1441                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1442                         break;
1443         }
1444         if (rt)
1445                 dst_hold(&rt->u.dst);
1446         write_unlock_bh(&rt6_lock);
1447         return rt;
1448 }
1449
1450 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1451                                      struct net_device *dev,
1452                                      unsigned int pref)
1453 {
1454         struct in6_rtmsg rtmsg;
1455
1456         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1457         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1458         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1459         rtmsg.rtmsg_metric = 1024;
1460         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1461                             RTF_PREF(pref);
1462
1463         rtmsg.rtmsg_ifindex = dev->ifindex;
1464
1465         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1466         return rt6_get_dflt_router(gwaddr, dev);
1467 }
1468
1469 void rt6_purge_dflt_routers(void)
1470 {
1471         struct rt6_info *rt;
1472
1473 restart:
1474         read_lock_bh(&rt6_lock);
1475         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1476                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1477                         dst_hold(&rt->u.dst);
1478
1479                         read_unlock_bh(&rt6_lock);
1480
1481                         ip6_del_rt(rt, NULL, NULL, NULL);
1482
1483                         goto restart;
1484                 }
1485         }
1486         read_unlock_bh(&rt6_lock);
1487 }
1488
1489 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1490 {
1491         struct in6_rtmsg rtmsg;
1492         int err;
1493
1494         switch(cmd) {
1495         case SIOCADDRT:         /* Add a route */
1496         case SIOCDELRT:         /* Delete a route */
1497                 if (!capable(CAP_NET_ADMIN))
1498                         return -EPERM;
1499                 err = copy_from_user(&rtmsg, arg,
1500                                      sizeof(struct in6_rtmsg));
1501                 if (err)
1502                         return -EFAULT;
1503                         
1504                 rtnl_lock();
1505                 switch (cmd) {
1506                 case SIOCADDRT:
1507                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1508                         break;
1509                 case SIOCDELRT:
1510                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1511                         break;
1512                 default:
1513                         err = -EINVAL;
1514                 }
1515                 rtnl_unlock();
1516
1517                 return err;
1518         };
1519
1520         return -EINVAL;
1521 }
1522
1523 /*
1524  *      Drop the packet on the floor
1525  */
1526
1527 static int ip6_pkt_discard(struct sk_buff *skb)
1528 {
1529         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1530         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1531         kfree_skb(skb);
1532         return 0;
1533 }
1534
1535 static int ip6_pkt_discard_out(struct sk_buff *skb)
1536 {
1537         skb->dev = skb->dst->dev;
1538         return ip6_pkt_discard(skb);
1539 }
1540
1541 /*
1542  *      Allocate a dst for local (unicast / anycast) address.
1543  */
1544
1545 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1546                                     const struct in6_addr *addr,
1547                                     int anycast)
1548 {
1549         struct rt6_info *rt = ip6_dst_alloc();
1550
1551         if (rt == NULL)
1552                 return ERR_PTR(-ENOMEM);
1553
1554         dev_hold(&loopback_dev);
1555         in6_dev_hold(idev);
1556
1557         rt->u.dst.flags = DST_HOST;
1558         rt->u.dst.input = ip6_input;
1559         rt->u.dst.output = ip6_output;
1560         rt->rt6i_dev = &loopback_dev;
1561         rt->rt6i_idev = idev;
1562         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1563         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1564         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1565         rt->u.dst.obsolete = -1;
1566
1567         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1568         if (anycast)
1569                 rt->rt6i_flags |= RTF_ANYCAST;
1570         else
1571                 rt->rt6i_flags |= RTF_LOCAL;
1572         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1573         if (rt->rt6i_nexthop == NULL) {
1574                 dst_free((struct dst_entry *) rt);
1575                 return ERR_PTR(-ENOMEM);
1576         }
1577
1578         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1579         rt->rt6i_dst.plen = 128;
1580
1581         atomic_set(&rt->u.dst.__refcnt, 1);
1582
1583         return rt;
1584 }
1585
1586 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1587 {
1588         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1589             rt != &ip6_null_entry) {
1590                 RT6_TRACE("deleted by ifdown %p\n", rt);
1591                 return -1;
1592         }
1593         return 0;
1594 }
1595
1596 void rt6_ifdown(struct net_device *dev)
1597 {
1598         write_lock_bh(&rt6_lock);
1599         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1600         write_unlock_bh(&rt6_lock);
1601 }
1602
1603 struct rt6_mtu_change_arg
1604 {
1605         struct net_device *dev;
1606         unsigned mtu;
1607 };
1608
1609 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1610 {
1611         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1612         struct inet6_dev *idev;
1613
1614         /* In IPv6 pmtu discovery is not optional,
1615            so that RTAX_MTU lock cannot disable it.
1616            We still use this lock to block changes
1617            caused by addrconf/ndisc.
1618         */
1619
1620         idev = __in6_dev_get(arg->dev);
1621         if (idev == NULL)
1622                 return 0;
1623
1624         /* For administrative MTU increase, there is no way to discover
1625            IPv6 PMTU increase, so PMTU increase should be updated here.
1626            Since RFC 1981 doesn't include administrative MTU increase
1627            update PMTU increase is a MUST. (i.e. jumbo frame)
1628          */
1629         /*
1630            If new MTU is less than route PMTU, this new MTU will be the
1631            lowest MTU in the path, update the route PMTU to reflect PMTU
1632            decreases; if new MTU is greater than route PMTU, and the
1633            old MTU is the lowest MTU in the path, update the route PMTU
1634            to reflect the increase. In this case if the other nodes' MTU
1635            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1636            PMTU discouvery.
1637          */
1638         if (rt->rt6i_dev == arg->dev &&
1639             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1640             (dst_mtu(&rt->u.dst) > arg->mtu ||
1641              (dst_mtu(&rt->u.dst) < arg->mtu &&
1642               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1643                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1644         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1645         return 0;
1646 }
1647
1648 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1649 {
1650         struct rt6_mtu_change_arg arg;
1651
1652         arg.dev = dev;
1653         arg.mtu = mtu;
1654         read_lock_bh(&rt6_lock);
1655         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1656         read_unlock_bh(&rt6_lock);
1657 }
1658
1659 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1660                               struct in6_rtmsg *rtmsg)
1661 {
1662         memset(rtmsg, 0, sizeof(*rtmsg));
1663
1664         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1665         rtmsg->rtmsg_src_len = r->rtm_src_len;
1666         rtmsg->rtmsg_flags = RTF_UP;
1667         if (r->rtm_type == RTN_UNREACHABLE)
1668                 rtmsg->rtmsg_flags |= RTF_REJECT;
1669
1670         if (rta[RTA_GATEWAY-1]) {
1671                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1672                         return -EINVAL;
1673                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1674                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1675         }
1676         if (rta[RTA_DST-1]) {
1677                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1678                         return -EINVAL;
1679                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1680         }
1681         if (rta[RTA_SRC-1]) {
1682                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1683                         return -EINVAL;
1684                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1685         }
1686         if (rta[RTA_OIF-1]) {
1687                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1688                         return -EINVAL;
1689                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1690         }
1691         if (rta[RTA_PRIORITY-1]) {
1692                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1693                         return -EINVAL;
1694                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1695         }
1696         return 0;
1697 }
1698
1699 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1700 {
1701         struct rtmsg *r = NLMSG_DATA(nlh);
1702         struct in6_rtmsg rtmsg;
1703
1704         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1705                 return -EINVAL;
1706         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1707 }
1708
1709 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1710 {
1711         struct rtmsg *r = NLMSG_DATA(nlh);
1712         struct in6_rtmsg rtmsg;
1713
1714         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1715                 return -EINVAL;
1716         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1717 }
1718
1719 struct rt6_rtnl_dump_arg
1720 {
1721         struct sk_buff *skb;
1722         struct netlink_callback *cb;
1723 };
1724
1725 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1726                          struct in6_addr *dst, struct in6_addr *src,
1727                          int iif, int type, u32 pid, u32 seq,
1728                          int prefix, unsigned int flags)
1729 {
1730         struct rtmsg *rtm;
1731         struct nlmsghdr  *nlh;
1732         unsigned char    *b = skb->tail;
1733         struct rta_cacheinfo ci;
1734
1735         if (prefix) {   /* user wants prefix routes only */
1736                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1737                         /* success since this is not a prefix route */
1738                         return 1;
1739                 }
1740         }
1741
1742         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1743         rtm = NLMSG_DATA(nlh);
1744         rtm->rtm_family = AF_INET6;
1745         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1746         rtm->rtm_src_len = rt->rt6i_src.plen;
1747         rtm->rtm_tos = 0;
1748         rtm->rtm_table = RT_TABLE_MAIN;
1749         if (rt->rt6i_flags&RTF_REJECT)
1750                 rtm->rtm_type = RTN_UNREACHABLE;
1751         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1752                 rtm->rtm_type = RTN_LOCAL;
1753         else
1754                 rtm->rtm_type = RTN_UNICAST;
1755         rtm->rtm_flags = 0;
1756         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1757         rtm->rtm_protocol = rt->rt6i_protocol;
1758         if (rt->rt6i_flags&RTF_DYNAMIC)
1759                 rtm->rtm_protocol = RTPROT_REDIRECT;
1760         else if (rt->rt6i_flags & RTF_ADDRCONF)
1761                 rtm->rtm_protocol = RTPROT_KERNEL;
1762         else if (rt->rt6i_flags&RTF_DEFAULT)
1763                 rtm->rtm_protocol = RTPROT_RA;
1764
1765         if (rt->rt6i_flags&RTF_CACHE)
1766                 rtm->rtm_flags |= RTM_F_CLONED;
1767
1768         if (dst) {
1769                 RTA_PUT(skb, RTA_DST, 16, dst);
1770                 rtm->rtm_dst_len = 128;
1771         } else if (rtm->rtm_dst_len)
1772                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1773 #ifdef CONFIG_IPV6_SUBTREES
1774         if (src) {
1775                 RTA_PUT(skb, RTA_SRC, 16, src);
1776                 rtm->rtm_src_len = 128;
1777         } else if (rtm->rtm_src_len)
1778                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1779 #endif
1780         if (iif)
1781                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1782         else if (dst) {
1783                 struct in6_addr saddr_buf;
1784                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1785                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1786         }
1787         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1788                 goto rtattr_failure;
1789         if (rt->u.dst.neighbour)
1790                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1791         if (rt->u.dst.dev)
1792                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1793         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1794         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1795         if (rt->rt6i_expires)
1796                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1797         else
1798                 ci.rta_expires = 0;
1799         ci.rta_used = rt->u.dst.__use;
1800         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1801         ci.rta_error = rt->u.dst.error;
1802         ci.rta_id = 0;
1803         ci.rta_ts = 0;
1804         ci.rta_tsage = 0;
1805         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1806         nlh->nlmsg_len = skb->tail - b;
1807         return skb->len;
1808
1809 nlmsg_failure:
1810 rtattr_failure:
1811         skb_trim(skb, b - skb->data);
1812         return -1;
1813 }
1814
1815 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1816 {
1817         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1818         int prefix;
1819
1820         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1821                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1822                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1823         } else
1824                 prefix = 0;
1825
1826         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1827                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1828                      prefix, NLM_F_MULTI);
1829 }
1830
1831 static int fib6_dump_node(struct fib6_walker_t *w)
1832 {
1833         int res;
1834         struct rt6_info *rt;
1835
1836         for (rt = w->leaf; rt; rt = rt->u.next) {
1837                 res = rt6_dump_route(rt, w->args);
1838                 if (res < 0) {
1839                         /* Frame is full, suspend walking */
1840                         w->leaf = rt;
1841                         return 1;
1842                 }
1843                 BUG_TRAP(res!=0);
1844         }
1845         w->leaf = NULL;
1846         return 0;
1847 }
1848
1849 static void fib6_dump_end(struct netlink_callback *cb)
1850 {
1851         struct fib6_walker_t *w = (void*)cb->args[0];
1852
1853         if (w) {
1854                 cb->args[0] = 0;
1855                 fib6_walker_unlink(w);
1856                 kfree(w);
1857         }
1858         cb->done = (void*)cb->args[1];
1859         cb->args[1] = 0;
1860 }
1861
1862 static int fib6_dump_done(struct netlink_callback *cb)
1863 {
1864         fib6_dump_end(cb);
1865         return cb->done ? cb->done(cb) : 0;
1866 }
1867
1868 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1869 {
1870         struct rt6_rtnl_dump_arg arg;
1871         struct fib6_walker_t *w;
1872         int res;
1873
1874         arg.skb = skb;
1875         arg.cb = cb;
1876
1877         w = (void*)cb->args[0];
1878         if (w == NULL) {
1879                 /* New dump:
1880                  * 
1881                  * 1. hook callback destructor.
1882                  */
1883                 cb->args[1] = (long)cb->done;
1884                 cb->done = fib6_dump_done;
1885
1886                 /*
1887                  * 2. allocate and initialize walker.
1888                  */
1889                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1890                 if (w == NULL)
1891                         return -ENOMEM;
1892                 RT6_TRACE("dump<%p", w);
1893                 w->root = &ip6_routing_table;
1894                 w->func = fib6_dump_node;
1895                 w->args = &arg;
1896                 cb->args[0] = (long)w;
1897                 read_lock_bh(&rt6_lock);
1898                 res = fib6_walk(w);
1899                 read_unlock_bh(&rt6_lock);
1900         } else {
1901                 w->args = &arg;
1902                 read_lock_bh(&rt6_lock);
1903                 res = fib6_walk_continue(w);
1904                 read_unlock_bh(&rt6_lock);
1905         }
1906 #if RT6_DEBUG >= 3
1907         if (res <= 0 && skb->len == 0)
1908                 RT6_TRACE("%p>dump end\n", w);
1909 #endif
1910         res = res < 0 ? res : skb->len;
1911         /* res < 0 is an error. (really, impossible)
1912            res == 0 means that dump is complete, but skb still can contain data.
1913            res > 0 dump is not complete, but frame is full.
1914          */
1915         /* Destroy walker, if dump of this table is complete. */
1916         if (res <= 0)
1917                 fib6_dump_end(cb);
1918         return res;
1919 }
1920
1921 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1922 {
1923         struct rtattr **rta = arg;
1924         int iif = 0;
1925         int err = -ENOBUFS;
1926         struct sk_buff *skb;
1927         struct flowi fl;
1928         struct rt6_info *rt;
1929
1930         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1931         if (skb == NULL)
1932                 goto out;
1933
1934         /* Reserve room for dummy headers, this skb can pass
1935            through good chunk of routing engine.
1936          */
1937         skb->mac.raw = skb->data;
1938         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1939
1940         memset(&fl, 0, sizeof(fl));
1941         if (rta[RTA_SRC-1])
1942                 ipv6_addr_copy(&fl.fl6_src,
1943                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1944         if (rta[RTA_DST-1])
1945                 ipv6_addr_copy(&fl.fl6_dst,
1946                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1947
1948         if (rta[RTA_IIF-1])
1949                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1950
1951         if (iif) {
1952                 struct net_device *dev;
1953                 dev = __dev_get_by_index(iif);
1954                 if (!dev) {
1955                         err = -ENODEV;
1956                         goto out_free;
1957                 }
1958         }
1959
1960         fl.oif = 0;
1961         if (rta[RTA_OIF-1])
1962                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1963
1964         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1965
1966         skb->dst = &rt->u.dst;
1967
1968         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1969         err = rt6_fill_node(skb, rt, 
1970                             &fl.fl6_dst, &fl.fl6_src,
1971                             iif,
1972                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1973                             nlh->nlmsg_seq, 0, 0);
1974         if (err < 0) {
1975                 err = -EMSGSIZE;
1976                 goto out_free;
1977         }
1978
1979         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1980         if (err > 0)
1981                 err = 0;
1982 out:
1983         return err;
1984 out_free:
1985         kfree_skb(skb);
1986         goto out;       
1987 }
1988
1989 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
1990                         struct netlink_skb_parms *req)
1991 {
1992         struct sk_buff *skb;
1993         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1994         u32 pid = current->pid;
1995         u32 seq = 0;
1996
1997         if (req)
1998                 pid = req->pid;
1999         if (nlh)
2000                 seq = nlh->nlmsg_seq;
2001         
2002         skb = alloc_skb(size, gfp_any());
2003         if (!skb) {
2004                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2005                 return;
2006         }
2007         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2008                 kfree_skb(skb);
2009                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2010                 return;
2011         }
2012         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2013         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2014 }
2015
2016 /*
2017  *      /proc
2018  */
2019
2020 #ifdef CONFIG_PROC_FS
2021
2022 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2023
2024 struct rt6_proc_arg
2025 {
2026         char *buffer;
2027         int offset;
2028         int length;
2029         int skip;
2030         int len;
2031 };
2032
2033 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2034 {
2035         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2036         int i;
2037
2038         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2039                 arg->skip++;
2040                 return 0;
2041         }
2042
2043         if (arg->len >= arg->length)
2044                 return 0;
2045
2046         for (i=0; i<16; i++) {
2047                 sprintf(arg->buffer + arg->len, "%02x",
2048                         rt->rt6i_dst.addr.s6_addr[i]);
2049                 arg->len += 2;
2050         }
2051         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2052                             rt->rt6i_dst.plen);
2053
2054 #ifdef CONFIG_IPV6_SUBTREES
2055         for (i=0; i<16; i++) {
2056                 sprintf(arg->buffer + arg->len, "%02x",
2057                         rt->rt6i_src.addr.s6_addr[i]);
2058                 arg->len += 2;
2059         }
2060         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2061                             rt->rt6i_src.plen);
2062 #else
2063         sprintf(arg->buffer + arg->len,
2064                 "00000000000000000000000000000000 00 ");
2065         arg->len += 36;
2066 #endif
2067
2068         if (rt->rt6i_nexthop) {
2069                 for (i=0; i<16; i++) {
2070                         sprintf(arg->buffer + arg->len, "%02x",
2071                                 rt->rt6i_nexthop->primary_key[i]);
2072                         arg->len += 2;
2073                 }
2074         } else {
2075                 sprintf(arg->buffer + arg->len,
2076                         "00000000000000000000000000000000");
2077                 arg->len += 32;
2078         }
2079         arg->len += sprintf(arg->buffer + arg->len,
2080                             " %08x %08x %08x %08x %8s\n",
2081                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2082                             rt->u.dst.__use, rt->rt6i_flags, 
2083                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2084         return 0;
2085 }
2086
2087 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2088 {
2089         struct rt6_proc_arg arg;
2090         arg.buffer = buffer;
2091         arg.offset = offset;
2092         arg.length = length;
2093         arg.skip = 0;
2094         arg.len = 0;
2095
2096         read_lock_bh(&rt6_lock);
2097         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2098         read_unlock_bh(&rt6_lock);
2099
2100         *start = buffer;
2101         if (offset)
2102                 *start += offset % RT6_INFO_LEN;
2103
2104         arg.len -= offset % RT6_INFO_LEN;
2105
2106         if (arg.len > length)
2107                 arg.len = length;
2108         if (arg.len < 0)
2109                 arg.len = 0;
2110
2111         return arg.len;
2112 }
2113
2114 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2115 {
2116         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2117                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2118                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2119                       rt6_stats.fib_rt_cache,
2120                       atomic_read(&ip6_dst_ops.entries),
2121                       rt6_stats.fib_discarded_routes);
2122
2123         return 0;
2124 }
2125
2126 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2127 {
2128         return single_open(file, rt6_stats_seq_show, NULL);
2129 }
2130
2131 static struct file_operations rt6_stats_seq_fops = {
2132         .owner   = THIS_MODULE,
2133         .open    = rt6_stats_seq_open,
2134         .read    = seq_read,
2135         .llseek  = seq_lseek,
2136         .release = single_release,
2137 };
2138 #endif  /* CONFIG_PROC_FS */
2139
2140 #ifdef CONFIG_SYSCTL
2141
2142 static int flush_delay;
2143
2144 static
2145 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2146                               void __user *buffer, size_t *lenp, loff_t *ppos)
2147 {
2148         if (write) {
2149                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2150                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2151                 return 0;
2152         } else
2153                 return -EINVAL;
2154 }
2155
2156 ctl_table ipv6_route_table[] = {
2157         {
2158                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2159                 .procname       =       "flush",
2160                 .data           =       &flush_delay,
2161                 .maxlen         =       sizeof(int),
2162                 .mode           =       0200,
2163                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2164         },
2165         {
2166                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2167                 .procname       =       "gc_thresh",
2168                 .data           =       &ip6_dst_ops.gc_thresh,
2169                 .maxlen         =       sizeof(int),
2170                 .mode           =       0644,
2171                 .proc_handler   =       &proc_dointvec,
2172         },
2173         {
2174                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2175                 .procname       =       "max_size",
2176                 .data           =       &ip6_rt_max_size,
2177                 .maxlen         =       sizeof(int),
2178                 .mode           =       0644,
2179                 .proc_handler   =       &proc_dointvec,
2180         },
2181         {
2182                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2183                 .procname       =       "gc_min_interval",
2184                 .data           =       &ip6_rt_gc_min_interval,
2185                 .maxlen         =       sizeof(int),
2186                 .mode           =       0644,
2187                 .proc_handler   =       &proc_dointvec_jiffies,
2188                 .strategy       =       &sysctl_jiffies,
2189         },
2190         {
2191                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2192                 .procname       =       "gc_timeout",
2193                 .data           =       &ip6_rt_gc_timeout,
2194                 .maxlen         =       sizeof(int),
2195                 .mode           =       0644,
2196                 .proc_handler   =       &proc_dointvec_jiffies,
2197                 .strategy       =       &sysctl_jiffies,
2198         },
2199         {
2200                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2201                 .procname       =       "gc_interval",
2202                 .data           =       &ip6_rt_gc_interval,
2203                 .maxlen         =       sizeof(int),
2204                 .mode           =       0644,
2205                 .proc_handler   =       &proc_dointvec_jiffies,
2206                 .strategy       =       &sysctl_jiffies,
2207         },
2208         {
2209                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2210                 .procname       =       "gc_elasticity",
2211                 .data           =       &ip6_rt_gc_elasticity,
2212                 .maxlen         =       sizeof(int),
2213                 .mode           =       0644,
2214                 .proc_handler   =       &proc_dointvec_jiffies,
2215                 .strategy       =       &sysctl_jiffies,
2216         },
2217         {
2218                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2219                 .procname       =       "mtu_expires",
2220                 .data           =       &ip6_rt_mtu_expires,
2221                 .maxlen         =       sizeof(int),
2222                 .mode           =       0644,
2223                 .proc_handler   =       &proc_dointvec_jiffies,
2224                 .strategy       =       &sysctl_jiffies,
2225         },
2226         {
2227                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2228                 .procname       =       "min_adv_mss",
2229                 .data           =       &ip6_rt_min_advmss,
2230                 .maxlen         =       sizeof(int),
2231                 .mode           =       0644,
2232                 .proc_handler   =       &proc_dointvec_jiffies,
2233                 .strategy       =       &sysctl_jiffies,
2234         },
2235         {
2236                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2237                 .procname       =       "gc_min_interval_ms",
2238                 .data           =       &ip6_rt_gc_min_interval,
2239                 .maxlen         =       sizeof(int),
2240                 .mode           =       0644,
2241                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2242                 .strategy       =       &sysctl_ms_jiffies,
2243         },
2244         { .ctl_name = 0 }
2245 };
2246
2247 #endif
2248
2249 void __init ip6_route_init(void)
2250 {
2251         struct proc_dir_entry *p;
2252
2253         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2254                                                      sizeof(struct rt6_info),
2255                                                      0, SLAB_HWCACHE_ALIGN,
2256                                                      NULL, NULL);
2257         if (!ip6_dst_ops.kmem_cachep)
2258                 panic("cannot create ip6_dst_cache");
2259
2260         fib6_init();
2261 #ifdef  CONFIG_PROC_FS
2262         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2263         if (p)
2264                 p->owner = THIS_MODULE;
2265
2266         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2267 #endif
2268 #ifdef CONFIG_XFRM
2269         xfrm6_init();
2270 #endif
2271 }
2272
2273 void ip6_route_cleanup(void)
2274 {
2275 #ifdef CONFIG_PROC_FS
2276         proc_net_remove("ipv6_route");
2277         proc_net_remove("rt6_stats");
2278 #endif
2279 #ifdef CONFIG_XFRM
2280         xfrm6_fini();
2281 #endif
2282         rt6_ifdown(NULL);
2283         fib6_gc_cleanup();
2284         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2285 }