Merge ../linux-2.6-watchdog-mm
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146 static int ip6_pkt_blk_hole(struct sk_buff *skb);
147
148 struct rt6_info ip6_prohibit_entry = {
149         .u = {
150                 .dst = {
151                         .__refcnt       = ATOMIC_INIT(1),
152                         .__use          = 1,
153                         .dev            = &loopback_dev,
154                         .obsolete       = -1,
155                         .error          = -EACCES,
156                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
157                         .input          = ip6_pkt_prohibit,
158                         .output         = ip6_pkt_prohibit_out,
159                         .ops            = &ip6_dst_ops,
160                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
161                 }
162         },
163         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
164         .rt6i_metric    = ~(u32) 0,
165         .rt6i_ref       = ATOMIC_INIT(1),
166 };
167
168 struct rt6_info ip6_blk_hole_entry = {
169         .u = {
170                 .dst = {
171                         .__refcnt       = ATOMIC_INIT(1),
172                         .__use          = 1,
173                         .dev            = &loopback_dev,
174                         .obsolete       = -1,
175                         .error          = -EINVAL,
176                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
177                         .input          = ip6_pkt_blk_hole,
178                         .output         = ip6_pkt_blk_hole,
179                         .ops            = &ip6_dst_ops,
180                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
181                 }
182         },
183         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
184         .rt6i_metric    = ~(u32) 0,
185         .rt6i_ref       = ATOMIC_INIT(1),
186 };
187
188 #endif
189
190 /* allocate dst with ip6_dst_ops */
191 static __inline__ struct rt6_info *ip6_dst_alloc(void)
192 {
193         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
194 }
195
196 static void ip6_dst_destroy(struct dst_entry *dst)
197 {
198         struct rt6_info *rt = (struct rt6_info *)dst;
199         struct inet6_dev *idev = rt->rt6i_idev;
200
201         if (idev != NULL) {
202                 rt->rt6i_idev = NULL;
203                 in6_dev_put(idev);
204         }       
205 }
206
207 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
208                            int how)
209 {
210         struct rt6_info *rt = (struct rt6_info *)dst;
211         struct inet6_dev *idev = rt->rt6i_idev;
212
213         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
214                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
215                 if (loopback_idev != NULL) {
216                         rt->rt6i_idev = loopback_idev;
217                         in6_dev_put(idev);
218                 }
219         }
220 }
221
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
223 {
224         return (rt->rt6i_flags & RTF_EXPIRES &&
225                 time_after(jiffies, rt->rt6i_expires));
226 }
227
228 static inline int rt6_need_strict(struct in6_addr *daddr)
229 {
230         return (ipv6_addr_type(daddr) &
231                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 }
233
234 /*
235  *      Route lookup. Any table->tb6_lock is implied.
236  */
237
238 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
239                                                     int oif,
240                                                     int strict)
241 {
242         struct rt6_info *local = NULL;
243         struct rt6_info *sprt;
244
245         if (oif) {
246                 for (sprt = rt; sprt; sprt = sprt->u.next) {
247                         struct net_device *dev = sprt->rt6i_dev;
248                         if (dev->ifindex == oif)
249                                 return sprt;
250                         if (dev->flags & IFF_LOOPBACK) {
251                                 if (sprt->rt6i_idev == NULL ||
252                                     sprt->rt6i_idev->dev->ifindex != oif) {
253                                         if (strict && oif)
254                                                 continue;
255                                         if (local && (!oif || 
256                                                       local->rt6i_idev->dev->ifindex == oif))
257                                                 continue;
258                                 }
259                                 local = sprt;
260                         }
261                 }
262
263                 if (local)
264                         return local;
265
266                 if (strict)
267                         return &ip6_null_entry;
268         }
269         return rt;
270 }
271
272 #ifdef CONFIG_IPV6_ROUTER_PREF
273 static void rt6_probe(struct rt6_info *rt)
274 {
275         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
276         /*
277          * Okay, this does not seem to be appropriate
278          * for now, however, we need to check if it
279          * is really so; aka Router Reachability Probing.
280          *
281          * Router Reachability Probe MUST be rate-limited
282          * to no more than one per minute.
283          */
284         if (!neigh || (neigh->nud_state & NUD_VALID))
285                 return;
286         read_lock_bh(&neigh->lock);
287         if (!(neigh->nud_state & NUD_VALID) &&
288             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
289                 struct in6_addr mcaddr;
290                 struct in6_addr *target;
291
292                 neigh->updated = jiffies;
293                 read_unlock_bh(&neigh->lock);
294
295                 target = (struct in6_addr *)&neigh->primary_key;
296                 addrconf_addr_solict_mult(target, &mcaddr);
297                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
298         } else
299                 read_unlock_bh(&neigh->lock);
300 }
301 #else
302 static inline void rt6_probe(struct rt6_info *rt)
303 {
304         return;
305 }
306 #endif
307
308 /*
309  * Default Router Selection (RFC 2461 6.3.6)
310  */
311 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
312 {
313         struct net_device *dev = rt->rt6i_dev;
314         if (!oif || dev->ifindex == oif)
315                 return 2;
316         if ((dev->flags & IFF_LOOPBACK) &&
317             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
318                 return 1;
319         return 0;
320 }
321
322 static int inline rt6_check_neigh(struct rt6_info *rt)
323 {
324         struct neighbour *neigh = rt->rt6i_nexthop;
325         int m = 0;
326         if (rt->rt6i_flags & RTF_NONEXTHOP ||
327             !(rt->rt6i_flags & RTF_GATEWAY))
328                 m = 1;
329         else if (neigh) {
330                 read_lock_bh(&neigh->lock);
331                 if (neigh->nud_state & NUD_VALID)
332                         m = 2;
333                 else if (!(neigh->nud_state & NUD_FAILED))
334                         m = 1;
335                 read_unlock_bh(&neigh->lock);
336         }
337         return m;
338 }
339
340 static int rt6_score_route(struct rt6_info *rt, int oif,
341                            int strict)
342 {
343         int m, n;
344                 
345         m = rt6_check_dev(rt, oif);
346         if (!m && (strict & RT6_LOOKUP_F_IFACE))
347                 return -1;
348 #ifdef CONFIG_IPV6_ROUTER_PREF
349         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
350 #endif
351         n = rt6_check_neigh(rt);
352         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
353                 return -1;
354         return m;
355 }
356
357 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
358                                    int strict)
359 {
360         struct rt6_info *match = NULL, *last = NULL;
361         struct rt6_info *rt, *rt0 = *head;
362         u32 metric;
363         int mpri = -1;
364
365         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
366                   __FUNCTION__, head, head ? *head : NULL, oif);
367
368         for (rt = rt0, metric = rt0->rt6i_metric;
369              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
370              rt = rt->u.next) {
371                 int m;
372
373                 if (rt6_check_expired(rt))
374                         continue;
375
376                 last = rt;
377
378                 m = rt6_score_route(rt, oif, strict);
379                 if (m < 0)
380                         continue;
381
382                 if (m > mpri) {
383                         if (strict & RT6_LOOKUP_F_REACHABLE)
384                                 rt6_probe(match);
385                         match = rt;
386                         mpri = m;
387                 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
388                         rt6_probe(rt);
389                 }
390         }
391
392         if (!match &&
393             (strict & RT6_LOOKUP_F_REACHABLE) &&
394             last && last != rt0) {
395                 /* no entries matched; do round-robin */
396                 static DEFINE_SPINLOCK(lock);
397                 spin_lock(&lock);
398                 *head = rt0->u.next;
399                 rt0->u.next = last->u.next;
400                 last->u.next = rt0;
401                 spin_unlock(&lock);
402         }
403
404         RT6_TRACE("%s() => %p, score=%d\n",
405                   __FUNCTION__, match, mpri);
406
407         return (match ? match : &ip6_null_entry);
408 }
409
410 #ifdef CONFIG_IPV6_ROUTE_INFO
411 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
412                   struct in6_addr *gwaddr)
413 {
414         struct route_info *rinfo = (struct route_info *) opt;
415         struct in6_addr prefix_buf, *prefix;
416         unsigned int pref;
417         u32 lifetime;
418         struct rt6_info *rt;
419
420         if (len < sizeof(struct route_info)) {
421                 return -EINVAL;
422         }
423
424         /* Sanity check for prefix_len and length */
425         if (rinfo->length > 3) {
426                 return -EINVAL;
427         } else if (rinfo->prefix_len > 128) {
428                 return -EINVAL;
429         } else if (rinfo->prefix_len > 64) {
430                 if (rinfo->length < 2) {
431                         return -EINVAL;
432                 }
433         } else if (rinfo->prefix_len > 0) {
434                 if (rinfo->length < 1) {
435                         return -EINVAL;
436                 }
437         }
438
439         pref = rinfo->route_pref;
440         if (pref == ICMPV6_ROUTER_PREF_INVALID)
441                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
442
443         lifetime = ntohl(rinfo->lifetime);
444         if (lifetime == 0xffffffff) {
445                 /* infinity */
446         } else if (lifetime > 0x7fffffff/HZ) {
447                 /* Avoid arithmetic overflow */
448                 lifetime = 0x7fffffff/HZ - 1;
449         }
450
451         if (rinfo->length == 3)
452                 prefix = (struct in6_addr *)rinfo->prefix;
453         else {
454                 /* this function is safe */
455                 ipv6_addr_prefix(&prefix_buf,
456                                  (struct in6_addr *)rinfo->prefix,
457                                  rinfo->prefix_len);
458                 prefix = &prefix_buf;
459         }
460
461         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
462
463         if (rt && !lifetime) {
464                 ip6_del_rt(rt);
465                 rt = NULL;
466         }
467
468         if (!rt && lifetime)
469                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
470                                         pref);
471         else if (rt)
472                 rt->rt6i_flags = RTF_ROUTEINFO |
473                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
474
475         if (rt) {
476                 if (lifetime == 0xffffffff) {
477                         rt->rt6i_flags &= ~RTF_EXPIRES;
478                 } else {
479                         rt->rt6i_expires = jiffies + HZ * lifetime;
480                         rt->rt6i_flags |= RTF_EXPIRES;
481                 }
482                 dst_release(&rt->u.dst);
483         }
484         return 0;
485 }
486 #endif
487
488 #define BACKTRACK(saddr) \
489 do { \
490         if (rt == &ip6_null_entry) { \
491                 struct fib6_node *pn; \
492                 while (1) { \
493                         if (fn->fn_flags & RTN_TL_ROOT) \
494                                 goto out; \
495                         pn = fn->parent; \
496                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
497                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
498                         else \
499                                 fn = pn; \
500                         if (fn->fn_flags & RTN_RTINFO) \
501                                 goto restart; \
502                 } \
503         } \
504 } while(0)
505
506 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
507                                              struct flowi *fl, int flags)
508 {
509         struct fib6_node *fn;
510         struct rt6_info *rt;
511
512         read_lock_bh(&table->tb6_lock);
513         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
514 restart:
515         rt = fn->leaf;
516         rt = rt6_device_match(rt, fl->oif, flags);
517         BACKTRACK(&fl->fl6_src);
518 out:
519         dst_hold(&rt->u.dst);
520         read_unlock_bh(&table->tb6_lock);
521
522         rt->u.dst.lastuse = jiffies;
523         rt->u.dst.__use++;
524
525         return rt;
526
527 }
528
529 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
530                             int oif, int strict)
531 {
532         struct flowi fl = {
533                 .oif = oif,
534                 .nl_u = {
535                         .ip6_u = {
536                                 .daddr = *daddr,
537                         },
538                 },
539         };
540         struct dst_entry *dst;
541         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
542
543         if (saddr) {
544                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
545                 flags |= RT6_LOOKUP_F_HAS_SADDR;
546         }
547
548         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
549         if (dst->error == 0)
550                 return (struct rt6_info *) dst;
551
552         dst_release(dst);
553
554         return NULL;
555 }
556
557 /* ip6_ins_rt is called with FREE table->tb6_lock.
558    It takes new route entry, the addition fails by any reason the
559    route is freed. In any case, if caller does not hold it, it may
560    be destroyed.
561  */
562
563 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
564 {
565         int err;
566         struct fib6_table *table;
567
568         table = rt->rt6i_table;
569         write_lock_bh(&table->tb6_lock);
570         err = fib6_add(&table->tb6_root, rt, info);
571         write_unlock_bh(&table->tb6_lock);
572
573         return err;
574 }
575
576 int ip6_ins_rt(struct rt6_info *rt)
577 {
578         return __ip6_ins_rt(rt, NULL);
579 }
580
581 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
582                                       struct in6_addr *saddr)
583 {
584         struct rt6_info *rt;
585
586         /*
587          *      Clone the route.
588          */
589
590         rt = ip6_rt_copy(ort);
591
592         if (rt) {
593                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
594                         if (rt->rt6i_dst.plen != 128 &&
595                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
596                                 rt->rt6i_flags |= RTF_ANYCAST;
597                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
598                 }
599
600                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
601                 rt->rt6i_dst.plen = 128;
602                 rt->rt6i_flags |= RTF_CACHE;
603                 rt->u.dst.flags |= DST_HOST;
604
605 #ifdef CONFIG_IPV6_SUBTREES
606                 if (rt->rt6i_src.plen && saddr) {
607                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
608                         rt->rt6i_src.plen = 128;
609                 }
610 #endif
611
612                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
613
614         }
615
616         return rt;
617 }
618
619 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
620 {
621         struct rt6_info *rt = ip6_rt_copy(ort);
622         if (rt) {
623                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
624                 rt->rt6i_dst.plen = 128;
625                 rt->rt6i_flags |= RTF_CACHE;
626                 rt->u.dst.flags |= DST_HOST;
627                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
628         }
629         return rt;
630 }
631
632 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
633                                             struct flowi *fl, int flags)
634 {
635         struct fib6_node *fn;
636         struct rt6_info *rt, *nrt;
637         int strict = 0;
638         int attempts = 3;
639         int err;
640         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
641
642         strict |= flags & RT6_LOOKUP_F_IFACE;
643
644 relookup:
645         read_lock_bh(&table->tb6_lock);
646
647 restart_2:
648         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
649
650 restart:
651         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
652         BACKTRACK(&fl->fl6_src);
653         if (rt == &ip6_null_entry ||
654             rt->rt6i_flags & RTF_CACHE)
655                 goto out;
656
657         dst_hold(&rt->u.dst);
658         read_unlock_bh(&table->tb6_lock);
659
660         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
661                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
662         else {
663 #if CLONE_OFFLINK_ROUTE
664                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
665 #else
666                 goto out2;
667 #endif
668         }
669
670         dst_release(&rt->u.dst);
671         rt = nrt ? : &ip6_null_entry;
672
673         dst_hold(&rt->u.dst);
674         if (nrt) {
675                 err = ip6_ins_rt(nrt);
676                 if (!err)
677                         goto out2;
678         }
679
680         if (--attempts <= 0)
681                 goto out2;
682
683         /*
684          * Race condition! In the gap, when table->tb6_lock was
685          * released someone could insert this route.  Relookup.
686          */
687         dst_release(&rt->u.dst);
688         goto relookup;
689
690 out:
691         if (reachable) {
692                 reachable = 0;
693                 goto restart_2;
694         }
695         dst_hold(&rt->u.dst);
696         read_unlock_bh(&table->tb6_lock);
697 out2:
698         rt->u.dst.lastuse = jiffies;
699         rt->u.dst.__use++;
700
701         return rt;
702 }
703
704 void ip6_route_input(struct sk_buff *skb)
705 {
706         struct ipv6hdr *iph = skb->nh.ipv6h;
707         int flags = RT6_LOOKUP_F_HAS_SADDR;
708         struct flowi fl = {
709                 .iif = skb->dev->ifindex,
710                 .nl_u = {
711                         .ip6_u = {
712                                 .daddr = iph->daddr,
713                                 .saddr = iph->saddr,
714                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
715                         },
716                 },
717                 .mark = skb->mark,
718                 .proto = iph->nexthdr,
719         };
720
721         if (rt6_need_strict(&iph->daddr))
722                 flags |= RT6_LOOKUP_F_IFACE;
723
724         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
725 }
726
727 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
728                                              struct flowi *fl, int flags)
729 {
730         struct fib6_node *fn;
731         struct rt6_info *rt, *nrt;
732         int strict = 0;
733         int attempts = 3;
734         int err;
735         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
736
737         strict |= flags & RT6_LOOKUP_F_IFACE;
738
739 relookup:
740         read_lock_bh(&table->tb6_lock);
741
742 restart_2:
743         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
744
745 restart:
746         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
747         BACKTRACK(&fl->fl6_src);
748         if (rt == &ip6_null_entry ||
749             rt->rt6i_flags & RTF_CACHE)
750                 goto out;
751
752         dst_hold(&rt->u.dst);
753         read_unlock_bh(&table->tb6_lock);
754
755         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
756                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
757         else {
758 #if CLONE_OFFLINK_ROUTE
759                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
760 #else
761                 goto out2;
762 #endif
763         }
764
765         dst_release(&rt->u.dst);
766         rt = nrt ? : &ip6_null_entry;
767
768         dst_hold(&rt->u.dst);
769         if (nrt) {
770                 err = ip6_ins_rt(nrt);
771                 if (!err)
772                         goto out2;
773         }
774
775         if (--attempts <= 0)
776                 goto out2;
777
778         /*
779          * Race condition! In the gap, when table->tb6_lock was
780          * released someone could insert this route.  Relookup.
781          */
782         dst_release(&rt->u.dst);
783         goto relookup;
784
785 out:
786         if (reachable) {
787                 reachable = 0;
788                 goto restart_2;
789         }
790         dst_hold(&rt->u.dst);
791         read_unlock_bh(&table->tb6_lock);
792 out2:
793         rt->u.dst.lastuse = jiffies;
794         rt->u.dst.__use++;
795         return rt;
796 }
797
798 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
799 {
800         int flags = 0;
801
802         if (rt6_need_strict(&fl->fl6_dst))
803                 flags |= RT6_LOOKUP_F_IFACE;
804
805         if (!ipv6_addr_any(&fl->fl6_src))
806                 flags |= RT6_LOOKUP_F_HAS_SADDR;
807
808         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
809 }
810
811
812 /*
813  *      Destination cache support functions
814  */
815
816 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
817 {
818         struct rt6_info *rt;
819
820         rt = (struct rt6_info *) dst;
821
822         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
823                 return dst;
824
825         return NULL;
826 }
827
828 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
829 {
830         struct rt6_info *rt = (struct rt6_info *) dst;
831
832         if (rt) {
833                 if (rt->rt6i_flags & RTF_CACHE)
834                         ip6_del_rt(rt);
835                 else
836                         dst_release(dst);
837         }
838         return NULL;
839 }
840
841 static void ip6_link_failure(struct sk_buff *skb)
842 {
843         struct rt6_info *rt;
844
845         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
846
847         rt = (struct rt6_info *) skb->dst;
848         if (rt) {
849                 if (rt->rt6i_flags&RTF_CACHE) {
850                         dst_set_expires(&rt->u.dst, 0);
851                         rt->rt6i_flags |= RTF_EXPIRES;
852                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
853                         rt->rt6i_node->fn_sernum = -1;
854         }
855 }
856
857 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
858 {
859         struct rt6_info *rt6 = (struct rt6_info*)dst;
860
861         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
862                 rt6->rt6i_flags |= RTF_MODIFIED;
863                 if (mtu < IPV6_MIN_MTU) {
864                         mtu = IPV6_MIN_MTU;
865                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
866                 }
867                 dst->metrics[RTAX_MTU-1] = mtu;
868                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
869         }
870 }
871
872 static int ipv6_get_mtu(struct net_device *dev);
873
874 static inline unsigned int ipv6_advmss(unsigned int mtu)
875 {
876         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
877
878         if (mtu < ip6_rt_min_advmss)
879                 mtu = ip6_rt_min_advmss;
880
881         /*
882          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
883          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
884          * IPV6_MAXPLEN is also valid and means: "any MSS, 
885          * rely only on pmtu discovery"
886          */
887         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
888                 mtu = IPV6_MAXPLEN;
889         return mtu;
890 }
891
892 static struct dst_entry *ndisc_dst_gc_list;
893 static DEFINE_SPINLOCK(ndisc_lock);
894
895 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
896                                   struct neighbour *neigh,
897                                   struct in6_addr *addr,
898                                   int (*output)(struct sk_buff *))
899 {
900         struct rt6_info *rt;
901         struct inet6_dev *idev = in6_dev_get(dev);
902
903         if (unlikely(idev == NULL))
904                 return NULL;
905
906         rt = ip6_dst_alloc();
907         if (unlikely(rt == NULL)) {
908                 in6_dev_put(idev);
909                 goto out;
910         }
911
912         dev_hold(dev);
913         if (neigh)
914                 neigh_hold(neigh);
915         else
916                 neigh = ndisc_get_neigh(dev, addr);
917
918         rt->rt6i_dev      = dev;
919         rt->rt6i_idev     = idev;
920         rt->rt6i_nexthop  = neigh;
921         atomic_set(&rt->u.dst.__refcnt, 1);
922         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
923         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
924         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
925         rt->u.dst.output  = output;
926
927 #if 0   /* there's no chance to use these for ndisc */
928         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
929                                 ? DST_HOST 
930                                 : 0;
931         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
932         rt->rt6i_dst.plen = 128;
933 #endif
934
935         spin_lock_bh(&ndisc_lock);
936         rt->u.dst.next = ndisc_dst_gc_list;
937         ndisc_dst_gc_list = &rt->u.dst;
938         spin_unlock_bh(&ndisc_lock);
939
940         fib6_force_start_gc();
941
942 out:
943         return &rt->u.dst;
944 }
945
946 int ndisc_dst_gc(int *more)
947 {
948         struct dst_entry *dst, *next, **pprev;
949         int freed;
950
951         next = NULL;
952         freed = 0;
953
954         spin_lock_bh(&ndisc_lock);
955         pprev = &ndisc_dst_gc_list;
956
957         while ((dst = *pprev) != NULL) {
958                 if (!atomic_read(&dst->__refcnt)) {
959                         *pprev = dst->next;
960                         dst_free(dst);
961                         freed++;
962                 } else {
963                         pprev = &dst->next;
964                         (*more)++;
965                 }
966         }
967
968         spin_unlock_bh(&ndisc_lock);
969
970         return freed;
971 }
972
973 static int ip6_dst_gc(void)
974 {
975         static unsigned expire = 30*HZ;
976         static unsigned long last_gc;
977         unsigned long now = jiffies;
978
979         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
980             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
981                 goto out;
982
983         expire++;
984         fib6_run_gc(expire);
985         last_gc = now;
986         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
987                 expire = ip6_rt_gc_timeout>>1;
988
989 out:
990         expire -= expire>>ip6_rt_gc_elasticity;
991         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
992 }
993
994 /* Clean host part of a prefix. Not necessary in radix tree,
995    but results in cleaner routing tables.
996
997    Remove it only when all the things will work!
998  */
999
1000 static int ipv6_get_mtu(struct net_device *dev)
1001 {
1002         int mtu = IPV6_MIN_MTU;
1003         struct inet6_dev *idev;
1004
1005         idev = in6_dev_get(dev);
1006         if (idev) {
1007                 mtu = idev->cnf.mtu6;
1008                 in6_dev_put(idev);
1009         }
1010         return mtu;
1011 }
1012
1013 int ipv6_get_hoplimit(struct net_device *dev)
1014 {
1015         int hoplimit = ipv6_devconf.hop_limit;
1016         struct inet6_dev *idev;
1017
1018         idev = in6_dev_get(dev);
1019         if (idev) {
1020                 hoplimit = idev->cnf.hop_limit;
1021                 in6_dev_put(idev);
1022         }
1023         return hoplimit;
1024 }
1025
1026 /*
1027  *
1028  */
1029
1030 int ip6_route_add(struct fib6_config *cfg)
1031 {
1032         int err;
1033         struct rt6_info *rt = NULL;
1034         struct net_device *dev = NULL;
1035         struct inet6_dev *idev = NULL;
1036         struct fib6_table *table;
1037         int addr_type;
1038
1039         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1040                 return -EINVAL;
1041 #ifndef CONFIG_IPV6_SUBTREES
1042         if (cfg->fc_src_len)
1043                 return -EINVAL;
1044 #endif
1045         if (cfg->fc_ifindex) {
1046                 err = -ENODEV;
1047                 dev = dev_get_by_index(cfg->fc_ifindex);
1048                 if (!dev)
1049                         goto out;
1050                 idev = in6_dev_get(dev);
1051                 if (!idev)
1052                         goto out;
1053         }
1054
1055         if (cfg->fc_metric == 0)
1056                 cfg->fc_metric = IP6_RT_PRIO_USER;
1057
1058         table = fib6_new_table(cfg->fc_table);
1059         if (table == NULL) {
1060                 err = -ENOBUFS;
1061                 goto out;
1062         }
1063
1064         rt = ip6_dst_alloc();
1065
1066         if (rt == NULL) {
1067                 err = -ENOMEM;
1068                 goto out;
1069         }
1070
1071         rt->u.dst.obsolete = -1;
1072         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1073
1074         if (cfg->fc_protocol == RTPROT_UNSPEC)
1075                 cfg->fc_protocol = RTPROT_BOOT;
1076         rt->rt6i_protocol = cfg->fc_protocol;
1077
1078         addr_type = ipv6_addr_type(&cfg->fc_dst);
1079
1080         if (addr_type & IPV6_ADDR_MULTICAST)
1081                 rt->u.dst.input = ip6_mc_input;
1082         else
1083                 rt->u.dst.input = ip6_forward;
1084
1085         rt->u.dst.output = ip6_output;
1086
1087         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1088         rt->rt6i_dst.plen = cfg->fc_dst_len;
1089         if (rt->rt6i_dst.plen == 128)
1090                rt->u.dst.flags = DST_HOST;
1091
1092 #ifdef CONFIG_IPV6_SUBTREES
1093         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1094         rt->rt6i_src.plen = cfg->fc_src_len;
1095 #endif
1096
1097         rt->rt6i_metric = cfg->fc_metric;
1098
1099         /* We cannot add true routes via loopback here,
1100            they would result in kernel looping; promote them to reject routes
1101          */
1102         if ((cfg->fc_flags & RTF_REJECT) ||
1103             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1104                 /* hold loopback dev/idev if we haven't done so. */
1105                 if (dev != &loopback_dev) {
1106                         if (dev) {
1107                                 dev_put(dev);
1108                                 in6_dev_put(idev);
1109                         }
1110                         dev = &loopback_dev;
1111                         dev_hold(dev);
1112                         idev = in6_dev_get(dev);
1113                         if (!idev) {
1114                                 err = -ENODEV;
1115                                 goto out;
1116                         }
1117                 }
1118                 rt->u.dst.output = ip6_pkt_discard_out;
1119                 rt->u.dst.input = ip6_pkt_discard;
1120                 rt->u.dst.error = -ENETUNREACH;
1121                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1122                 goto install_route;
1123         }
1124
1125         if (cfg->fc_flags & RTF_GATEWAY) {
1126                 struct in6_addr *gw_addr;
1127                 int gwa_type;
1128
1129                 gw_addr = &cfg->fc_gateway;
1130                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1131                 gwa_type = ipv6_addr_type(gw_addr);
1132
1133                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1134                         struct rt6_info *grt;
1135
1136                         /* IPv6 strictly inhibits using not link-local
1137                            addresses as nexthop address.
1138                            Otherwise, router will not able to send redirects.
1139                            It is very good, but in some (rare!) circumstances
1140                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1141                            some exceptions. --ANK
1142                          */
1143                         err = -EINVAL;
1144                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1145                                 goto out;
1146
1147                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1148
1149                         err = -EHOSTUNREACH;
1150                         if (grt == NULL)
1151                                 goto out;
1152                         if (dev) {
1153                                 if (dev != grt->rt6i_dev) {
1154                                         dst_release(&grt->u.dst);
1155                                         goto out;
1156                                 }
1157                         } else {
1158                                 dev = grt->rt6i_dev;
1159                                 idev = grt->rt6i_idev;
1160                                 dev_hold(dev);
1161                                 in6_dev_hold(grt->rt6i_idev);
1162                         }
1163                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1164                                 err = 0;
1165                         dst_release(&grt->u.dst);
1166
1167                         if (err)
1168                                 goto out;
1169                 }
1170                 err = -EINVAL;
1171                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1172                         goto out;
1173         }
1174
1175         err = -ENODEV;
1176         if (dev == NULL)
1177                 goto out;
1178
1179         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1180                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1181                 if (IS_ERR(rt->rt6i_nexthop)) {
1182                         err = PTR_ERR(rt->rt6i_nexthop);
1183                         rt->rt6i_nexthop = NULL;
1184                         goto out;
1185                 }
1186         }
1187
1188         rt->rt6i_flags = cfg->fc_flags;
1189
1190 install_route:
1191         if (cfg->fc_mx) {
1192                 struct nlattr *nla;
1193                 int remaining;
1194
1195                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1196                         int type = nla->nla_type;
1197
1198                         if (type) {
1199                                 if (type > RTAX_MAX) {
1200                                         err = -EINVAL;
1201                                         goto out;
1202                                 }
1203
1204                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1205                         }
1206                 }
1207         }
1208
1209         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1210                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1211         if (!rt->u.dst.metrics[RTAX_MTU-1])
1212                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1213         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1214                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1215         rt->u.dst.dev = dev;
1216         rt->rt6i_idev = idev;
1217         rt->rt6i_table = table;
1218         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1219
1220 out:
1221         if (dev)
1222                 dev_put(dev);
1223         if (idev)
1224                 in6_dev_put(idev);
1225         if (rt)
1226                 dst_free(&rt->u.dst);
1227         return err;
1228 }
1229
1230 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1231 {
1232         int err;
1233         struct fib6_table *table;
1234
1235         if (rt == &ip6_null_entry)
1236                 return -ENOENT;
1237
1238         table = rt->rt6i_table;
1239         write_lock_bh(&table->tb6_lock);
1240
1241         err = fib6_del(rt, info);
1242         dst_release(&rt->u.dst);
1243
1244         write_unlock_bh(&table->tb6_lock);
1245
1246         return err;
1247 }
1248
1249 int ip6_del_rt(struct rt6_info *rt)
1250 {
1251         return __ip6_del_rt(rt, NULL);
1252 }
1253
1254 static int ip6_route_del(struct fib6_config *cfg)
1255 {
1256         struct fib6_table *table;
1257         struct fib6_node *fn;
1258         struct rt6_info *rt;
1259         int err = -ESRCH;
1260
1261         table = fib6_get_table(cfg->fc_table);
1262         if (table == NULL)
1263                 return err;
1264
1265         read_lock_bh(&table->tb6_lock);
1266
1267         fn = fib6_locate(&table->tb6_root,
1268                          &cfg->fc_dst, cfg->fc_dst_len,
1269                          &cfg->fc_src, cfg->fc_src_len);
1270         
1271         if (fn) {
1272                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1273                         if (cfg->fc_ifindex &&
1274                             (rt->rt6i_dev == NULL ||
1275                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1276                                 continue;
1277                         if (cfg->fc_flags & RTF_GATEWAY &&
1278                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1279                                 continue;
1280                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1281                                 continue;
1282                         dst_hold(&rt->u.dst);
1283                         read_unlock_bh(&table->tb6_lock);
1284
1285                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1286                 }
1287         }
1288         read_unlock_bh(&table->tb6_lock);
1289
1290         return err;
1291 }
1292
1293 /*
1294  *      Handle redirects
1295  */
1296 struct ip6rd_flowi {
1297         struct flowi fl;
1298         struct in6_addr gateway;
1299 };
1300
1301 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1302                                              struct flowi *fl,
1303                                              int flags)
1304 {
1305         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1306         struct rt6_info *rt;
1307         struct fib6_node *fn;
1308
1309         /*
1310          * Get the "current" route for this destination and
1311          * check if the redirect has come from approriate router.
1312          *
1313          * RFC 2461 specifies that redirects should only be
1314          * accepted if they come from the nexthop to the target.
1315          * Due to the way the routes are chosen, this notion
1316          * is a bit fuzzy and one might need to check all possible
1317          * routes.
1318          */
1319
1320         read_lock_bh(&table->tb6_lock);
1321         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1322 restart:
1323         for (rt = fn->leaf; rt; rt = rt->u.next) {
1324                 /*
1325                  * Current route is on-link; redirect is always invalid.
1326                  *
1327                  * Seems, previous statement is not true. It could
1328                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1329                  * But then router serving it might decide, that we should
1330                  * know truth 8)8) --ANK (980726).
1331                  */
1332                 if (rt6_check_expired(rt))
1333                         continue;
1334                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1335                         continue;
1336                 if (fl->oif != rt->rt6i_dev->ifindex)
1337                         continue;
1338                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1339                         continue;
1340                 break;
1341         }
1342
1343         if (!rt)
1344                 rt = &ip6_null_entry;
1345         BACKTRACK(&fl->fl6_src);
1346 out:
1347         dst_hold(&rt->u.dst);
1348
1349         read_unlock_bh(&table->tb6_lock);
1350
1351         return rt;
1352 };
1353
1354 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1355                                            struct in6_addr *src,
1356                                            struct in6_addr *gateway,
1357                                            struct net_device *dev)
1358 {
1359         int flags = RT6_LOOKUP_F_HAS_SADDR;
1360         struct ip6rd_flowi rdfl = {
1361                 .fl = {
1362                         .oif = dev->ifindex,
1363                         .nl_u = {
1364                                 .ip6_u = {
1365                                         .daddr = *dest,
1366                                         .saddr = *src,
1367                                 },
1368                         },
1369                 },
1370                 .gateway = *gateway,
1371         };
1372
1373         if (rt6_need_strict(dest))
1374                 flags |= RT6_LOOKUP_F_IFACE;
1375
1376         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1377 }
1378
1379 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1380                   struct in6_addr *saddr,
1381                   struct neighbour *neigh, u8 *lladdr, int on_link)
1382 {
1383         struct rt6_info *rt, *nrt = NULL;
1384         struct netevent_redirect netevent;
1385
1386         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1387
1388         if (rt == &ip6_null_entry) {
1389                 if (net_ratelimit())
1390                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1391                                "for redirect target\n");
1392                 goto out;
1393         }
1394
1395         /*
1396          *      We have finally decided to accept it.
1397          */
1398
1399         neigh_update(neigh, lladdr, NUD_STALE, 
1400                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1401                      NEIGH_UPDATE_F_OVERRIDE|
1402                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1403                                      NEIGH_UPDATE_F_ISROUTER))
1404                      );
1405
1406         /*
1407          * Redirect received -> path was valid.
1408          * Look, redirects are sent only in response to data packets,
1409          * so that this nexthop apparently is reachable. --ANK
1410          */
1411         dst_confirm(&rt->u.dst);
1412
1413         /* Duplicate redirect: silently ignore. */
1414         if (neigh == rt->u.dst.neighbour)
1415                 goto out;
1416
1417         nrt = ip6_rt_copy(rt);
1418         if (nrt == NULL)
1419                 goto out;
1420
1421         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1422         if (on_link)
1423                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1424
1425         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1426         nrt->rt6i_dst.plen = 128;
1427         nrt->u.dst.flags |= DST_HOST;
1428
1429         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1430         nrt->rt6i_nexthop = neigh_clone(neigh);
1431         /* Reset pmtu, it may be better */
1432         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1433         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1434
1435         if (ip6_ins_rt(nrt))
1436                 goto out;
1437
1438         netevent.old = &rt->u.dst;
1439         netevent.new = &nrt->u.dst;
1440         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1441
1442         if (rt->rt6i_flags&RTF_CACHE) {
1443                 ip6_del_rt(rt);
1444                 return;
1445         }
1446
1447 out:
1448         dst_release(&rt->u.dst);
1449         return;
1450 }
1451
1452 /*
1453  *      Handle ICMP "packet too big" messages
1454  *      i.e. Path MTU discovery
1455  */
1456
1457 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1458                         struct net_device *dev, u32 pmtu)
1459 {
1460         struct rt6_info *rt, *nrt;
1461         int allfrag = 0;
1462
1463         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1464         if (rt == NULL)
1465                 return;
1466
1467         if (pmtu >= dst_mtu(&rt->u.dst))
1468                 goto out;
1469
1470         if (pmtu < IPV6_MIN_MTU) {
1471                 /*
1472                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1473                  * MTU (1280) and a fragment header should always be included
1474                  * after a node receiving Too Big message reporting PMTU is
1475                  * less than the IPv6 Minimum Link MTU.
1476                  */
1477                 pmtu = IPV6_MIN_MTU;
1478                 allfrag = 1;
1479         }
1480
1481         /* New mtu received -> path was valid.
1482            They are sent only in response to data packets,
1483            so that this nexthop apparently is reachable. --ANK
1484          */
1485         dst_confirm(&rt->u.dst);
1486
1487         /* Host route. If it is static, it would be better
1488            not to override it, but add new one, so that
1489            when cache entry will expire old pmtu
1490            would return automatically.
1491          */
1492         if (rt->rt6i_flags & RTF_CACHE) {
1493                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1494                 if (allfrag)
1495                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1496                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1497                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1498                 goto out;
1499         }
1500
1501         /* Network route.
1502            Two cases are possible:
1503            1. It is connected route. Action: COW
1504            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1505          */
1506         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1507                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1508         else
1509                 nrt = rt6_alloc_clone(rt, daddr);
1510
1511         if (nrt) {
1512                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1513                 if (allfrag)
1514                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1515
1516                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1517                  * happened within 5 mins, the recommended timer is 10 mins.
1518                  * Here this route expiration time is set to ip6_rt_mtu_expires
1519                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1520                  * and detecting PMTU increase will be automatically happened.
1521                  */
1522                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1523                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1524
1525                 ip6_ins_rt(nrt);
1526         }
1527 out:
1528         dst_release(&rt->u.dst);
1529 }
1530
1531 /*
1532  *      Misc support functions
1533  */
1534
1535 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1536 {
1537         struct rt6_info *rt = ip6_dst_alloc();
1538
1539         if (rt) {
1540                 rt->u.dst.input = ort->u.dst.input;
1541                 rt->u.dst.output = ort->u.dst.output;
1542
1543                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1544                 rt->u.dst.error = ort->u.dst.error;
1545                 rt->u.dst.dev = ort->u.dst.dev;
1546                 if (rt->u.dst.dev)
1547                         dev_hold(rt->u.dst.dev);
1548                 rt->rt6i_idev = ort->rt6i_idev;
1549                 if (rt->rt6i_idev)
1550                         in6_dev_hold(rt->rt6i_idev);
1551                 rt->u.dst.lastuse = jiffies;
1552                 rt->rt6i_expires = 0;
1553
1554                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1555                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1556                 rt->rt6i_metric = 0;
1557
1558                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1559 #ifdef CONFIG_IPV6_SUBTREES
1560                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1561 #endif
1562                 rt->rt6i_table = ort->rt6i_table;
1563         }
1564         return rt;
1565 }
1566
1567 #ifdef CONFIG_IPV6_ROUTE_INFO
1568 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1569                                            struct in6_addr *gwaddr, int ifindex)
1570 {
1571         struct fib6_node *fn;
1572         struct rt6_info *rt = NULL;
1573         struct fib6_table *table;
1574
1575         table = fib6_get_table(RT6_TABLE_INFO);
1576         if (table == NULL)
1577                 return NULL;
1578
1579         write_lock_bh(&table->tb6_lock);
1580         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1581         if (!fn)
1582                 goto out;
1583
1584         for (rt = fn->leaf; rt; rt = rt->u.next) {
1585                 if (rt->rt6i_dev->ifindex != ifindex)
1586                         continue;
1587                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1588                         continue;
1589                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1590                         continue;
1591                 dst_hold(&rt->u.dst);
1592                 break;
1593         }
1594 out:
1595         write_unlock_bh(&table->tb6_lock);
1596         return rt;
1597 }
1598
1599 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1600                                            struct in6_addr *gwaddr, int ifindex,
1601                                            unsigned pref)
1602 {
1603         struct fib6_config cfg = {
1604                 .fc_table       = RT6_TABLE_INFO,
1605                 .fc_metric      = 1024,
1606                 .fc_ifindex     = ifindex,
1607                 .fc_dst_len     = prefixlen,
1608                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1609                                   RTF_UP | RTF_PREF(pref),
1610         };
1611
1612         ipv6_addr_copy(&cfg.fc_dst, prefix);
1613         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1614
1615         /* We should treat it as a default route if prefix length is 0. */
1616         if (!prefixlen)
1617                 cfg.fc_flags |= RTF_DEFAULT;
1618
1619         ip6_route_add(&cfg);
1620
1621         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1622 }
1623 #endif
1624
1625 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1626 {       
1627         struct rt6_info *rt;
1628         struct fib6_table *table;
1629
1630         table = fib6_get_table(RT6_TABLE_DFLT);
1631         if (table == NULL)
1632                 return NULL;
1633
1634         write_lock_bh(&table->tb6_lock);
1635         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1636                 if (dev == rt->rt6i_dev &&
1637                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1638                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1639                         break;
1640         }
1641         if (rt)
1642                 dst_hold(&rt->u.dst);
1643         write_unlock_bh(&table->tb6_lock);
1644         return rt;
1645 }
1646
1647 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1648                                      struct net_device *dev,
1649                                      unsigned int pref)
1650 {
1651         struct fib6_config cfg = {
1652                 .fc_table       = RT6_TABLE_DFLT,
1653                 .fc_metric      = 1024,
1654                 .fc_ifindex     = dev->ifindex,
1655                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1656                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1657         };
1658
1659         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1660
1661         ip6_route_add(&cfg);
1662
1663         return rt6_get_dflt_router(gwaddr, dev);
1664 }
1665
1666 void rt6_purge_dflt_routers(void)
1667 {
1668         struct rt6_info *rt;
1669         struct fib6_table *table;
1670
1671         /* NOTE: Keep consistent with rt6_get_dflt_router */
1672         table = fib6_get_table(RT6_TABLE_DFLT);
1673         if (table == NULL)
1674                 return;
1675
1676 restart:
1677         read_lock_bh(&table->tb6_lock);
1678         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1679                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1680                         dst_hold(&rt->u.dst);
1681                         read_unlock_bh(&table->tb6_lock);
1682                         ip6_del_rt(rt);
1683                         goto restart;
1684                 }
1685         }
1686         read_unlock_bh(&table->tb6_lock);
1687 }
1688
1689 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1690                                  struct fib6_config *cfg)
1691 {
1692         memset(cfg, 0, sizeof(*cfg));
1693
1694         cfg->fc_table = RT6_TABLE_MAIN;
1695         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1696         cfg->fc_metric = rtmsg->rtmsg_metric;
1697         cfg->fc_expires = rtmsg->rtmsg_info;
1698         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1699         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1700         cfg->fc_flags = rtmsg->rtmsg_flags;
1701
1702         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1703         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1704         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1705 }
1706
1707 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1708 {
1709         struct fib6_config cfg;
1710         struct in6_rtmsg rtmsg;
1711         int err;
1712
1713         switch(cmd) {
1714         case SIOCADDRT:         /* Add a route */
1715         case SIOCDELRT:         /* Delete a route */
1716                 if (!capable(CAP_NET_ADMIN))
1717                         return -EPERM;
1718                 err = copy_from_user(&rtmsg, arg,
1719                                      sizeof(struct in6_rtmsg));
1720                 if (err)
1721                         return -EFAULT;
1722
1723                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1724
1725                 rtnl_lock();
1726                 switch (cmd) {
1727                 case SIOCADDRT:
1728                         err = ip6_route_add(&cfg);
1729                         break;
1730                 case SIOCDELRT:
1731                         err = ip6_route_del(&cfg);
1732                         break;
1733                 default:
1734                         err = -EINVAL;
1735                 }
1736                 rtnl_unlock();
1737
1738                 return err;
1739         };
1740
1741         return -EINVAL;
1742 }
1743
1744 /*
1745  *      Drop the packet on the floor
1746  */
1747
1748 static inline int ip6_pkt_drop(struct sk_buff *skb, int code)
1749 {
1750         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1751         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1752                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1753
1754         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTNOROUTES);
1755         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1756         kfree_skb(skb);
1757         return 0;
1758 }
1759
1760 static int ip6_pkt_discard(struct sk_buff *skb)
1761 {
1762         return ip6_pkt_drop(skb, ICMPV6_NOROUTE);
1763 }
1764
1765 static int ip6_pkt_discard_out(struct sk_buff *skb)
1766 {
1767         skb->dev = skb->dst->dev;
1768         return ip6_pkt_discard(skb);
1769 }
1770
1771 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1772
1773 static int ip6_pkt_prohibit(struct sk_buff *skb)
1774 {
1775         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED);
1776 }
1777
1778 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1779 {
1780         skb->dev = skb->dst->dev;
1781         return ip6_pkt_prohibit(skb);
1782 }
1783
1784 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1785 {
1786         kfree_skb(skb);
1787         return 0;
1788 }
1789
1790 #endif
1791
1792 /*
1793  *      Allocate a dst for local (unicast / anycast) address.
1794  */
1795
1796 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1797                                     const struct in6_addr *addr,
1798                                     int anycast)
1799 {
1800         struct rt6_info *rt = ip6_dst_alloc();
1801
1802         if (rt == NULL)
1803                 return ERR_PTR(-ENOMEM);
1804
1805         dev_hold(&loopback_dev);
1806         in6_dev_hold(idev);
1807
1808         rt->u.dst.flags = DST_HOST;
1809         rt->u.dst.input = ip6_input;
1810         rt->u.dst.output = ip6_output;
1811         rt->rt6i_dev = &loopback_dev;
1812         rt->rt6i_idev = idev;
1813         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1814         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1815         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1816         rt->u.dst.obsolete = -1;
1817
1818         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1819         if (anycast)
1820                 rt->rt6i_flags |= RTF_ANYCAST;
1821         else
1822                 rt->rt6i_flags |= RTF_LOCAL;
1823         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1824         if (rt->rt6i_nexthop == NULL) {
1825                 dst_free(&rt->u.dst);
1826                 return ERR_PTR(-ENOMEM);
1827         }
1828
1829         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1830         rt->rt6i_dst.plen = 128;
1831         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1832
1833         atomic_set(&rt->u.dst.__refcnt, 1);
1834
1835         return rt;
1836 }
1837
1838 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1839 {
1840         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1841             rt != &ip6_null_entry) {
1842                 RT6_TRACE("deleted by ifdown %p\n", rt);
1843                 return -1;
1844         }
1845         return 0;
1846 }
1847
1848 void rt6_ifdown(struct net_device *dev)
1849 {
1850         fib6_clean_all(fib6_ifdown, 0, dev);
1851 }
1852
1853 struct rt6_mtu_change_arg
1854 {
1855         struct net_device *dev;
1856         unsigned mtu;
1857 };
1858
1859 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1860 {
1861         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1862         struct inet6_dev *idev;
1863
1864         /* In IPv6 pmtu discovery is not optional,
1865            so that RTAX_MTU lock cannot disable it.
1866            We still use this lock to block changes
1867            caused by addrconf/ndisc.
1868         */
1869
1870         idev = __in6_dev_get(arg->dev);
1871         if (idev == NULL)
1872                 return 0;
1873
1874         /* For administrative MTU increase, there is no way to discover
1875            IPv6 PMTU increase, so PMTU increase should be updated here.
1876            Since RFC 1981 doesn't include administrative MTU increase
1877            update PMTU increase is a MUST. (i.e. jumbo frame)
1878          */
1879         /*
1880            If new MTU is less than route PMTU, this new MTU will be the
1881            lowest MTU in the path, update the route PMTU to reflect PMTU
1882            decreases; if new MTU is greater than route PMTU, and the
1883            old MTU is the lowest MTU in the path, update the route PMTU
1884            to reflect the increase. In this case if the other nodes' MTU
1885            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1886            PMTU discouvery.
1887          */
1888         if (rt->rt6i_dev == arg->dev &&
1889             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1890             (dst_mtu(&rt->u.dst) > arg->mtu ||
1891              (dst_mtu(&rt->u.dst) < arg->mtu &&
1892               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1893                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1894         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1895         return 0;
1896 }
1897
1898 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1899 {
1900         struct rt6_mtu_change_arg arg = {
1901                 .dev = dev,
1902                 .mtu = mtu,
1903         };
1904
1905         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1906 }
1907
1908 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1909         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1910         [RTA_OIF]               = { .type = NLA_U32 },
1911         [RTA_IIF]               = { .type = NLA_U32 },
1912         [RTA_PRIORITY]          = { .type = NLA_U32 },
1913         [RTA_METRICS]           = { .type = NLA_NESTED },
1914 };
1915
1916 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1917                               struct fib6_config *cfg)
1918 {
1919         struct rtmsg *rtm;
1920         struct nlattr *tb[RTA_MAX+1];
1921         int err;
1922
1923         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1924         if (err < 0)
1925                 goto errout;
1926
1927         err = -EINVAL;
1928         rtm = nlmsg_data(nlh);
1929         memset(cfg, 0, sizeof(*cfg));
1930
1931         cfg->fc_table = rtm->rtm_table;
1932         cfg->fc_dst_len = rtm->rtm_dst_len;
1933         cfg->fc_src_len = rtm->rtm_src_len;
1934         cfg->fc_flags = RTF_UP;
1935         cfg->fc_protocol = rtm->rtm_protocol;
1936
1937         if (rtm->rtm_type == RTN_UNREACHABLE)
1938                 cfg->fc_flags |= RTF_REJECT;
1939
1940         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1941         cfg->fc_nlinfo.nlh = nlh;
1942
1943         if (tb[RTA_GATEWAY]) {
1944                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1945                 cfg->fc_flags |= RTF_GATEWAY;
1946         }
1947
1948         if (tb[RTA_DST]) {
1949                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1950
1951                 if (nla_len(tb[RTA_DST]) < plen)
1952                         goto errout;
1953
1954                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1955         }
1956
1957         if (tb[RTA_SRC]) {
1958                 int plen = (rtm->rtm_src_len + 7) >> 3;
1959
1960                 if (nla_len(tb[RTA_SRC]) < plen)
1961                         goto errout;
1962
1963                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1964         }
1965
1966         if (tb[RTA_OIF])
1967                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1968
1969         if (tb[RTA_PRIORITY])
1970                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1971
1972         if (tb[RTA_METRICS]) {
1973                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1974                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1975         }
1976
1977         if (tb[RTA_TABLE])
1978                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1979
1980         err = 0;
1981 errout:
1982         return err;
1983 }
1984
1985 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1986 {
1987         struct fib6_config cfg;
1988         int err;
1989
1990         err = rtm_to_fib6_config(skb, nlh, &cfg);
1991         if (err < 0)
1992                 return err;
1993
1994         return ip6_route_del(&cfg);
1995 }
1996
1997 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1998 {
1999         struct fib6_config cfg;
2000         int err;
2001
2002         err = rtm_to_fib6_config(skb, nlh, &cfg);
2003         if (err < 0)
2004                 return err;
2005
2006         return ip6_route_add(&cfg);
2007 }
2008
2009 static inline size_t rt6_nlmsg_size(void)
2010 {
2011         return NLMSG_ALIGN(sizeof(struct rtmsg))
2012                + nla_total_size(16) /* RTA_SRC */
2013                + nla_total_size(16) /* RTA_DST */
2014                + nla_total_size(16) /* RTA_GATEWAY */
2015                + nla_total_size(16) /* RTA_PREFSRC */
2016                + nla_total_size(4) /* RTA_TABLE */
2017                + nla_total_size(4) /* RTA_IIF */
2018                + nla_total_size(4) /* RTA_OIF */
2019                + nla_total_size(4) /* RTA_PRIORITY */
2020                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2021                + nla_total_size(sizeof(struct rta_cacheinfo));
2022 }
2023
2024 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2025                          struct in6_addr *dst, struct in6_addr *src,
2026                          int iif, int type, u32 pid, u32 seq,
2027                          int prefix, unsigned int flags)
2028 {
2029         struct rtmsg *rtm;
2030         struct nlmsghdr *nlh;
2031         long expires;
2032         u32 table;
2033
2034         if (prefix) {   /* user wants prefix routes only */
2035                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2036                         /* success since this is not a prefix route */
2037                         return 1;
2038                 }
2039         }
2040
2041         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2042         if (nlh == NULL)
2043                 return -ENOBUFS;
2044
2045         rtm = nlmsg_data(nlh);
2046         rtm->rtm_family = AF_INET6;
2047         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2048         rtm->rtm_src_len = rt->rt6i_src.plen;
2049         rtm->rtm_tos = 0;
2050         if (rt->rt6i_table)
2051                 table = rt->rt6i_table->tb6_id;
2052         else
2053                 table = RT6_TABLE_UNSPEC;
2054         rtm->rtm_table = table;
2055         NLA_PUT_U32(skb, RTA_TABLE, table);
2056         if (rt->rt6i_flags&RTF_REJECT)
2057                 rtm->rtm_type = RTN_UNREACHABLE;
2058         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2059                 rtm->rtm_type = RTN_LOCAL;
2060         else
2061                 rtm->rtm_type = RTN_UNICAST;
2062         rtm->rtm_flags = 0;
2063         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2064         rtm->rtm_protocol = rt->rt6i_protocol;
2065         if (rt->rt6i_flags&RTF_DYNAMIC)
2066                 rtm->rtm_protocol = RTPROT_REDIRECT;
2067         else if (rt->rt6i_flags & RTF_ADDRCONF)
2068                 rtm->rtm_protocol = RTPROT_KERNEL;
2069         else if (rt->rt6i_flags&RTF_DEFAULT)
2070                 rtm->rtm_protocol = RTPROT_RA;
2071
2072         if (rt->rt6i_flags&RTF_CACHE)
2073                 rtm->rtm_flags |= RTM_F_CLONED;
2074
2075         if (dst) {
2076                 NLA_PUT(skb, RTA_DST, 16, dst);
2077                 rtm->rtm_dst_len = 128;
2078         } else if (rtm->rtm_dst_len)
2079                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2080 #ifdef CONFIG_IPV6_SUBTREES
2081         if (src) {
2082                 NLA_PUT(skb, RTA_SRC, 16, src);
2083                 rtm->rtm_src_len = 128;
2084         } else if (rtm->rtm_src_len)
2085                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2086 #endif
2087         if (iif)
2088                 NLA_PUT_U32(skb, RTA_IIF, iif);
2089         else if (dst) {
2090                 struct in6_addr saddr_buf;
2091                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2092                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2093         }
2094
2095         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2096                 goto nla_put_failure;
2097
2098         if (rt->u.dst.neighbour)
2099                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2100
2101         if (rt->u.dst.dev)
2102                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2103
2104         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2105
2106         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2107         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2108                                expires, rt->u.dst.error) < 0)
2109                 goto nla_put_failure;
2110
2111         return nlmsg_end(skb, nlh);
2112
2113 nla_put_failure:
2114         return nlmsg_cancel(skb, nlh);
2115 }
2116
2117 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2118 {
2119         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2120         int prefix;
2121
2122         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2123                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2124                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2125         } else
2126                 prefix = 0;
2127
2128         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2129                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2130                      prefix, NLM_F_MULTI);
2131 }
2132
2133 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2134 {
2135         struct nlattr *tb[RTA_MAX+1];
2136         struct rt6_info *rt;
2137         struct sk_buff *skb;
2138         struct rtmsg *rtm;
2139         struct flowi fl;
2140         int err, iif = 0;
2141
2142         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2143         if (err < 0)
2144                 goto errout;
2145
2146         err = -EINVAL;
2147         memset(&fl, 0, sizeof(fl));
2148
2149         if (tb[RTA_SRC]) {
2150                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2151                         goto errout;
2152
2153                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2154         }
2155
2156         if (tb[RTA_DST]) {
2157                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2158                         goto errout;
2159
2160                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2161         }
2162
2163         if (tb[RTA_IIF])
2164                 iif = nla_get_u32(tb[RTA_IIF]);
2165
2166         if (tb[RTA_OIF])
2167                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2168
2169         if (iif) {
2170                 struct net_device *dev;
2171                 dev = __dev_get_by_index(iif);
2172                 if (!dev) {
2173                         err = -ENODEV;
2174                         goto errout;
2175                 }
2176         }
2177
2178         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2179         if (skb == NULL) {
2180                 err = -ENOBUFS;
2181                 goto errout;
2182         }
2183
2184         /* Reserve room for dummy headers, this skb can pass
2185            through good chunk of routing engine.
2186          */
2187         skb->mac.raw = skb->data;
2188         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2189
2190         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2191         skb->dst = &rt->u.dst;
2192
2193         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2194                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2195                             nlh->nlmsg_seq, 0, 0);
2196         if (err < 0) {
2197                 kfree_skb(skb);
2198                 goto errout;
2199         }
2200
2201         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2202 errout:
2203         return err;
2204 }
2205
2206 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2207 {
2208         struct sk_buff *skb;
2209         u32 pid = 0, seq = 0;
2210         struct nlmsghdr *nlh = NULL;
2211         int err = -ENOBUFS;
2212
2213         if (info) {
2214                 pid = info->pid;
2215                 nlh = info->nlh;
2216                 if (nlh)
2217                         seq = nlh->nlmsg_seq;
2218         }
2219
2220         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2221         if (skb == NULL)
2222                 goto errout;
2223
2224         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2225         /* failure implies BUG in rt6_nlmsg_size() */
2226         BUG_ON(err < 0);
2227
2228         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2229 errout:
2230         if (err < 0)
2231                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2232 }
2233
2234 /*
2235  *      /proc
2236  */
2237
2238 #ifdef CONFIG_PROC_FS
2239
2240 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2241
2242 struct rt6_proc_arg
2243 {
2244         char *buffer;
2245         int offset;
2246         int length;
2247         int skip;
2248         int len;
2249 };
2250
2251 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2252 {
2253         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2254
2255         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2256                 arg->skip++;
2257                 return 0;
2258         }
2259
2260         if (arg->len >= arg->length)
2261                 return 0;
2262
2263         arg->len += sprintf(arg->buffer + arg->len,
2264                             NIP6_SEQFMT " %02x ",
2265                             NIP6(rt->rt6i_dst.addr),
2266                             rt->rt6i_dst.plen);
2267
2268 #ifdef CONFIG_IPV6_SUBTREES
2269         arg->len += sprintf(arg->buffer + arg->len,
2270                             NIP6_SEQFMT " %02x ",
2271                             NIP6(rt->rt6i_src.addr),
2272                             rt->rt6i_src.plen);
2273 #else
2274         arg->len += sprintf(arg->buffer + arg->len,
2275                             "00000000000000000000000000000000 00 ");
2276 #endif
2277
2278         if (rt->rt6i_nexthop) {
2279                 arg->len += sprintf(arg->buffer + arg->len,
2280                                     NIP6_SEQFMT,
2281                                     NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2282         } else {
2283                 arg->len += sprintf(arg->buffer + arg->len,
2284                                     "00000000000000000000000000000000");
2285         }
2286         arg->len += sprintf(arg->buffer + arg->len,
2287                             " %08x %08x %08x %08x %8s\n",
2288                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2289                             rt->u.dst.__use, rt->rt6i_flags, 
2290                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2291         return 0;
2292 }
2293
2294 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2295 {
2296         struct rt6_proc_arg arg = {
2297                 .buffer = buffer,
2298                 .offset = offset,
2299                 .length = length,
2300         };
2301
2302         fib6_clean_all(rt6_info_route, 0, &arg);
2303
2304         *start = buffer;
2305         if (offset)
2306                 *start += offset % RT6_INFO_LEN;
2307
2308         arg.len -= offset % RT6_INFO_LEN;
2309
2310         if (arg.len > length)
2311                 arg.len = length;
2312         if (arg.len < 0)
2313                 arg.len = 0;
2314
2315         return arg.len;
2316 }
2317
2318 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2319 {
2320         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2321                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2322                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2323                       rt6_stats.fib_rt_cache,
2324                       atomic_read(&ip6_dst_ops.entries),
2325                       rt6_stats.fib_discarded_routes);
2326
2327         return 0;
2328 }
2329
2330 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2331 {
2332         return single_open(file, rt6_stats_seq_show, NULL);
2333 }
2334
2335 static struct file_operations rt6_stats_seq_fops = {
2336         .owner   = THIS_MODULE,
2337         .open    = rt6_stats_seq_open,
2338         .read    = seq_read,
2339         .llseek  = seq_lseek,
2340         .release = single_release,
2341 };
2342 #endif  /* CONFIG_PROC_FS */
2343
2344 #ifdef CONFIG_SYSCTL
2345
2346 static int flush_delay;
2347
2348 static
2349 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2350                               void __user *buffer, size_t *lenp, loff_t *ppos)
2351 {
2352         if (write) {
2353                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2354                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2355                 return 0;
2356         } else
2357                 return -EINVAL;
2358 }
2359
2360 ctl_table ipv6_route_table[] = {
2361         {
2362                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2363                 .procname       =       "flush",
2364                 .data           =       &flush_delay,
2365                 .maxlen         =       sizeof(int),
2366                 .mode           =       0200,
2367                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2368         },
2369         {
2370                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2371                 .procname       =       "gc_thresh",
2372                 .data           =       &ip6_dst_ops.gc_thresh,
2373                 .maxlen         =       sizeof(int),
2374                 .mode           =       0644,
2375                 .proc_handler   =       &proc_dointvec,
2376         },
2377         {
2378                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2379                 .procname       =       "max_size",
2380                 .data           =       &ip6_rt_max_size,
2381                 .maxlen         =       sizeof(int),
2382                 .mode           =       0644,
2383                 .proc_handler   =       &proc_dointvec,
2384         },
2385         {
2386                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2387                 .procname       =       "gc_min_interval",
2388                 .data           =       &ip6_rt_gc_min_interval,
2389                 .maxlen         =       sizeof(int),
2390                 .mode           =       0644,
2391                 .proc_handler   =       &proc_dointvec_jiffies,
2392                 .strategy       =       &sysctl_jiffies,
2393         },
2394         {
2395                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2396                 .procname       =       "gc_timeout",
2397                 .data           =       &ip6_rt_gc_timeout,
2398                 .maxlen         =       sizeof(int),
2399                 .mode           =       0644,
2400                 .proc_handler   =       &proc_dointvec_jiffies,
2401                 .strategy       =       &sysctl_jiffies,
2402         },
2403         {
2404                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2405                 .procname       =       "gc_interval",
2406                 .data           =       &ip6_rt_gc_interval,
2407                 .maxlen         =       sizeof(int),
2408                 .mode           =       0644,
2409                 .proc_handler   =       &proc_dointvec_jiffies,
2410                 .strategy       =       &sysctl_jiffies,
2411         },
2412         {
2413                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2414                 .procname       =       "gc_elasticity",
2415                 .data           =       &ip6_rt_gc_elasticity,
2416                 .maxlen         =       sizeof(int),
2417                 .mode           =       0644,
2418                 .proc_handler   =       &proc_dointvec_jiffies,
2419                 .strategy       =       &sysctl_jiffies,
2420         },
2421         {
2422                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2423                 .procname       =       "mtu_expires",
2424                 .data           =       &ip6_rt_mtu_expires,
2425                 .maxlen         =       sizeof(int),
2426                 .mode           =       0644,
2427                 .proc_handler   =       &proc_dointvec_jiffies,
2428                 .strategy       =       &sysctl_jiffies,
2429         },
2430         {
2431                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2432                 .procname       =       "min_adv_mss",
2433                 .data           =       &ip6_rt_min_advmss,
2434                 .maxlen         =       sizeof(int),
2435                 .mode           =       0644,
2436                 .proc_handler   =       &proc_dointvec_jiffies,
2437                 .strategy       =       &sysctl_jiffies,
2438         },
2439         {
2440                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2441                 .procname       =       "gc_min_interval_ms",
2442                 .data           =       &ip6_rt_gc_min_interval,
2443                 .maxlen         =       sizeof(int),
2444                 .mode           =       0644,
2445                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2446                 .strategy       =       &sysctl_ms_jiffies,
2447         },
2448         { .ctl_name = 0 }
2449 };
2450
2451 #endif
2452
2453 void __init ip6_route_init(void)
2454 {
2455         struct proc_dir_entry *p;
2456
2457         ip6_dst_ops.kmem_cachep =
2458                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2459                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2460         fib6_init();
2461 #ifdef  CONFIG_PROC_FS
2462         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2463         if (p)
2464                 p->owner = THIS_MODULE;
2465
2466         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2467 #endif
2468 #ifdef CONFIG_XFRM
2469         xfrm6_init();
2470 #endif
2471 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2472         fib6_rules_init();
2473 #endif
2474 }
2475
2476 void ip6_route_cleanup(void)
2477 {
2478 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2479         fib6_rules_cleanup();
2480 #endif
2481 #ifdef CONFIG_PROC_FS
2482         proc_net_remove("ipv6_route");
2483         proc_net_remove("rt6_stats");
2484 #endif
2485 #ifdef CONFIG_XFRM
2486         xfrm6_fini();
2487 #endif
2488         rt6_ifdown(NULL);
2489         fib6_gc_cleanup();
2490         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2491 }