Merge branch 'upstream-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/linvil...
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146 static int ip6_pkt_blk_hole(struct sk_buff *skb);
147
148 struct rt6_info ip6_prohibit_entry = {
149         .u = {
150                 .dst = {
151                         .__refcnt       = ATOMIC_INIT(1),
152                         .__use          = 1,
153                         .dev            = &loopback_dev,
154                         .obsolete       = -1,
155                         .error          = -EACCES,
156                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
157                         .input          = ip6_pkt_prohibit,
158                         .output         = ip6_pkt_prohibit_out,
159                         .ops            = &ip6_dst_ops,
160                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
161                 }
162         },
163         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
164         .rt6i_metric    = ~(u32) 0,
165         .rt6i_ref       = ATOMIC_INIT(1),
166 };
167
168 struct rt6_info ip6_blk_hole_entry = {
169         .u = {
170                 .dst = {
171                         .__refcnt       = ATOMIC_INIT(1),
172                         .__use          = 1,
173                         .dev            = &loopback_dev,
174                         .obsolete       = -1,
175                         .error          = -EINVAL,
176                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
177                         .input          = ip6_pkt_blk_hole,
178                         .output         = ip6_pkt_blk_hole,
179                         .ops            = &ip6_dst_ops,
180                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
181                 }
182         },
183         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
184         .rt6i_metric    = ~(u32) 0,
185         .rt6i_ref       = ATOMIC_INIT(1),
186 };
187
188 #endif
189
190 /* allocate dst with ip6_dst_ops */
191 static __inline__ struct rt6_info *ip6_dst_alloc(void)
192 {
193         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
194 }
195
196 static void ip6_dst_destroy(struct dst_entry *dst)
197 {
198         struct rt6_info *rt = (struct rt6_info *)dst;
199         struct inet6_dev *idev = rt->rt6i_idev;
200
201         if (idev != NULL) {
202                 rt->rt6i_idev = NULL;
203                 in6_dev_put(idev);
204         }       
205 }
206
207 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
208                            int how)
209 {
210         struct rt6_info *rt = (struct rt6_info *)dst;
211         struct inet6_dev *idev = rt->rt6i_idev;
212
213         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
214                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
215                 if (loopback_idev != NULL) {
216                         rt->rt6i_idev = loopback_idev;
217                         in6_dev_put(idev);
218                 }
219         }
220 }
221
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
223 {
224         return (rt->rt6i_flags & RTF_EXPIRES &&
225                 time_after(jiffies, rt->rt6i_expires));
226 }
227
228 static inline int rt6_need_strict(struct in6_addr *daddr)
229 {
230         return (ipv6_addr_type(daddr) &
231                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 }
233
234 /*
235  *      Route lookup. Any table->tb6_lock is implied.
236  */
237
238 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
239                                                     int oif,
240                                                     int strict)
241 {
242         struct rt6_info *local = NULL;
243         struct rt6_info *sprt;
244
245         if (oif) {
246                 for (sprt = rt; sprt; sprt = sprt->u.next) {
247                         struct net_device *dev = sprt->rt6i_dev;
248                         if (dev->ifindex == oif)
249                                 return sprt;
250                         if (dev->flags & IFF_LOOPBACK) {
251                                 if (sprt->rt6i_idev == NULL ||
252                                     sprt->rt6i_idev->dev->ifindex != oif) {
253                                         if (strict && oif)
254                                                 continue;
255                                         if (local && (!oif || 
256                                                       local->rt6i_idev->dev->ifindex == oif))
257                                                 continue;
258                                 }
259                                 local = sprt;
260                         }
261                 }
262
263                 if (local)
264                         return local;
265
266                 if (strict)
267                         return &ip6_null_entry;
268         }
269         return rt;
270 }
271
272 #ifdef CONFIG_IPV6_ROUTER_PREF
273 static void rt6_probe(struct rt6_info *rt)
274 {
275         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
276         /*
277          * Okay, this does not seem to be appropriate
278          * for now, however, we need to check if it
279          * is really so; aka Router Reachability Probing.
280          *
281          * Router Reachability Probe MUST be rate-limited
282          * to no more than one per minute.
283          */
284         if (!neigh || (neigh->nud_state & NUD_VALID))
285                 return;
286         read_lock_bh(&neigh->lock);
287         if (!(neigh->nud_state & NUD_VALID) &&
288             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
289                 struct in6_addr mcaddr;
290                 struct in6_addr *target;
291
292                 neigh->updated = jiffies;
293                 read_unlock_bh(&neigh->lock);
294
295                 target = (struct in6_addr *)&neigh->primary_key;
296                 addrconf_addr_solict_mult(target, &mcaddr);
297                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
298         } else
299                 read_unlock_bh(&neigh->lock);
300 }
301 #else
302 static inline void rt6_probe(struct rt6_info *rt)
303 {
304         return;
305 }
306 #endif
307
308 /*
309  * Default Router Selection (RFC 2461 6.3.6)
310  */
311 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
312 {
313         struct net_device *dev = rt->rt6i_dev;
314         if (!oif || dev->ifindex == oif)
315                 return 2;
316         if ((dev->flags & IFF_LOOPBACK) &&
317             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
318                 return 1;
319         return 0;
320 }
321
322 static int inline rt6_check_neigh(struct rt6_info *rt)
323 {
324         struct neighbour *neigh = rt->rt6i_nexthop;
325         int m = 0;
326         if (rt->rt6i_flags & RTF_NONEXTHOP ||
327             !(rt->rt6i_flags & RTF_GATEWAY))
328                 m = 1;
329         else if (neigh) {
330                 read_lock_bh(&neigh->lock);
331                 if (neigh->nud_state & NUD_VALID)
332                         m = 2;
333                 read_unlock_bh(&neigh->lock);
334         }
335         return m;
336 }
337
338 static int rt6_score_route(struct rt6_info *rt, int oif,
339                            int strict)
340 {
341         int m, n;
342                 
343         m = rt6_check_dev(rt, oif);
344         if (!m && (strict & RT6_LOOKUP_F_IFACE))
345                 return -1;
346 #ifdef CONFIG_IPV6_ROUTER_PREF
347         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
348 #endif
349         n = rt6_check_neigh(rt);
350         if (n > 1)
351                 m |= 16;
352         else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
353                 return -1;
354         return m;
355 }
356
357 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
358                                    int strict)
359 {
360         struct rt6_info *match = NULL, *last = NULL;
361         struct rt6_info *rt, *rt0 = *head;
362         u32 metric;
363         int mpri = -1;
364
365         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
366                   __FUNCTION__, head, head ? *head : NULL, oif);
367
368         for (rt = rt0, metric = rt0->rt6i_metric;
369              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
370              rt = rt->u.next) {
371                 int m;
372
373                 if (rt6_check_expired(rt))
374                         continue;
375
376                 last = rt;
377
378                 m = rt6_score_route(rt, oif, strict);
379                 if (m < 0)
380                         continue;
381
382                 if (m > mpri) {
383                         rt6_probe(match);
384                         match = rt;
385                         mpri = m;
386                 } else {
387                         rt6_probe(rt);
388                 }
389         }
390
391         if (!match &&
392             (strict & RT6_LOOKUP_F_REACHABLE) &&
393             last && last != rt0) {
394                 /* no entries matched; do round-robin */
395                 static DEFINE_SPINLOCK(lock);
396                 spin_lock(&lock);
397                 *head = rt0->u.next;
398                 rt0->u.next = last->u.next;
399                 last->u.next = rt0;
400                 spin_unlock(&lock);
401         }
402
403         RT6_TRACE("%s() => %p, score=%d\n",
404                   __FUNCTION__, match, mpri);
405
406         return (match ? match : &ip6_null_entry);
407 }
408
409 #ifdef CONFIG_IPV6_ROUTE_INFO
410 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
411                   struct in6_addr *gwaddr)
412 {
413         struct route_info *rinfo = (struct route_info *) opt;
414         struct in6_addr prefix_buf, *prefix;
415         unsigned int pref;
416         u32 lifetime;
417         struct rt6_info *rt;
418
419         if (len < sizeof(struct route_info)) {
420                 return -EINVAL;
421         }
422
423         /* Sanity check for prefix_len and length */
424         if (rinfo->length > 3) {
425                 return -EINVAL;
426         } else if (rinfo->prefix_len > 128) {
427                 return -EINVAL;
428         } else if (rinfo->prefix_len > 64) {
429                 if (rinfo->length < 2) {
430                         return -EINVAL;
431                 }
432         } else if (rinfo->prefix_len > 0) {
433                 if (rinfo->length < 1) {
434                         return -EINVAL;
435                 }
436         }
437
438         pref = rinfo->route_pref;
439         if (pref == ICMPV6_ROUTER_PREF_INVALID)
440                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
441
442         lifetime = htonl(rinfo->lifetime);
443         if (lifetime == 0xffffffff) {
444                 /* infinity */
445         } else if (lifetime > 0x7fffffff/HZ) {
446                 /* Avoid arithmetic overflow */
447                 lifetime = 0x7fffffff/HZ - 1;
448         }
449
450         if (rinfo->length == 3)
451                 prefix = (struct in6_addr *)rinfo->prefix;
452         else {
453                 /* this function is safe */
454                 ipv6_addr_prefix(&prefix_buf,
455                                  (struct in6_addr *)rinfo->prefix,
456                                  rinfo->prefix_len);
457                 prefix = &prefix_buf;
458         }
459
460         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
461
462         if (rt && !lifetime) {
463                 ip6_del_rt(rt);
464                 rt = NULL;
465         }
466
467         if (!rt && lifetime)
468                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
469                                         pref);
470         else if (rt)
471                 rt->rt6i_flags = RTF_ROUTEINFO |
472                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
473
474         if (rt) {
475                 if (lifetime == 0xffffffff) {
476                         rt->rt6i_flags &= ~RTF_EXPIRES;
477                 } else {
478                         rt->rt6i_expires = jiffies + HZ * lifetime;
479                         rt->rt6i_flags |= RTF_EXPIRES;
480                 }
481                 dst_release(&rt->u.dst);
482         }
483         return 0;
484 }
485 #endif
486
487 #define BACKTRACK(saddr) \
488 do { \
489         if (rt == &ip6_null_entry) { \
490                 struct fib6_node *pn; \
491                 while (1) { \
492                         if (fn->fn_flags & RTN_TL_ROOT) \
493                                 goto out; \
494                         pn = fn->parent; \
495                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
496                                 fn = fib6_lookup(pn->subtree, NULL, saddr); \
497                         else \
498                                 fn = pn; \
499                         if (fn->fn_flags & RTN_RTINFO) \
500                                 goto restart; \
501                 } \
502         } \
503 } while(0)
504
505 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
506                                              struct flowi *fl, int flags)
507 {
508         struct fib6_node *fn;
509         struct rt6_info *rt;
510
511         read_lock_bh(&table->tb6_lock);
512         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
513 restart:
514         rt = fn->leaf;
515         rt = rt6_device_match(rt, fl->oif, flags);
516         BACKTRACK(&fl->fl6_src);
517 out:
518         dst_hold(&rt->u.dst);
519         read_unlock_bh(&table->tb6_lock);
520
521         rt->u.dst.lastuse = jiffies;
522         rt->u.dst.__use++;
523
524         return rt;
525
526 }
527
528 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
529                             int oif, int strict)
530 {
531         struct flowi fl = {
532                 .oif = oif,
533                 .nl_u = {
534                         .ip6_u = {
535                                 .daddr = *daddr,
536                         },
537                 },
538         };
539         struct dst_entry *dst;
540         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
541
542         if (saddr) {
543                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
544                 flags |= RT6_LOOKUP_F_HAS_SADDR;
545         }
546
547         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
548         if (dst->error == 0)
549                 return (struct rt6_info *) dst;
550
551         dst_release(dst);
552
553         return NULL;
554 }
555
556 /* ip6_ins_rt is called with FREE table->tb6_lock.
557    It takes new route entry, the addition fails by any reason the
558    route is freed. In any case, if caller does not hold it, it may
559    be destroyed.
560  */
561
562 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
563 {
564         int err;
565         struct fib6_table *table;
566
567         table = rt->rt6i_table;
568         write_lock_bh(&table->tb6_lock);
569         err = fib6_add(&table->tb6_root, rt, info);
570         write_unlock_bh(&table->tb6_lock);
571
572         return err;
573 }
574
575 int ip6_ins_rt(struct rt6_info *rt)
576 {
577         return __ip6_ins_rt(rt, NULL);
578 }
579
580 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
581                                       struct in6_addr *saddr)
582 {
583         struct rt6_info *rt;
584
585         /*
586          *      Clone the route.
587          */
588
589         rt = ip6_rt_copy(ort);
590
591         if (rt) {
592                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
593                         if (rt->rt6i_dst.plen != 128 &&
594                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
595                                 rt->rt6i_flags |= RTF_ANYCAST;
596                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
597                 }
598
599                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
600                 rt->rt6i_dst.plen = 128;
601                 rt->rt6i_flags |= RTF_CACHE;
602                 rt->u.dst.flags |= DST_HOST;
603
604 #ifdef CONFIG_IPV6_SUBTREES
605                 if (rt->rt6i_src.plen && saddr) {
606                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
607                         rt->rt6i_src.plen = 128;
608                 }
609 #endif
610
611                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
612
613         }
614
615         return rt;
616 }
617
618 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
619 {
620         struct rt6_info *rt = ip6_rt_copy(ort);
621         if (rt) {
622                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
623                 rt->rt6i_dst.plen = 128;
624                 rt->rt6i_flags |= RTF_CACHE;
625                 rt->u.dst.flags |= DST_HOST;
626                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
627         }
628         return rt;
629 }
630
631 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
632                                             struct flowi *fl, int flags)
633 {
634         struct fib6_node *fn;
635         struct rt6_info *rt, *nrt;
636         int strict = 0;
637         int attempts = 3;
638         int err;
639         int reachable = RT6_LOOKUP_F_REACHABLE;
640
641         strict |= flags & RT6_LOOKUP_F_IFACE;
642
643 relookup:
644         read_lock_bh(&table->tb6_lock);
645
646 restart_2:
647         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
648
649 restart:
650         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
651         BACKTRACK(&fl->fl6_src);
652         if (rt == &ip6_null_entry ||
653             rt->rt6i_flags & RTF_CACHE)
654                 goto out;
655
656         dst_hold(&rt->u.dst);
657         read_unlock_bh(&table->tb6_lock);
658
659         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
660                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
661         else {
662 #if CLONE_OFFLINK_ROUTE
663                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
664 #else
665                 goto out2;
666 #endif
667         }
668
669         dst_release(&rt->u.dst);
670         rt = nrt ? : &ip6_null_entry;
671
672         dst_hold(&rt->u.dst);
673         if (nrt) {
674                 err = ip6_ins_rt(nrt);
675                 if (!err)
676                         goto out2;
677         }
678
679         if (--attempts <= 0)
680                 goto out2;
681
682         /*
683          * Race condition! In the gap, when table->tb6_lock was
684          * released someone could insert this route.  Relookup.
685          */
686         dst_release(&rt->u.dst);
687         goto relookup;
688
689 out:
690         if (reachable) {
691                 reachable = 0;
692                 goto restart_2;
693         }
694         dst_hold(&rt->u.dst);
695         read_unlock_bh(&table->tb6_lock);
696 out2:
697         rt->u.dst.lastuse = jiffies;
698         rt->u.dst.__use++;
699
700         return rt;
701 }
702
703 void ip6_route_input(struct sk_buff *skb)
704 {
705         struct ipv6hdr *iph = skb->nh.ipv6h;
706         int flags = RT6_LOOKUP_F_HAS_SADDR;
707         struct flowi fl = {
708                 .iif = skb->dev->ifindex,
709                 .nl_u = {
710                         .ip6_u = {
711                                 .daddr = iph->daddr,
712                                 .saddr = iph->saddr,
713 #ifdef CONFIG_IPV6_ROUTE_FWMARK
714                                 .fwmark = skb->nfmark,
715 #endif
716                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
717                         },
718                 },
719                 .proto = iph->nexthdr,
720         };
721
722         if (rt6_need_strict(&iph->daddr))
723                 flags |= RT6_LOOKUP_F_IFACE;
724
725         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
726 }
727
728 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
729                                              struct flowi *fl, int flags)
730 {
731         struct fib6_node *fn;
732         struct rt6_info *rt, *nrt;
733         int strict = 0;
734         int attempts = 3;
735         int err;
736         int reachable = RT6_LOOKUP_F_REACHABLE;
737
738         strict |= flags & RT6_LOOKUP_F_IFACE;
739
740 relookup:
741         read_lock_bh(&table->tb6_lock);
742
743 restart_2:
744         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
745
746 restart:
747         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
748         BACKTRACK(&fl->fl6_src);
749         if (rt == &ip6_null_entry ||
750             rt->rt6i_flags & RTF_CACHE)
751                 goto out;
752
753         dst_hold(&rt->u.dst);
754         read_unlock_bh(&table->tb6_lock);
755
756         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
757                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
758         else {
759 #if CLONE_OFFLINK_ROUTE
760                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
761 #else
762                 goto out2;
763 #endif
764         }
765
766         dst_release(&rt->u.dst);
767         rt = nrt ? : &ip6_null_entry;
768
769         dst_hold(&rt->u.dst);
770         if (nrt) {
771                 err = ip6_ins_rt(nrt);
772                 if (!err)
773                         goto out2;
774         }
775
776         if (--attempts <= 0)
777                 goto out2;
778
779         /*
780          * Race condition! In the gap, when table->tb6_lock was
781          * released someone could insert this route.  Relookup.
782          */
783         dst_release(&rt->u.dst);
784         goto relookup;
785
786 out:
787         if (reachable) {
788                 reachable = 0;
789                 goto restart_2;
790         }
791         dst_hold(&rt->u.dst);
792         read_unlock_bh(&table->tb6_lock);
793 out2:
794         rt->u.dst.lastuse = jiffies;
795         rt->u.dst.__use++;
796         return rt;
797 }
798
799 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
800 {
801         int flags = 0;
802
803         if (rt6_need_strict(&fl->fl6_dst))
804                 flags |= RT6_LOOKUP_F_IFACE;
805
806         if (!ipv6_addr_any(&fl->fl6_src))
807                 flags |= RT6_LOOKUP_F_HAS_SADDR;
808
809         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
810 }
811
812
813 /*
814  *      Destination cache support functions
815  */
816
817 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
818 {
819         struct rt6_info *rt;
820
821         rt = (struct rt6_info *) dst;
822
823         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
824                 return dst;
825
826         return NULL;
827 }
828
829 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
830 {
831         struct rt6_info *rt = (struct rt6_info *) dst;
832
833         if (rt) {
834                 if (rt->rt6i_flags & RTF_CACHE)
835                         ip6_del_rt(rt);
836                 else
837                         dst_release(dst);
838         }
839         return NULL;
840 }
841
842 static void ip6_link_failure(struct sk_buff *skb)
843 {
844         struct rt6_info *rt;
845
846         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
847
848         rt = (struct rt6_info *) skb->dst;
849         if (rt) {
850                 if (rt->rt6i_flags&RTF_CACHE) {
851                         dst_set_expires(&rt->u.dst, 0);
852                         rt->rt6i_flags |= RTF_EXPIRES;
853                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
854                         rt->rt6i_node->fn_sernum = -1;
855         }
856 }
857
858 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
859 {
860         struct rt6_info *rt6 = (struct rt6_info*)dst;
861
862         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
863                 rt6->rt6i_flags |= RTF_MODIFIED;
864                 if (mtu < IPV6_MIN_MTU) {
865                         mtu = IPV6_MIN_MTU;
866                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
867                 }
868                 dst->metrics[RTAX_MTU-1] = mtu;
869                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
870         }
871 }
872
873 static int ipv6_get_mtu(struct net_device *dev);
874
875 static inline unsigned int ipv6_advmss(unsigned int mtu)
876 {
877         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
878
879         if (mtu < ip6_rt_min_advmss)
880                 mtu = ip6_rt_min_advmss;
881
882         /*
883          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
884          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
885          * IPV6_MAXPLEN is also valid and means: "any MSS, 
886          * rely only on pmtu discovery"
887          */
888         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
889                 mtu = IPV6_MAXPLEN;
890         return mtu;
891 }
892
893 static struct dst_entry *ndisc_dst_gc_list;
894 static DEFINE_SPINLOCK(ndisc_lock);
895
896 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
897                                   struct neighbour *neigh,
898                                   struct in6_addr *addr,
899                                   int (*output)(struct sk_buff *))
900 {
901         struct rt6_info *rt;
902         struct inet6_dev *idev = in6_dev_get(dev);
903
904         if (unlikely(idev == NULL))
905                 return NULL;
906
907         rt = ip6_dst_alloc();
908         if (unlikely(rt == NULL)) {
909                 in6_dev_put(idev);
910                 goto out;
911         }
912
913         dev_hold(dev);
914         if (neigh)
915                 neigh_hold(neigh);
916         else
917                 neigh = ndisc_get_neigh(dev, addr);
918
919         rt->rt6i_dev      = dev;
920         rt->rt6i_idev     = idev;
921         rt->rt6i_nexthop  = neigh;
922         atomic_set(&rt->u.dst.__refcnt, 1);
923         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
924         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
925         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
926         rt->u.dst.output  = output;
927
928 #if 0   /* there's no chance to use these for ndisc */
929         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
930                                 ? DST_HOST 
931                                 : 0;
932         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
933         rt->rt6i_dst.plen = 128;
934 #endif
935
936         spin_lock_bh(&ndisc_lock);
937         rt->u.dst.next = ndisc_dst_gc_list;
938         ndisc_dst_gc_list = &rt->u.dst;
939         spin_unlock_bh(&ndisc_lock);
940
941         fib6_force_start_gc();
942
943 out:
944         return (struct dst_entry *)rt;
945 }
946
947 int ndisc_dst_gc(int *more)
948 {
949         struct dst_entry *dst, *next, **pprev;
950         int freed;
951
952         next = NULL;
953         freed = 0;
954
955         spin_lock_bh(&ndisc_lock);
956         pprev = &ndisc_dst_gc_list;
957
958         while ((dst = *pprev) != NULL) {
959                 if (!atomic_read(&dst->__refcnt)) {
960                         *pprev = dst->next;
961                         dst_free(dst);
962                         freed++;
963                 } else {
964                         pprev = &dst->next;
965                         (*more)++;
966                 }
967         }
968
969         spin_unlock_bh(&ndisc_lock);
970
971         return freed;
972 }
973
974 static int ip6_dst_gc(void)
975 {
976         static unsigned expire = 30*HZ;
977         static unsigned long last_gc;
978         unsigned long now = jiffies;
979
980         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
981             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
982                 goto out;
983
984         expire++;
985         fib6_run_gc(expire);
986         last_gc = now;
987         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
988                 expire = ip6_rt_gc_timeout>>1;
989
990 out:
991         expire -= expire>>ip6_rt_gc_elasticity;
992         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
993 }
994
995 /* Clean host part of a prefix. Not necessary in radix tree,
996    but results in cleaner routing tables.
997
998    Remove it only when all the things will work!
999  */
1000
1001 static int ipv6_get_mtu(struct net_device *dev)
1002 {
1003         int mtu = IPV6_MIN_MTU;
1004         struct inet6_dev *idev;
1005
1006         idev = in6_dev_get(dev);
1007         if (idev) {
1008                 mtu = idev->cnf.mtu6;
1009                 in6_dev_put(idev);
1010         }
1011         return mtu;
1012 }
1013
1014 int ipv6_get_hoplimit(struct net_device *dev)
1015 {
1016         int hoplimit = ipv6_devconf.hop_limit;
1017         struct inet6_dev *idev;
1018
1019         idev = in6_dev_get(dev);
1020         if (idev) {
1021                 hoplimit = idev->cnf.hop_limit;
1022                 in6_dev_put(idev);
1023         }
1024         return hoplimit;
1025 }
1026
1027 /*
1028  *
1029  */
1030
1031 int ip6_route_add(struct fib6_config *cfg)
1032 {
1033         int err;
1034         struct rt6_info *rt = NULL;
1035         struct net_device *dev = NULL;
1036         struct inet6_dev *idev = NULL;
1037         struct fib6_table *table;
1038         int addr_type;
1039
1040         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1041                 return -EINVAL;
1042 #ifndef CONFIG_IPV6_SUBTREES
1043         if (cfg->fc_src_len)
1044                 return -EINVAL;
1045 #endif
1046         if (cfg->fc_ifindex) {
1047                 err = -ENODEV;
1048                 dev = dev_get_by_index(cfg->fc_ifindex);
1049                 if (!dev)
1050                         goto out;
1051                 idev = in6_dev_get(dev);
1052                 if (!idev)
1053                         goto out;
1054         }
1055
1056         if (cfg->fc_metric == 0)
1057                 cfg->fc_metric = IP6_RT_PRIO_USER;
1058
1059         table = fib6_new_table(cfg->fc_table);
1060         if (table == NULL) {
1061                 err = -ENOBUFS;
1062                 goto out;
1063         }
1064
1065         rt = ip6_dst_alloc();
1066
1067         if (rt == NULL) {
1068                 err = -ENOMEM;
1069                 goto out;
1070         }
1071
1072         rt->u.dst.obsolete = -1;
1073         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1074
1075         if (cfg->fc_protocol == RTPROT_UNSPEC)
1076                 cfg->fc_protocol = RTPROT_BOOT;
1077         rt->rt6i_protocol = cfg->fc_protocol;
1078
1079         addr_type = ipv6_addr_type(&cfg->fc_dst);
1080
1081         if (addr_type & IPV6_ADDR_MULTICAST)
1082                 rt->u.dst.input = ip6_mc_input;
1083         else
1084                 rt->u.dst.input = ip6_forward;
1085
1086         rt->u.dst.output = ip6_output;
1087
1088         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1089         rt->rt6i_dst.plen = cfg->fc_dst_len;
1090         if (rt->rt6i_dst.plen == 128)
1091                rt->u.dst.flags = DST_HOST;
1092
1093 #ifdef CONFIG_IPV6_SUBTREES
1094         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1095         rt->rt6i_src.plen = cfg->fc_src_len;
1096 #endif
1097
1098         rt->rt6i_metric = cfg->fc_metric;
1099
1100         /* We cannot add true routes via loopback here,
1101            they would result in kernel looping; promote them to reject routes
1102          */
1103         if ((cfg->fc_flags & RTF_REJECT) ||
1104             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1105                 /* hold loopback dev/idev if we haven't done so. */
1106                 if (dev != &loopback_dev) {
1107                         if (dev) {
1108                                 dev_put(dev);
1109                                 in6_dev_put(idev);
1110                         }
1111                         dev = &loopback_dev;
1112                         dev_hold(dev);
1113                         idev = in6_dev_get(dev);
1114                         if (!idev) {
1115                                 err = -ENODEV;
1116                                 goto out;
1117                         }
1118                 }
1119                 rt->u.dst.output = ip6_pkt_discard_out;
1120                 rt->u.dst.input = ip6_pkt_discard;
1121                 rt->u.dst.error = -ENETUNREACH;
1122                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1123                 goto install_route;
1124         }
1125
1126         if (cfg->fc_flags & RTF_GATEWAY) {
1127                 struct in6_addr *gw_addr;
1128                 int gwa_type;
1129
1130                 gw_addr = &cfg->fc_gateway;
1131                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1132                 gwa_type = ipv6_addr_type(gw_addr);
1133
1134                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1135                         struct rt6_info *grt;
1136
1137                         /* IPv6 strictly inhibits using not link-local
1138                            addresses as nexthop address.
1139                            Otherwise, router will not able to send redirects.
1140                            It is very good, but in some (rare!) circumstances
1141                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1142                            some exceptions. --ANK
1143                          */
1144                         err = -EINVAL;
1145                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1146                                 goto out;
1147
1148                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1149
1150                         err = -EHOSTUNREACH;
1151                         if (grt == NULL)
1152                                 goto out;
1153                         if (dev) {
1154                                 if (dev != grt->rt6i_dev) {
1155                                         dst_release(&grt->u.dst);
1156                                         goto out;
1157                                 }
1158                         } else {
1159                                 dev = grt->rt6i_dev;
1160                                 idev = grt->rt6i_idev;
1161                                 dev_hold(dev);
1162                                 in6_dev_hold(grt->rt6i_idev);
1163                         }
1164                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1165                                 err = 0;
1166                         dst_release(&grt->u.dst);
1167
1168                         if (err)
1169                                 goto out;
1170                 }
1171                 err = -EINVAL;
1172                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1173                         goto out;
1174         }
1175
1176         err = -ENODEV;
1177         if (dev == NULL)
1178                 goto out;
1179
1180         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1181                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1182                 if (IS_ERR(rt->rt6i_nexthop)) {
1183                         err = PTR_ERR(rt->rt6i_nexthop);
1184                         rt->rt6i_nexthop = NULL;
1185                         goto out;
1186                 }
1187         }
1188
1189         rt->rt6i_flags = cfg->fc_flags;
1190
1191 install_route:
1192         if (cfg->fc_mx) {
1193                 struct nlattr *nla;
1194                 int remaining;
1195
1196                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1197                         int type = nla->nla_type;
1198
1199                         if (type) {
1200                                 if (type > RTAX_MAX) {
1201                                         err = -EINVAL;
1202                                         goto out;
1203                                 }
1204
1205                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1206                         }
1207                 }
1208         }
1209
1210         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1211                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1212         if (!rt->u.dst.metrics[RTAX_MTU-1])
1213                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1214         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1215                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1216         rt->u.dst.dev = dev;
1217         rt->rt6i_idev = idev;
1218         rt->rt6i_table = table;
1219         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1220
1221 out:
1222         if (dev)
1223                 dev_put(dev);
1224         if (idev)
1225                 in6_dev_put(idev);
1226         if (rt)
1227                 dst_free((struct dst_entry *) rt);
1228         return err;
1229 }
1230
1231 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1232 {
1233         int err;
1234         struct fib6_table *table;
1235
1236         if (rt == &ip6_null_entry)
1237                 return -ENOENT;
1238
1239         table = rt->rt6i_table;
1240         write_lock_bh(&table->tb6_lock);
1241
1242         err = fib6_del(rt, info);
1243         dst_release(&rt->u.dst);
1244
1245         write_unlock_bh(&table->tb6_lock);
1246
1247         return err;
1248 }
1249
1250 int ip6_del_rt(struct rt6_info *rt)
1251 {
1252         return __ip6_del_rt(rt, NULL);
1253 }
1254
1255 static int ip6_route_del(struct fib6_config *cfg)
1256 {
1257         struct fib6_table *table;
1258         struct fib6_node *fn;
1259         struct rt6_info *rt;
1260         int err = -ESRCH;
1261
1262         table = fib6_get_table(cfg->fc_table);
1263         if (table == NULL)
1264                 return err;
1265
1266         read_lock_bh(&table->tb6_lock);
1267
1268         fn = fib6_locate(&table->tb6_root,
1269                          &cfg->fc_dst, cfg->fc_dst_len,
1270                          &cfg->fc_src, cfg->fc_src_len);
1271         
1272         if (fn) {
1273                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1274                         if (cfg->fc_ifindex &&
1275                             (rt->rt6i_dev == NULL ||
1276                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1277                                 continue;
1278                         if (cfg->fc_flags & RTF_GATEWAY &&
1279                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1280                                 continue;
1281                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1282                                 continue;
1283                         dst_hold(&rt->u.dst);
1284                         read_unlock_bh(&table->tb6_lock);
1285
1286                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1287                 }
1288         }
1289         read_unlock_bh(&table->tb6_lock);
1290
1291         return err;
1292 }
1293
1294 /*
1295  *      Handle redirects
1296  */
1297 struct ip6rd_flowi {
1298         struct flowi fl;
1299         struct in6_addr gateway;
1300 };
1301
1302 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1303                                              struct flowi *fl,
1304                                              int flags)
1305 {
1306         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1307         struct rt6_info *rt;
1308         struct fib6_node *fn;
1309
1310         /*
1311          * Get the "current" route for this destination and
1312          * check if the redirect has come from approriate router.
1313          *
1314          * RFC 2461 specifies that redirects should only be
1315          * accepted if they come from the nexthop to the target.
1316          * Due to the way the routes are chosen, this notion
1317          * is a bit fuzzy and one might need to check all possible
1318          * routes.
1319          */
1320
1321         read_lock_bh(&table->tb6_lock);
1322         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1323 restart:
1324         for (rt = fn->leaf; rt; rt = rt->u.next) {
1325                 /*
1326                  * Current route is on-link; redirect is always invalid.
1327                  *
1328                  * Seems, previous statement is not true. It could
1329                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1330                  * But then router serving it might decide, that we should
1331                  * know truth 8)8) --ANK (980726).
1332                  */
1333                 if (rt6_check_expired(rt))
1334                         continue;
1335                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1336                         continue;
1337                 if (fl->oif != rt->rt6i_dev->ifindex)
1338                         continue;
1339                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1340                         continue;
1341                 break;
1342         }
1343
1344         if (!rt)
1345                 rt = &ip6_null_entry;
1346         BACKTRACK(&fl->fl6_src);
1347 out:
1348         dst_hold(&rt->u.dst);
1349
1350         read_unlock_bh(&table->tb6_lock);
1351
1352         return rt;
1353 };
1354
1355 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1356                                            struct in6_addr *src,
1357                                            struct in6_addr *gateway,
1358                                            struct net_device *dev)
1359 {
1360         int flags = RT6_LOOKUP_F_HAS_SADDR;
1361         struct ip6rd_flowi rdfl = {
1362                 .fl = {
1363                         .oif = dev->ifindex,
1364                         .nl_u = {
1365                                 .ip6_u = {
1366                                         .daddr = *dest,
1367                                         .saddr = *src,
1368                                 },
1369                         },
1370                 },
1371                 .gateway = *gateway,
1372         };
1373
1374         if (rt6_need_strict(dest))
1375                 flags |= RT6_LOOKUP_F_IFACE;
1376
1377         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1378 }
1379
1380 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1381                   struct in6_addr *saddr,
1382                   struct neighbour *neigh, u8 *lladdr, int on_link)
1383 {
1384         struct rt6_info *rt, *nrt = NULL;
1385         struct netevent_redirect netevent;
1386
1387         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1388
1389         if (rt == &ip6_null_entry) {
1390                 if (net_ratelimit())
1391                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1392                                "for redirect target\n");
1393                 goto out;
1394         }
1395
1396         /*
1397          *      We have finally decided to accept it.
1398          */
1399
1400         neigh_update(neigh, lladdr, NUD_STALE, 
1401                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1402                      NEIGH_UPDATE_F_OVERRIDE|
1403                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1404                                      NEIGH_UPDATE_F_ISROUTER))
1405                      );
1406
1407         /*
1408          * Redirect received -> path was valid.
1409          * Look, redirects are sent only in response to data packets,
1410          * so that this nexthop apparently is reachable. --ANK
1411          */
1412         dst_confirm(&rt->u.dst);
1413
1414         /* Duplicate redirect: silently ignore. */
1415         if (neigh == rt->u.dst.neighbour)
1416                 goto out;
1417
1418         nrt = ip6_rt_copy(rt);
1419         if (nrt == NULL)
1420                 goto out;
1421
1422         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1423         if (on_link)
1424                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1425
1426         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1427         nrt->rt6i_dst.plen = 128;
1428         nrt->u.dst.flags |= DST_HOST;
1429
1430         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1431         nrt->rt6i_nexthop = neigh_clone(neigh);
1432         /* Reset pmtu, it may be better */
1433         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1434         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1435
1436         if (ip6_ins_rt(nrt))
1437                 goto out;
1438
1439         netevent.old = &rt->u.dst;
1440         netevent.new = &nrt->u.dst;
1441         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1442
1443         if (rt->rt6i_flags&RTF_CACHE) {
1444                 ip6_del_rt(rt);
1445                 return;
1446         }
1447
1448 out:
1449         dst_release(&rt->u.dst);
1450         return;
1451 }
1452
1453 /*
1454  *      Handle ICMP "packet too big" messages
1455  *      i.e. Path MTU discovery
1456  */
1457
1458 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1459                         struct net_device *dev, u32 pmtu)
1460 {
1461         struct rt6_info *rt, *nrt;
1462         int allfrag = 0;
1463
1464         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1465         if (rt == NULL)
1466                 return;
1467
1468         if (pmtu >= dst_mtu(&rt->u.dst))
1469                 goto out;
1470
1471         if (pmtu < IPV6_MIN_MTU) {
1472                 /*
1473                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1474                  * MTU (1280) and a fragment header should always be included
1475                  * after a node receiving Too Big message reporting PMTU is
1476                  * less than the IPv6 Minimum Link MTU.
1477                  */
1478                 pmtu = IPV6_MIN_MTU;
1479                 allfrag = 1;
1480         }
1481
1482         /* New mtu received -> path was valid.
1483            They are sent only in response to data packets,
1484            so that this nexthop apparently is reachable. --ANK
1485          */
1486         dst_confirm(&rt->u.dst);
1487
1488         /* Host route. If it is static, it would be better
1489            not to override it, but add new one, so that
1490            when cache entry will expire old pmtu
1491            would return automatically.
1492          */
1493         if (rt->rt6i_flags & RTF_CACHE) {
1494                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1495                 if (allfrag)
1496                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1497                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1498                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1499                 goto out;
1500         }
1501
1502         /* Network route.
1503            Two cases are possible:
1504            1. It is connected route. Action: COW
1505            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1506          */
1507         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1508                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1509         else
1510                 nrt = rt6_alloc_clone(rt, daddr);
1511
1512         if (nrt) {
1513                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1514                 if (allfrag)
1515                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1516
1517                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1518                  * happened within 5 mins, the recommended timer is 10 mins.
1519                  * Here this route expiration time is set to ip6_rt_mtu_expires
1520                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1521                  * and detecting PMTU increase will be automatically happened.
1522                  */
1523                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1524                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1525
1526                 ip6_ins_rt(nrt);
1527         }
1528 out:
1529         dst_release(&rt->u.dst);
1530 }
1531
1532 /*
1533  *      Misc support functions
1534  */
1535
1536 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1537 {
1538         struct rt6_info *rt = ip6_dst_alloc();
1539
1540         if (rt) {
1541                 rt->u.dst.input = ort->u.dst.input;
1542                 rt->u.dst.output = ort->u.dst.output;
1543
1544                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1545                 rt->u.dst.error = ort->u.dst.error;
1546                 rt->u.dst.dev = ort->u.dst.dev;
1547                 if (rt->u.dst.dev)
1548                         dev_hold(rt->u.dst.dev);
1549                 rt->rt6i_idev = ort->rt6i_idev;
1550                 if (rt->rt6i_idev)
1551                         in6_dev_hold(rt->rt6i_idev);
1552                 rt->u.dst.lastuse = jiffies;
1553                 rt->rt6i_expires = 0;
1554
1555                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1556                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1557                 rt->rt6i_metric = 0;
1558
1559                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1560 #ifdef CONFIG_IPV6_SUBTREES
1561                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1562 #endif
1563                 rt->rt6i_table = ort->rt6i_table;
1564         }
1565         return rt;
1566 }
1567
1568 #ifdef CONFIG_IPV6_ROUTE_INFO
1569 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1570                                            struct in6_addr *gwaddr, int ifindex)
1571 {
1572         struct fib6_node *fn;
1573         struct rt6_info *rt = NULL;
1574         struct fib6_table *table;
1575
1576         table = fib6_get_table(RT6_TABLE_INFO);
1577         if (table == NULL)
1578                 return NULL;
1579
1580         write_lock_bh(&table->tb6_lock);
1581         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1582         if (!fn)
1583                 goto out;
1584
1585         for (rt = fn->leaf; rt; rt = rt->u.next) {
1586                 if (rt->rt6i_dev->ifindex != ifindex)
1587                         continue;
1588                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1589                         continue;
1590                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1591                         continue;
1592                 dst_hold(&rt->u.dst);
1593                 break;
1594         }
1595 out:
1596         write_unlock_bh(&table->tb6_lock);
1597         return rt;
1598 }
1599
1600 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1601                                            struct in6_addr *gwaddr, int ifindex,
1602                                            unsigned pref)
1603 {
1604         struct fib6_config cfg = {
1605                 .fc_table       = RT6_TABLE_INFO,
1606                 .fc_metric      = 1024,
1607                 .fc_ifindex     = ifindex,
1608                 .fc_dst_len     = prefixlen,
1609                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1610                                   RTF_UP | RTF_PREF(pref),
1611         };
1612
1613         ipv6_addr_copy(&cfg.fc_dst, prefix);
1614         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1615
1616         /* We should treat it as a default route if prefix length is 0. */
1617         if (!prefixlen)
1618                 cfg.fc_flags |= RTF_DEFAULT;
1619
1620         ip6_route_add(&cfg);
1621
1622         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1623 }
1624 #endif
1625
1626 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1627 {       
1628         struct rt6_info *rt;
1629         struct fib6_table *table;
1630
1631         table = fib6_get_table(RT6_TABLE_DFLT);
1632         if (table == NULL)
1633                 return NULL;
1634
1635         write_lock_bh(&table->tb6_lock);
1636         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1637                 if (dev == rt->rt6i_dev &&
1638                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1639                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1640                         break;
1641         }
1642         if (rt)
1643                 dst_hold(&rt->u.dst);
1644         write_unlock_bh(&table->tb6_lock);
1645         return rt;
1646 }
1647
1648 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1649                                      struct net_device *dev,
1650                                      unsigned int pref)
1651 {
1652         struct fib6_config cfg = {
1653                 .fc_table       = RT6_TABLE_DFLT,
1654                 .fc_metric      = 1024,
1655                 .fc_ifindex     = dev->ifindex,
1656                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1657                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1658         };
1659
1660         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1661
1662         ip6_route_add(&cfg);
1663
1664         return rt6_get_dflt_router(gwaddr, dev);
1665 }
1666
1667 void rt6_purge_dflt_routers(void)
1668 {
1669         struct rt6_info *rt;
1670         struct fib6_table *table;
1671
1672         /* NOTE: Keep consistent with rt6_get_dflt_router */
1673         table = fib6_get_table(RT6_TABLE_DFLT);
1674         if (table == NULL)
1675                 return;
1676
1677 restart:
1678         read_lock_bh(&table->tb6_lock);
1679         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1680                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1681                         dst_hold(&rt->u.dst);
1682                         read_unlock_bh(&table->tb6_lock);
1683                         ip6_del_rt(rt);
1684                         goto restart;
1685                 }
1686         }
1687         read_unlock_bh(&table->tb6_lock);
1688 }
1689
1690 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1691                                  struct fib6_config *cfg)
1692 {
1693         memset(cfg, 0, sizeof(*cfg));
1694
1695         cfg->fc_table = RT6_TABLE_MAIN;
1696         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1697         cfg->fc_metric = rtmsg->rtmsg_metric;
1698         cfg->fc_expires = rtmsg->rtmsg_info;
1699         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1700         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1701         cfg->fc_flags = rtmsg->rtmsg_flags;
1702
1703         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1704         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1705         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1706 }
1707
1708 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1709 {
1710         struct fib6_config cfg;
1711         struct in6_rtmsg rtmsg;
1712         int err;
1713
1714         switch(cmd) {
1715         case SIOCADDRT:         /* Add a route */
1716         case SIOCDELRT:         /* Delete a route */
1717                 if (!capable(CAP_NET_ADMIN))
1718                         return -EPERM;
1719                 err = copy_from_user(&rtmsg, arg,
1720                                      sizeof(struct in6_rtmsg));
1721                 if (err)
1722                         return -EFAULT;
1723
1724                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1725
1726                 rtnl_lock();
1727                 switch (cmd) {
1728                 case SIOCADDRT:
1729                         err = ip6_route_add(&cfg);
1730                         break;
1731                 case SIOCDELRT:
1732                         err = ip6_route_del(&cfg);
1733                         break;
1734                 default:
1735                         err = -EINVAL;
1736                 }
1737                 rtnl_unlock();
1738
1739                 return err;
1740         };
1741
1742         return -EINVAL;
1743 }
1744
1745 /*
1746  *      Drop the packet on the floor
1747  */
1748
1749 static inline int ip6_pkt_drop(struct sk_buff *skb, int code)
1750 {
1751         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1752         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1753                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1754
1755         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1756         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1757         kfree_skb(skb);
1758         return 0;
1759 }
1760
1761 static int ip6_pkt_discard(struct sk_buff *skb)
1762 {
1763         return ip6_pkt_drop(skb, ICMPV6_NOROUTE);
1764 }
1765
1766 static int ip6_pkt_discard_out(struct sk_buff *skb)
1767 {
1768         skb->dev = skb->dst->dev;
1769         return ip6_pkt_discard(skb);
1770 }
1771
1772 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1773
1774 static int ip6_pkt_prohibit(struct sk_buff *skb)
1775 {
1776         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED);
1777 }
1778
1779 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1780 {
1781         skb->dev = skb->dst->dev;
1782         return ip6_pkt_prohibit(skb);
1783 }
1784
1785 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1786 {
1787         kfree_skb(skb);
1788         return 0;
1789 }
1790
1791 #endif
1792
1793 /*
1794  *      Allocate a dst for local (unicast / anycast) address.
1795  */
1796
1797 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1798                                     const struct in6_addr *addr,
1799                                     int anycast)
1800 {
1801         struct rt6_info *rt = ip6_dst_alloc();
1802
1803         if (rt == NULL)
1804                 return ERR_PTR(-ENOMEM);
1805
1806         dev_hold(&loopback_dev);
1807         in6_dev_hold(idev);
1808
1809         rt->u.dst.flags = DST_HOST;
1810         rt->u.dst.input = ip6_input;
1811         rt->u.dst.output = ip6_output;
1812         rt->rt6i_dev = &loopback_dev;
1813         rt->rt6i_idev = idev;
1814         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1815         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1816         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1817         rt->u.dst.obsolete = -1;
1818
1819         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1820         if (anycast)
1821                 rt->rt6i_flags |= RTF_ANYCAST;
1822         else
1823                 rt->rt6i_flags |= RTF_LOCAL;
1824         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1825         if (rt->rt6i_nexthop == NULL) {
1826                 dst_free((struct dst_entry *) rt);
1827                 return ERR_PTR(-ENOMEM);
1828         }
1829
1830         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1831         rt->rt6i_dst.plen = 128;
1832         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1833
1834         atomic_set(&rt->u.dst.__refcnt, 1);
1835
1836         return rt;
1837 }
1838
1839 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1840 {
1841         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1842             rt != &ip6_null_entry) {
1843                 RT6_TRACE("deleted by ifdown %p\n", rt);
1844                 return -1;
1845         }
1846         return 0;
1847 }
1848
1849 void rt6_ifdown(struct net_device *dev)
1850 {
1851         fib6_clean_all(fib6_ifdown, 0, dev);
1852 }
1853
1854 struct rt6_mtu_change_arg
1855 {
1856         struct net_device *dev;
1857         unsigned mtu;
1858 };
1859
1860 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1861 {
1862         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1863         struct inet6_dev *idev;
1864
1865         /* In IPv6 pmtu discovery is not optional,
1866            so that RTAX_MTU lock cannot disable it.
1867            We still use this lock to block changes
1868            caused by addrconf/ndisc.
1869         */
1870
1871         idev = __in6_dev_get(arg->dev);
1872         if (idev == NULL)
1873                 return 0;
1874
1875         /* For administrative MTU increase, there is no way to discover
1876            IPv6 PMTU increase, so PMTU increase should be updated here.
1877            Since RFC 1981 doesn't include administrative MTU increase
1878            update PMTU increase is a MUST. (i.e. jumbo frame)
1879          */
1880         /*
1881            If new MTU is less than route PMTU, this new MTU will be the
1882            lowest MTU in the path, update the route PMTU to reflect PMTU
1883            decreases; if new MTU is greater than route PMTU, and the
1884            old MTU is the lowest MTU in the path, update the route PMTU
1885            to reflect the increase. In this case if the other nodes' MTU
1886            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1887            PMTU discouvery.
1888          */
1889         if (rt->rt6i_dev == arg->dev &&
1890             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1891             (dst_mtu(&rt->u.dst) > arg->mtu ||
1892              (dst_mtu(&rt->u.dst) < arg->mtu &&
1893               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1894                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1895         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1896         return 0;
1897 }
1898
1899 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1900 {
1901         struct rt6_mtu_change_arg arg = {
1902                 .dev = dev,
1903                 .mtu = mtu,
1904         };
1905
1906         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1907 }
1908
1909 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1910         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1911         [RTA_OIF]               = { .type = NLA_U32 },
1912         [RTA_IIF]               = { .type = NLA_U32 },
1913         [RTA_PRIORITY]          = { .type = NLA_U32 },
1914         [RTA_METRICS]           = { .type = NLA_NESTED },
1915 };
1916
1917 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1918                               struct fib6_config *cfg)
1919 {
1920         struct rtmsg *rtm;
1921         struct nlattr *tb[RTA_MAX+1];
1922         int err;
1923
1924         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1925         if (err < 0)
1926                 goto errout;
1927
1928         err = -EINVAL;
1929         rtm = nlmsg_data(nlh);
1930         memset(cfg, 0, sizeof(*cfg));
1931
1932         cfg->fc_table = rtm->rtm_table;
1933         cfg->fc_dst_len = rtm->rtm_dst_len;
1934         cfg->fc_src_len = rtm->rtm_src_len;
1935         cfg->fc_flags = RTF_UP;
1936         cfg->fc_protocol = rtm->rtm_protocol;
1937
1938         if (rtm->rtm_type == RTN_UNREACHABLE)
1939                 cfg->fc_flags |= RTF_REJECT;
1940
1941         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1942         cfg->fc_nlinfo.nlh = nlh;
1943
1944         if (tb[RTA_GATEWAY]) {
1945                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1946                 cfg->fc_flags |= RTF_GATEWAY;
1947         }
1948
1949         if (tb[RTA_DST]) {
1950                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1951
1952                 if (nla_len(tb[RTA_DST]) < plen)
1953                         goto errout;
1954
1955                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1956         }
1957
1958         if (tb[RTA_SRC]) {
1959                 int plen = (rtm->rtm_src_len + 7) >> 3;
1960
1961                 if (nla_len(tb[RTA_SRC]) < plen)
1962                         goto errout;
1963
1964                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1965         }
1966
1967         if (tb[RTA_OIF])
1968                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1969
1970         if (tb[RTA_PRIORITY])
1971                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1972
1973         if (tb[RTA_METRICS]) {
1974                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1975                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1976         }
1977
1978         if (tb[RTA_TABLE])
1979                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1980
1981         err = 0;
1982 errout:
1983         return err;
1984 }
1985
1986 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1987 {
1988         struct fib6_config cfg;
1989         int err;
1990
1991         err = rtm_to_fib6_config(skb, nlh, &cfg);
1992         if (err < 0)
1993                 return err;
1994
1995         return ip6_route_del(&cfg);
1996 }
1997
1998 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1999 {
2000         struct fib6_config cfg;
2001         int err;
2002
2003         err = rtm_to_fib6_config(skb, nlh, &cfg);
2004         if (err < 0)
2005                 return err;
2006
2007         return ip6_route_add(&cfg);
2008 }
2009
2010 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2011                          struct in6_addr *dst, struct in6_addr *src,
2012                          int iif, int type, u32 pid, u32 seq,
2013                          int prefix, unsigned int flags)
2014 {
2015         struct rtmsg *rtm;
2016         struct nlmsghdr *nlh;
2017         struct rta_cacheinfo ci;
2018         u32 table;
2019
2020         if (prefix) {   /* user wants prefix routes only */
2021                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2022                         /* success since this is not a prefix route */
2023                         return 1;
2024                 }
2025         }
2026
2027         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2028         if (nlh == NULL)
2029                 return -ENOBUFS;
2030
2031         rtm = nlmsg_data(nlh);
2032         rtm->rtm_family = AF_INET6;
2033         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2034         rtm->rtm_src_len = rt->rt6i_src.plen;
2035         rtm->rtm_tos = 0;
2036         if (rt->rt6i_table)
2037                 table = rt->rt6i_table->tb6_id;
2038         else
2039                 table = RT6_TABLE_UNSPEC;
2040         rtm->rtm_table = table;
2041         NLA_PUT_U32(skb, RTA_TABLE, table);
2042         if (rt->rt6i_flags&RTF_REJECT)
2043                 rtm->rtm_type = RTN_UNREACHABLE;
2044         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2045                 rtm->rtm_type = RTN_LOCAL;
2046         else
2047                 rtm->rtm_type = RTN_UNICAST;
2048         rtm->rtm_flags = 0;
2049         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2050         rtm->rtm_protocol = rt->rt6i_protocol;
2051         if (rt->rt6i_flags&RTF_DYNAMIC)
2052                 rtm->rtm_protocol = RTPROT_REDIRECT;
2053         else if (rt->rt6i_flags & RTF_ADDRCONF)
2054                 rtm->rtm_protocol = RTPROT_KERNEL;
2055         else if (rt->rt6i_flags&RTF_DEFAULT)
2056                 rtm->rtm_protocol = RTPROT_RA;
2057
2058         if (rt->rt6i_flags&RTF_CACHE)
2059                 rtm->rtm_flags |= RTM_F_CLONED;
2060
2061         if (dst) {
2062                 NLA_PUT(skb, RTA_DST, 16, dst);
2063                 rtm->rtm_dst_len = 128;
2064         } else if (rtm->rtm_dst_len)
2065                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2066 #ifdef CONFIG_IPV6_SUBTREES
2067         if (src) {
2068                 NLA_PUT(skb, RTA_SRC, 16, src);
2069                 rtm->rtm_src_len = 128;
2070         } else if (rtm->rtm_src_len)
2071                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2072 #endif
2073         if (iif)
2074                 NLA_PUT_U32(skb, RTA_IIF, iif);
2075         else if (dst) {
2076                 struct in6_addr saddr_buf;
2077                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2078                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2079         }
2080
2081         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2082                 goto nla_put_failure;
2083
2084         if (rt->u.dst.neighbour)
2085                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2086
2087         if (rt->u.dst.dev)
2088                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2089
2090         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2091         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2092         if (rt->rt6i_expires)
2093                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2094         else
2095                 ci.rta_expires = 0;
2096         ci.rta_used = rt->u.dst.__use;
2097         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2098         ci.rta_error = rt->u.dst.error;
2099         ci.rta_id = 0;
2100         ci.rta_ts = 0;
2101         ci.rta_tsage = 0;
2102         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2103
2104         return nlmsg_end(skb, nlh);
2105
2106 nla_put_failure:
2107         return nlmsg_cancel(skb, nlh);
2108 }
2109
2110 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2111 {
2112         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2113         int prefix;
2114
2115         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2116                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2117                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2118         } else
2119                 prefix = 0;
2120
2121         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2122                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2123                      prefix, NLM_F_MULTI);
2124 }
2125
2126 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2127 {
2128         struct nlattr *tb[RTA_MAX+1];
2129         struct rt6_info *rt;
2130         struct sk_buff *skb;
2131         struct rtmsg *rtm;
2132         struct flowi fl;
2133         int err, iif = 0;
2134
2135         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2136         if (err < 0)
2137                 goto errout;
2138
2139         err = -EINVAL;
2140         memset(&fl, 0, sizeof(fl));
2141
2142         if (tb[RTA_SRC]) {
2143                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2144                         goto errout;
2145
2146                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2147         }
2148
2149         if (tb[RTA_DST]) {
2150                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2151                         goto errout;
2152
2153                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2154         }
2155
2156         if (tb[RTA_IIF])
2157                 iif = nla_get_u32(tb[RTA_IIF]);
2158
2159         if (tb[RTA_OIF])
2160                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2161
2162         if (iif) {
2163                 struct net_device *dev;
2164                 dev = __dev_get_by_index(iif);
2165                 if (!dev) {
2166                         err = -ENODEV;
2167                         goto errout;
2168                 }
2169         }
2170
2171         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2172         if (skb == NULL) {
2173                 err = -ENOBUFS;
2174                 goto errout;
2175         }
2176
2177         /* Reserve room for dummy headers, this skb can pass
2178            through good chunk of routing engine.
2179          */
2180         skb->mac.raw = skb->data;
2181         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2182
2183         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2184         skb->dst = &rt->u.dst;
2185
2186         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2187                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2188                             nlh->nlmsg_seq, 0, 0);
2189         if (err < 0) {
2190                 kfree_skb(skb);
2191                 goto errout;
2192         }
2193
2194         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2195 errout:
2196         return err;
2197 }
2198
2199 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2200 {
2201         struct sk_buff *skb;
2202         u32 pid = 0, seq = 0;
2203         struct nlmsghdr *nlh = NULL;
2204         int payload = sizeof(struct rtmsg) + 256;
2205         int err = -ENOBUFS;
2206
2207         if (info) {
2208                 pid = info->pid;
2209                 nlh = info->nlh;
2210                 if (nlh)
2211                         seq = nlh->nlmsg_seq;
2212         }
2213
2214         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2215         if (skb == NULL)
2216                 goto errout;
2217
2218         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2219         if (err < 0) {
2220                 kfree_skb(skb);
2221                 goto errout;
2222         }
2223
2224         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2225 errout:
2226         if (err < 0)
2227                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2228 }
2229
2230 /*
2231  *      /proc
2232  */
2233
2234 #ifdef CONFIG_PROC_FS
2235
2236 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2237
2238 struct rt6_proc_arg
2239 {
2240         char *buffer;
2241         int offset;
2242         int length;
2243         int skip;
2244         int len;
2245 };
2246
2247 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2248 {
2249         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2250         int i;
2251
2252         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2253                 arg->skip++;
2254                 return 0;
2255         }
2256
2257         if (arg->len >= arg->length)
2258                 return 0;
2259
2260         for (i=0; i<16; i++) {
2261                 sprintf(arg->buffer + arg->len, "%02x",
2262                         rt->rt6i_dst.addr.s6_addr[i]);
2263                 arg->len += 2;
2264         }
2265         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2266                             rt->rt6i_dst.plen);
2267
2268 #ifdef CONFIG_IPV6_SUBTREES
2269         for (i=0; i<16; i++) {
2270                 sprintf(arg->buffer + arg->len, "%02x",
2271                         rt->rt6i_src.addr.s6_addr[i]);
2272                 arg->len += 2;
2273         }
2274         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2275                             rt->rt6i_src.plen);
2276 #else
2277         sprintf(arg->buffer + arg->len,
2278                 "00000000000000000000000000000000 00 ");
2279         arg->len += 36;
2280 #endif
2281
2282         if (rt->rt6i_nexthop) {
2283                 for (i=0; i<16; i++) {
2284                         sprintf(arg->buffer + arg->len, "%02x",
2285                                 rt->rt6i_nexthop->primary_key[i]);
2286                         arg->len += 2;
2287                 }
2288         } else {
2289                 sprintf(arg->buffer + arg->len,
2290                         "00000000000000000000000000000000");
2291                 arg->len += 32;
2292         }
2293         arg->len += sprintf(arg->buffer + arg->len,
2294                             " %08x %08x %08x %08x %8s\n",
2295                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2296                             rt->u.dst.__use, rt->rt6i_flags, 
2297                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2298         return 0;
2299 }
2300
2301 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2302 {
2303         struct rt6_proc_arg arg = {
2304                 .buffer = buffer,
2305                 .offset = offset,
2306                 .length = length,
2307         };
2308
2309         fib6_clean_all(rt6_info_route, 0, &arg);
2310
2311         *start = buffer;
2312         if (offset)
2313                 *start += offset % RT6_INFO_LEN;
2314
2315         arg.len -= offset % RT6_INFO_LEN;
2316
2317         if (arg.len > length)
2318                 arg.len = length;
2319         if (arg.len < 0)
2320                 arg.len = 0;
2321
2322         return arg.len;
2323 }
2324
2325 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2326 {
2327         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2328                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2329                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2330                       rt6_stats.fib_rt_cache,
2331                       atomic_read(&ip6_dst_ops.entries),
2332                       rt6_stats.fib_discarded_routes);
2333
2334         return 0;
2335 }
2336
2337 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2338 {
2339         return single_open(file, rt6_stats_seq_show, NULL);
2340 }
2341
2342 static struct file_operations rt6_stats_seq_fops = {
2343         .owner   = THIS_MODULE,
2344         .open    = rt6_stats_seq_open,
2345         .read    = seq_read,
2346         .llseek  = seq_lseek,
2347         .release = single_release,
2348 };
2349 #endif  /* CONFIG_PROC_FS */
2350
2351 #ifdef CONFIG_SYSCTL
2352
2353 static int flush_delay;
2354
2355 static
2356 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2357                               void __user *buffer, size_t *lenp, loff_t *ppos)
2358 {
2359         if (write) {
2360                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2361                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2362                 return 0;
2363         } else
2364                 return -EINVAL;
2365 }
2366
2367 ctl_table ipv6_route_table[] = {
2368         {
2369                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2370                 .procname       =       "flush",
2371                 .data           =       &flush_delay,
2372                 .maxlen         =       sizeof(int),
2373                 .mode           =       0200,
2374                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2375         },
2376         {
2377                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2378                 .procname       =       "gc_thresh",
2379                 .data           =       &ip6_dst_ops.gc_thresh,
2380                 .maxlen         =       sizeof(int),
2381                 .mode           =       0644,
2382                 .proc_handler   =       &proc_dointvec,
2383         },
2384         {
2385                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2386                 .procname       =       "max_size",
2387                 .data           =       &ip6_rt_max_size,
2388                 .maxlen         =       sizeof(int),
2389                 .mode           =       0644,
2390                 .proc_handler   =       &proc_dointvec,
2391         },
2392         {
2393                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2394                 .procname       =       "gc_min_interval",
2395                 .data           =       &ip6_rt_gc_min_interval,
2396                 .maxlen         =       sizeof(int),
2397                 .mode           =       0644,
2398                 .proc_handler   =       &proc_dointvec_jiffies,
2399                 .strategy       =       &sysctl_jiffies,
2400         },
2401         {
2402                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2403                 .procname       =       "gc_timeout",
2404                 .data           =       &ip6_rt_gc_timeout,
2405                 .maxlen         =       sizeof(int),
2406                 .mode           =       0644,
2407                 .proc_handler   =       &proc_dointvec_jiffies,
2408                 .strategy       =       &sysctl_jiffies,
2409         },
2410         {
2411                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2412                 .procname       =       "gc_interval",
2413                 .data           =       &ip6_rt_gc_interval,
2414                 .maxlen         =       sizeof(int),
2415                 .mode           =       0644,
2416                 .proc_handler   =       &proc_dointvec_jiffies,
2417                 .strategy       =       &sysctl_jiffies,
2418         },
2419         {
2420                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2421                 .procname       =       "gc_elasticity",
2422                 .data           =       &ip6_rt_gc_elasticity,
2423                 .maxlen         =       sizeof(int),
2424                 .mode           =       0644,
2425                 .proc_handler   =       &proc_dointvec_jiffies,
2426                 .strategy       =       &sysctl_jiffies,
2427         },
2428         {
2429                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2430                 .procname       =       "mtu_expires",
2431                 .data           =       &ip6_rt_mtu_expires,
2432                 .maxlen         =       sizeof(int),
2433                 .mode           =       0644,
2434                 .proc_handler   =       &proc_dointvec_jiffies,
2435                 .strategy       =       &sysctl_jiffies,
2436         },
2437         {
2438                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2439                 .procname       =       "min_adv_mss",
2440                 .data           =       &ip6_rt_min_advmss,
2441                 .maxlen         =       sizeof(int),
2442                 .mode           =       0644,
2443                 .proc_handler   =       &proc_dointvec_jiffies,
2444                 .strategy       =       &sysctl_jiffies,
2445         },
2446         {
2447                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2448                 .procname       =       "gc_min_interval_ms",
2449                 .data           =       &ip6_rt_gc_min_interval,
2450                 .maxlen         =       sizeof(int),
2451                 .mode           =       0644,
2452                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2453                 .strategy       =       &sysctl_ms_jiffies,
2454         },
2455         { .ctl_name = 0 }
2456 };
2457
2458 #endif
2459
2460 void __init ip6_route_init(void)
2461 {
2462         struct proc_dir_entry *p;
2463
2464         ip6_dst_ops.kmem_cachep =
2465                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2466                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2467         fib6_init();
2468 #ifdef  CONFIG_PROC_FS
2469         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2470         if (p)
2471                 p->owner = THIS_MODULE;
2472
2473         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2474 #endif
2475 #ifdef CONFIG_XFRM
2476         xfrm6_init();
2477 #endif
2478 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2479         fib6_rules_init();
2480 #endif
2481 }
2482
2483 void ip6_route_cleanup(void)
2484 {
2485 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2486         fib6_rules_cleanup();
2487 #endif
2488 #ifdef CONFIG_PROC_FS
2489         proc_net_remove("ipv6_route");
2490         proc_net_remove("rt6_stats");
2491 #endif
2492 #ifdef CONFIG_XFRM
2493         xfrm6_fini();
2494 #endif
2495         rt6_ifdown(NULL);
2496         fib6_gc_cleanup();
2497         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2498 }