[IPV6] ROUTE: Do not route packets to link-local address on other device.
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146 static int ip6_pkt_blk_hole(struct sk_buff *skb);
147
148 struct rt6_info ip6_prohibit_entry = {
149         .u = {
150                 .dst = {
151                         .__refcnt       = ATOMIC_INIT(1),
152                         .__use          = 1,
153                         .dev            = &loopback_dev,
154                         .obsolete       = -1,
155                         .error          = -EACCES,
156                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
157                         .input          = ip6_pkt_prohibit,
158                         .output         = ip6_pkt_prohibit_out,
159                         .ops            = &ip6_dst_ops,
160                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
161                 }
162         },
163         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
164         .rt6i_metric    = ~(u32) 0,
165         .rt6i_ref       = ATOMIC_INIT(1),
166 };
167
168 struct rt6_info ip6_blk_hole_entry = {
169         .u = {
170                 .dst = {
171                         .__refcnt       = ATOMIC_INIT(1),
172                         .__use          = 1,
173                         .dev            = &loopback_dev,
174                         .obsolete       = -1,
175                         .error          = -EINVAL,
176                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
177                         .input          = ip6_pkt_blk_hole,
178                         .output         = ip6_pkt_blk_hole,
179                         .ops            = &ip6_dst_ops,
180                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
181                 }
182         },
183         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
184         .rt6i_metric    = ~(u32) 0,
185         .rt6i_ref       = ATOMIC_INIT(1),
186 };
187
188 #endif
189
190 /* allocate dst with ip6_dst_ops */
191 static __inline__ struct rt6_info *ip6_dst_alloc(void)
192 {
193         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
194 }
195
196 static void ip6_dst_destroy(struct dst_entry *dst)
197 {
198         struct rt6_info *rt = (struct rt6_info *)dst;
199         struct inet6_dev *idev = rt->rt6i_idev;
200
201         if (idev != NULL) {
202                 rt->rt6i_idev = NULL;
203                 in6_dev_put(idev);
204         }       
205 }
206
207 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
208                            int how)
209 {
210         struct rt6_info *rt = (struct rt6_info *)dst;
211         struct inet6_dev *idev = rt->rt6i_idev;
212
213         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
214                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
215                 if (loopback_idev != NULL) {
216                         rt->rt6i_idev = loopback_idev;
217                         in6_dev_put(idev);
218                 }
219         }
220 }
221
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
223 {
224         return (rt->rt6i_flags & RTF_EXPIRES &&
225                 time_after(jiffies, rt->rt6i_expires));
226 }
227
228 static inline int rt6_need_strict(struct in6_addr *daddr)
229 {
230         return (ipv6_addr_type(daddr) &
231                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 }
233
234 /*
235  *      Route lookup. Any table->tb6_lock is implied.
236  */
237
238 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
239                                                     int oif,
240                                                     int strict)
241 {
242         struct rt6_info *local = NULL;
243         struct rt6_info *sprt;
244
245         if (oif) {
246                 for (sprt = rt; sprt; sprt = sprt->u.next) {
247                         struct net_device *dev = sprt->rt6i_dev;
248                         if (dev->ifindex == oif)
249                                 return sprt;
250                         if (dev->flags & IFF_LOOPBACK) {
251                                 if (sprt->rt6i_idev == NULL ||
252                                     sprt->rt6i_idev->dev->ifindex != oif) {
253                                         if (strict && oif)
254                                                 continue;
255                                         if (local && (!oif || 
256                                                       local->rt6i_idev->dev->ifindex == oif))
257                                                 continue;
258                                 }
259                                 local = sprt;
260                         }
261                 }
262
263                 if (local)
264                         return local;
265
266                 if (strict)
267                         return &ip6_null_entry;
268         }
269         return rt;
270 }
271
272 #ifdef CONFIG_IPV6_ROUTER_PREF
273 static void rt6_probe(struct rt6_info *rt)
274 {
275         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
276         /*
277          * Okay, this does not seem to be appropriate
278          * for now, however, we need to check if it
279          * is really so; aka Router Reachability Probing.
280          *
281          * Router Reachability Probe MUST be rate-limited
282          * to no more than one per minute.
283          */
284         if (!neigh || (neigh->nud_state & NUD_VALID))
285                 return;
286         read_lock_bh(&neigh->lock);
287         if (!(neigh->nud_state & NUD_VALID) &&
288             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
289                 struct in6_addr mcaddr;
290                 struct in6_addr *target;
291
292                 neigh->updated = jiffies;
293                 read_unlock_bh(&neigh->lock);
294
295                 target = (struct in6_addr *)&neigh->primary_key;
296                 addrconf_addr_solict_mult(target, &mcaddr);
297                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
298         } else
299                 read_unlock_bh(&neigh->lock);
300 }
301 #else
302 static inline void rt6_probe(struct rt6_info *rt)
303 {
304         return;
305 }
306 #endif
307
308 /*
309  * Default Router Selection (RFC 2461 6.3.6)
310  */
311 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
312 {
313         struct net_device *dev = rt->rt6i_dev;
314         int ret = 0;
315
316         if (!oif)
317                 return 2;
318         if (dev->flags & IFF_LOOPBACK) {
319                 if (!WARN_ON(rt->rt6i_idev == NULL) &&
320                     rt->rt6i_idev->dev->ifindex == oif)
321                         ret = 1;
322                 else
323                         return 0;
324         }
325         if (dev->ifindex == oif)
326                 return 2;
327
328         return ret;
329 }
330
331 static int inline rt6_check_neigh(struct rt6_info *rt)
332 {
333         struct neighbour *neigh = rt->rt6i_nexthop;
334         int m = 0;
335         if (rt->rt6i_flags & RTF_NONEXTHOP ||
336             !(rt->rt6i_flags & RTF_GATEWAY))
337                 m = 1;
338         else if (neigh) {
339                 read_lock_bh(&neigh->lock);
340                 if (neigh->nud_state & NUD_VALID)
341                         m = 2;
342                 else if (!(neigh->nud_state & NUD_FAILED))
343                         m = 1;
344                 read_unlock_bh(&neigh->lock);
345         }
346         return m;
347 }
348
349 static int rt6_score_route(struct rt6_info *rt, int oif,
350                            int strict)
351 {
352         int m, n;
353                 
354         m = rt6_check_dev(rt, oif);
355         if (!m && (strict & RT6_LOOKUP_F_IFACE))
356                 return -1;
357 #ifdef CONFIG_IPV6_ROUTER_PREF
358         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
359 #endif
360         n = rt6_check_neigh(rt);
361         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
362                 return -1;
363         return m;
364 }
365
366 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
367                                    int strict)
368 {
369         struct rt6_info *match = NULL, *last = NULL;
370         struct rt6_info *rt, *rt0 = *head;
371         u32 metric;
372         int mpri = -1;
373
374         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
375                   __FUNCTION__, head, head ? *head : NULL, oif);
376
377         for (rt = rt0, metric = rt0->rt6i_metric;
378              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
379              rt = rt->u.next) {
380                 int m;
381
382                 if (rt6_check_expired(rt))
383                         continue;
384
385                 last = rt;
386
387                 m = rt6_score_route(rt, oif, strict);
388                 if (m < 0)
389                         continue;
390
391                 if (m > mpri) {
392                         if (strict & RT6_LOOKUP_F_REACHABLE)
393                                 rt6_probe(match);
394                         match = rt;
395                         mpri = m;
396                 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
397                         rt6_probe(rt);
398                 }
399         }
400
401         if (!match &&
402             (strict & RT6_LOOKUP_F_REACHABLE) &&
403             last && last != rt0) {
404                 /* no entries matched; do round-robin */
405                 static DEFINE_SPINLOCK(lock);
406                 spin_lock(&lock);
407                 *head = rt0->u.next;
408                 rt0->u.next = last->u.next;
409                 last->u.next = rt0;
410                 spin_unlock(&lock);
411         }
412
413         RT6_TRACE("%s() => %p, score=%d\n",
414                   __FUNCTION__, match, mpri);
415
416         return (match ? match : &ip6_null_entry);
417 }
418
419 #ifdef CONFIG_IPV6_ROUTE_INFO
420 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
421                   struct in6_addr *gwaddr)
422 {
423         struct route_info *rinfo = (struct route_info *) opt;
424         struct in6_addr prefix_buf, *prefix;
425         unsigned int pref;
426         u32 lifetime;
427         struct rt6_info *rt;
428
429         if (len < sizeof(struct route_info)) {
430                 return -EINVAL;
431         }
432
433         /* Sanity check for prefix_len and length */
434         if (rinfo->length > 3) {
435                 return -EINVAL;
436         } else if (rinfo->prefix_len > 128) {
437                 return -EINVAL;
438         } else if (rinfo->prefix_len > 64) {
439                 if (rinfo->length < 2) {
440                         return -EINVAL;
441                 }
442         } else if (rinfo->prefix_len > 0) {
443                 if (rinfo->length < 1) {
444                         return -EINVAL;
445                 }
446         }
447
448         pref = rinfo->route_pref;
449         if (pref == ICMPV6_ROUTER_PREF_INVALID)
450                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
451
452         lifetime = ntohl(rinfo->lifetime);
453         if (lifetime == 0xffffffff) {
454                 /* infinity */
455         } else if (lifetime > 0x7fffffff/HZ) {
456                 /* Avoid arithmetic overflow */
457                 lifetime = 0x7fffffff/HZ - 1;
458         }
459
460         if (rinfo->length == 3)
461                 prefix = (struct in6_addr *)rinfo->prefix;
462         else {
463                 /* this function is safe */
464                 ipv6_addr_prefix(&prefix_buf,
465                                  (struct in6_addr *)rinfo->prefix,
466                                  rinfo->prefix_len);
467                 prefix = &prefix_buf;
468         }
469
470         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
471
472         if (rt && !lifetime) {
473                 ip6_del_rt(rt);
474                 rt = NULL;
475         }
476
477         if (!rt && lifetime)
478                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
479                                         pref);
480         else if (rt)
481                 rt->rt6i_flags = RTF_ROUTEINFO |
482                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
483
484         if (rt) {
485                 if (lifetime == 0xffffffff) {
486                         rt->rt6i_flags &= ~RTF_EXPIRES;
487                 } else {
488                         rt->rt6i_expires = jiffies + HZ * lifetime;
489                         rt->rt6i_flags |= RTF_EXPIRES;
490                 }
491                 dst_release(&rt->u.dst);
492         }
493         return 0;
494 }
495 #endif
496
497 #define BACKTRACK(saddr) \
498 do { \
499         if (rt == &ip6_null_entry) { \
500                 struct fib6_node *pn; \
501                 while (1) { \
502                         if (fn->fn_flags & RTN_TL_ROOT) \
503                                 goto out; \
504                         pn = fn->parent; \
505                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
506                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
507                         else \
508                                 fn = pn; \
509                         if (fn->fn_flags & RTN_RTINFO) \
510                                 goto restart; \
511                 } \
512         } \
513 } while(0)
514
515 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
516                                              struct flowi *fl, int flags)
517 {
518         struct fib6_node *fn;
519         struct rt6_info *rt;
520
521         read_lock_bh(&table->tb6_lock);
522         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
523 restart:
524         rt = fn->leaf;
525         rt = rt6_device_match(rt, fl->oif, flags);
526         BACKTRACK(&fl->fl6_src);
527 out:
528         dst_hold(&rt->u.dst);
529         read_unlock_bh(&table->tb6_lock);
530
531         rt->u.dst.lastuse = jiffies;
532         rt->u.dst.__use++;
533
534         return rt;
535
536 }
537
538 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
539                             int oif, int strict)
540 {
541         struct flowi fl = {
542                 .oif = oif,
543                 .nl_u = {
544                         .ip6_u = {
545                                 .daddr = *daddr,
546                         },
547                 },
548         };
549         struct dst_entry *dst;
550         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
551
552         if (saddr) {
553                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
554                 flags |= RT6_LOOKUP_F_HAS_SADDR;
555         }
556
557         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
558         if (dst->error == 0)
559                 return (struct rt6_info *) dst;
560
561         dst_release(dst);
562
563         return NULL;
564 }
565
566 /* ip6_ins_rt is called with FREE table->tb6_lock.
567    It takes new route entry, the addition fails by any reason the
568    route is freed. In any case, if caller does not hold it, it may
569    be destroyed.
570  */
571
572 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
573 {
574         int err;
575         struct fib6_table *table;
576
577         table = rt->rt6i_table;
578         write_lock_bh(&table->tb6_lock);
579         err = fib6_add(&table->tb6_root, rt, info);
580         write_unlock_bh(&table->tb6_lock);
581
582         return err;
583 }
584
585 int ip6_ins_rt(struct rt6_info *rt)
586 {
587         return __ip6_ins_rt(rt, NULL);
588 }
589
590 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
591                                       struct in6_addr *saddr)
592 {
593         struct rt6_info *rt;
594
595         /*
596          *      Clone the route.
597          */
598
599         rt = ip6_rt_copy(ort);
600
601         if (rt) {
602                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
603                         if (rt->rt6i_dst.plen != 128 &&
604                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
605                                 rt->rt6i_flags |= RTF_ANYCAST;
606                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
607                 }
608
609                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
610                 rt->rt6i_dst.plen = 128;
611                 rt->rt6i_flags |= RTF_CACHE;
612                 rt->u.dst.flags |= DST_HOST;
613
614 #ifdef CONFIG_IPV6_SUBTREES
615                 if (rt->rt6i_src.plen && saddr) {
616                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
617                         rt->rt6i_src.plen = 128;
618                 }
619 #endif
620
621                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
622
623         }
624
625         return rt;
626 }
627
628 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
629 {
630         struct rt6_info *rt = ip6_rt_copy(ort);
631         if (rt) {
632                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
633                 rt->rt6i_dst.plen = 128;
634                 rt->rt6i_flags |= RTF_CACHE;
635                 rt->u.dst.flags |= DST_HOST;
636                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
637         }
638         return rt;
639 }
640
641 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
642                                             struct flowi *fl, int flags)
643 {
644         struct fib6_node *fn;
645         struct rt6_info *rt, *nrt;
646         int strict = 0;
647         int attempts = 3;
648         int err;
649         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
650
651         strict |= flags & RT6_LOOKUP_F_IFACE;
652
653 relookup:
654         read_lock_bh(&table->tb6_lock);
655
656 restart_2:
657         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
658
659 restart:
660         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
661         BACKTRACK(&fl->fl6_src);
662         if (rt == &ip6_null_entry ||
663             rt->rt6i_flags & RTF_CACHE)
664                 goto out;
665
666         dst_hold(&rt->u.dst);
667         read_unlock_bh(&table->tb6_lock);
668
669         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
670                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
671         else {
672 #if CLONE_OFFLINK_ROUTE
673                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
674 #else
675                 goto out2;
676 #endif
677         }
678
679         dst_release(&rt->u.dst);
680         rt = nrt ? : &ip6_null_entry;
681
682         dst_hold(&rt->u.dst);
683         if (nrt) {
684                 err = ip6_ins_rt(nrt);
685                 if (!err)
686                         goto out2;
687         }
688
689         if (--attempts <= 0)
690                 goto out2;
691
692         /*
693          * Race condition! In the gap, when table->tb6_lock was
694          * released someone could insert this route.  Relookup.
695          */
696         dst_release(&rt->u.dst);
697         goto relookup;
698
699 out:
700         if (reachable) {
701                 reachable = 0;
702                 goto restart_2;
703         }
704         dst_hold(&rt->u.dst);
705         read_unlock_bh(&table->tb6_lock);
706 out2:
707         rt->u.dst.lastuse = jiffies;
708         rt->u.dst.__use++;
709
710         return rt;
711 }
712
713 void ip6_route_input(struct sk_buff *skb)
714 {
715         struct ipv6hdr *iph = skb->nh.ipv6h;
716         int flags = RT6_LOOKUP_F_HAS_SADDR;
717         struct flowi fl = {
718                 .iif = skb->dev->ifindex,
719                 .nl_u = {
720                         .ip6_u = {
721                                 .daddr = iph->daddr,
722                                 .saddr = iph->saddr,
723                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
724                         },
725                 },
726                 .mark = skb->mark,
727                 .proto = iph->nexthdr,
728         };
729
730         if (rt6_need_strict(&iph->daddr))
731                 flags |= RT6_LOOKUP_F_IFACE;
732
733         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
734 }
735
736 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
737                                              struct flowi *fl, int flags)
738 {
739         struct fib6_node *fn;
740         struct rt6_info *rt, *nrt;
741         int strict = 0;
742         int attempts = 3;
743         int err;
744         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
745
746         strict |= flags & RT6_LOOKUP_F_IFACE;
747
748 relookup:
749         read_lock_bh(&table->tb6_lock);
750
751 restart_2:
752         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
753
754 restart:
755         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
756         BACKTRACK(&fl->fl6_src);
757         if (rt == &ip6_null_entry ||
758             rt->rt6i_flags & RTF_CACHE)
759                 goto out;
760
761         dst_hold(&rt->u.dst);
762         read_unlock_bh(&table->tb6_lock);
763
764         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
765                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
766         else {
767 #if CLONE_OFFLINK_ROUTE
768                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
769 #else
770                 goto out2;
771 #endif
772         }
773
774         dst_release(&rt->u.dst);
775         rt = nrt ? : &ip6_null_entry;
776
777         dst_hold(&rt->u.dst);
778         if (nrt) {
779                 err = ip6_ins_rt(nrt);
780                 if (!err)
781                         goto out2;
782         }
783
784         if (--attempts <= 0)
785                 goto out2;
786
787         /*
788          * Race condition! In the gap, when table->tb6_lock was
789          * released someone could insert this route.  Relookup.
790          */
791         dst_release(&rt->u.dst);
792         goto relookup;
793
794 out:
795         if (reachable) {
796                 reachable = 0;
797                 goto restart_2;
798         }
799         dst_hold(&rt->u.dst);
800         read_unlock_bh(&table->tb6_lock);
801 out2:
802         rt->u.dst.lastuse = jiffies;
803         rt->u.dst.__use++;
804         return rt;
805 }
806
807 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
808 {
809         int flags = 0;
810
811         if (rt6_need_strict(&fl->fl6_dst))
812                 flags |= RT6_LOOKUP_F_IFACE;
813
814         if (!ipv6_addr_any(&fl->fl6_src))
815                 flags |= RT6_LOOKUP_F_HAS_SADDR;
816
817         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
818 }
819
820
821 /*
822  *      Destination cache support functions
823  */
824
825 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
826 {
827         struct rt6_info *rt;
828
829         rt = (struct rt6_info *) dst;
830
831         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
832                 return dst;
833
834         return NULL;
835 }
836
837 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
838 {
839         struct rt6_info *rt = (struct rt6_info *) dst;
840
841         if (rt) {
842                 if (rt->rt6i_flags & RTF_CACHE)
843                         ip6_del_rt(rt);
844                 else
845                         dst_release(dst);
846         }
847         return NULL;
848 }
849
850 static void ip6_link_failure(struct sk_buff *skb)
851 {
852         struct rt6_info *rt;
853
854         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
855
856         rt = (struct rt6_info *) skb->dst;
857         if (rt) {
858                 if (rt->rt6i_flags&RTF_CACHE) {
859                         dst_set_expires(&rt->u.dst, 0);
860                         rt->rt6i_flags |= RTF_EXPIRES;
861                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
862                         rt->rt6i_node->fn_sernum = -1;
863         }
864 }
865
866 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
867 {
868         struct rt6_info *rt6 = (struct rt6_info*)dst;
869
870         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
871                 rt6->rt6i_flags |= RTF_MODIFIED;
872                 if (mtu < IPV6_MIN_MTU) {
873                         mtu = IPV6_MIN_MTU;
874                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
875                 }
876                 dst->metrics[RTAX_MTU-1] = mtu;
877                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
878         }
879 }
880
881 static int ipv6_get_mtu(struct net_device *dev);
882
883 static inline unsigned int ipv6_advmss(unsigned int mtu)
884 {
885         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
886
887         if (mtu < ip6_rt_min_advmss)
888                 mtu = ip6_rt_min_advmss;
889
890         /*
891          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
892          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
893          * IPV6_MAXPLEN is also valid and means: "any MSS, 
894          * rely only on pmtu discovery"
895          */
896         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
897                 mtu = IPV6_MAXPLEN;
898         return mtu;
899 }
900
901 static struct dst_entry *ndisc_dst_gc_list;
902 static DEFINE_SPINLOCK(ndisc_lock);
903
904 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
905                                   struct neighbour *neigh,
906                                   struct in6_addr *addr,
907                                   int (*output)(struct sk_buff *))
908 {
909         struct rt6_info *rt;
910         struct inet6_dev *idev = in6_dev_get(dev);
911
912         if (unlikely(idev == NULL))
913                 return NULL;
914
915         rt = ip6_dst_alloc();
916         if (unlikely(rt == NULL)) {
917                 in6_dev_put(idev);
918                 goto out;
919         }
920
921         dev_hold(dev);
922         if (neigh)
923                 neigh_hold(neigh);
924         else
925                 neigh = ndisc_get_neigh(dev, addr);
926
927         rt->rt6i_dev      = dev;
928         rt->rt6i_idev     = idev;
929         rt->rt6i_nexthop  = neigh;
930         atomic_set(&rt->u.dst.__refcnt, 1);
931         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
932         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
933         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
934         rt->u.dst.output  = output;
935
936 #if 0   /* there's no chance to use these for ndisc */
937         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
938                                 ? DST_HOST 
939                                 : 0;
940         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
941         rt->rt6i_dst.plen = 128;
942 #endif
943
944         spin_lock_bh(&ndisc_lock);
945         rt->u.dst.next = ndisc_dst_gc_list;
946         ndisc_dst_gc_list = &rt->u.dst;
947         spin_unlock_bh(&ndisc_lock);
948
949         fib6_force_start_gc();
950
951 out:
952         return &rt->u.dst;
953 }
954
955 int ndisc_dst_gc(int *more)
956 {
957         struct dst_entry *dst, *next, **pprev;
958         int freed;
959
960         next = NULL;
961         freed = 0;
962
963         spin_lock_bh(&ndisc_lock);
964         pprev = &ndisc_dst_gc_list;
965
966         while ((dst = *pprev) != NULL) {
967                 if (!atomic_read(&dst->__refcnt)) {
968                         *pprev = dst->next;
969                         dst_free(dst);
970                         freed++;
971                 } else {
972                         pprev = &dst->next;
973                         (*more)++;
974                 }
975         }
976
977         spin_unlock_bh(&ndisc_lock);
978
979         return freed;
980 }
981
982 static int ip6_dst_gc(void)
983 {
984         static unsigned expire = 30*HZ;
985         static unsigned long last_gc;
986         unsigned long now = jiffies;
987
988         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
989             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
990                 goto out;
991
992         expire++;
993         fib6_run_gc(expire);
994         last_gc = now;
995         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
996                 expire = ip6_rt_gc_timeout>>1;
997
998 out:
999         expire -= expire>>ip6_rt_gc_elasticity;
1000         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1001 }
1002
1003 /* Clean host part of a prefix. Not necessary in radix tree,
1004    but results in cleaner routing tables.
1005
1006    Remove it only when all the things will work!
1007  */
1008
1009 static int ipv6_get_mtu(struct net_device *dev)
1010 {
1011         int mtu = IPV6_MIN_MTU;
1012         struct inet6_dev *idev;
1013
1014         idev = in6_dev_get(dev);
1015         if (idev) {
1016                 mtu = idev->cnf.mtu6;
1017                 in6_dev_put(idev);
1018         }
1019         return mtu;
1020 }
1021
1022 int ipv6_get_hoplimit(struct net_device *dev)
1023 {
1024         int hoplimit = ipv6_devconf.hop_limit;
1025         struct inet6_dev *idev;
1026
1027         idev = in6_dev_get(dev);
1028         if (idev) {
1029                 hoplimit = idev->cnf.hop_limit;
1030                 in6_dev_put(idev);
1031         }
1032         return hoplimit;
1033 }
1034
1035 /*
1036  *
1037  */
1038
1039 int ip6_route_add(struct fib6_config *cfg)
1040 {
1041         int err;
1042         struct rt6_info *rt = NULL;
1043         struct net_device *dev = NULL;
1044         struct inet6_dev *idev = NULL;
1045         struct fib6_table *table;
1046         int addr_type;
1047
1048         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1049                 return -EINVAL;
1050 #ifndef CONFIG_IPV6_SUBTREES
1051         if (cfg->fc_src_len)
1052                 return -EINVAL;
1053 #endif
1054         if (cfg->fc_ifindex) {
1055                 err = -ENODEV;
1056                 dev = dev_get_by_index(cfg->fc_ifindex);
1057                 if (!dev)
1058                         goto out;
1059                 idev = in6_dev_get(dev);
1060                 if (!idev)
1061                         goto out;
1062         }
1063
1064         if (cfg->fc_metric == 0)
1065                 cfg->fc_metric = IP6_RT_PRIO_USER;
1066
1067         table = fib6_new_table(cfg->fc_table);
1068         if (table == NULL) {
1069                 err = -ENOBUFS;
1070                 goto out;
1071         }
1072
1073         rt = ip6_dst_alloc();
1074
1075         if (rt == NULL) {
1076                 err = -ENOMEM;
1077                 goto out;
1078         }
1079
1080         rt->u.dst.obsolete = -1;
1081         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1082
1083         if (cfg->fc_protocol == RTPROT_UNSPEC)
1084                 cfg->fc_protocol = RTPROT_BOOT;
1085         rt->rt6i_protocol = cfg->fc_protocol;
1086
1087         addr_type = ipv6_addr_type(&cfg->fc_dst);
1088
1089         if (addr_type & IPV6_ADDR_MULTICAST)
1090                 rt->u.dst.input = ip6_mc_input;
1091         else
1092                 rt->u.dst.input = ip6_forward;
1093
1094         rt->u.dst.output = ip6_output;
1095
1096         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1097         rt->rt6i_dst.plen = cfg->fc_dst_len;
1098         if (rt->rt6i_dst.plen == 128)
1099                rt->u.dst.flags = DST_HOST;
1100
1101 #ifdef CONFIG_IPV6_SUBTREES
1102         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1103         rt->rt6i_src.plen = cfg->fc_src_len;
1104 #endif
1105
1106         rt->rt6i_metric = cfg->fc_metric;
1107
1108         /* We cannot add true routes via loopback here,
1109            they would result in kernel looping; promote them to reject routes
1110          */
1111         if ((cfg->fc_flags & RTF_REJECT) ||
1112             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1113                 /* hold loopback dev/idev if we haven't done so. */
1114                 if (dev != &loopback_dev) {
1115                         if (dev) {
1116                                 dev_put(dev);
1117                                 in6_dev_put(idev);
1118                         }
1119                         dev = &loopback_dev;
1120                         dev_hold(dev);
1121                         idev = in6_dev_get(dev);
1122                         if (!idev) {
1123                                 err = -ENODEV;
1124                                 goto out;
1125                         }
1126                 }
1127                 rt->u.dst.output = ip6_pkt_discard_out;
1128                 rt->u.dst.input = ip6_pkt_discard;
1129                 rt->u.dst.error = -ENETUNREACH;
1130                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1131                 goto install_route;
1132         }
1133
1134         if (cfg->fc_flags & RTF_GATEWAY) {
1135                 struct in6_addr *gw_addr;
1136                 int gwa_type;
1137
1138                 gw_addr = &cfg->fc_gateway;
1139                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1140                 gwa_type = ipv6_addr_type(gw_addr);
1141
1142                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1143                         struct rt6_info *grt;
1144
1145                         /* IPv6 strictly inhibits using not link-local
1146                            addresses as nexthop address.
1147                            Otherwise, router will not able to send redirects.
1148                            It is very good, but in some (rare!) circumstances
1149                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1150                            some exceptions. --ANK
1151                          */
1152                         err = -EINVAL;
1153                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1154                                 goto out;
1155
1156                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1157
1158                         err = -EHOSTUNREACH;
1159                         if (grt == NULL)
1160                                 goto out;
1161                         if (dev) {
1162                                 if (dev != grt->rt6i_dev) {
1163                                         dst_release(&grt->u.dst);
1164                                         goto out;
1165                                 }
1166                         } else {
1167                                 dev = grt->rt6i_dev;
1168                                 idev = grt->rt6i_idev;
1169                                 dev_hold(dev);
1170                                 in6_dev_hold(grt->rt6i_idev);
1171                         }
1172                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1173                                 err = 0;
1174                         dst_release(&grt->u.dst);
1175
1176                         if (err)
1177                                 goto out;
1178                 }
1179                 err = -EINVAL;
1180                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1181                         goto out;
1182         }
1183
1184         err = -ENODEV;
1185         if (dev == NULL)
1186                 goto out;
1187
1188         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1189                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1190                 if (IS_ERR(rt->rt6i_nexthop)) {
1191                         err = PTR_ERR(rt->rt6i_nexthop);
1192                         rt->rt6i_nexthop = NULL;
1193                         goto out;
1194                 }
1195         }
1196
1197         rt->rt6i_flags = cfg->fc_flags;
1198
1199 install_route:
1200         if (cfg->fc_mx) {
1201                 struct nlattr *nla;
1202                 int remaining;
1203
1204                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1205                         int type = nla->nla_type;
1206
1207                         if (type) {
1208                                 if (type > RTAX_MAX) {
1209                                         err = -EINVAL;
1210                                         goto out;
1211                                 }
1212
1213                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1214                         }
1215                 }
1216         }
1217
1218         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1219                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1220         if (!rt->u.dst.metrics[RTAX_MTU-1])
1221                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1222         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1223                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1224         rt->u.dst.dev = dev;
1225         rt->rt6i_idev = idev;
1226         rt->rt6i_table = table;
1227         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1228
1229 out:
1230         if (dev)
1231                 dev_put(dev);
1232         if (idev)
1233                 in6_dev_put(idev);
1234         if (rt)
1235                 dst_free(&rt->u.dst);
1236         return err;
1237 }
1238
1239 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1240 {
1241         int err;
1242         struct fib6_table *table;
1243
1244         if (rt == &ip6_null_entry)
1245                 return -ENOENT;
1246
1247         table = rt->rt6i_table;
1248         write_lock_bh(&table->tb6_lock);
1249
1250         err = fib6_del(rt, info);
1251         dst_release(&rt->u.dst);
1252
1253         write_unlock_bh(&table->tb6_lock);
1254
1255         return err;
1256 }
1257
1258 int ip6_del_rt(struct rt6_info *rt)
1259 {
1260         return __ip6_del_rt(rt, NULL);
1261 }
1262
1263 static int ip6_route_del(struct fib6_config *cfg)
1264 {
1265         struct fib6_table *table;
1266         struct fib6_node *fn;
1267         struct rt6_info *rt;
1268         int err = -ESRCH;
1269
1270         table = fib6_get_table(cfg->fc_table);
1271         if (table == NULL)
1272                 return err;
1273
1274         read_lock_bh(&table->tb6_lock);
1275
1276         fn = fib6_locate(&table->tb6_root,
1277                          &cfg->fc_dst, cfg->fc_dst_len,
1278                          &cfg->fc_src, cfg->fc_src_len);
1279         
1280         if (fn) {
1281                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1282                         if (cfg->fc_ifindex &&
1283                             (rt->rt6i_dev == NULL ||
1284                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1285                                 continue;
1286                         if (cfg->fc_flags & RTF_GATEWAY &&
1287                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1288                                 continue;
1289                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1290                                 continue;
1291                         dst_hold(&rt->u.dst);
1292                         read_unlock_bh(&table->tb6_lock);
1293
1294                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1295                 }
1296         }
1297         read_unlock_bh(&table->tb6_lock);
1298
1299         return err;
1300 }
1301
1302 /*
1303  *      Handle redirects
1304  */
1305 struct ip6rd_flowi {
1306         struct flowi fl;
1307         struct in6_addr gateway;
1308 };
1309
1310 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1311                                              struct flowi *fl,
1312                                              int flags)
1313 {
1314         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1315         struct rt6_info *rt;
1316         struct fib6_node *fn;
1317
1318         /*
1319          * Get the "current" route for this destination and
1320          * check if the redirect has come from approriate router.
1321          *
1322          * RFC 2461 specifies that redirects should only be
1323          * accepted if they come from the nexthop to the target.
1324          * Due to the way the routes are chosen, this notion
1325          * is a bit fuzzy and one might need to check all possible
1326          * routes.
1327          */
1328
1329         read_lock_bh(&table->tb6_lock);
1330         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1331 restart:
1332         for (rt = fn->leaf; rt; rt = rt->u.next) {
1333                 /*
1334                  * Current route is on-link; redirect is always invalid.
1335                  *
1336                  * Seems, previous statement is not true. It could
1337                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1338                  * But then router serving it might decide, that we should
1339                  * know truth 8)8) --ANK (980726).
1340                  */
1341                 if (rt6_check_expired(rt))
1342                         continue;
1343                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1344                         continue;
1345                 if (fl->oif != rt->rt6i_dev->ifindex)
1346                         continue;
1347                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1348                         continue;
1349                 break;
1350         }
1351
1352         if (!rt)
1353                 rt = &ip6_null_entry;
1354         BACKTRACK(&fl->fl6_src);
1355 out:
1356         dst_hold(&rt->u.dst);
1357
1358         read_unlock_bh(&table->tb6_lock);
1359
1360         return rt;
1361 };
1362
1363 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1364                                            struct in6_addr *src,
1365                                            struct in6_addr *gateway,
1366                                            struct net_device *dev)
1367 {
1368         int flags = RT6_LOOKUP_F_HAS_SADDR;
1369         struct ip6rd_flowi rdfl = {
1370                 .fl = {
1371                         .oif = dev->ifindex,
1372                         .nl_u = {
1373                                 .ip6_u = {
1374                                         .daddr = *dest,
1375                                         .saddr = *src,
1376                                 },
1377                         },
1378                 },
1379                 .gateway = *gateway,
1380         };
1381
1382         if (rt6_need_strict(dest))
1383                 flags |= RT6_LOOKUP_F_IFACE;
1384
1385         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1386 }
1387
1388 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1389                   struct in6_addr *saddr,
1390                   struct neighbour *neigh, u8 *lladdr, int on_link)
1391 {
1392         struct rt6_info *rt, *nrt = NULL;
1393         struct netevent_redirect netevent;
1394
1395         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1396
1397         if (rt == &ip6_null_entry) {
1398                 if (net_ratelimit())
1399                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1400                                "for redirect target\n");
1401                 goto out;
1402         }
1403
1404         /*
1405          *      We have finally decided to accept it.
1406          */
1407
1408         neigh_update(neigh, lladdr, NUD_STALE, 
1409                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1410                      NEIGH_UPDATE_F_OVERRIDE|
1411                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1412                                      NEIGH_UPDATE_F_ISROUTER))
1413                      );
1414
1415         /*
1416          * Redirect received -> path was valid.
1417          * Look, redirects are sent only in response to data packets,
1418          * so that this nexthop apparently is reachable. --ANK
1419          */
1420         dst_confirm(&rt->u.dst);
1421
1422         /* Duplicate redirect: silently ignore. */
1423         if (neigh == rt->u.dst.neighbour)
1424                 goto out;
1425
1426         nrt = ip6_rt_copy(rt);
1427         if (nrt == NULL)
1428                 goto out;
1429
1430         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1431         if (on_link)
1432                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1433
1434         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1435         nrt->rt6i_dst.plen = 128;
1436         nrt->u.dst.flags |= DST_HOST;
1437
1438         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1439         nrt->rt6i_nexthop = neigh_clone(neigh);
1440         /* Reset pmtu, it may be better */
1441         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1442         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1443
1444         if (ip6_ins_rt(nrt))
1445                 goto out;
1446
1447         netevent.old = &rt->u.dst;
1448         netevent.new = &nrt->u.dst;
1449         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1450
1451         if (rt->rt6i_flags&RTF_CACHE) {
1452                 ip6_del_rt(rt);
1453                 return;
1454         }
1455
1456 out:
1457         dst_release(&rt->u.dst);
1458         return;
1459 }
1460
1461 /*
1462  *      Handle ICMP "packet too big" messages
1463  *      i.e. Path MTU discovery
1464  */
1465
1466 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1467                         struct net_device *dev, u32 pmtu)
1468 {
1469         struct rt6_info *rt, *nrt;
1470         int allfrag = 0;
1471
1472         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1473         if (rt == NULL)
1474                 return;
1475
1476         if (pmtu >= dst_mtu(&rt->u.dst))
1477                 goto out;
1478
1479         if (pmtu < IPV6_MIN_MTU) {
1480                 /*
1481                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1482                  * MTU (1280) and a fragment header should always be included
1483                  * after a node receiving Too Big message reporting PMTU is
1484                  * less than the IPv6 Minimum Link MTU.
1485                  */
1486                 pmtu = IPV6_MIN_MTU;
1487                 allfrag = 1;
1488         }
1489
1490         /* New mtu received -> path was valid.
1491            They are sent only in response to data packets,
1492            so that this nexthop apparently is reachable. --ANK
1493          */
1494         dst_confirm(&rt->u.dst);
1495
1496         /* Host route. If it is static, it would be better
1497            not to override it, but add new one, so that
1498            when cache entry will expire old pmtu
1499            would return automatically.
1500          */
1501         if (rt->rt6i_flags & RTF_CACHE) {
1502                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1503                 if (allfrag)
1504                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1505                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1506                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1507                 goto out;
1508         }
1509
1510         /* Network route.
1511            Two cases are possible:
1512            1. It is connected route. Action: COW
1513            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1514          */
1515         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1516                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1517         else
1518                 nrt = rt6_alloc_clone(rt, daddr);
1519
1520         if (nrt) {
1521                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1522                 if (allfrag)
1523                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1524
1525                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1526                  * happened within 5 mins, the recommended timer is 10 mins.
1527                  * Here this route expiration time is set to ip6_rt_mtu_expires
1528                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1529                  * and detecting PMTU increase will be automatically happened.
1530                  */
1531                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1532                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1533
1534                 ip6_ins_rt(nrt);
1535         }
1536 out:
1537         dst_release(&rt->u.dst);
1538 }
1539
1540 /*
1541  *      Misc support functions
1542  */
1543
1544 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1545 {
1546         struct rt6_info *rt = ip6_dst_alloc();
1547
1548         if (rt) {
1549                 rt->u.dst.input = ort->u.dst.input;
1550                 rt->u.dst.output = ort->u.dst.output;
1551
1552                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1553                 rt->u.dst.error = ort->u.dst.error;
1554                 rt->u.dst.dev = ort->u.dst.dev;
1555                 if (rt->u.dst.dev)
1556                         dev_hold(rt->u.dst.dev);
1557                 rt->rt6i_idev = ort->rt6i_idev;
1558                 if (rt->rt6i_idev)
1559                         in6_dev_hold(rt->rt6i_idev);
1560                 rt->u.dst.lastuse = jiffies;
1561                 rt->rt6i_expires = 0;
1562
1563                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1564                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1565                 rt->rt6i_metric = 0;
1566
1567                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1568 #ifdef CONFIG_IPV6_SUBTREES
1569                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1570 #endif
1571                 rt->rt6i_table = ort->rt6i_table;
1572         }
1573         return rt;
1574 }
1575
1576 #ifdef CONFIG_IPV6_ROUTE_INFO
1577 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1578                                            struct in6_addr *gwaddr, int ifindex)
1579 {
1580         struct fib6_node *fn;
1581         struct rt6_info *rt = NULL;
1582         struct fib6_table *table;
1583
1584         table = fib6_get_table(RT6_TABLE_INFO);
1585         if (table == NULL)
1586                 return NULL;
1587
1588         write_lock_bh(&table->tb6_lock);
1589         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1590         if (!fn)
1591                 goto out;
1592
1593         for (rt = fn->leaf; rt; rt = rt->u.next) {
1594                 if (rt->rt6i_dev->ifindex != ifindex)
1595                         continue;
1596                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1597                         continue;
1598                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1599                         continue;
1600                 dst_hold(&rt->u.dst);
1601                 break;
1602         }
1603 out:
1604         write_unlock_bh(&table->tb6_lock);
1605         return rt;
1606 }
1607
1608 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1609                                            struct in6_addr *gwaddr, int ifindex,
1610                                            unsigned pref)
1611 {
1612         struct fib6_config cfg = {
1613                 .fc_table       = RT6_TABLE_INFO,
1614                 .fc_metric      = 1024,
1615                 .fc_ifindex     = ifindex,
1616                 .fc_dst_len     = prefixlen,
1617                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1618                                   RTF_UP | RTF_PREF(pref),
1619         };
1620
1621         ipv6_addr_copy(&cfg.fc_dst, prefix);
1622         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1623
1624         /* We should treat it as a default route if prefix length is 0. */
1625         if (!prefixlen)
1626                 cfg.fc_flags |= RTF_DEFAULT;
1627
1628         ip6_route_add(&cfg);
1629
1630         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1631 }
1632 #endif
1633
1634 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1635 {       
1636         struct rt6_info *rt;
1637         struct fib6_table *table;
1638
1639         table = fib6_get_table(RT6_TABLE_DFLT);
1640         if (table == NULL)
1641                 return NULL;
1642
1643         write_lock_bh(&table->tb6_lock);
1644         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1645                 if (dev == rt->rt6i_dev &&
1646                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1647                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1648                         break;
1649         }
1650         if (rt)
1651                 dst_hold(&rt->u.dst);
1652         write_unlock_bh(&table->tb6_lock);
1653         return rt;
1654 }
1655
1656 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1657                                      struct net_device *dev,
1658                                      unsigned int pref)
1659 {
1660         struct fib6_config cfg = {
1661                 .fc_table       = RT6_TABLE_DFLT,
1662                 .fc_metric      = 1024,
1663                 .fc_ifindex     = dev->ifindex,
1664                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1665                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1666         };
1667
1668         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1669
1670         ip6_route_add(&cfg);
1671
1672         return rt6_get_dflt_router(gwaddr, dev);
1673 }
1674
1675 void rt6_purge_dflt_routers(void)
1676 {
1677         struct rt6_info *rt;
1678         struct fib6_table *table;
1679
1680         /* NOTE: Keep consistent with rt6_get_dflt_router */
1681         table = fib6_get_table(RT6_TABLE_DFLT);
1682         if (table == NULL)
1683                 return;
1684
1685 restart:
1686         read_lock_bh(&table->tb6_lock);
1687         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1688                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1689                         dst_hold(&rt->u.dst);
1690                         read_unlock_bh(&table->tb6_lock);
1691                         ip6_del_rt(rt);
1692                         goto restart;
1693                 }
1694         }
1695         read_unlock_bh(&table->tb6_lock);
1696 }
1697
1698 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1699                                  struct fib6_config *cfg)
1700 {
1701         memset(cfg, 0, sizeof(*cfg));
1702
1703         cfg->fc_table = RT6_TABLE_MAIN;
1704         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1705         cfg->fc_metric = rtmsg->rtmsg_metric;
1706         cfg->fc_expires = rtmsg->rtmsg_info;
1707         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1708         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1709         cfg->fc_flags = rtmsg->rtmsg_flags;
1710
1711         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1712         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1713         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1714 }
1715
1716 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1717 {
1718         struct fib6_config cfg;
1719         struct in6_rtmsg rtmsg;
1720         int err;
1721
1722         switch(cmd) {
1723         case SIOCADDRT:         /* Add a route */
1724         case SIOCDELRT:         /* Delete a route */
1725                 if (!capable(CAP_NET_ADMIN))
1726                         return -EPERM;
1727                 err = copy_from_user(&rtmsg, arg,
1728                                      sizeof(struct in6_rtmsg));
1729                 if (err)
1730                         return -EFAULT;
1731
1732                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1733
1734                 rtnl_lock();
1735                 switch (cmd) {
1736                 case SIOCADDRT:
1737                         err = ip6_route_add(&cfg);
1738                         break;
1739                 case SIOCDELRT:
1740                         err = ip6_route_del(&cfg);
1741                         break;
1742                 default:
1743                         err = -EINVAL;
1744                 }
1745                 rtnl_unlock();
1746
1747                 return err;
1748         };
1749
1750         return -EINVAL;
1751 }
1752
1753 /*
1754  *      Drop the packet on the floor
1755  */
1756
1757 static inline int ip6_pkt_drop(struct sk_buff *skb, int code)
1758 {
1759         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1760         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1761                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1762
1763         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTNOROUTES);
1764         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1765         kfree_skb(skb);
1766         return 0;
1767 }
1768
1769 static int ip6_pkt_discard(struct sk_buff *skb)
1770 {
1771         return ip6_pkt_drop(skb, ICMPV6_NOROUTE);
1772 }
1773
1774 static int ip6_pkt_discard_out(struct sk_buff *skb)
1775 {
1776         skb->dev = skb->dst->dev;
1777         return ip6_pkt_discard(skb);
1778 }
1779
1780 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1781
1782 static int ip6_pkt_prohibit(struct sk_buff *skb)
1783 {
1784         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED);
1785 }
1786
1787 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1788 {
1789         skb->dev = skb->dst->dev;
1790         return ip6_pkt_prohibit(skb);
1791 }
1792
1793 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1794 {
1795         kfree_skb(skb);
1796         return 0;
1797 }
1798
1799 #endif
1800
1801 /*
1802  *      Allocate a dst for local (unicast / anycast) address.
1803  */
1804
1805 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1806                                     const struct in6_addr *addr,
1807                                     int anycast)
1808 {
1809         struct rt6_info *rt = ip6_dst_alloc();
1810
1811         if (rt == NULL)
1812                 return ERR_PTR(-ENOMEM);
1813
1814         dev_hold(&loopback_dev);
1815         in6_dev_hold(idev);
1816
1817         rt->u.dst.flags = DST_HOST;
1818         rt->u.dst.input = ip6_input;
1819         rt->u.dst.output = ip6_output;
1820         rt->rt6i_dev = &loopback_dev;
1821         rt->rt6i_idev = idev;
1822         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1823         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1824         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1825         rt->u.dst.obsolete = -1;
1826
1827         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1828         if (anycast)
1829                 rt->rt6i_flags |= RTF_ANYCAST;
1830         else
1831                 rt->rt6i_flags |= RTF_LOCAL;
1832         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1833         if (rt->rt6i_nexthop == NULL) {
1834                 dst_free(&rt->u.dst);
1835                 return ERR_PTR(-ENOMEM);
1836         }
1837
1838         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1839         rt->rt6i_dst.plen = 128;
1840         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1841
1842         atomic_set(&rt->u.dst.__refcnt, 1);
1843
1844         return rt;
1845 }
1846
1847 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1848 {
1849         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1850             rt != &ip6_null_entry) {
1851                 RT6_TRACE("deleted by ifdown %p\n", rt);
1852                 return -1;
1853         }
1854         return 0;
1855 }
1856
1857 void rt6_ifdown(struct net_device *dev)
1858 {
1859         fib6_clean_all(fib6_ifdown, 0, dev);
1860 }
1861
1862 struct rt6_mtu_change_arg
1863 {
1864         struct net_device *dev;
1865         unsigned mtu;
1866 };
1867
1868 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1869 {
1870         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1871         struct inet6_dev *idev;
1872
1873         /* In IPv6 pmtu discovery is not optional,
1874            so that RTAX_MTU lock cannot disable it.
1875            We still use this lock to block changes
1876            caused by addrconf/ndisc.
1877         */
1878
1879         idev = __in6_dev_get(arg->dev);
1880         if (idev == NULL)
1881                 return 0;
1882
1883         /* For administrative MTU increase, there is no way to discover
1884            IPv6 PMTU increase, so PMTU increase should be updated here.
1885            Since RFC 1981 doesn't include administrative MTU increase
1886            update PMTU increase is a MUST. (i.e. jumbo frame)
1887          */
1888         /*
1889            If new MTU is less than route PMTU, this new MTU will be the
1890            lowest MTU in the path, update the route PMTU to reflect PMTU
1891            decreases; if new MTU is greater than route PMTU, and the
1892            old MTU is the lowest MTU in the path, update the route PMTU
1893            to reflect the increase. In this case if the other nodes' MTU
1894            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1895            PMTU discouvery.
1896          */
1897         if (rt->rt6i_dev == arg->dev &&
1898             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1899             (dst_mtu(&rt->u.dst) > arg->mtu ||
1900              (dst_mtu(&rt->u.dst) < arg->mtu &&
1901               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1902                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1903         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1904         return 0;
1905 }
1906
1907 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1908 {
1909         struct rt6_mtu_change_arg arg = {
1910                 .dev = dev,
1911                 .mtu = mtu,
1912         };
1913
1914         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1915 }
1916
1917 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1918         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1919         [RTA_OIF]               = { .type = NLA_U32 },
1920         [RTA_IIF]               = { .type = NLA_U32 },
1921         [RTA_PRIORITY]          = { .type = NLA_U32 },
1922         [RTA_METRICS]           = { .type = NLA_NESTED },
1923 };
1924
1925 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1926                               struct fib6_config *cfg)
1927 {
1928         struct rtmsg *rtm;
1929         struct nlattr *tb[RTA_MAX+1];
1930         int err;
1931
1932         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1933         if (err < 0)
1934                 goto errout;
1935
1936         err = -EINVAL;
1937         rtm = nlmsg_data(nlh);
1938         memset(cfg, 0, sizeof(*cfg));
1939
1940         cfg->fc_table = rtm->rtm_table;
1941         cfg->fc_dst_len = rtm->rtm_dst_len;
1942         cfg->fc_src_len = rtm->rtm_src_len;
1943         cfg->fc_flags = RTF_UP;
1944         cfg->fc_protocol = rtm->rtm_protocol;
1945
1946         if (rtm->rtm_type == RTN_UNREACHABLE)
1947                 cfg->fc_flags |= RTF_REJECT;
1948
1949         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1950         cfg->fc_nlinfo.nlh = nlh;
1951
1952         if (tb[RTA_GATEWAY]) {
1953                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1954                 cfg->fc_flags |= RTF_GATEWAY;
1955         }
1956
1957         if (tb[RTA_DST]) {
1958                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1959
1960                 if (nla_len(tb[RTA_DST]) < plen)
1961                         goto errout;
1962
1963                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1964         }
1965
1966         if (tb[RTA_SRC]) {
1967                 int plen = (rtm->rtm_src_len + 7) >> 3;
1968
1969                 if (nla_len(tb[RTA_SRC]) < plen)
1970                         goto errout;
1971
1972                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1973         }
1974
1975         if (tb[RTA_OIF])
1976                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1977
1978         if (tb[RTA_PRIORITY])
1979                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1980
1981         if (tb[RTA_METRICS]) {
1982                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1983                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1984         }
1985
1986         if (tb[RTA_TABLE])
1987                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1988
1989         err = 0;
1990 errout:
1991         return err;
1992 }
1993
1994 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1995 {
1996         struct fib6_config cfg;
1997         int err;
1998
1999         err = rtm_to_fib6_config(skb, nlh, &cfg);
2000         if (err < 0)
2001                 return err;
2002
2003         return ip6_route_del(&cfg);
2004 }
2005
2006 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2007 {
2008         struct fib6_config cfg;
2009         int err;
2010
2011         err = rtm_to_fib6_config(skb, nlh, &cfg);
2012         if (err < 0)
2013                 return err;
2014
2015         return ip6_route_add(&cfg);
2016 }
2017
2018 static inline size_t rt6_nlmsg_size(void)
2019 {
2020         return NLMSG_ALIGN(sizeof(struct rtmsg))
2021                + nla_total_size(16) /* RTA_SRC */
2022                + nla_total_size(16) /* RTA_DST */
2023                + nla_total_size(16) /* RTA_GATEWAY */
2024                + nla_total_size(16) /* RTA_PREFSRC */
2025                + nla_total_size(4) /* RTA_TABLE */
2026                + nla_total_size(4) /* RTA_IIF */
2027                + nla_total_size(4) /* RTA_OIF */
2028                + nla_total_size(4) /* RTA_PRIORITY */
2029                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2030                + nla_total_size(sizeof(struct rta_cacheinfo));
2031 }
2032
2033 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2034                          struct in6_addr *dst, struct in6_addr *src,
2035                          int iif, int type, u32 pid, u32 seq,
2036                          int prefix, unsigned int flags)
2037 {
2038         struct rtmsg *rtm;
2039         struct nlmsghdr *nlh;
2040         long expires;
2041         u32 table;
2042
2043         if (prefix) {   /* user wants prefix routes only */
2044                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2045                         /* success since this is not a prefix route */
2046                         return 1;
2047                 }
2048         }
2049
2050         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2051         if (nlh == NULL)
2052                 return -EMSGSIZE;
2053
2054         rtm = nlmsg_data(nlh);
2055         rtm->rtm_family = AF_INET6;
2056         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2057         rtm->rtm_src_len = rt->rt6i_src.plen;
2058         rtm->rtm_tos = 0;
2059         if (rt->rt6i_table)
2060                 table = rt->rt6i_table->tb6_id;
2061         else
2062                 table = RT6_TABLE_UNSPEC;
2063         rtm->rtm_table = table;
2064         NLA_PUT_U32(skb, RTA_TABLE, table);
2065         if (rt->rt6i_flags&RTF_REJECT)
2066                 rtm->rtm_type = RTN_UNREACHABLE;
2067         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2068                 rtm->rtm_type = RTN_LOCAL;
2069         else
2070                 rtm->rtm_type = RTN_UNICAST;
2071         rtm->rtm_flags = 0;
2072         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2073         rtm->rtm_protocol = rt->rt6i_protocol;
2074         if (rt->rt6i_flags&RTF_DYNAMIC)
2075                 rtm->rtm_protocol = RTPROT_REDIRECT;
2076         else if (rt->rt6i_flags & RTF_ADDRCONF)
2077                 rtm->rtm_protocol = RTPROT_KERNEL;
2078         else if (rt->rt6i_flags&RTF_DEFAULT)
2079                 rtm->rtm_protocol = RTPROT_RA;
2080
2081         if (rt->rt6i_flags&RTF_CACHE)
2082                 rtm->rtm_flags |= RTM_F_CLONED;
2083
2084         if (dst) {
2085                 NLA_PUT(skb, RTA_DST, 16, dst);
2086                 rtm->rtm_dst_len = 128;
2087         } else if (rtm->rtm_dst_len)
2088                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2089 #ifdef CONFIG_IPV6_SUBTREES
2090         if (src) {
2091                 NLA_PUT(skb, RTA_SRC, 16, src);
2092                 rtm->rtm_src_len = 128;
2093         } else if (rtm->rtm_src_len)
2094                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2095 #endif
2096         if (iif)
2097                 NLA_PUT_U32(skb, RTA_IIF, iif);
2098         else if (dst) {
2099                 struct in6_addr saddr_buf;
2100                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2101                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2102         }
2103
2104         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2105                 goto nla_put_failure;
2106
2107         if (rt->u.dst.neighbour)
2108                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2109
2110         if (rt->u.dst.dev)
2111                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2112
2113         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2114
2115         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2116         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2117                                expires, rt->u.dst.error) < 0)
2118                 goto nla_put_failure;
2119
2120         return nlmsg_end(skb, nlh);
2121
2122 nla_put_failure:
2123         nlmsg_cancel(skb, nlh);
2124         return -EMSGSIZE;
2125 }
2126
2127 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2128 {
2129         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2130         int prefix;
2131
2132         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2133                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2134                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2135         } else
2136                 prefix = 0;
2137
2138         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2139                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2140                      prefix, NLM_F_MULTI);
2141 }
2142
2143 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2144 {
2145         struct nlattr *tb[RTA_MAX+1];
2146         struct rt6_info *rt;
2147         struct sk_buff *skb;
2148         struct rtmsg *rtm;
2149         struct flowi fl;
2150         int err, iif = 0;
2151
2152         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2153         if (err < 0)
2154                 goto errout;
2155
2156         err = -EINVAL;
2157         memset(&fl, 0, sizeof(fl));
2158
2159         if (tb[RTA_SRC]) {
2160                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2161                         goto errout;
2162
2163                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2164         }
2165
2166         if (tb[RTA_DST]) {
2167                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2168                         goto errout;
2169
2170                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2171         }
2172
2173         if (tb[RTA_IIF])
2174                 iif = nla_get_u32(tb[RTA_IIF]);
2175
2176         if (tb[RTA_OIF])
2177                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2178
2179         if (iif) {
2180                 struct net_device *dev;
2181                 dev = __dev_get_by_index(iif);
2182                 if (!dev) {
2183                         err = -ENODEV;
2184                         goto errout;
2185                 }
2186         }
2187
2188         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2189         if (skb == NULL) {
2190                 err = -ENOBUFS;
2191                 goto errout;
2192         }
2193
2194         /* Reserve room for dummy headers, this skb can pass
2195            through good chunk of routing engine.
2196          */
2197         skb->mac.raw = skb->data;
2198         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2199
2200         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2201         skb->dst = &rt->u.dst;
2202
2203         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2204                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2205                             nlh->nlmsg_seq, 0, 0);
2206         if (err < 0) {
2207                 kfree_skb(skb);
2208                 goto errout;
2209         }
2210
2211         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2212 errout:
2213         return err;
2214 }
2215
2216 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2217 {
2218         struct sk_buff *skb;
2219         u32 pid = 0, seq = 0;
2220         struct nlmsghdr *nlh = NULL;
2221         int err = -ENOBUFS;
2222
2223         if (info) {
2224                 pid = info->pid;
2225                 nlh = info->nlh;
2226                 if (nlh)
2227                         seq = nlh->nlmsg_seq;
2228         }
2229
2230         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2231         if (skb == NULL)
2232                 goto errout;
2233
2234         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2235         if (err < 0) {
2236                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2237                 WARN_ON(err == -EMSGSIZE);
2238                 kfree_skb(skb);
2239                 goto errout;
2240         }
2241         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2242 errout:
2243         if (err < 0)
2244                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2245 }
2246
2247 /*
2248  *      /proc
2249  */
2250
2251 #ifdef CONFIG_PROC_FS
2252
2253 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2254
2255 struct rt6_proc_arg
2256 {
2257         char *buffer;
2258         int offset;
2259         int length;
2260         int skip;
2261         int len;
2262 };
2263
2264 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2265 {
2266         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2267
2268         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2269                 arg->skip++;
2270                 return 0;
2271         }
2272
2273         if (arg->len >= arg->length)
2274                 return 0;
2275
2276         arg->len += sprintf(arg->buffer + arg->len,
2277                             NIP6_SEQFMT " %02x ",
2278                             NIP6(rt->rt6i_dst.addr),
2279                             rt->rt6i_dst.plen);
2280
2281 #ifdef CONFIG_IPV6_SUBTREES
2282         arg->len += sprintf(arg->buffer + arg->len,
2283                             NIP6_SEQFMT " %02x ",
2284                             NIP6(rt->rt6i_src.addr),
2285                             rt->rt6i_src.plen);
2286 #else
2287         arg->len += sprintf(arg->buffer + arg->len,
2288                             "00000000000000000000000000000000 00 ");
2289 #endif
2290
2291         if (rt->rt6i_nexthop) {
2292                 arg->len += sprintf(arg->buffer + arg->len,
2293                                     NIP6_SEQFMT,
2294                                     NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2295         } else {
2296                 arg->len += sprintf(arg->buffer + arg->len,
2297                                     "00000000000000000000000000000000");
2298         }
2299         arg->len += sprintf(arg->buffer + arg->len,
2300                             " %08x %08x %08x %08x %8s\n",
2301                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2302                             rt->u.dst.__use, rt->rt6i_flags, 
2303                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2304         return 0;
2305 }
2306
2307 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2308 {
2309         struct rt6_proc_arg arg = {
2310                 .buffer = buffer,
2311                 .offset = offset,
2312                 .length = length,
2313         };
2314
2315         fib6_clean_all(rt6_info_route, 0, &arg);
2316
2317         *start = buffer;
2318         if (offset)
2319                 *start += offset % RT6_INFO_LEN;
2320
2321         arg.len -= offset % RT6_INFO_LEN;
2322
2323         if (arg.len > length)
2324                 arg.len = length;
2325         if (arg.len < 0)
2326                 arg.len = 0;
2327
2328         return arg.len;
2329 }
2330
2331 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2332 {
2333         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2334                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2335                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2336                       rt6_stats.fib_rt_cache,
2337                       atomic_read(&ip6_dst_ops.entries),
2338                       rt6_stats.fib_discarded_routes);
2339
2340         return 0;
2341 }
2342
2343 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2344 {
2345         return single_open(file, rt6_stats_seq_show, NULL);
2346 }
2347
2348 static struct file_operations rt6_stats_seq_fops = {
2349         .owner   = THIS_MODULE,
2350         .open    = rt6_stats_seq_open,
2351         .read    = seq_read,
2352         .llseek  = seq_lseek,
2353         .release = single_release,
2354 };
2355 #endif  /* CONFIG_PROC_FS */
2356
2357 #ifdef CONFIG_SYSCTL
2358
2359 static int flush_delay;
2360
2361 static
2362 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2363                               void __user *buffer, size_t *lenp, loff_t *ppos)
2364 {
2365         if (write) {
2366                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2367                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2368                 return 0;
2369         } else
2370                 return -EINVAL;
2371 }
2372
2373 ctl_table ipv6_route_table[] = {
2374         {
2375                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2376                 .procname       =       "flush",
2377                 .data           =       &flush_delay,
2378                 .maxlen         =       sizeof(int),
2379                 .mode           =       0200,
2380                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2381         },
2382         {
2383                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2384                 .procname       =       "gc_thresh",
2385                 .data           =       &ip6_dst_ops.gc_thresh,
2386                 .maxlen         =       sizeof(int),
2387                 .mode           =       0644,
2388                 .proc_handler   =       &proc_dointvec,
2389         },
2390         {
2391                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2392                 .procname       =       "max_size",
2393                 .data           =       &ip6_rt_max_size,
2394                 .maxlen         =       sizeof(int),
2395                 .mode           =       0644,
2396                 .proc_handler   =       &proc_dointvec,
2397         },
2398         {
2399                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2400                 .procname       =       "gc_min_interval",
2401                 .data           =       &ip6_rt_gc_min_interval,
2402                 .maxlen         =       sizeof(int),
2403                 .mode           =       0644,
2404                 .proc_handler   =       &proc_dointvec_jiffies,
2405                 .strategy       =       &sysctl_jiffies,
2406         },
2407         {
2408                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2409                 .procname       =       "gc_timeout",
2410                 .data           =       &ip6_rt_gc_timeout,
2411                 .maxlen         =       sizeof(int),
2412                 .mode           =       0644,
2413                 .proc_handler   =       &proc_dointvec_jiffies,
2414                 .strategy       =       &sysctl_jiffies,
2415         },
2416         {
2417                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2418                 .procname       =       "gc_interval",
2419                 .data           =       &ip6_rt_gc_interval,
2420                 .maxlen         =       sizeof(int),
2421                 .mode           =       0644,
2422                 .proc_handler   =       &proc_dointvec_jiffies,
2423                 .strategy       =       &sysctl_jiffies,
2424         },
2425         {
2426                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2427                 .procname       =       "gc_elasticity",
2428                 .data           =       &ip6_rt_gc_elasticity,
2429                 .maxlen         =       sizeof(int),
2430                 .mode           =       0644,
2431                 .proc_handler   =       &proc_dointvec_jiffies,
2432                 .strategy       =       &sysctl_jiffies,
2433         },
2434         {
2435                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2436                 .procname       =       "mtu_expires",
2437                 .data           =       &ip6_rt_mtu_expires,
2438                 .maxlen         =       sizeof(int),
2439                 .mode           =       0644,
2440                 .proc_handler   =       &proc_dointvec_jiffies,
2441                 .strategy       =       &sysctl_jiffies,
2442         },
2443         {
2444                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2445                 .procname       =       "min_adv_mss",
2446                 .data           =       &ip6_rt_min_advmss,
2447                 .maxlen         =       sizeof(int),
2448                 .mode           =       0644,
2449                 .proc_handler   =       &proc_dointvec_jiffies,
2450                 .strategy       =       &sysctl_jiffies,
2451         },
2452         {
2453                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2454                 .procname       =       "gc_min_interval_ms",
2455                 .data           =       &ip6_rt_gc_min_interval,
2456                 .maxlen         =       sizeof(int),
2457                 .mode           =       0644,
2458                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2459                 .strategy       =       &sysctl_ms_jiffies,
2460         },
2461         { .ctl_name = 0 }
2462 };
2463
2464 #endif
2465
2466 void __init ip6_route_init(void)
2467 {
2468         struct proc_dir_entry *p;
2469
2470         ip6_dst_ops.kmem_cachep =
2471                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2472                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2473         fib6_init();
2474 #ifdef  CONFIG_PROC_FS
2475         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2476         if (p)
2477                 p->owner = THIS_MODULE;
2478
2479         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2480 #endif
2481 #ifdef CONFIG_XFRM
2482         xfrm6_init();
2483 #endif
2484 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2485         fib6_rules_init();
2486 #endif
2487 }
2488
2489 void ip6_route_cleanup(void)
2490 {
2491 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2492         fib6_rules_cleanup();
2493 #endif
2494 #ifdef CONFIG_PROC_FS
2495         proc_net_remove("ipv6_route");
2496         proc_net_remove("rt6_stats");
2497 #endif
2498 #ifdef CONFIG_XFRM
2499         xfrm6_fini();
2500 #endif
2501         rt6_ifdown(NULL);
2502         fib6_gc_cleanup();
2503         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2504 }