Pull thinkpad-2.6.24 into release branch
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static int ip6_rt_max_size = 4096;
77 static int ip6_rt_gc_min_interval = HZ / 2;
78 static int ip6_rt_gc_timeout = 60*HZ;
79 int ip6_rt_gc_interval = 30*HZ;
80 static int ip6_rt_gc_elasticity = 9;
81 static int ip6_rt_mtu_expires = 10*60*HZ;
82 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
83
84 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
85 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void             ip6_dst_destroy(struct dst_entry *);
88 static void             ip6_dst_ifdown(struct dst_entry *,
89                                        struct net_device *dev, int how);
90 static int               ip6_dst_gc(void);
91
92 static int              ip6_pkt_discard(struct sk_buff *skb);
93 static int              ip6_pkt_discard_out(struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
99                                            struct in6_addr *gwaddr, int ifindex,
100                                            unsigned pref);
101 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex);
103 #endif
104
105 static struct dst_ops ip6_dst_ops = {
106         .family                 =       AF_INET6,
107         .protocol               =       __constant_htons(ETH_P_IPV6),
108         .gc                     =       ip6_dst_gc,
109         .gc_thresh              =       1024,
110         .check                  =       ip6_dst_check,
111         .destroy                =       ip6_dst_destroy,
112         .ifdown                 =       ip6_dst_ifdown,
113         .negative_advice        =       ip6_negative_advice,
114         .link_failure           =       ip6_link_failure,
115         .update_pmtu            =       ip6_rt_update_pmtu,
116         .entry_size             =       sizeof(struct rt6_info),
117 };
118
119 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
120 {
121 }
122
123 static struct dst_ops ip6_dst_blackhole_ops = {
124         .family                 =       AF_INET6,
125         .protocol               =       __constant_htons(ETH_P_IPV6),
126         .destroy                =       ip6_dst_destroy,
127         .check                  =       ip6_dst_check,
128         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
129         .entry_size             =       sizeof(struct rt6_info),
130 };
131
132 struct rt6_info ip6_null_entry = {
133         .u = {
134                 .dst = {
135                         .__refcnt       = ATOMIC_INIT(1),
136                         .__use          = 1,
137                         .obsolete       = -1,
138                         .error          = -ENETUNREACH,
139                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
140                         .input          = ip6_pkt_discard,
141                         .output         = ip6_pkt_discard_out,
142                         .ops            = &ip6_dst_ops,
143                         .path           = (struct dst_entry*)&ip6_null_entry,
144                 }
145         },
146         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
147         .rt6i_metric    = ~(u32) 0,
148         .rt6i_ref       = ATOMIC_INIT(1),
149 };
150
151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
152
153 static int ip6_pkt_prohibit(struct sk_buff *skb);
154 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
155 static int ip6_pkt_blk_hole(struct sk_buff *skb);
156
157 struct rt6_info ip6_prohibit_entry = {
158         .u = {
159                 .dst = {
160                         .__refcnt       = ATOMIC_INIT(1),
161                         .__use          = 1,
162                         .obsolete       = -1,
163                         .error          = -EACCES,
164                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
165                         .input          = ip6_pkt_prohibit,
166                         .output         = ip6_pkt_prohibit_out,
167                         .ops            = &ip6_dst_ops,
168                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
169                 }
170         },
171         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
172         .rt6i_metric    = ~(u32) 0,
173         .rt6i_ref       = ATOMIC_INIT(1),
174 };
175
176 struct rt6_info ip6_blk_hole_entry = {
177         .u = {
178                 .dst = {
179                         .__refcnt       = ATOMIC_INIT(1),
180                         .__use          = 1,
181                         .obsolete       = -1,
182                         .error          = -EINVAL,
183                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
184                         .input          = ip6_pkt_blk_hole,
185                         .output         = ip6_pkt_blk_hole,
186                         .ops            = &ip6_dst_ops,
187                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
188                 }
189         },
190         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
191         .rt6i_metric    = ~(u32) 0,
192         .rt6i_ref       = ATOMIC_INIT(1),
193 };
194
195 #endif
196
197 /* allocate dst with ip6_dst_ops */
198 static __inline__ struct rt6_info *ip6_dst_alloc(void)
199 {
200         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
201 }
202
203 static void ip6_dst_destroy(struct dst_entry *dst)
204 {
205         struct rt6_info *rt = (struct rt6_info *)dst;
206         struct inet6_dev *idev = rt->rt6i_idev;
207
208         if (idev != NULL) {
209                 rt->rt6i_idev = NULL;
210                 in6_dev_put(idev);
211         }
212 }
213
214 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
215                            int how)
216 {
217         struct rt6_info *rt = (struct rt6_info *)dst;
218         struct inet6_dev *idev = rt->rt6i_idev;
219
220         if (dev != init_net.loopback_dev && idev != NULL && idev->dev == dev) {
221                 struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev);
222                 if (loopback_idev != NULL) {
223                         rt->rt6i_idev = loopback_idev;
224                         in6_dev_put(idev);
225                 }
226         }
227 }
228
229 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
230 {
231         return (rt->rt6i_flags & RTF_EXPIRES &&
232                 time_after(jiffies, rt->rt6i_expires));
233 }
234
235 static inline int rt6_need_strict(struct in6_addr *daddr)
236 {
237         return (ipv6_addr_type(daddr) &
238                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
239 }
240
241 /*
242  *      Route lookup. Any table->tb6_lock is implied.
243  */
244
245 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
246                                                     int oif,
247                                                     int strict)
248 {
249         struct rt6_info *local = NULL;
250         struct rt6_info *sprt;
251
252         if (oif) {
253                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
254                         struct net_device *dev = sprt->rt6i_dev;
255                         if (dev->ifindex == oif)
256                                 return sprt;
257                         if (dev->flags & IFF_LOOPBACK) {
258                                 if (sprt->rt6i_idev == NULL ||
259                                     sprt->rt6i_idev->dev->ifindex != oif) {
260                                         if (strict && oif)
261                                                 continue;
262                                         if (local && (!oif ||
263                                                       local->rt6i_idev->dev->ifindex == oif))
264                                                 continue;
265                                 }
266                                 local = sprt;
267                         }
268                 }
269
270                 if (local)
271                         return local;
272
273                 if (strict)
274                         return &ip6_null_entry;
275         }
276         return rt;
277 }
278
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283         /*
284          * Okay, this does not seem to be appropriate
285          * for now, however, we need to check if it
286          * is really so; aka Router Reachability Probing.
287          *
288          * Router Reachability Probe MUST be rate-limited
289          * to no more than one per minute.
290          */
291         if (!neigh || (neigh->nud_state & NUD_VALID))
292                 return;
293         read_lock_bh(&neigh->lock);
294         if (!(neigh->nud_state & NUD_VALID) &&
295             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296                 struct in6_addr mcaddr;
297                 struct in6_addr *target;
298
299                 neigh->updated = jiffies;
300                 read_unlock_bh(&neigh->lock);
301
302                 target = (struct in6_addr *)&neigh->primary_key;
303                 addrconf_addr_solict_mult(target, &mcaddr);
304                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305         } else
306                 read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311         return;
312 }
313 #endif
314
315 /*
316  * Default Router Selection (RFC 2461 6.3.6)
317  */
318 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
319 {
320         struct net_device *dev = rt->rt6i_dev;
321         if (!oif || dev->ifindex == oif)
322                 return 2;
323         if ((dev->flags & IFF_LOOPBACK) &&
324             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
325                 return 1;
326         return 0;
327 }
328
329 static inline int rt6_check_neigh(struct rt6_info *rt)
330 {
331         struct neighbour *neigh = rt->rt6i_nexthop;
332         int m = 0;
333         if (rt->rt6i_flags & RTF_NONEXTHOP ||
334             !(rt->rt6i_flags & RTF_GATEWAY))
335                 m = 1;
336         else if (neigh) {
337                 read_lock_bh(&neigh->lock);
338                 if (neigh->nud_state & NUD_VALID)
339                         m = 2;
340                 else if (!(neigh->nud_state & NUD_FAILED))
341                         m = 1;
342                 read_unlock_bh(&neigh->lock);
343         }
344         return m;
345 }
346
347 static int rt6_score_route(struct rt6_info *rt, int oif,
348                            int strict)
349 {
350         int m, n;
351
352         m = rt6_check_dev(rt, oif);
353         if (!m && (strict & RT6_LOOKUP_F_IFACE))
354                 return -1;
355 #ifdef CONFIG_IPV6_ROUTER_PREF
356         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
357 #endif
358         n = rt6_check_neigh(rt);
359         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
360                 return -1;
361         return m;
362 }
363
364 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
365                                    int *mpri, struct rt6_info *match)
366 {
367         int m;
368
369         if (rt6_check_expired(rt))
370                 goto out;
371
372         m = rt6_score_route(rt, oif, strict);
373         if (m < 0)
374                 goto out;
375
376         if (m > *mpri) {
377                 if (strict & RT6_LOOKUP_F_REACHABLE)
378                         rt6_probe(match);
379                 *mpri = m;
380                 match = rt;
381         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
382                 rt6_probe(rt);
383         }
384
385 out:
386         return match;
387 }
388
389 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
390                                      struct rt6_info *rr_head,
391                                      u32 metric, int oif, int strict)
392 {
393         struct rt6_info *rt, *match;
394         int mpri = -1;
395
396         match = NULL;
397         for (rt = rr_head; rt && rt->rt6i_metric == metric;
398              rt = rt->u.dst.rt6_next)
399                 match = find_match(rt, oif, strict, &mpri, match);
400         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
401              rt = rt->u.dst.rt6_next)
402                 match = find_match(rt, oif, strict, &mpri, match);
403
404         return match;
405 }
406
407 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
408 {
409         struct rt6_info *match, *rt0;
410
411         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
412                   __FUNCTION__, fn->leaf, oif);
413
414         rt0 = fn->rr_ptr;
415         if (!rt0)
416                 fn->rr_ptr = rt0 = fn->leaf;
417
418         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
419
420         if (!match &&
421             (strict & RT6_LOOKUP_F_REACHABLE)) {
422                 struct rt6_info *next = rt0->u.dst.rt6_next;
423
424                 /* no entries matched; do round-robin */
425                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
426                         next = fn->leaf;
427
428                 if (next != rt0)
429                         fn->rr_ptr = next;
430         }
431
432         RT6_TRACE("%s() => %p\n",
433                   __FUNCTION__, match);
434
435         return (match ? match : &ip6_null_entry);
436 }
437
438 #ifdef CONFIG_IPV6_ROUTE_INFO
439 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
440                   struct in6_addr *gwaddr)
441 {
442         struct route_info *rinfo = (struct route_info *) opt;
443         struct in6_addr prefix_buf, *prefix;
444         unsigned int pref;
445         u32 lifetime;
446         struct rt6_info *rt;
447
448         if (len < sizeof(struct route_info)) {
449                 return -EINVAL;
450         }
451
452         /* Sanity check for prefix_len and length */
453         if (rinfo->length > 3) {
454                 return -EINVAL;
455         } else if (rinfo->prefix_len > 128) {
456                 return -EINVAL;
457         } else if (rinfo->prefix_len > 64) {
458                 if (rinfo->length < 2) {
459                         return -EINVAL;
460                 }
461         } else if (rinfo->prefix_len > 0) {
462                 if (rinfo->length < 1) {
463                         return -EINVAL;
464                 }
465         }
466
467         pref = rinfo->route_pref;
468         if (pref == ICMPV6_ROUTER_PREF_INVALID)
469                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
470
471         lifetime = ntohl(rinfo->lifetime);
472         if (lifetime == 0xffffffff) {
473                 /* infinity */
474         } else if (lifetime > 0x7fffffff/HZ) {
475                 /* Avoid arithmetic overflow */
476                 lifetime = 0x7fffffff/HZ - 1;
477         }
478
479         if (rinfo->length == 3)
480                 prefix = (struct in6_addr *)rinfo->prefix;
481         else {
482                 /* this function is safe */
483                 ipv6_addr_prefix(&prefix_buf,
484                                  (struct in6_addr *)rinfo->prefix,
485                                  rinfo->prefix_len);
486                 prefix = &prefix_buf;
487         }
488
489         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
490
491         if (rt && !lifetime) {
492                 ip6_del_rt(rt);
493                 rt = NULL;
494         }
495
496         if (!rt && lifetime)
497                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
498                                         pref);
499         else if (rt)
500                 rt->rt6i_flags = RTF_ROUTEINFO |
501                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
502
503         if (rt) {
504                 if (lifetime == 0xffffffff) {
505                         rt->rt6i_flags &= ~RTF_EXPIRES;
506                 } else {
507                         rt->rt6i_expires = jiffies + HZ * lifetime;
508                         rt->rt6i_flags |= RTF_EXPIRES;
509                 }
510                 dst_release(&rt->u.dst);
511         }
512         return 0;
513 }
514 #endif
515
516 #define BACKTRACK(saddr) \
517 do { \
518         if (rt == &ip6_null_entry) { \
519                 struct fib6_node *pn; \
520                 while (1) { \
521                         if (fn->fn_flags & RTN_TL_ROOT) \
522                                 goto out; \
523                         pn = fn->parent; \
524                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
525                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
526                         else \
527                                 fn = pn; \
528                         if (fn->fn_flags & RTN_RTINFO) \
529                                 goto restart; \
530                 } \
531         } \
532 } while(0)
533
534 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
535                                              struct flowi *fl, int flags)
536 {
537         struct fib6_node *fn;
538         struct rt6_info *rt;
539
540         read_lock_bh(&table->tb6_lock);
541         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
542 restart:
543         rt = fn->leaf;
544         rt = rt6_device_match(rt, fl->oif, flags);
545         BACKTRACK(&fl->fl6_src);
546 out:
547         dst_use(&rt->u.dst, jiffies);
548         read_unlock_bh(&table->tb6_lock);
549         return rt;
550
551 }
552
553 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
554                             int oif, int strict)
555 {
556         struct flowi fl = {
557                 .oif = oif,
558                 .nl_u = {
559                         .ip6_u = {
560                                 .daddr = *daddr,
561                         },
562                 },
563         };
564         struct dst_entry *dst;
565         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
566
567         if (saddr) {
568                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
569                 flags |= RT6_LOOKUP_F_HAS_SADDR;
570         }
571
572         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
573         if (dst->error == 0)
574                 return (struct rt6_info *) dst;
575
576         dst_release(dst);
577
578         return NULL;
579 }
580
581 EXPORT_SYMBOL(rt6_lookup);
582
583 /* ip6_ins_rt is called with FREE table->tb6_lock.
584    It takes new route entry, the addition fails by any reason the
585    route is freed. In any case, if caller does not hold it, it may
586    be destroyed.
587  */
588
589 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
590 {
591         int err;
592         struct fib6_table *table;
593
594         table = rt->rt6i_table;
595         write_lock_bh(&table->tb6_lock);
596         err = fib6_add(&table->tb6_root, rt, info);
597         write_unlock_bh(&table->tb6_lock);
598
599         return err;
600 }
601
602 int ip6_ins_rt(struct rt6_info *rt)
603 {
604         return __ip6_ins_rt(rt, NULL);
605 }
606
607 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
608                                       struct in6_addr *saddr)
609 {
610         struct rt6_info *rt;
611
612         /*
613          *      Clone the route.
614          */
615
616         rt = ip6_rt_copy(ort);
617
618         if (rt) {
619                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
620                         if (rt->rt6i_dst.plen != 128 &&
621                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
622                                 rt->rt6i_flags |= RTF_ANYCAST;
623                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
624                 }
625
626                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
627                 rt->rt6i_dst.plen = 128;
628                 rt->rt6i_flags |= RTF_CACHE;
629                 rt->u.dst.flags |= DST_HOST;
630
631 #ifdef CONFIG_IPV6_SUBTREES
632                 if (rt->rt6i_src.plen && saddr) {
633                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
634                         rt->rt6i_src.plen = 128;
635                 }
636 #endif
637
638                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
639
640         }
641
642         return rt;
643 }
644
645 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
646 {
647         struct rt6_info *rt = ip6_rt_copy(ort);
648         if (rt) {
649                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
650                 rt->rt6i_dst.plen = 128;
651                 rt->rt6i_flags |= RTF_CACHE;
652                 rt->u.dst.flags |= DST_HOST;
653                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
654         }
655         return rt;
656 }
657
658 static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
659                                             struct flowi *fl, int flags)
660 {
661         struct fib6_node *fn;
662         struct rt6_info *rt, *nrt;
663         int strict = 0;
664         int attempts = 3;
665         int err;
666         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
667
668         strict |= flags & RT6_LOOKUP_F_IFACE;
669
670 relookup:
671         read_lock_bh(&table->tb6_lock);
672
673 restart_2:
674         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
675
676 restart:
677         rt = rt6_select(fn, oif, strict | reachable);
678         BACKTRACK(&fl->fl6_src);
679         if (rt == &ip6_null_entry ||
680             rt->rt6i_flags & RTF_CACHE)
681                 goto out;
682
683         dst_hold(&rt->u.dst);
684         read_unlock_bh(&table->tb6_lock);
685
686         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
687                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
688         else {
689 #if CLONE_OFFLINK_ROUTE
690                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
691 #else
692                 goto out2;
693 #endif
694         }
695
696         dst_release(&rt->u.dst);
697         rt = nrt ? : &ip6_null_entry;
698
699         dst_hold(&rt->u.dst);
700         if (nrt) {
701                 err = ip6_ins_rt(nrt);
702                 if (!err)
703                         goto out2;
704         }
705
706         if (--attempts <= 0)
707                 goto out2;
708
709         /*
710          * Race condition! In the gap, when table->tb6_lock was
711          * released someone could insert this route.  Relookup.
712          */
713         dst_release(&rt->u.dst);
714         goto relookup;
715
716 out:
717         if (reachable) {
718                 reachable = 0;
719                 goto restart_2;
720         }
721         dst_hold(&rt->u.dst);
722         read_unlock_bh(&table->tb6_lock);
723 out2:
724         rt->u.dst.lastuse = jiffies;
725         rt->u.dst.__use++;
726
727         return rt;
728 }
729
730 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
731                                             struct flowi *fl, int flags)
732 {
733         return ip6_pol_route(table, fl->iif, fl, flags);
734 }
735
736 void ip6_route_input(struct sk_buff *skb)
737 {
738         struct ipv6hdr *iph = ipv6_hdr(skb);
739         int flags = RT6_LOOKUP_F_HAS_SADDR;
740         struct flowi fl = {
741                 .iif = skb->dev->ifindex,
742                 .nl_u = {
743                         .ip6_u = {
744                                 .daddr = iph->daddr,
745                                 .saddr = iph->saddr,
746                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
747                         },
748                 },
749                 .mark = skb->mark,
750                 .proto = iph->nexthdr,
751         };
752
753         if (rt6_need_strict(&iph->daddr))
754                 flags |= RT6_LOOKUP_F_IFACE;
755
756         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
757 }
758
759 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
760                                              struct flowi *fl, int flags)
761 {
762         return ip6_pol_route(table, fl->oif, fl, flags);
763 }
764
765 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
766 {
767         int flags = 0;
768
769         if (rt6_need_strict(&fl->fl6_dst))
770                 flags |= RT6_LOOKUP_F_IFACE;
771
772         if (!ipv6_addr_any(&fl->fl6_src))
773                 flags |= RT6_LOOKUP_F_HAS_SADDR;
774
775         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
776 }
777
778 EXPORT_SYMBOL(ip6_route_output);
779
780 static int ip6_blackhole_output(struct sk_buff *skb)
781 {
782         kfree_skb(skb);
783         return 0;
784 }
785
786 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
787 {
788         struct rt6_info *ort = (struct rt6_info *) *dstp;
789         struct rt6_info *rt = (struct rt6_info *)
790                 dst_alloc(&ip6_dst_blackhole_ops);
791         struct dst_entry *new = NULL;
792
793         if (rt) {
794                 new = &rt->u.dst;
795
796                 atomic_set(&new->__refcnt, 1);
797                 new->__use = 1;
798                 new->input = ip6_blackhole_output;
799                 new->output = ip6_blackhole_output;
800
801                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
802                 new->dev = ort->u.dst.dev;
803                 if (new->dev)
804                         dev_hold(new->dev);
805                 rt->rt6i_idev = ort->rt6i_idev;
806                 if (rt->rt6i_idev)
807                         in6_dev_hold(rt->rt6i_idev);
808                 rt->rt6i_expires = 0;
809
810                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
811                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
812                 rt->rt6i_metric = 0;
813
814                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
815 #ifdef CONFIG_IPV6_SUBTREES
816                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
817 #endif
818
819                 dst_free(new);
820         }
821
822         dst_release(*dstp);
823         *dstp = new;
824         return (new ? 0 : -ENOMEM);
825 }
826 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
827
828 /*
829  *      Destination cache support functions
830  */
831
832 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
833 {
834         struct rt6_info *rt;
835
836         rt = (struct rt6_info *) dst;
837
838         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
839                 return dst;
840
841         return NULL;
842 }
843
844 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
845 {
846         struct rt6_info *rt = (struct rt6_info *) dst;
847
848         if (rt) {
849                 if (rt->rt6i_flags & RTF_CACHE)
850                         ip6_del_rt(rt);
851                 else
852                         dst_release(dst);
853         }
854         return NULL;
855 }
856
857 static void ip6_link_failure(struct sk_buff *skb)
858 {
859         struct rt6_info *rt;
860
861         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
862
863         rt = (struct rt6_info *) skb->dst;
864         if (rt) {
865                 if (rt->rt6i_flags&RTF_CACHE) {
866                         dst_set_expires(&rt->u.dst, 0);
867                         rt->rt6i_flags |= RTF_EXPIRES;
868                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
869                         rt->rt6i_node->fn_sernum = -1;
870         }
871 }
872
873 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
874 {
875         struct rt6_info *rt6 = (struct rt6_info*)dst;
876
877         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
878                 rt6->rt6i_flags |= RTF_MODIFIED;
879                 if (mtu < IPV6_MIN_MTU) {
880                         mtu = IPV6_MIN_MTU;
881                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
882                 }
883                 dst->metrics[RTAX_MTU-1] = mtu;
884                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
885         }
886 }
887
888 static int ipv6_get_mtu(struct net_device *dev);
889
890 static inline unsigned int ipv6_advmss(unsigned int mtu)
891 {
892         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
893
894         if (mtu < ip6_rt_min_advmss)
895                 mtu = ip6_rt_min_advmss;
896
897         /*
898          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
899          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
900          * IPV6_MAXPLEN is also valid and means: "any MSS,
901          * rely only on pmtu discovery"
902          */
903         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
904                 mtu = IPV6_MAXPLEN;
905         return mtu;
906 }
907
908 static struct dst_entry *ndisc_dst_gc_list;
909 static DEFINE_SPINLOCK(ndisc_lock);
910
911 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
912                                   struct neighbour *neigh,
913                                   struct in6_addr *addr,
914                                   int (*output)(struct sk_buff *))
915 {
916         struct rt6_info *rt;
917         struct inet6_dev *idev = in6_dev_get(dev);
918
919         if (unlikely(idev == NULL))
920                 return NULL;
921
922         rt = ip6_dst_alloc();
923         if (unlikely(rt == NULL)) {
924                 in6_dev_put(idev);
925                 goto out;
926         }
927
928         dev_hold(dev);
929         if (neigh)
930                 neigh_hold(neigh);
931         else
932                 neigh = ndisc_get_neigh(dev, addr);
933
934         rt->rt6i_dev      = dev;
935         rt->rt6i_idev     = idev;
936         rt->rt6i_nexthop  = neigh;
937         atomic_set(&rt->u.dst.__refcnt, 1);
938         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
939         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
940         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
941         rt->u.dst.output  = output;
942
943 #if 0   /* there's no chance to use these for ndisc */
944         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
945                                 ? DST_HOST
946                                 : 0;
947         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
948         rt->rt6i_dst.plen = 128;
949 #endif
950
951         spin_lock_bh(&ndisc_lock);
952         rt->u.dst.next = ndisc_dst_gc_list;
953         ndisc_dst_gc_list = &rt->u.dst;
954         spin_unlock_bh(&ndisc_lock);
955
956         fib6_force_start_gc();
957
958 out:
959         return &rt->u.dst;
960 }
961
962 int ndisc_dst_gc(int *more)
963 {
964         struct dst_entry *dst, *next, **pprev;
965         int freed;
966
967         next = NULL;
968         freed = 0;
969
970         spin_lock_bh(&ndisc_lock);
971         pprev = &ndisc_dst_gc_list;
972
973         while ((dst = *pprev) != NULL) {
974                 if (!atomic_read(&dst->__refcnt)) {
975                         *pprev = dst->next;
976                         dst_free(dst);
977                         freed++;
978                 } else {
979                         pprev = &dst->next;
980                         (*more)++;
981                 }
982         }
983
984         spin_unlock_bh(&ndisc_lock);
985
986         return freed;
987 }
988
989 static int ip6_dst_gc(void)
990 {
991         static unsigned expire = 30*HZ;
992         static unsigned long last_gc;
993         unsigned long now = jiffies;
994
995         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
996             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
997                 goto out;
998
999         expire++;
1000         fib6_run_gc(expire);
1001         last_gc = now;
1002         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1003                 expire = ip6_rt_gc_timeout>>1;
1004
1005 out:
1006         expire -= expire>>ip6_rt_gc_elasticity;
1007         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1008 }
1009
1010 /* Clean host part of a prefix. Not necessary in radix tree,
1011    but results in cleaner routing tables.
1012
1013    Remove it only when all the things will work!
1014  */
1015
1016 static int ipv6_get_mtu(struct net_device *dev)
1017 {
1018         int mtu = IPV6_MIN_MTU;
1019         struct inet6_dev *idev;
1020
1021         idev = in6_dev_get(dev);
1022         if (idev) {
1023                 mtu = idev->cnf.mtu6;
1024                 in6_dev_put(idev);
1025         }
1026         return mtu;
1027 }
1028
1029 int ipv6_get_hoplimit(struct net_device *dev)
1030 {
1031         int hoplimit = ipv6_devconf.hop_limit;
1032         struct inet6_dev *idev;
1033
1034         idev = in6_dev_get(dev);
1035         if (idev) {
1036                 hoplimit = idev->cnf.hop_limit;
1037                 in6_dev_put(idev);
1038         }
1039         return hoplimit;
1040 }
1041
1042 /*
1043  *
1044  */
1045
1046 int ip6_route_add(struct fib6_config *cfg)
1047 {
1048         int err;
1049         struct rt6_info *rt = NULL;
1050         struct net_device *dev = NULL;
1051         struct inet6_dev *idev = NULL;
1052         struct fib6_table *table;
1053         int addr_type;
1054
1055         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1056                 return -EINVAL;
1057 #ifndef CONFIG_IPV6_SUBTREES
1058         if (cfg->fc_src_len)
1059                 return -EINVAL;
1060 #endif
1061         if (cfg->fc_ifindex) {
1062                 err = -ENODEV;
1063                 dev = dev_get_by_index(&init_net, cfg->fc_ifindex);
1064                 if (!dev)
1065                         goto out;
1066                 idev = in6_dev_get(dev);
1067                 if (!idev)
1068                         goto out;
1069         }
1070
1071         if (cfg->fc_metric == 0)
1072                 cfg->fc_metric = IP6_RT_PRIO_USER;
1073
1074         table = fib6_new_table(cfg->fc_table);
1075         if (table == NULL) {
1076                 err = -ENOBUFS;
1077                 goto out;
1078         }
1079
1080         rt = ip6_dst_alloc();
1081
1082         if (rt == NULL) {
1083                 err = -ENOMEM;
1084                 goto out;
1085         }
1086
1087         rt->u.dst.obsolete = -1;
1088         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1089
1090         if (cfg->fc_protocol == RTPROT_UNSPEC)
1091                 cfg->fc_protocol = RTPROT_BOOT;
1092         rt->rt6i_protocol = cfg->fc_protocol;
1093
1094         addr_type = ipv6_addr_type(&cfg->fc_dst);
1095
1096         if (addr_type & IPV6_ADDR_MULTICAST)
1097                 rt->u.dst.input = ip6_mc_input;
1098         else
1099                 rt->u.dst.input = ip6_forward;
1100
1101         rt->u.dst.output = ip6_output;
1102
1103         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1104         rt->rt6i_dst.plen = cfg->fc_dst_len;
1105         if (rt->rt6i_dst.plen == 128)
1106                rt->u.dst.flags = DST_HOST;
1107
1108 #ifdef CONFIG_IPV6_SUBTREES
1109         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1110         rt->rt6i_src.plen = cfg->fc_src_len;
1111 #endif
1112
1113         rt->rt6i_metric = cfg->fc_metric;
1114
1115         /* We cannot add true routes via loopback here,
1116            they would result in kernel looping; promote them to reject routes
1117          */
1118         if ((cfg->fc_flags & RTF_REJECT) ||
1119             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1120                 /* hold loopback dev/idev if we haven't done so. */
1121                 if (dev != init_net.loopback_dev) {
1122                         if (dev) {
1123                                 dev_put(dev);
1124                                 in6_dev_put(idev);
1125                         }
1126                         dev = init_net.loopback_dev;
1127                         dev_hold(dev);
1128                         idev = in6_dev_get(dev);
1129                         if (!idev) {
1130                                 err = -ENODEV;
1131                                 goto out;
1132                         }
1133                 }
1134                 rt->u.dst.output = ip6_pkt_discard_out;
1135                 rt->u.dst.input = ip6_pkt_discard;
1136                 rt->u.dst.error = -ENETUNREACH;
1137                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1138                 goto install_route;
1139         }
1140
1141         if (cfg->fc_flags & RTF_GATEWAY) {
1142                 struct in6_addr *gw_addr;
1143                 int gwa_type;
1144
1145                 gw_addr = &cfg->fc_gateway;
1146                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1147                 gwa_type = ipv6_addr_type(gw_addr);
1148
1149                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1150                         struct rt6_info *grt;
1151
1152                         /* IPv6 strictly inhibits using not link-local
1153                            addresses as nexthop address.
1154                            Otherwise, router will not able to send redirects.
1155                            It is very good, but in some (rare!) circumstances
1156                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1157                            some exceptions. --ANK
1158                          */
1159                         err = -EINVAL;
1160                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1161                                 goto out;
1162
1163                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1164
1165                         err = -EHOSTUNREACH;
1166                         if (grt == NULL)
1167                                 goto out;
1168                         if (dev) {
1169                                 if (dev != grt->rt6i_dev) {
1170                                         dst_release(&grt->u.dst);
1171                                         goto out;
1172                                 }
1173                         } else {
1174                                 dev = grt->rt6i_dev;
1175                                 idev = grt->rt6i_idev;
1176                                 dev_hold(dev);
1177                                 in6_dev_hold(grt->rt6i_idev);
1178                         }
1179                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1180                                 err = 0;
1181                         dst_release(&grt->u.dst);
1182
1183                         if (err)
1184                                 goto out;
1185                 }
1186                 err = -EINVAL;
1187                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1188                         goto out;
1189         }
1190
1191         err = -ENODEV;
1192         if (dev == NULL)
1193                 goto out;
1194
1195         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1196                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1197                 if (IS_ERR(rt->rt6i_nexthop)) {
1198                         err = PTR_ERR(rt->rt6i_nexthop);
1199                         rt->rt6i_nexthop = NULL;
1200                         goto out;
1201                 }
1202         }
1203
1204         rt->rt6i_flags = cfg->fc_flags;
1205
1206 install_route:
1207         if (cfg->fc_mx) {
1208                 struct nlattr *nla;
1209                 int remaining;
1210
1211                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1212                         int type = nla_type(nla);
1213
1214                         if (type) {
1215                                 if (type > RTAX_MAX) {
1216                                         err = -EINVAL;
1217                                         goto out;
1218                                 }
1219
1220                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1221                         }
1222                 }
1223         }
1224
1225         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1226                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1227         if (!rt->u.dst.metrics[RTAX_MTU-1])
1228                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1229         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1230                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1231         rt->u.dst.dev = dev;
1232         rt->rt6i_idev = idev;
1233         rt->rt6i_table = table;
1234         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1235
1236 out:
1237         if (dev)
1238                 dev_put(dev);
1239         if (idev)
1240                 in6_dev_put(idev);
1241         if (rt)
1242                 dst_free(&rt->u.dst);
1243         return err;
1244 }
1245
1246 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1247 {
1248         int err;
1249         struct fib6_table *table;
1250
1251         if (rt == &ip6_null_entry)
1252                 return -ENOENT;
1253
1254         table = rt->rt6i_table;
1255         write_lock_bh(&table->tb6_lock);
1256
1257         err = fib6_del(rt, info);
1258         dst_release(&rt->u.dst);
1259
1260         write_unlock_bh(&table->tb6_lock);
1261
1262         return err;
1263 }
1264
1265 int ip6_del_rt(struct rt6_info *rt)
1266 {
1267         return __ip6_del_rt(rt, NULL);
1268 }
1269
1270 static int ip6_route_del(struct fib6_config *cfg)
1271 {
1272         struct fib6_table *table;
1273         struct fib6_node *fn;
1274         struct rt6_info *rt;
1275         int err = -ESRCH;
1276
1277         table = fib6_get_table(cfg->fc_table);
1278         if (table == NULL)
1279                 return err;
1280
1281         read_lock_bh(&table->tb6_lock);
1282
1283         fn = fib6_locate(&table->tb6_root,
1284                          &cfg->fc_dst, cfg->fc_dst_len,
1285                          &cfg->fc_src, cfg->fc_src_len);
1286
1287         if (fn) {
1288                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1289                         if (cfg->fc_ifindex &&
1290                             (rt->rt6i_dev == NULL ||
1291                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1292                                 continue;
1293                         if (cfg->fc_flags & RTF_GATEWAY &&
1294                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1295                                 continue;
1296                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1297                                 continue;
1298                         dst_hold(&rt->u.dst);
1299                         read_unlock_bh(&table->tb6_lock);
1300
1301                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1302                 }
1303         }
1304         read_unlock_bh(&table->tb6_lock);
1305
1306         return err;
1307 }
1308
1309 /*
1310  *      Handle redirects
1311  */
1312 struct ip6rd_flowi {
1313         struct flowi fl;
1314         struct in6_addr gateway;
1315 };
1316
1317 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1318                                              struct flowi *fl,
1319                                              int flags)
1320 {
1321         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1322         struct rt6_info *rt;
1323         struct fib6_node *fn;
1324
1325         /*
1326          * Get the "current" route for this destination and
1327          * check if the redirect has come from approriate router.
1328          *
1329          * RFC 2461 specifies that redirects should only be
1330          * accepted if they come from the nexthop to the target.
1331          * Due to the way the routes are chosen, this notion
1332          * is a bit fuzzy and one might need to check all possible
1333          * routes.
1334          */
1335
1336         read_lock_bh(&table->tb6_lock);
1337         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1338 restart:
1339         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1340                 /*
1341                  * Current route is on-link; redirect is always invalid.
1342                  *
1343                  * Seems, previous statement is not true. It could
1344                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1345                  * But then router serving it might decide, that we should
1346                  * know truth 8)8) --ANK (980726).
1347                  */
1348                 if (rt6_check_expired(rt))
1349                         continue;
1350                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1351                         continue;
1352                 if (fl->oif != rt->rt6i_dev->ifindex)
1353                         continue;
1354                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1355                         continue;
1356                 break;
1357         }
1358
1359         if (!rt)
1360                 rt = &ip6_null_entry;
1361         BACKTRACK(&fl->fl6_src);
1362 out:
1363         dst_hold(&rt->u.dst);
1364
1365         read_unlock_bh(&table->tb6_lock);
1366
1367         return rt;
1368 };
1369
1370 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1371                                            struct in6_addr *src,
1372                                            struct in6_addr *gateway,
1373                                            struct net_device *dev)
1374 {
1375         int flags = RT6_LOOKUP_F_HAS_SADDR;
1376         struct ip6rd_flowi rdfl = {
1377                 .fl = {
1378                         .oif = dev->ifindex,
1379                         .nl_u = {
1380                                 .ip6_u = {
1381                                         .daddr = *dest,
1382                                         .saddr = *src,
1383                                 },
1384                         },
1385                 },
1386                 .gateway = *gateway,
1387         };
1388
1389         if (rt6_need_strict(dest))
1390                 flags |= RT6_LOOKUP_F_IFACE;
1391
1392         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1393 }
1394
1395 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1396                   struct in6_addr *saddr,
1397                   struct neighbour *neigh, u8 *lladdr, int on_link)
1398 {
1399         struct rt6_info *rt, *nrt = NULL;
1400         struct netevent_redirect netevent;
1401
1402         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1403
1404         if (rt == &ip6_null_entry) {
1405                 if (net_ratelimit())
1406                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1407                                "for redirect target\n");
1408                 goto out;
1409         }
1410
1411         /*
1412          *      We have finally decided to accept it.
1413          */
1414
1415         neigh_update(neigh, lladdr, NUD_STALE,
1416                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1417                      NEIGH_UPDATE_F_OVERRIDE|
1418                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1419                                      NEIGH_UPDATE_F_ISROUTER))
1420                      );
1421
1422         /*
1423          * Redirect received -> path was valid.
1424          * Look, redirects are sent only in response to data packets,
1425          * so that this nexthop apparently is reachable. --ANK
1426          */
1427         dst_confirm(&rt->u.dst);
1428
1429         /* Duplicate redirect: silently ignore. */
1430         if (neigh == rt->u.dst.neighbour)
1431                 goto out;
1432
1433         nrt = ip6_rt_copy(rt);
1434         if (nrt == NULL)
1435                 goto out;
1436
1437         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1438         if (on_link)
1439                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1440
1441         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1442         nrt->rt6i_dst.plen = 128;
1443         nrt->u.dst.flags |= DST_HOST;
1444
1445         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1446         nrt->rt6i_nexthop = neigh_clone(neigh);
1447         /* Reset pmtu, it may be better */
1448         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1449         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1450
1451         if (ip6_ins_rt(nrt))
1452                 goto out;
1453
1454         netevent.old = &rt->u.dst;
1455         netevent.new = &nrt->u.dst;
1456         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1457
1458         if (rt->rt6i_flags&RTF_CACHE) {
1459                 ip6_del_rt(rt);
1460                 return;
1461         }
1462
1463 out:
1464         dst_release(&rt->u.dst);
1465         return;
1466 }
1467
1468 /*
1469  *      Handle ICMP "packet too big" messages
1470  *      i.e. Path MTU discovery
1471  */
1472
1473 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1474                         struct net_device *dev, u32 pmtu)
1475 {
1476         struct rt6_info *rt, *nrt;
1477         int allfrag = 0;
1478
1479         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1480         if (rt == NULL)
1481                 return;
1482
1483         if (pmtu >= dst_mtu(&rt->u.dst))
1484                 goto out;
1485
1486         if (pmtu < IPV6_MIN_MTU) {
1487                 /*
1488                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1489                  * MTU (1280) and a fragment header should always be included
1490                  * after a node receiving Too Big message reporting PMTU is
1491                  * less than the IPv6 Minimum Link MTU.
1492                  */
1493                 pmtu = IPV6_MIN_MTU;
1494                 allfrag = 1;
1495         }
1496
1497         /* New mtu received -> path was valid.
1498            They are sent only in response to data packets,
1499            so that this nexthop apparently is reachable. --ANK
1500          */
1501         dst_confirm(&rt->u.dst);
1502
1503         /* Host route. If it is static, it would be better
1504            not to override it, but add new one, so that
1505            when cache entry will expire old pmtu
1506            would return automatically.
1507          */
1508         if (rt->rt6i_flags & RTF_CACHE) {
1509                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1510                 if (allfrag)
1511                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1512                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1513                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1514                 goto out;
1515         }
1516
1517         /* Network route.
1518            Two cases are possible:
1519            1. It is connected route. Action: COW
1520            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1521          */
1522         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1523                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1524         else
1525                 nrt = rt6_alloc_clone(rt, daddr);
1526
1527         if (nrt) {
1528                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1529                 if (allfrag)
1530                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1531
1532                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1533                  * happened within 5 mins, the recommended timer is 10 mins.
1534                  * Here this route expiration time is set to ip6_rt_mtu_expires
1535                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1536                  * and detecting PMTU increase will be automatically happened.
1537                  */
1538                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1539                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1540
1541                 ip6_ins_rt(nrt);
1542         }
1543 out:
1544         dst_release(&rt->u.dst);
1545 }
1546
1547 /*
1548  *      Misc support functions
1549  */
1550
1551 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1552 {
1553         struct rt6_info *rt = ip6_dst_alloc();
1554
1555         if (rt) {
1556                 rt->u.dst.input = ort->u.dst.input;
1557                 rt->u.dst.output = ort->u.dst.output;
1558
1559                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1560                 rt->u.dst.error = ort->u.dst.error;
1561                 rt->u.dst.dev = ort->u.dst.dev;
1562                 if (rt->u.dst.dev)
1563                         dev_hold(rt->u.dst.dev);
1564                 rt->rt6i_idev = ort->rt6i_idev;
1565                 if (rt->rt6i_idev)
1566                         in6_dev_hold(rt->rt6i_idev);
1567                 rt->u.dst.lastuse = jiffies;
1568                 rt->rt6i_expires = 0;
1569
1570                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1571                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1572                 rt->rt6i_metric = 0;
1573
1574                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1575 #ifdef CONFIG_IPV6_SUBTREES
1576                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1577 #endif
1578                 rt->rt6i_table = ort->rt6i_table;
1579         }
1580         return rt;
1581 }
1582
1583 #ifdef CONFIG_IPV6_ROUTE_INFO
1584 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1585                                            struct in6_addr *gwaddr, int ifindex)
1586 {
1587         struct fib6_node *fn;
1588         struct rt6_info *rt = NULL;
1589         struct fib6_table *table;
1590
1591         table = fib6_get_table(RT6_TABLE_INFO);
1592         if (table == NULL)
1593                 return NULL;
1594
1595         write_lock_bh(&table->tb6_lock);
1596         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1597         if (!fn)
1598                 goto out;
1599
1600         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1601                 if (rt->rt6i_dev->ifindex != ifindex)
1602                         continue;
1603                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1604                         continue;
1605                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1606                         continue;
1607                 dst_hold(&rt->u.dst);
1608                 break;
1609         }
1610 out:
1611         write_unlock_bh(&table->tb6_lock);
1612         return rt;
1613 }
1614
1615 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1616                                            struct in6_addr *gwaddr, int ifindex,
1617                                            unsigned pref)
1618 {
1619         struct fib6_config cfg = {
1620                 .fc_table       = RT6_TABLE_INFO,
1621                 .fc_metric      = 1024,
1622                 .fc_ifindex     = ifindex,
1623                 .fc_dst_len     = prefixlen,
1624                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1625                                   RTF_UP | RTF_PREF(pref),
1626         };
1627
1628         ipv6_addr_copy(&cfg.fc_dst, prefix);
1629         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1630
1631         /* We should treat it as a default route if prefix length is 0. */
1632         if (!prefixlen)
1633                 cfg.fc_flags |= RTF_DEFAULT;
1634
1635         ip6_route_add(&cfg);
1636
1637         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1638 }
1639 #endif
1640
1641 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1642 {
1643         struct rt6_info *rt;
1644         struct fib6_table *table;
1645
1646         table = fib6_get_table(RT6_TABLE_DFLT);
1647         if (table == NULL)
1648                 return NULL;
1649
1650         write_lock_bh(&table->tb6_lock);
1651         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1652                 if (dev == rt->rt6i_dev &&
1653                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1654                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1655                         break;
1656         }
1657         if (rt)
1658                 dst_hold(&rt->u.dst);
1659         write_unlock_bh(&table->tb6_lock);
1660         return rt;
1661 }
1662
1663 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1664                                      struct net_device *dev,
1665                                      unsigned int pref)
1666 {
1667         struct fib6_config cfg = {
1668                 .fc_table       = RT6_TABLE_DFLT,
1669                 .fc_metric      = 1024,
1670                 .fc_ifindex     = dev->ifindex,
1671                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1672                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1673         };
1674
1675         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1676
1677         ip6_route_add(&cfg);
1678
1679         return rt6_get_dflt_router(gwaddr, dev);
1680 }
1681
1682 void rt6_purge_dflt_routers(void)
1683 {
1684         struct rt6_info *rt;
1685         struct fib6_table *table;
1686
1687         /* NOTE: Keep consistent with rt6_get_dflt_router */
1688         table = fib6_get_table(RT6_TABLE_DFLT);
1689         if (table == NULL)
1690                 return;
1691
1692 restart:
1693         read_lock_bh(&table->tb6_lock);
1694         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1695                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1696                         dst_hold(&rt->u.dst);
1697                         read_unlock_bh(&table->tb6_lock);
1698                         ip6_del_rt(rt);
1699                         goto restart;
1700                 }
1701         }
1702         read_unlock_bh(&table->tb6_lock);
1703 }
1704
1705 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1706                                  struct fib6_config *cfg)
1707 {
1708         memset(cfg, 0, sizeof(*cfg));
1709
1710         cfg->fc_table = RT6_TABLE_MAIN;
1711         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1712         cfg->fc_metric = rtmsg->rtmsg_metric;
1713         cfg->fc_expires = rtmsg->rtmsg_info;
1714         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1715         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1716         cfg->fc_flags = rtmsg->rtmsg_flags;
1717
1718         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1719         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1720         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1721 }
1722
1723 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1724 {
1725         struct fib6_config cfg;
1726         struct in6_rtmsg rtmsg;
1727         int err;
1728
1729         switch(cmd) {
1730         case SIOCADDRT:         /* Add a route */
1731         case SIOCDELRT:         /* Delete a route */
1732                 if (!capable(CAP_NET_ADMIN))
1733                         return -EPERM;
1734                 err = copy_from_user(&rtmsg, arg,
1735                                      sizeof(struct in6_rtmsg));
1736                 if (err)
1737                         return -EFAULT;
1738
1739                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1740
1741                 rtnl_lock();
1742                 switch (cmd) {
1743                 case SIOCADDRT:
1744                         err = ip6_route_add(&cfg);
1745                         break;
1746                 case SIOCDELRT:
1747                         err = ip6_route_del(&cfg);
1748                         break;
1749                 default:
1750                         err = -EINVAL;
1751                 }
1752                 rtnl_unlock();
1753
1754                 return err;
1755         }
1756
1757         return -EINVAL;
1758 }
1759
1760 /*
1761  *      Drop the packet on the floor
1762  */
1763
1764 static inline int ip6_pkt_drop(struct sk_buff *skb, int code,
1765                                int ipstats_mib_noroutes)
1766 {
1767         int type;
1768         switch (ipstats_mib_noroutes) {
1769         case IPSTATS_MIB_INNOROUTES:
1770                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1771                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1772                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1773                         break;
1774                 }
1775                 /* FALLTHROUGH */
1776         case IPSTATS_MIB_OUTNOROUTES:
1777                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1778                 break;
1779         }
1780         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1781         kfree_skb(skb);
1782         return 0;
1783 }
1784
1785 static int ip6_pkt_discard(struct sk_buff *skb)
1786 {
1787         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1788 }
1789
1790 static int ip6_pkt_discard_out(struct sk_buff *skb)
1791 {
1792         skb->dev = skb->dst->dev;
1793         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1794 }
1795
1796 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1797
1798 static int ip6_pkt_prohibit(struct sk_buff *skb)
1799 {
1800         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1801 }
1802
1803 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1804 {
1805         skb->dev = skb->dst->dev;
1806         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1807 }
1808
1809 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1810 {
1811         kfree_skb(skb);
1812         return 0;
1813 }
1814
1815 #endif
1816
1817 /*
1818  *      Allocate a dst for local (unicast / anycast) address.
1819  */
1820
1821 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1822                                     const struct in6_addr *addr,
1823                                     int anycast)
1824 {
1825         struct rt6_info *rt = ip6_dst_alloc();
1826
1827         if (rt == NULL)
1828                 return ERR_PTR(-ENOMEM);
1829
1830         dev_hold(init_net.loopback_dev);
1831         in6_dev_hold(idev);
1832
1833         rt->u.dst.flags = DST_HOST;
1834         rt->u.dst.input = ip6_input;
1835         rt->u.dst.output = ip6_output;
1836         rt->rt6i_dev = init_net.loopback_dev;
1837         rt->rt6i_idev = idev;
1838         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1839         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1840         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1841         rt->u.dst.obsolete = -1;
1842
1843         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1844         if (anycast)
1845                 rt->rt6i_flags |= RTF_ANYCAST;
1846         else
1847                 rt->rt6i_flags |= RTF_LOCAL;
1848         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1849         if (rt->rt6i_nexthop == NULL) {
1850                 dst_free(&rt->u.dst);
1851                 return ERR_PTR(-ENOMEM);
1852         }
1853
1854         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1855         rt->rt6i_dst.plen = 128;
1856         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1857
1858         atomic_set(&rt->u.dst.__refcnt, 1);
1859
1860         return rt;
1861 }
1862
1863 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1864 {
1865         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1866             rt != &ip6_null_entry) {
1867                 RT6_TRACE("deleted by ifdown %p\n", rt);
1868                 return -1;
1869         }
1870         return 0;
1871 }
1872
1873 void rt6_ifdown(struct net_device *dev)
1874 {
1875         fib6_clean_all(fib6_ifdown, 0, dev);
1876 }
1877
1878 struct rt6_mtu_change_arg
1879 {
1880         struct net_device *dev;
1881         unsigned mtu;
1882 };
1883
1884 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1885 {
1886         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1887         struct inet6_dev *idev;
1888
1889         /* In IPv6 pmtu discovery is not optional,
1890            so that RTAX_MTU lock cannot disable it.
1891            We still use this lock to block changes
1892            caused by addrconf/ndisc.
1893         */
1894
1895         idev = __in6_dev_get(arg->dev);
1896         if (idev == NULL)
1897                 return 0;
1898
1899         /* For administrative MTU increase, there is no way to discover
1900            IPv6 PMTU increase, so PMTU increase should be updated here.
1901            Since RFC 1981 doesn't include administrative MTU increase
1902            update PMTU increase is a MUST. (i.e. jumbo frame)
1903          */
1904         /*
1905            If new MTU is less than route PMTU, this new MTU will be the
1906            lowest MTU in the path, update the route PMTU to reflect PMTU
1907            decreases; if new MTU is greater than route PMTU, and the
1908            old MTU is the lowest MTU in the path, update the route PMTU
1909            to reflect the increase. In this case if the other nodes' MTU
1910            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1911            PMTU discouvery.
1912          */
1913         if (rt->rt6i_dev == arg->dev &&
1914             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1915             (dst_mtu(&rt->u.dst) > arg->mtu ||
1916              (dst_mtu(&rt->u.dst) < arg->mtu &&
1917               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1918                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1919                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1920         }
1921         return 0;
1922 }
1923
1924 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1925 {
1926         struct rt6_mtu_change_arg arg = {
1927                 .dev = dev,
1928                 .mtu = mtu,
1929         };
1930
1931         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1932 }
1933
1934 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1935         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1936         [RTA_OIF]               = { .type = NLA_U32 },
1937         [RTA_IIF]               = { .type = NLA_U32 },
1938         [RTA_PRIORITY]          = { .type = NLA_U32 },
1939         [RTA_METRICS]           = { .type = NLA_NESTED },
1940 };
1941
1942 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1943                               struct fib6_config *cfg)
1944 {
1945         struct rtmsg *rtm;
1946         struct nlattr *tb[RTA_MAX+1];
1947         int err;
1948
1949         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1950         if (err < 0)
1951                 goto errout;
1952
1953         err = -EINVAL;
1954         rtm = nlmsg_data(nlh);
1955         memset(cfg, 0, sizeof(*cfg));
1956
1957         cfg->fc_table = rtm->rtm_table;
1958         cfg->fc_dst_len = rtm->rtm_dst_len;
1959         cfg->fc_src_len = rtm->rtm_src_len;
1960         cfg->fc_flags = RTF_UP;
1961         cfg->fc_protocol = rtm->rtm_protocol;
1962
1963         if (rtm->rtm_type == RTN_UNREACHABLE)
1964                 cfg->fc_flags |= RTF_REJECT;
1965
1966         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1967         cfg->fc_nlinfo.nlh = nlh;
1968
1969         if (tb[RTA_GATEWAY]) {
1970                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1971                 cfg->fc_flags |= RTF_GATEWAY;
1972         }
1973
1974         if (tb[RTA_DST]) {
1975                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1976
1977                 if (nla_len(tb[RTA_DST]) < plen)
1978                         goto errout;
1979
1980                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1981         }
1982
1983         if (tb[RTA_SRC]) {
1984                 int plen = (rtm->rtm_src_len + 7) >> 3;
1985
1986                 if (nla_len(tb[RTA_SRC]) < plen)
1987                         goto errout;
1988
1989                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1990         }
1991
1992         if (tb[RTA_OIF])
1993                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1994
1995         if (tb[RTA_PRIORITY])
1996                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1997
1998         if (tb[RTA_METRICS]) {
1999                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2000                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2001         }
2002
2003         if (tb[RTA_TABLE])
2004                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2005
2006         err = 0;
2007 errout:
2008         return err;
2009 }
2010
2011 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2012 {
2013         struct fib6_config cfg;
2014         int err;
2015
2016         err = rtm_to_fib6_config(skb, nlh, &cfg);
2017         if (err < 0)
2018                 return err;
2019
2020         return ip6_route_del(&cfg);
2021 }
2022
2023 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2024 {
2025         struct fib6_config cfg;
2026         int err;
2027
2028         err = rtm_to_fib6_config(skb, nlh, &cfg);
2029         if (err < 0)
2030                 return err;
2031
2032         return ip6_route_add(&cfg);
2033 }
2034
2035 static inline size_t rt6_nlmsg_size(void)
2036 {
2037         return NLMSG_ALIGN(sizeof(struct rtmsg))
2038                + nla_total_size(16) /* RTA_SRC */
2039                + nla_total_size(16) /* RTA_DST */
2040                + nla_total_size(16) /* RTA_GATEWAY */
2041                + nla_total_size(16) /* RTA_PREFSRC */
2042                + nla_total_size(4) /* RTA_TABLE */
2043                + nla_total_size(4) /* RTA_IIF */
2044                + nla_total_size(4) /* RTA_OIF */
2045                + nla_total_size(4) /* RTA_PRIORITY */
2046                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2047                + nla_total_size(sizeof(struct rta_cacheinfo));
2048 }
2049
2050 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2051                          struct in6_addr *dst, struct in6_addr *src,
2052                          int iif, int type, u32 pid, u32 seq,
2053                          int prefix, unsigned int flags)
2054 {
2055         struct rtmsg *rtm;
2056         struct nlmsghdr *nlh;
2057         long expires;
2058         u32 table;
2059
2060         if (prefix) {   /* user wants prefix routes only */
2061                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2062                         /* success since this is not a prefix route */
2063                         return 1;
2064                 }
2065         }
2066
2067         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2068         if (nlh == NULL)
2069                 return -EMSGSIZE;
2070
2071         rtm = nlmsg_data(nlh);
2072         rtm->rtm_family = AF_INET6;
2073         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2074         rtm->rtm_src_len = rt->rt6i_src.plen;
2075         rtm->rtm_tos = 0;
2076         if (rt->rt6i_table)
2077                 table = rt->rt6i_table->tb6_id;
2078         else
2079                 table = RT6_TABLE_UNSPEC;
2080         rtm->rtm_table = table;
2081         NLA_PUT_U32(skb, RTA_TABLE, table);
2082         if (rt->rt6i_flags&RTF_REJECT)
2083                 rtm->rtm_type = RTN_UNREACHABLE;
2084         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2085                 rtm->rtm_type = RTN_LOCAL;
2086         else
2087                 rtm->rtm_type = RTN_UNICAST;
2088         rtm->rtm_flags = 0;
2089         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2090         rtm->rtm_protocol = rt->rt6i_protocol;
2091         if (rt->rt6i_flags&RTF_DYNAMIC)
2092                 rtm->rtm_protocol = RTPROT_REDIRECT;
2093         else if (rt->rt6i_flags & RTF_ADDRCONF)
2094                 rtm->rtm_protocol = RTPROT_KERNEL;
2095         else if (rt->rt6i_flags&RTF_DEFAULT)
2096                 rtm->rtm_protocol = RTPROT_RA;
2097
2098         if (rt->rt6i_flags&RTF_CACHE)
2099                 rtm->rtm_flags |= RTM_F_CLONED;
2100
2101         if (dst) {
2102                 NLA_PUT(skb, RTA_DST, 16, dst);
2103                 rtm->rtm_dst_len = 128;
2104         } else if (rtm->rtm_dst_len)
2105                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2106 #ifdef CONFIG_IPV6_SUBTREES
2107         if (src) {
2108                 NLA_PUT(skb, RTA_SRC, 16, src);
2109                 rtm->rtm_src_len = 128;
2110         } else if (rtm->rtm_src_len)
2111                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2112 #endif
2113         if (iif)
2114                 NLA_PUT_U32(skb, RTA_IIF, iif);
2115         else if (dst) {
2116                 struct in6_addr saddr_buf;
2117                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2118                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2119         }
2120
2121         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2122                 goto nla_put_failure;
2123
2124         if (rt->u.dst.neighbour)
2125                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2126
2127         if (rt->u.dst.dev)
2128                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2129
2130         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2131
2132         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2133         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2134                                expires, rt->u.dst.error) < 0)
2135                 goto nla_put_failure;
2136
2137         return nlmsg_end(skb, nlh);
2138
2139 nla_put_failure:
2140         nlmsg_cancel(skb, nlh);
2141         return -EMSGSIZE;
2142 }
2143
2144 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2145 {
2146         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2147         int prefix;
2148
2149         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2150                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2151                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2152         } else
2153                 prefix = 0;
2154
2155         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2156                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2157                      prefix, NLM_F_MULTI);
2158 }
2159
2160 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2161 {
2162         struct nlattr *tb[RTA_MAX+1];
2163         struct rt6_info *rt;
2164         struct sk_buff *skb;
2165         struct rtmsg *rtm;
2166         struct flowi fl;
2167         int err, iif = 0;
2168
2169         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2170         if (err < 0)
2171                 goto errout;
2172
2173         err = -EINVAL;
2174         memset(&fl, 0, sizeof(fl));
2175
2176         if (tb[RTA_SRC]) {
2177                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2178                         goto errout;
2179
2180                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2181         }
2182
2183         if (tb[RTA_DST]) {
2184                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2185                         goto errout;
2186
2187                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2188         }
2189
2190         if (tb[RTA_IIF])
2191                 iif = nla_get_u32(tb[RTA_IIF]);
2192
2193         if (tb[RTA_OIF])
2194                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2195
2196         if (iif) {
2197                 struct net_device *dev;
2198                 dev = __dev_get_by_index(&init_net, iif);
2199                 if (!dev) {
2200                         err = -ENODEV;
2201                         goto errout;
2202                 }
2203         }
2204
2205         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2206         if (skb == NULL) {
2207                 err = -ENOBUFS;
2208                 goto errout;
2209         }
2210
2211         /* Reserve room for dummy headers, this skb can pass
2212            through good chunk of routing engine.
2213          */
2214         skb_reset_mac_header(skb);
2215         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2216
2217         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2218         skb->dst = &rt->u.dst;
2219
2220         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2221                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2222                             nlh->nlmsg_seq, 0, 0);
2223         if (err < 0) {
2224                 kfree_skb(skb);
2225                 goto errout;
2226         }
2227
2228         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2229 errout:
2230         return err;
2231 }
2232
2233 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2234 {
2235         struct sk_buff *skb;
2236         u32 pid = 0, seq = 0;
2237         struct nlmsghdr *nlh = NULL;
2238         int err = -ENOBUFS;
2239
2240         if (info) {
2241                 pid = info->pid;
2242                 nlh = info->nlh;
2243                 if (nlh)
2244                         seq = nlh->nlmsg_seq;
2245         }
2246
2247         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2248         if (skb == NULL)
2249                 goto errout;
2250
2251         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2252         if (err < 0) {
2253                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2254                 WARN_ON(err == -EMSGSIZE);
2255                 kfree_skb(skb);
2256                 goto errout;
2257         }
2258         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2259 errout:
2260         if (err < 0)
2261                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2262 }
2263
2264 /*
2265  *      /proc
2266  */
2267
2268 #ifdef CONFIG_PROC_FS
2269
2270 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2271
2272 struct rt6_proc_arg
2273 {
2274         char *buffer;
2275         int offset;
2276         int length;
2277         int skip;
2278         int len;
2279 };
2280
2281 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2282 {
2283         struct seq_file *m = p_arg;
2284
2285         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2286                    rt->rt6i_dst.plen);
2287
2288 #ifdef CONFIG_IPV6_SUBTREES
2289         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2290                    rt->rt6i_src.plen);
2291 #else
2292         seq_puts(m, "00000000000000000000000000000000 00 ");
2293 #endif
2294
2295         if (rt->rt6i_nexthop) {
2296                 seq_printf(m, NIP6_SEQFMT,
2297                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2298         } else {
2299                 seq_puts(m, "00000000000000000000000000000000");
2300         }
2301         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2302                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2303                    rt->u.dst.__use, rt->rt6i_flags,
2304                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2305         return 0;
2306 }
2307
2308 static int ipv6_route_show(struct seq_file *m, void *v)
2309 {
2310         fib6_clean_all(rt6_info_route, 0, m);
2311         return 0;
2312 }
2313
2314 static int ipv6_route_open(struct inode *inode, struct file *file)
2315 {
2316         return single_open(file, ipv6_route_show, NULL);
2317 }
2318
2319 static const struct file_operations ipv6_route_proc_fops = {
2320         .owner          = THIS_MODULE,
2321         .open           = ipv6_route_open,
2322         .read           = seq_read,
2323         .llseek         = seq_lseek,
2324         .release        = single_release,
2325 };
2326
2327 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2328 {
2329         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2330                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2331                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2332                       rt6_stats.fib_rt_cache,
2333                       atomic_read(&ip6_dst_ops.entries),
2334                       rt6_stats.fib_discarded_routes);
2335
2336         return 0;
2337 }
2338
2339 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2340 {
2341         return single_open(file, rt6_stats_seq_show, NULL);
2342 }
2343
2344 static const struct file_operations rt6_stats_seq_fops = {
2345         .owner   = THIS_MODULE,
2346         .open    = rt6_stats_seq_open,
2347         .read    = seq_read,
2348         .llseek  = seq_lseek,
2349         .release = single_release,
2350 };
2351 #endif  /* CONFIG_PROC_FS */
2352
2353 #ifdef CONFIG_SYSCTL
2354
2355 static int flush_delay;
2356
2357 static
2358 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2359                               void __user *buffer, size_t *lenp, loff_t *ppos)
2360 {
2361         if (write) {
2362                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2363                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2364                 return 0;
2365         } else
2366                 return -EINVAL;
2367 }
2368
2369 ctl_table ipv6_route_table[] = {
2370         {
2371                 .procname       =       "flush",
2372                 .data           =       &flush_delay,
2373                 .maxlen         =       sizeof(int),
2374                 .mode           =       0200,
2375                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2376         },
2377         {
2378                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2379                 .procname       =       "gc_thresh",
2380                 .data           =       &ip6_dst_ops.gc_thresh,
2381                 .maxlen         =       sizeof(int),
2382                 .mode           =       0644,
2383                 .proc_handler   =       &proc_dointvec,
2384         },
2385         {
2386                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2387                 .procname       =       "max_size",
2388                 .data           =       &ip6_rt_max_size,
2389                 .maxlen         =       sizeof(int),
2390                 .mode           =       0644,
2391                 .proc_handler   =       &proc_dointvec,
2392         },
2393         {
2394                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2395                 .procname       =       "gc_min_interval",
2396                 .data           =       &ip6_rt_gc_min_interval,
2397                 .maxlen         =       sizeof(int),
2398                 .mode           =       0644,
2399                 .proc_handler   =       &proc_dointvec_jiffies,
2400                 .strategy       =       &sysctl_jiffies,
2401         },
2402         {
2403                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2404                 .procname       =       "gc_timeout",
2405                 .data           =       &ip6_rt_gc_timeout,
2406                 .maxlen         =       sizeof(int),
2407                 .mode           =       0644,
2408                 .proc_handler   =       &proc_dointvec_jiffies,
2409                 .strategy       =       &sysctl_jiffies,
2410         },
2411         {
2412                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2413                 .procname       =       "gc_interval",
2414                 .data           =       &ip6_rt_gc_interval,
2415                 .maxlen         =       sizeof(int),
2416                 .mode           =       0644,
2417                 .proc_handler   =       &proc_dointvec_jiffies,
2418                 .strategy       =       &sysctl_jiffies,
2419         },
2420         {
2421                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2422                 .procname       =       "gc_elasticity",
2423                 .data           =       &ip6_rt_gc_elasticity,
2424                 .maxlen         =       sizeof(int),
2425                 .mode           =       0644,
2426                 .proc_handler   =       &proc_dointvec_jiffies,
2427                 .strategy       =       &sysctl_jiffies,
2428         },
2429         {
2430                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2431                 .procname       =       "mtu_expires",
2432                 .data           =       &ip6_rt_mtu_expires,
2433                 .maxlen         =       sizeof(int),
2434                 .mode           =       0644,
2435                 .proc_handler   =       &proc_dointvec_jiffies,
2436                 .strategy       =       &sysctl_jiffies,
2437         },
2438         {
2439                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2440                 .procname       =       "min_adv_mss",
2441                 .data           =       &ip6_rt_min_advmss,
2442                 .maxlen         =       sizeof(int),
2443                 .mode           =       0644,
2444                 .proc_handler   =       &proc_dointvec_jiffies,
2445                 .strategy       =       &sysctl_jiffies,
2446         },
2447         {
2448                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2449                 .procname       =       "gc_min_interval_ms",
2450                 .data           =       &ip6_rt_gc_min_interval,
2451                 .maxlen         =       sizeof(int),
2452                 .mode           =       0644,
2453                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2454                 .strategy       =       &sysctl_ms_jiffies,
2455         },
2456         { .ctl_name = 0 }
2457 };
2458
2459 #endif
2460
2461 void __init ip6_route_init(void)
2462 {
2463         ip6_dst_ops.kmem_cachep =
2464                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2465                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2466         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2467
2468         fib6_init();
2469         proc_net_fops_create(&init_net, "ipv6_route", 0, &ipv6_route_proc_fops);
2470         proc_net_fops_create(&init_net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2471 #ifdef CONFIG_XFRM
2472         xfrm6_init();
2473 #endif
2474 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2475         fib6_rules_init();
2476 #endif
2477
2478         __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL);
2479         __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL);
2480         __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL);
2481 }
2482
2483 void ip6_route_cleanup(void)
2484 {
2485 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2486         fib6_rules_cleanup();
2487 #endif
2488 #ifdef CONFIG_PROC_FS
2489         proc_net_remove(&init_net, "ipv6_route");
2490         proc_net_remove(&init_net, "rt6_stats");
2491 #endif
2492 #ifdef CONFIG_XFRM
2493         xfrm6_fini();
2494 #endif
2495         rt6_ifdown(NULL);
2496         fib6_gc_cleanup();
2497         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2498 }