Merge commit 'v2.6.26' into core/locking
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/mroute6.h>
40 #include <linux/init.h>
41 #include <linux/if_arp.h>
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #include <linux/nsproxy.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 #define CLONE_OFFLINK_ROUTE 0
77
78 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
79 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sk_buff *skb);
88 static void             ip6_link_failure(struct sk_buff *skb);
89 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93                                            struct in6_addr *prefix, int prefixlen,
94                                            struct in6_addr *gwaddr, int ifindex,
95                                            unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97                                            struct in6_addr *prefix, int prefixlen,
98                                            struct in6_addr *gwaddr, int ifindex);
99 #endif
100
101 static struct dst_ops ip6_dst_ops_template = {
102         .family                 =       AF_INET6,
103         .protocol               =       __constant_htons(ETH_P_IPV6),
104         .gc                     =       ip6_dst_gc,
105         .gc_thresh              =       1024,
106         .check                  =       ip6_dst_check,
107         .destroy                =       ip6_dst_destroy,
108         .ifdown                 =       ip6_dst_ifdown,
109         .negative_advice        =       ip6_negative_advice,
110         .link_failure           =       ip6_link_failure,
111         .update_pmtu            =       ip6_rt_update_pmtu,
112         .local_out              =       __ip6_local_out,
113         .entry_size             =       sizeof(struct rt6_info),
114         .entries                =       ATOMIC_INIT(0),
115 };
116
117 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
118 {
119 }
120
121 static struct dst_ops ip6_dst_blackhole_ops = {
122         .family                 =       AF_INET6,
123         .protocol               =       __constant_htons(ETH_P_IPV6),
124         .destroy                =       ip6_dst_destroy,
125         .check                  =       ip6_dst_check,
126         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
127         .entry_size             =       sizeof(struct rt6_info),
128         .entries                =       ATOMIC_INIT(0),
129 };
130
131 static struct rt6_info ip6_null_entry_template = {
132         .u = {
133                 .dst = {
134                         .__refcnt       = ATOMIC_INIT(1),
135                         .__use          = 1,
136                         .obsolete       = -1,
137                         .error          = -ENETUNREACH,
138                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
139                         .input          = ip6_pkt_discard,
140                         .output         = ip6_pkt_discard_out,
141                 }
142         },
143         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
144         .rt6i_metric    = ~(u32) 0,
145         .rt6i_ref       = ATOMIC_INIT(1),
146 };
147
148 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
149
150 static int ip6_pkt_prohibit(struct sk_buff *skb);
151 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
152
153 static struct rt6_info ip6_prohibit_entry_template = {
154         .u = {
155                 .dst = {
156                         .__refcnt       = ATOMIC_INIT(1),
157                         .__use          = 1,
158                         .obsolete       = -1,
159                         .error          = -EACCES,
160                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
161                         .input          = ip6_pkt_prohibit,
162                         .output         = ip6_pkt_prohibit_out,
163                 }
164         },
165         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
166         .rt6i_metric    = ~(u32) 0,
167         .rt6i_ref       = ATOMIC_INIT(1),
168 };
169
170 static struct rt6_info ip6_blk_hole_entry_template = {
171         .u = {
172                 .dst = {
173                         .__refcnt       = ATOMIC_INIT(1),
174                         .__use          = 1,
175                         .obsolete       = -1,
176                         .error          = -EINVAL,
177                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
178                         .input          = dst_discard,
179                         .output         = dst_discard,
180                 }
181         },
182         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
183         .rt6i_metric    = ~(u32) 0,
184         .rt6i_ref       = ATOMIC_INIT(1),
185 };
186
187 #endif
188
189 /* allocate dst with ip6_dst_ops */
190 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
191 {
192         return (struct rt6_info *)dst_alloc(ops);
193 }
194
195 static void ip6_dst_destroy(struct dst_entry *dst)
196 {
197         struct rt6_info *rt = (struct rt6_info *)dst;
198         struct inet6_dev *idev = rt->rt6i_idev;
199
200         if (idev != NULL) {
201                 rt->rt6i_idev = NULL;
202                 in6_dev_put(idev);
203         }
204 }
205
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207                            int how)
208 {
209         struct rt6_info *rt = (struct rt6_info *)dst;
210         struct inet6_dev *idev = rt->rt6i_idev;
211         struct net_device *loopback_dev =
212                 dev_net(dev)->loopback_dev;
213
214         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
215                 struct inet6_dev *loopback_idev =
216                         in6_dev_get(loopback_dev);
217                 if (loopback_idev != NULL) {
218                         rt->rt6i_idev = loopback_idev;
219                         in6_dev_put(idev);
220                 }
221         }
222 }
223
224 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
225 {
226         return (rt->rt6i_flags & RTF_EXPIRES &&
227                 time_after(jiffies, rt->rt6i_expires));
228 }
229
230 static inline int rt6_need_strict(struct in6_addr *daddr)
231 {
232         return (ipv6_addr_type(daddr) &
233                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
234 }
235
236 /*
237  *      Route lookup. Any table->tb6_lock is implied.
238  */
239
240 static inline struct rt6_info *rt6_device_match(struct net *net,
241                                                     struct rt6_info *rt,
242                                                     int oif,
243                                                     int flags)
244 {
245         struct rt6_info *local = NULL;
246         struct rt6_info *sprt;
247
248         if (oif) {
249                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
250                         struct net_device *dev = sprt->rt6i_dev;
251                         if (dev->ifindex == oif)
252                                 return sprt;
253                         if (dev->flags & IFF_LOOPBACK) {
254                                 if (sprt->rt6i_idev == NULL ||
255                                     sprt->rt6i_idev->dev->ifindex != oif) {
256                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
257                                                 continue;
258                                         if (local && (!oif ||
259                                                       local->rt6i_idev->dev->ifindex == oif))
260                                                 continue;
261                                 }
262                                 local = sprt;
263                         }
264                 }
265
266                 if (local)
267                         return local;
268
269                 if (flags & RT6_LOOKUP_F_IFACE)
270                         return net->ipv6.ip6_null_entry;
271         }
272         return rt;
273 }
274
275 #ifdef CONFIG_IPV6_ROUTER_PREF
276 static void rt6_probe(struct rt6_info *rt)
277 {
278         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
279         /*
280          * Okay, this does not seem to be appropriate
281          * for now, however, we need to check if it
282          * is really so; aka Router Reachability Probing.
283          *
284          * Router Reachability Probe MUST be rate-limited
285          * to no more than one per minute.
286          */
287         if (!neigh || (neigh->nud_state & NUD_VALID))
288                 return;
289         read_lock_bh(&neigh->lock);
290         if (!(neigh->nud_state & NUD_VALID) &&
291             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
292                 struct in6_addr mcaddr;
293                 struct in6_addr *target;
294
295                 neigh->updated = jiffies;
296                 read_unlock_bh(&neigh->lock);
297
298                 target = (struct in6_addr *)&neigh->primary_key;
299                 addrconf_addr_solict_mult(target, &mcaddr);
300                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
301         } else
302                 read_unlock_bh(&neigh->lock);
303 }
304 #else
305 static inline void rt6_probe(struct rt6_info *rt)
306 {
307         return;
308 }
309 #endif
310
311 /*
312  * Default Router Selection (RFC 2461 6.3.6)
313  */
314 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
315 {
316         struct net_device *dev = rt->rt6i_dev;
317         if (!oif || dev->ifindex == oif)
318                 return 2;
319         if ((dev->flags & IFF_LOOPBACK) &&
320             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
321                 return 1;
322         return 0;
323 }
324
325 static inline int rt6_check_neigh(struct rt6_info *rt)
326 {
327         struct neighbour *neigh = rt->rt6i_nexthop;
328         int m;
329         if (rt->rt6i_flags & RTF_NONEXTHOP ||
330             !(rt->rt6i_flags & RTF_GATEWAY))
331                 m = 1;
332         else if (neigh) {
333                 read_lock_bh(&neigh->lock);
334                 if (neigh->nud_state & NUD_VALID)
335                         m = 2;
336 #ifdef CONFIG_IPV6_ROUTER_PREF
337                 else if (neigh->nud_state & NUD_FAILED)
338                         m = 0;
339 #endif
340                 else
341                         m = 1;
342                 read_unlock_bh(&neigh->lock);
343         } else
344                 m = 0;
345         return m;
346 }
347
348 static int rt6_score_route(struct rt6_info *rt, int oif,
349                            int strict)
350 {
351         int m, n;
352
353         m = rt6_check_dev(rt, oif);
354         if (!m && (strict & RT6_LOOKUP_F_IFACE))
355                 return -1;
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
358 #endif
359         n = rt6_check_neigh(rt);
360         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
361                 return -1;
362         return m;
363 }
364
365 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
366                                    int *mpri, struct rt6_info *match)
367 {
368         int m;
369
370         if (rt6_check_expired(rt))
371                 goto out;
372
373         m = rt6_score_route(rt, oif, strict);
374         if (m < 0)
375                 goto out;
376
377         if (m > *mpri) {
378                 if (strict & RT6_LOOKUP_F_REACHABLE)
379                         rt6_probe(match);
380                 *mpri = m;
381                 match = rt;
382         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
383                 rt6_probe(rt);
384         }
385
386 out:
387         return match;
388 }
389
390 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
391                                      struct rt6_info *rr_head,
392                                      u32 metric, int oif, int strict)
393 {
394         struct rt6_info *rt, *match;
395         int mpri = -1;
396
397         match = NULL;
398         for (rt = rr_head; rt && rt->rt6i_metric == metric;
399              rt = rt->u.dst.rt6_next)
400                 match = find_match(rt, oif, strict, &mpri, match);
401         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
402              rt = rt->u.dst.rt6_next)
403                 match = find_match(rt, oif, strict, &mpri, match);
404
405         return match;
406 }
407
408 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
409 {
410         struct rt6_info *match, *rt0;
411         struct net *net;
412
413         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
414                   __func__, fn->leaf, oif);
415
416         rt0 = fn->rr_ptr;
417         if (!rt0)
418                 fn->rr_ptr = rt0 = fn->leaf;
419
420         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
421
422         if (!match &&
423             (strict & RT6_LOOKUP_F_REACHABLE)) {
424                 struct rt6_info *next = rt0->u.dst.rt6_next;
425
426                 /* no entries matched; do round-robin */
427                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
428                         next = fn->leaf;
429
430                 if (next != rt0)
431                         fn->rr_ptr = next;
432         }
433
434         RT6_TRACE("%s() => %p\n",
435                   __func__, match);
436
437         net = dev_net(rt0->rt6i_dev);
438         return (match ? match : net->ipv6.ip6_null_entry);
439 }
440
441 #ifdef CONFIG_IPV6_ROUTE_INFO
442 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
443                   struct in6_addr *gwaddr)
444 {
445         struct net *net = dev_net(dev);
446         struct route_info *rinfo = (struct route_info *) opt;
447         struct in6_addr prefix_buf, *prefix;
448         unsigned int pref;
449         unsigned long lifetime;
450         struct rt6_info *rt;
451
452         if (len < sizeof(struct route_info)) {
453                 return -EINVAL;
454         }
455
456         /* Sanity check for prefix_len and length */
457         if (rinfo->length > 3) {
458                 return -EINVAL;
459         } else if (rinfo->prefix_len > 128) {
460                 return -EINVAL;
461         } else if (rinfo->prefix_len > 64) {
462                 if (rinfo->length < 2) {
463                         return -EINVAL;
464                 }
465         } else if (rinfo->prefix_len > 0) {
466                 if (rinfo->length < 1) {
467                         return -EINVAL;
468                 }
469         }
470
471         pref = rinfo->route_pref;
472         if (pref == ICMPV6_ROUTER_PREF_INVALID)
473                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
474
475         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
476
477         if (rinfo->length == 3)
478                 prefix = (struct in6_addr *)rinfo->prefix;
479         else {
480                 /* this function is safe */
481                 ipv6_addr_prefix(&prefix_buf,
482                                  (struct in6_addr *)rinfo->prefix,
483                                  rinfo->prefix_len);
484                 prefix = &prefix_buf;
485         }
486
487         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
488                                 dev->ifindex);
489
490         if (rt && !lifetime) {
491                 ip6_del_rt(rt);
492                 rt = NULL;
493         }
494
495         if (!rt && lifetime)
496                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
497                                         pref);
498         else if (rt)
499                 rt->rt6i_flags = RTF_ROUTEINFO |
500                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
501
502         if (rt) {
503                 if (!addrconf_finite_timeout(lifetime)) {
504                         rt->rt6i_flags &= ~RTF_EXPIRES;
505                 } else {
506                         rt->rt6i_expires = jiffies + HZ * lifetime;
507                         rt->rt6i_flags |= RTF_EXPIRES;
508                 }
509                 dst_release(&rt->u.dst);
510         }
511         return 0;
512 }
513 #endif
514
515 #define BACKTRACK(__net, saddr)                 \
516 do { \
517         if (rt == __net->ipv6.ip6_null_entry) { \
518                 struct fib6_node *pn; \
519                 while (1) { \
520                         if (fn->fn_flags & RTN_TL_ROOT) \
521                                 goto out; \
522                         pn = fn->parent; \
523                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
524                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
525                         else \
526                                 fn = pn; \
527                         if (fn->fn_flags & RTN_RTINFO) \
528                                 goto restart; \
529                 } \
530         } \
531 } while(0)
532
533 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
534                                              struct fib6_table *table,
535                                              struct flowi *fl, int flags)
536 {
537         struct fib6_node *fn;
538         struct rt6_info *rt;
539
540         read_lock_bh(&table->tb6_lock);
541         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
542 restart:
543         rt = fn->leaf;
544         rt = rt6_device_match(net, rt, fl->oif, flags);
545         BACKTRACK(net, &fl->fl6_src);
546 out:
547         dst_use(&rt->u.dst, jiffies);
548         read_unlock_bh(&table->tb6_lock);
549         return rt;
550
551 }
552
553 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
554                             const struct in6_addr *saddr, int oif, int strict)
555 {
556         struct flowi fl = {
557                 .oif = oif,
558                 .nl_u = {
559                         .ip6_u = {
560                                 .daddr = *daddr,
561                         },
562                 },
563         };
564         struct dst_entry *dst;
565         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
566
567         if (saddr) {
568                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
569                 flags |= RT6_LOOKUP_F_HAS_SADDR;
570         }
571
572         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
573         if (dst->error == 0)
574                 return (struct rt6_info *) dst;
575
576         dst_release(dst);
577
578         return NULL;
579 }
580
581 EXPORT_SYMBOL(rt6_lookup);
582
583 /* ip6_ins_rt is called with FREE table->tb6_lock.
584    It takes new route entry, the addition fails by any reason the
585    route is freed. In any case, if caller does not hold it, it may
586    be destroyed.
587  */
588
589 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
590 {
591         int err;
592         struct fib6_table *table;
593
594         table = rt->rt6i_table;
595         write_lock_bh(&table->tb6_lock);
596         err = fib6_add(&table->tb6_root, rt, info);
597         write_unlock_bh(&table->tb6_lock);
598
599         return err;
600 }
601
602 int ip6_ins_rt(struct rt6_info *rt)
603 {
604         struct nl_info info = {
605                 .nl_net = dev_net(rt->rt6i_dev),
606         };
607         return __ip6_ins_rt(rt, &info);
608 }
609
610 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
611                                       struct in6_addr *saddr)
612 {
613         struct rt6_info *rt;
614
615         /*
616          *      Clone the route.
617          */
618
619         rt = ip6_rt_copy(ort);
620
621         if (rt) {
622                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
623                         if (rt->rt6i_dst.plen != 128 &&
624                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
625                                 rt->rt6i_flags |= RTF_ANYCAST;
626                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
627                 }
628
629                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
630                 rt->rt6i_dst.plen = 128;
631                 rt->rt6i_flags |= RTF_CACHE;
632                 rt->u.dst.flags |= DST_HOST;
633
634 #ifdef CONFIG_IPV6_SUBTREES
635                 if (rt->rt6i_src.plen && saddr) {
636                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
637                         rt->rt6i_src.plen = 128;
638                 }
639 #endif
640
641                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
642
643         }
644
645         return rt;
646 }
647
648 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
649 {
650         struct rt6_info *rt = ip6_rt_copy(ort);
651         if (rt) {
652                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
653                 rt->rt6i_dst.plen = 128;
654                 rt->rt6i_flags |= RTF_CACHE;
655                 rt->u.dst.flags |= DST_HOST;
656                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
657         }
658         return rt;
659 }
660
661 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
662                                       struct flowi *fl, int flags)
663 {
664         struct fib6_node *fn;
665         struct rt6_info *rt, *nrt;
666         int strict = 0;
667         int attempts = 3;
668         int err;
669         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
670
671         strict |= flags & RT6_LOOKUP_F_IFACE;
672
673 relookup:
674         read_lock_bh(&table->tb6_lock);
675
676 restart_2:
677         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
678
679 restart:
680         rt = rt6_select(fn, oif, strict | reachable);
681
682         BACKTRACK(net, &fl->fl6_src);
683         if (rt == net->ipv6.ip6_null_entry ||
684             rt->rt6i_flags & RTF_CACHE)
685                 goto out;
686
687         dst_hold(&rt->u.dst);
688         read_unlock_bh(&table->tb6_lock);
689
690         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
691                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
692         else {
693 #if CLONE_OFFLINK_ROUTE
694                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
695 #else
696                 goto out2;
697 #endif
698         }
699
700         dst_release(&rt->u.dst);
701         rt = nrt ? : net->ipv6.ip6_null_entry;
702
703         dst_hold(&rt->u.dst);
704         if (nrt) {
705                 err = ip6_ins_rt(nrt);
706                 if (!err)
707                         goto out2;
708         }
709
710         if (--attempts <= 0)
711                 goto out2;
712
713         /*
714          * Race condition! In the gap, when table->tb6_lock was
715          * released someone could insert this route.  Relookup.
716          */
717         dst_release(&rt->u.dst);
718         goto relookup;
719
720 out:
721         if (reachable) {
722                 reachable = 0;
723                 goto restart_2;
724         }
725         dst_hold(&rt->u.dst);
726         read_unlock_bh(&table->tb6_lock);
727 out2:
728         rt->u.dst.lastuse = jiffies;
729         rt->u.dst.__use++;
730
731         return rt;
732 }
733
734 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
735                                             struct flowi *fl, int flags)
736 {
737         return ip6_pol_route(net, table, fl->iif, fl, flags);
738 }
739
740 void ip6_route_input(struct sk_buff *skb)
741 {
742         struct ipv6hdr *iph = ipv6_hdr(skb);
743         struct net *net = dev_net(skb->dev);
744         int flags = RT6_LOOKUP_F_HAS_SADDR;
745         struct flowi fl = {
746                 .iif = skb->dev->ifindex,
747                 .nl_u = {
748                         .ip6_u = {
749                                 .daddr = iph->daddr,
750                                 .saddr = iph->saddr,
751                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
752                         },
753                 },
754                 .mark = skb->mark,
755                 .proto = iph->nexthdr,
756         };
757
758         if (rt6_need_strict(&iph->daddr))
759                 flags |= RT6_LOOKUP_F_IFACE;
760
761         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
762 }
763
764 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
765                                              struct flowi *fl, int flags)
766 {
767         return ip6_pol_route(net, table, fl->oif, fl, flags);
768 }
769
770 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
771                                     struct flowi *fl)
772 {
773         int flags = 0;
774
775         if (rt6_need_strict(&fl->fl6_dst))
776                 flags |= RT6_LOOKUP_F_IFACE;
777
778         if (!ipv6_addr_any(&fl->fl6_src))
779                 flags |= RT6_LOOKUP_F_HAS_SADDR;
780         else if (sk) {
781                 unsigned int prefs = inet6_sk(sk)->srcprefs;
782                 if (prefs & IPV6_PREFER_SRC_TMP)
783                         flags |= RT6_LOOKUP_F_SRCPREF_TMP;
784                 if (prefs & IPV6_PREFER_SRC_PUBLIC)
785                         flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
786                 if (prefs & IPV6_PREFER_SRC_COA)
787                         flags |= RT6_LOOKUP_F_SRCPREF_COA;
788         }
789
790         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
791 }
792
793 EXPORT_SYMBOL(ip6_route_output);
794
795 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
796 {
797         struct rt6_info *ort = (struct rt6_info *) *dstp;
798         struct rt6_info *rt = (struct rt6_info *)
799                 dst_alloc(&ip6_dst_blackhole_ops);
800         struct dst_entry *new = NULL;
801
802         if (rt) {
803                 new = &rt->u.dst;
804
805                 atomic_set(&new->__refcnt, 1);
806                 new->__use = 1;
807                 new->input = dst_discard;
808                 new->output = dst_discard;
809
810                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
811                 new->dev = ort->u.dst.dev;
812                 if (new->dev)
813                         dev_hold(new->dev);
814                 rt->rt6i_idev = ort->rt6i_idev;
815                 if (rt->rt6i_idev)
816                         in6_dev_hold(rt->rt6i_idev);
817                 rt->rt6i_expires = 0;
818
819                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
820                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
821                 rt->rt6i_metric = 0;
822
823                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
824 #ifdef CONFIG_IPV6_SUBTREES
825                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
826 #endif
827
828                 dst_free(new);
829         }
830
831         dst_release(*dstp);
832         *dstp = new;
833         return (new ? 0 : -ENOMEM);
834 }
835 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
836
837 /*
838  *      Destination cache support functions
839  */
840
841 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
842 {
843         struct rt6_info *rt;
844
845         rt = (struct rt6_info *) dst;
846
847         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
848                 return dst;
849
850         return NULL;
851 }
852
853 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
854 {
855         struct rt6_info *rt = (struct rt6_info *) dst;
856
857         if (rt) {
858                 if (rt->rt6i_flags & RTF_CACHE)
859                         ip6_del_rt(rt);
860                 else
861                         dst_release(dst);
862         }
863         return NULL;
864 }
865
866 static void ip6_link_failure(struct sk_buff *skb)
867 {
868         struct rt6_info *rt;
869
870         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
871
872         rt = (struct rt6_info *) skb->dst;
873         if (rt) {
874                 if (rt->rt6i_flags&RTF_CACHE) {
875                         dst_set_expires(&rt->u.dst, 0);
876                         rt->rt6i_flags |= RTF_EXPIRES;
877                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
878                         rt->rt6i_node->fn_sernum = -1;
879         }
880 }
881
882 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
883 {
884         struct rt6_info *rt6 = (struct rt6_info*)dst;
885
886         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
887                 rt6->rt6i_flags |= RTF_MODIFIED;
888                 if (mtu < IPV6_MIN_MTU) {
889                         mtu = IPV6_MIN_MTU;
890                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
891                 }
892                 dst->metrics[RTAX_MTU-1] = mtu;
893                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
894         }
895 }
896
897 static int ipv6_get_mtu(struct net_device *dev);
898
899 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
900 {
901         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
902
903         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
904                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
905
906         /*
907          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
908          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
909          * IPV6_MAXPLEN is also valid and means: "any MSS,
910          * rely only on pmtu discovery"
911          */
912         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
913                 mtu = IPV6_MAXPLEN;
914         return mtu;
915 }
916
917 static struct dst_entry *icmp6_dst_gc_list;
918 static DEFINE_SPINLOCK(icmp6_dst_lock);
919
920 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
921                                   struct neighbour *neigh,
922                                   const struct in6_addr *addr)
923 {
924         struct rt6_info *rt;
925         struct inet6_dev *idev = in6_dev_get(dev);
926         struct net *net = dev_net(dev);
927
928         if (unlikely(idev == NULL))
929                 return NULL;
930
931         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
932         if (unlikely(rt == NULL)) {
933                 in6_dev_put(idev);
934                 goto out;
935         }
936
937         dev_hold(dev);
938         if (neigh)
939                 neigh_hold(neigh);
940         else
941                 neigh = ndisc_get_neigh(dev, addr);
942
943         rt->rt6i_dev      = dev;
944         rt->rt6i_idev     = idev;
945         rt->rt6i_nexthop  = neigh;
946         atomic_set(&rt->u.dst.__refcnt, 1);
947         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
948         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
949         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
950         rt->u.dst.output  = ip6_output;
951
952 #if 0   /* there's no chance to use these for ndisc */
953         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
954                                 ? DST_HOST
955                                 : 0;
956         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
957         rt->rt6i_dst.plen = 128;
958 #endif
959
960         spin_lock_bh(&icmp6_dst_lock);
961         rt->u.dst.next = icmp6_dst_gc_list;
962         icmp6_dst_gc_list = &rt->u.dst;
963         spin_unlock_bh(&icmp6_dst_lock);
964
965         fib6_force_start_gc(net);
966
967 out:
968         return &rt->u.dst;
969 }
970
971 int icmp6_dst_gc(int *more)
972 {
973         struct dst_entry *dst, *next, **pprev;
974         int freed;
975
976         next = NULL;
977         freed = 0;
978
979         spin_lock_bh(&icmp6_dst_lock);
980         pprev = &icmp6_dst_gc_list;
981
982         while ((dst = *pprev) != NULL) {
983                 if (!atomic_read(&dst->__refcnt)) {
984                         *pprev = dst->next;
985                         dst_free(dst);
986                         freed++;
987                 } else {
988                         pprev = &dst->next;
989                         (*more)++;
990                 }
991         }
992
993         spin_unlock_bh(&icmp6_dst_lock);
994
995         return freed;
996 }
997
998 static int ip6_dst_gc(struct dst_ops *ops)
999 {
1000         unsigned long now = jiffies;
1001         struct net *net = ops->dst_net;
1002         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1003         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1004         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1005         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1006         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1007
1008         if (time_after(rt_last_gc + rt_min_interval, now) &&
1009             atomic_read(&ops->entries) <= rt_max_size)
1010                 goto out;
1011
1012         net->ipv6.ip6_rt_gc_expire++;
1013         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1014         net->ipv6.ip6_rt_last_gc = now;
1015         if (atomic_read(&ops->entries) < ops->gc_thresh)
1016                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1017 out:
1018         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1019         return (atomic_read(&ops->entries) > rt_max_size);
1020 }
1021
1022 /* Clean host part of a prefix. Not necessary in radix tree,
1023    but results in cleaner routing tables.
1024
1025    Remove it only when all the things will work!
1026  */
1027
1028 static int ipv6_get_mtu(struct net_device *dev)
1029 {
1030         int mtu = IPV6_MIN_MTU;
1031         struct inet6_dev *idev;
1032
1033         idev = in6_dev_get(dev);
1034         if (idev) {
1035                 mtu = idev->cnf.mtu6;
1036                 in6_dev_put(idev);
1037         }
1038         return mtu;
1039 }
1040
1041 int ip6_dst_hoplimit(struct dst_entry *dst)
1042 {
1043         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1044         if (hoplimit < 0) {
1045                 struct net_device *dev = dst->dev;
1046                 struct inet6_dev *idev = in6_dev_get(dev);
1047                 if (idev) {
1048                         hoplimit = idev->cnf.hop_limit;
1049                         in6_dev_put(idev);
1050                 } else
1051                         hoplimit = ipv6_devconf.hop_limit;
1052         }
1053         return hoplimit;
1054 }
1055
1056 /*
1057  *
1058  */
1059
1060 int ip6_route_add(struct fib6_config *cfg)
1061 {
1062         int err;
1063         struct net *net = cfg->fc_nlinfo.nl_net;
1064         struct rt6_info *rt = NULL;
1065         struct net_device *dev = NULL;
1066         struct inet6_dev *idev = NULL;
1067         struct fib6_table *table;
1068         int addr_type;
1069
1070         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1071                 return -EINVAL;
1072 #ifndef CONFIG_IPV6_SUBTREES
1073         if (cfg->fc_src_len)
1074                 return -EINVAL;
1075 #endif
1076         if (cfg->fc_ifindex) {
1077                 err = -ENODEV;
1078                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1079                 if (!dev)
1080                         goto out;
1081                 idev = in6_dev_get(dev);
1082                 if (!idev)
1083                         goto out;
1084         }
1085
1086         if (cfg->fc_metric == 0)
1087                 cfg->fc_metric = IP6_RT_PRIO_USER;
1088
1089         table = fib6_new_table(net, cfg->fc_table);
1090         if (table == NULL) {
1091                 err = -ENOBUFS;
1092                 goto out;
1093         }
1094
1095         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1096
1097         if (rt == NULL) {
1098                 err = -ENOMEM;
1099                 goto out;
1100         }
1101
1102         rt->u.dst.obsolete = -1;
1103         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1104                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1105                                 0;
1106
1107         if (cfg->fc_protocol == RTPROT_UNSPEC)
1108                 cfg->fc_protocol = RTPROT_BOOT;
1109         rt->rt6i_protocol = cfg->fc_protocol;
1110
1111         addr_type = ipv6_addr_type(&cfg->fc_dst);
1112
1113         if (addr_type & IPV6_ADDR_MULTICAST)
1114                 rt->u.dst.input = ip6_mc_input;
1115         else
1116                 rt->u.dst.input = ip6_forward;
1117
1118         rt->u.dst.output = ip6_output;
1119
1120         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1121         rt->rt6i_dst.plen = cfg->fc_dst_len;
1122         if (rt->rt6i_dst.plen == 128)
1123                rt->u.dst.flags = DST_HOST;
1124
1125 #ifdef CONFIG_IPV6_SUBTREES
1126         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1127         rt->rt6i_src.plen = cfg->fc_src_len;
1128 #endif
1129
1130         rt->rt6i_metric = cfg->fc_metric;
1131
1132         /* We cannot add true routes via loopback here,
1133            they would result in kernel looping; promote them to reject routes
1134          */
1135         if ((cfg->fc_flags & RTF_REJECT) ||
1136             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1137                 /* hold loopback dev/idev if we haven't done so. */
1138                 if (dev != net->loopback_dev) {
1139                         if (dev) {
1140                                 dev_put(dev);
1141                                 in6_dev_put(idev);
1142                         }
1143                         dev = net->loopback_dev;
1144                         dev_hold(dev);
1145                         idev = in6_dev_get(dev);
1146                         if (!idev) {
1147                                 err = -ENODEV;
1148                                 goto out;
1149                         }
1150                 }
1151                 rt->u.dst.output = ip6_pkt_discard_out;
1152                 rt->u.dst.input = ip6_pkt_discard;
1153                 rt->u.dst.error = -ENETUNREACH;
1154                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1155                 goto install_route;
1156         }
1157
1158         if (cfg->fc_flags & RTF_GATEWAY) {
1159                 struct in6_addr *gw_addr;
1160                 int gwa_type;
1161
1162                 gw_addr = &cfg->fc_gateway;
1163                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1164                 gwa_type = ipv6_addr_type(gw_addr);
1165
1166                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1167                         struct rt6_info *grt;
1168
1169                         /* IPv6 strictly inhibits using not link-local
1170                            addresses as nexthop address.
1171                            Otherwise, router will not able to send redirects.
1172                            It is very good, but in some (rare!) circumstances
1173                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1174                            some exceptions. --ANK
1175                          */
1176                         err = -EINVAL;
1177                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1178                                 goto out;
1179
1180                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1181
1182                         err = -EHOSTUNREACH;
1183                         if (grt == NULL)
1184                                 goto out;
1185                         if (dev) {
1186                                 if (dev != grt->rt6i_dev) {
1187                                         dst_release(&grt->u.dst);
1188                                         goto out;
1189                                 }
1190                         } else {
1191                                 dev = grt->rt6i_dev;
1192                                 idev = grt->rt6i_idev;
1193                                 dev_hold(dev);
1194                                 in6_dev_hold(grt->rt6i_idev);
1195                         }
1196                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1197                                 err = 0;
1198                         dst_release(&grt->u.dst);
1199
1200                         if (err)
1201                                 goto out;
1202                 }
1203                 err = -EINVAL;
1204                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1205                         goto out;
1206         }
1207
1208         err = -ENODEV;
1209         if (dev == NULL)
1210                 goto out;
1211
1212         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1213                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1214                 if (IS_ERR(rt->rt6i_nexthop)) {
1215                         err = PTR_ERR(rt->rt6i_nexthop);
1216                         rt->rt6i_nexthop = NULL;
1217                         goto out;
1218                 }
1219         }
1220
1221         rt->rt6i_flags = cfg->fc_flags;
1222
1223 install_route:
1224         if (cfg->fc_mx) {
1225                 struct nlattr *nla;
1226                 int remaining;
1227
1228                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1229                         int type = nla_type(nla);
1230
1231                         if (type) {
1232                                 if (type > RTAX_MAX) {
1233                                         err = -EINVAL;
1234                                         goto out;
1235                                 }
1236
1237                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1238                         }
1239                 }
1240         }
1241
1242         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1243                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1244         if (!dst_metric(&rt->u.dst, RTAX_MTU))
1245                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1246         if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1247                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1248         rt->u.dst.dev = dev;
1249         rt->rt6i_idev = idev;
1250         rt->rt6i_table = table;
1251
1252         cfg->fc_nlinfo.nl_net = dev_net(dev);
1253
1254         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1255
1256 out:
1257         if (dev)
1258                 dev_put(dev);
1259         if (idev)
1260                 in6_dev_put(idev);
1261         if (rt)
1262                 dst_free(&rt->u.dst);
1263         return err;
1264 }
1265
1266 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1267 {
1268         int err;
1269         struct fib6_table *table;
1270         struct net *net = dev_net(rt->rt6i_dev);
1271
1272         if (rt == net->ipv6.ip6_null_entry)
1273                 return -ENOENT;
1274
1275         table = rt->rt6i_table;
1276         write_lock_bh(&table->tb6_lock);
1277
1278         err = fib6_del(rt, info);
1279         dst_release(&rt->u.dst);
1280
1281         write_unlock_bh(&table->tb6_lock);
1282
1283         return err;
1284 }
1285
1286 int ip6_del_rt(struct rt6_info *rt)
1287 {
1288         struct nl_info info = {
1289                 .nl_net = dev_net(rt->rt6i_dev),
1290         };
1291         return __ip6_del_rt(rt, &info);
1292 }
1293
1294 static int ip6_route_del(struct fib6_config *cfg)
1295 {
1296         struct fib6_table *table;
1297         struct fib6_node *fn;
1298         struct rt6_info *rt;
1299         int err = -ESRCH;
1300
1301         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1302         if (table == NULL)
1303                 return err;
1304
1305         read_lock_bh(&table->tb6_lock);
1306
1307         fn = fib6_locate(&table->tb6_root,
1308                          &cfg->fc_dst, cfg->fc_dst_len,
1309                          &cfg->fc_src, cfg->fc_src_len);
1310
1311         if (fn) {
1312                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1313                         if (cfg->fc_ifindex &&
1314                             (rt->rt6i_dev == NULL ||
1315                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1316                                 continue;
1317                         if (cfg->fc_flags & RTF_GATEWAY &&
1318                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1319                                 continue;
1320                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1321                                 continue;
1322                         dst_hold(&rt->u.dst);
1323                         read_unlock_bh(&table->tb6_lock);
1324
1325                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1326                 }
1327         }
1328         read_unlock_bh(&table->tb6_lock);
1329
1330         return err;
1331 }
1332
1333 /*
1334  *      Handle redirects
1335  */
1336 struct ip6rd_flowi {
1337         struct flowi fl;
1338         struct in6_addr gateway;
1339 };
1340
1341 static struct rt6_info *__ip6_route_redirect(struct net *net,
1342                                              struct fib6_table *table,
1343                                              struct flowi *fl,
1344                                              int flags)
1345 {
1346         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1347         struct rt6_info *rt;
1348         struct fib6_node *fn;
1349
1350         /*
1351          * Get the "current" route for this destination and
1352          * check if the redirect has come from approriate router.
1353          *
1354          * RFC 2461 specifies that redirects should only be
1355          * accepted if they come from the nexthop to the target.
1356          * Due to the way the routes are chosen, this notion
1357          * is a bit fuzzy and one might need to check all possible
1358          * routes.
1359          */
1360
1361         read_lock_bh(&table->tb6_lock);
1362         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1363 restart:
1364         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1365                 /*
1366                  * Current route is on-link; redirect is always invalid.
1367                  *
1368                  * Seems, previous statement is not true. It could
1369                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1370                  * But then router serving it might decide, that we should
1371                  * know truth 8)8) --ANK (980726).
1372                  */
1373                 if (rt6_check_expired(rt))
1374                         continue;
1375                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1376                         continue;
1377                 if (fl->oif != rt->rt6i_dev->ifindex)
1378                         continue;
1379                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1380                         continue;
1381                 break;
1382         }
1383
1384         if (!rt)
1385                 rt = net->ipv6.ip6_null_entry;
1386         BACKTRACK(net, &fl->fl6_src);
1387 out:
1388         dst_hold(&rt->u.dst);
1389
1390         read_unlock_bh(&table->tb6_lock);
1391
1392         return rt;
1393 };
1394
1395 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1396                                            struct in6_addr *src,
1397                                            struct in6_addr *gateway,
1398                                            struct net_device *dev)
1399 {
1400         int flags = RT6_LOOKUP_F_HAS_SADDR;
1401         struct net *net = dev_net(dev);
1402         struct ip6rd_flowi rdfl = {
1403                 .fl = {
1404                         .oif = dev->ifindex,
1405                         .nl_u = {
1406                                 .ip6_u = {
1407                                         .daddr = *dest,
1408                                         .saddr = *src,
1409                                 },
1410                         },
1411                 },
1412                 .gateway = *gateway,
1413         };
1414
1415         if (rt6_need_strict(dest))
1416                 flags |= RT6_LOOKUP_F_IFACE;
1417
1418         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1419                                                    flags, __ip6_route_redirect);
1420 }
1421
1422 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1423                   struct in6_addr *saddr,
1424                   struct neighbour *neigh, u8 *lladdr, int on_link)
1425 {
1426         struct rt6_info *rt, *nrt = NULL;
1427         struct netevent_redirect netevent;
1428         struct net *net = dev_net(neigh->dev);
1429
1430         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1431
1432         if (rt == net->ipv6.ip6_null_entry) {
1433                 if (net_ratelimit())
1434                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1435                                "for redirect target\n");
1436                 goto out;
1437         }
1438
1439         /*
1440          *      We have finally decided to accept it.
1441          */
1442
1443         neigh_update(neigh, lladdr, NUD_STALE,
1444                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1445                      NEIGH_UPDATE_F_OVERRIDE|
1446                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1447                                      NEIGH_UPDATE_F_ISROUTER))
1448                      );
1449
1450         /*
1451          * Redirect received -> path was valid.
1452          * Look, redirects are sent only in response to data packets,
1453          * so that this nexthop apparently is reachable. --ANK
1454          */
1455         dst_confirm(&rt->u.dst);
1456
1457         /* Duplicate redirect: silently ignore. */
1458         if (neigh == rt->u.dst.neighbour)
1459                 goto out;
1460
1461         nrt = ip6_rt_copy(rt);
1462         if (nrt == NULL)
1463                 goto out;
1464
1465         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1466         if (on_link)
1467                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1468
1469         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1470         nrt->rt6i_dst.plen = 128;
1471         nrt->u.dst.flags |= DST_HOST;
1472
1473         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1474         nrt->rt6i_nexthop = neigh_clone(neigh);
1475         /* Reset pmtu, it may be better */
1476         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1477         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1478                                                         dst_mtu(&nrt->u.dst));
1479
1480         if (ip6_ins_rt(nrt))
1481                 goto out;
1482
1483         netevent.old = &rt->u.dst;
1484         netevent.new = &nrt->u.dst;
1485         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1486
1487         if (rt->rt6i_flags&RTF_CACHE) {
1488                 ip6_del_rt(rt);
1489                 return;
1490         }
1491
1492 out:
1493         dst_release(&rt->u.dst);
1494         return;
1495 }
1496
1497 /*
1498  *      Handle ICMP "packet too big" messages
1499  *      i.e. Path MTU discovery
1500  */
1501
1502 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1503                         struct net_device *dev, u32 pmtu)
1504 {
1505         struct rt6_info *rt, *nrt;
1506         struct net *net = dev_net(dev);
1507         int allfrag = 0;
1508
1509         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1510         if (rt == NULL)
1511                 return;
1512
1513         if (pmtu >= dst_mtu(&rt->u.dst))
1514                 goto out;
1515
1516         if (pmtu < IPV6_MIN_MTU) {
1517                 /*
1518                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1519                  * MTU (1280) and a fragment header should always be included
1520                  * after a node receiving Too Big message reporting PMTU is
1521                  * less than the IPv6 Minimum Link MTU.
1522                  */
1523                 pmtu = IPV6_MIN_MTU;
1524                 allfrag = 1;
1525         }
1526
1527         /* New mtu received -> path was valid.
1528            They are sent only in response to data packets,
1529            so that this nexthop apparently is reachable. --ANK
1530          */
1531         dst_confirm(&rt->u.dst);
1532
1533         /* Host route. If it is static, it would be better
1534            not to override it, but add new one, so that
1535            when cache entry will expire old pmtu
1536            would return automatically.
1537          */
1538         if (rt->rt6i_flags & RTF_CACHE) {
1539                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1540                 if (allfrag)
1541                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1542                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1543                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1544                 goto out;
1545         }
1546
1547         /* Network route.
1548            Two cases are possible:
1549            1. It is connected route. Action: COW
1550            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1551          */
1552         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1553                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1554         else
1555                 nrt = rt6_alloc_clone(rt, daddr);
1556
1557         if (nrt) {
1558                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1559                 if (allfrag)
1560                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1561
1562                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1563                  * happened within 5 mins, the recommended timer is 10 mins.
1564                  * Here this route expiration time is set to ip6_rt_mtu_expires
1565                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1566                  * and detecting PMTU increase will be automatically happened.
1567                  */
1568                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1569                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1570
1571                 ip6_ins_rt(nrt);
1572         }
1573 out:
1574         dst_release(&rt->u.dst);
1575 }
1576
1577 /*
1578  *      Misc support functions
1579  */
1580
1581 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1582 {
1583         struct net *net = dev_net(ort->rt6i_dev);
1584         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1585
1586         if (rt) {
1587                 rt->u.dst.input = ort->u.dst.input;
1588                 rt->u.dst.output = ort->u.dst.output;
1589
1590                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1591                 rt->u.dst.error = ort->u.dst.error;
1592                 rt->u.dst.dev = ort->u.dst.dev;
1593                 if (rt->u.dst.dev)
1594                         dev_hold(rt->u.dst.dev);
1595                 rt->rt6i_idev = ort->rt6i_idev;
1596                 if (rt->rt6i_idev)
1597                         in6_dev_hold(rt->rt6i_idev);
1598                 rt->u.dst.lastuse = jiffies;
1599                 rt->rt6i_expires = 0;
1600
1601                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1602                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1603                 rt->rt6i_metric = 0;
1604
1605                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1606 #ifdef CONFIG_IPV6_SUBTREES
1607                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1608 #endif
1609                 rt->rt6i_table = ort->rt6i_table;
1610         }
1611         return rt;
1612 }
1613
1614 #ifdef CONFIG_IPV6_ROUTE_INFO
1615 static struct rt6_info *rt6_get_route_info(struct net *net,
1616                                            struct in6_addr *prefix, int prefixlen,
1617                                            struct in6_addr *gwaddr, int ifindex)
1618 {
1619         struct fib6_node *fn;
1620         struct rt6_info *rt = NULL;
1621         struct fib6_table *table;
1622
1623         table = fib6_get_table(net, RT6_TABLE_INFO);
1624         if (table == NULL)
1625                 return NULL;
1626
1627         write_lock_bh(&table->tb6_lock);
1628         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1629         if (!fn)
1630                 goto out;
1631
1632         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1633                 if (rt->rt6i_dev->ifindex != ifindex)
1634                         continue;
1635                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1636                         continue;
1637                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1638                         continue;
1639                 dst_hold(&rt->u.dst);
1640                 break;
1641         }
1642 out:
1643         write_unlock_bh(&table->tb6_lock);
1644         return rt;
1645 }
1646
1647 static struct rt6_info *rt6_add_route_info(struct net *net,
1648                                            struct in6_addr *prefix, int prefixlen,
1649                                            struct in6_addr *gwaddr, int ifindex,
1650                                            unsigned pref)
1651 {
1652         struct fib6_config cfg = {
1653                 .fc_table       = RT6_TABLE_INFO,
1654                 .fc_metric      = IP6_RT_PRIO_USER,
1655                 .fc_ifindex     = ifindex,
1656                 .fc_dst_len     = prefixlen,
1657                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1658                                   RTF_UP | RTF_PREF(pref),
1659                 .fc_nlinfo.pid = 0,
1660                 .fc_nlinfo.nlh = NULL,
1661                 .fc_nlinfo.nl_net = net,
1662         };
1663
1664         ipv6_addr_copy(&cfg.fc_dst, prefix);
1665         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1666
1667         /* We should treat it as a default route if prefix length is 0. */
1668         if (!prefixlen)
1669                 cfg.fc_flags |= RTF_DEFAULT;
1670
1671         ip6_route_add(&cfg);
1672
1673         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1674 }
1675 #endif
1676
1677 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1678 {
1679         struct rt6_info *rt;
1680         struct fib6_table *table;
1681
1682         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1683         if (table == NULL)
1684                 return NULL;
1685
1686         write_lock_bh(&table->tb6_lock);
1687         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1688                 if (dev == rt->rt6i_dev &&
1689                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1690                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1691                         break;
1692         }
1693         if (rt)
1694                 dst_hold(&rt->u.dst);
1695         write_unlock_bh(&table->tb6_lock);
1696         return rt;
1697 }
1698
1699 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1700                                      struct net_device *dev,
1701                                      unsigned int pref)
1702 {
1703         struct fib6_config cfg = {
1704                 .fc_table       = RT6_TABLE_DFLT,
1705                 .fc_metric      = IP6_RT_PRIO_USER,
1706                 .fc_ifindex     = dev->ifindex,
1707                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1708                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1709                 .fc_nlinfo.pid = 0,
1710                 .fc_nlinfo.nlh = NULL,
1711                 .fc_nlinfo.nl_net = dev_net(dev),
1712         };
1713
1714         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1715
1716         ip6_route_add(&cfg);
1717
1718         return rt6_get_dflt_router(gwaddr, dev);
1719 }
1720
1721 void rt6_purge_dflt_routers(struct net *net)
1722 {
1723         struct rt6_info *rt;
1724         struct fib6_table *table;
1725
1726         /* NOTE: Keep consistent with rt6_get_dflt_router */
1727         table = fib6_get_table(net, RT6_TABLE_DFLT);
1728         if (table == NULL)
1729                 return;
1730
1731 restart:
1732         read_lock_bh(&table->tb6_lock);
1733         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1734                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1735                         dst_hold(&rt->u.dst);
1736                         read_unlock_bh(&table->tb6_lock);
1737                         ip6_del_rt(rt);
1738                         goto restart;
1739                 }
1740         }
1741         read_unlock_bh(&table->tb6_lock);
1742 }
1743
1744 static void rtmsg_to_fib6_config(struct net *net,
1745                                  struct in6_rtmsg *rtmsg,
1746                                  struct fib6_config *cfg)
1747 {
1748         memset(cfg, 0, sizeof(*cfg));
1749
1750         cfg->fc_table = RT6_TABLE_MAIN;
1751         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1752         cfg->fc_metric = rtmsg->rtmsg_metric;
1753         cfg->fc_expires = rtmsg->rtmsg_info;
1754         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1755         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1756         cfg->fc_flags = rtmsg->rtmsg_flags;
1757
1758         cfg->fc_nlinfo.nl_net = net;
1759
1760         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1761         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1762         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1763 }
1764
1765 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1766 {
1767         struct fib6_config cfg;
1768         struct in6_rtmsg rtmsg;
1769         int err;
1770
1771         switch(cmd) {
1772         case SIOCADDRT:         /* Add a route */
1773         case SIOCDELRT:         /* Delete a route */
1774                 if (!capable(CAP_NET_ADMIN))
1775                         return -EPERM;
1776                 err = copy_from_user(&rtmsg, arg,
1777                                      sizeof(struct in6_rtmsg));
1778                 if (err)
1779                         return -EFAULT;
1780
1781                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1782
1783                 rtnl_lock();
1784                 switch (cmd) {
1785                 case SIOCADDRT:
1786                         err = ip6_route_add(&cfg);
1787                         break;
1788                 case SIOCDELRT:
1789                         err = ip6_route_del(&cfg);
1790                         break;
1791                 default:
1792                         err = -EINVAL;
1793                 }
1794                 rtnl_unlock();
1795
1796                 return err;
1797         }
1798
1799         return -EINVAL;
1800 }
1801
1802 /*
1803  *      Drop the packet on the floor
1804  */
1805
1806 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1807 {
1808         int type;
1809         switch (ipstats_mib_noroutes) {
1810         case IPSTATS_MIB_INNOROUTES:
1811                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1812                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1813                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1814                         break;
1815                 }
1816                 /* FALLTHROUGH */
1817         case IPSTATS_MIB_OUTNOROUTES:
1818                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1819                 break;
1820         }
1821         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1822         kfree_skb(skb);
1823         return 0;
1824 }
1825
1826 static int ip6_pkt_discard(struct sk_buff *skb)
1827 {
1828         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1829 }
1830
1831 static int ip6_pkt_discard_out(struct sk_buff *skb)
1832 {
1833         skb->dev = skb->dst->dev;
1834         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1835 }
1836
1837 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1838
1839 static int ip6_pkt_prohibit(struct sk_buff *skb)
1840 {
1841         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1842 }
1843
1844 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1845 {
1846         skb->dev = skb->dst->dev;
1847         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1848 }
1849
1850 #endif
1851
1852 /*
1853  *      Allocate a dst for local (unicast / anycast) address.
1854  */
1855
1856 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1857                                     const struct in6_addr *addr,
1858                                     int anycast)
1859 {
1860         struct net *net = dev_net(idev->dev);
1861         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1862
1863         if (rt == NULL)
1864                 return ERR_PTR(-ENOMEM);
1865
1866         dev_hold(net->loopback_dev);
1867         in6_dev_hold(idev);
1868
1869         rt->u.dst.flags = DST_HOST;
1870         rt->u.dst.input = ip6_input;
1871         rt->u.dst.output = ip6_output;
1872         rt->rt6i_dev = net->loopback_dev;
1873         rt->rt6i_idev = idev;
1874         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1875         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1876         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1877         rt->u.dst.obsolete = -1;
1878
1879         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1880         if (anycast)
1881                 rt->rt6i_flags |= RTF_ANYCAST;
1882         else
1883                 rt->rt6i_flags |= RTF_LOCAL;
1884         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1885         if (rt->rt6i_nexthop == NULL) {
1886                 dst_free(&rt->u.dst);
1887                 return ERR_PTR(-ENOMEM);
1888         }
1889
1890         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1891         rt->rt6i_dst.plen = 128;
1892         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1893
1894         atomic_set(&rt->u.dst.__refcnt, 1);
1895
1896         return rt;
1897 }
1898
1899 struct arg_dev_net {
1900         struct net_device *dev;
1901         struct net *net;
1902 };
1903
1904 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1905 {
1906         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1907         struct net *net = ((struct arg_dev_net *)arg)->net;
1908
1909         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1910             rt != net->ipv6.ip6_null_entry) {
1911                 RT6_TRACE("deleted by ifdown %p\n", rt);
1912                 return -1;
1913         }
1914         return 0;
1915 }
1916
1917 void rt6_ifdown(struct net *net, struct net_device *dev)
1918 {
1919         struct arg_dev_net adn = {
1920                 .dev = dev,
1921                 .net = net,
1922         };
1923
1924         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1925 }
1926
1927 struct rt6_mtu_change_arg
1928 {
1929         struct net_device *dev;
1930         unsigned mtu;
1931 };
1932
1933 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1934 {
1935         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1936         struct inet6_dev *idev;
1937         struct net *net = dev_net(arg->dev);
1938
1939         /* In IPv6 pmtu discovery is not optional,
1940            so that RTAX_MTU lock cannot disable it.
1941            We still use this lock to block changes
1942            caused by addrconf/ndisc.
1943         */
1944
1945         idev = __in6_dev_get(arg->dev);
1946         if (idev == NULL)
1947                 return 0;
1948
1949         /* For administrative MTU increase, there is no way to discover
1950            IPv6 PMTU increase, so PMTU increase should be updated here.
1951            Since RFC 1981 doesn't include administrative MTU increase
1952            update PMTU increase is a MUST. (i.e. jumbo frame)
1953          */
1954         /*
1955            If new MTU is less than route PMTU, this new MTU will be the
1956            lowest MTU in the path, update the route PMTU to reflect PMTU
1957            decreases; if new MTU is greater than route PMTU, and the
1958            old MTU is the lowest MTU in the path, update the route PMTU
1959            to reflect the increase. In this case if the other nodes' MTU
1960            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1961            PMTU discouvery.
1962          */
1963         if (rt->rt6i_dev == arg->dev &&
1964             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1965             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1966              (dst_mtu(&rt->u.dst) < arg->mtu &&
1967               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1968                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1969                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1970         }
1971         return 0;
1972 }
1973
1974 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1975 {
1976         struct rt6_mtu_change_arg arg = {
1977                 .dev = dev,
1978                 .mtu = mtu,
1979         };
1980
1981         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
1982 }
1983
1984 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1985         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1986         [RTA_OIF]               = { .type = NLA_U32 },
1987         [RTA_IIF]               = { .type = NLA_U32 },
1988         [RTA_PRIORITY]          = { .type = NLA_U32 },
1989         [RTA_METRICS]           = { .type = NLA_NESTED },
1990 };
1991
1992 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1993                               struct fib6_config *cfg)
1994 {
1995         struct rtmsg *rtm;
1996         struct nlattr *tb[RTA_MAX+1];
1997         int err;
1998
1999         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2000         if (err < 0)
2001                 goto errout;
2002
2003         err = -EINVAL;
2004         rtm = nlmsg_data(nlh);
2005         memset(cfg, 0, sizeof(*cfg));
2006
2007         cfg->fc_table = rtm->rtm_table;
2008         cfg->fc_dst_len = rtm->rtm_dst_len;
2009         cfg->fc_src_len = rtm->rtm_src_len;
2010         cfg->fc_flags = RTF_UP;
2011         cfg->fc_protocol = rtm->rtm_protocol;
2012
2013         if (rtm->rtm_type == RTN_UNREACHABLE)
2014                 cfg->fc_flags |= RTF_REJECT;
2015
2016         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2017         cfg->fc_nlinfo.nlh = nlh;
2018         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2019
2020         if (tb[RTA_GATEWAY]) {
2021                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2022                 cfg->fc_flags |= RTF_GATEWAY;
2023         }
2024
2025         if (tb[RTA_DST]) {
2026                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2027
2028                 if (nla_len(tb[RTA_DST]) < plen)
2029                         goto errout;
2030
2031                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2032         }
2033
2034         if (tb[RTA_SRC]) {
2035                 int plen = (rtm->rtm_src_len + 7) >> 3;
2036
2037                 if (nla_len(tb[RTA_SRC]) < plen)
2038                         goto errout;
2039
2040                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2041         }
2042
2043         if (tb[RTA_OIF])
2044                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2045
2046         if (tb[RTA_PRIORITY])
2047                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2048
2049         if (tb[RTA_METRICS]) {
2050                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2051                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2052         }
2053
2054         if (tb[RTA_TABLE])
2055                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2056
2057         err = 0;
2058 errout:
2059         return err;
2060 }
2061
2062 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2063 {
2064         struct fib6_config cfg;
2065         int err;
2066
2067         err = rtm_to_fib6_config(skb, nlh, &cfg);
2068         if (err < 0)
2069                 return err;
2070
2071         return ip6_route_del(&cfg);
2072 }
2073
2074 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2075 {
2076         struct fib6_config cfg;
2077         int err;
2078
2079         err = rtm_to_fib6_config(skb, nlh, &cfg);
2080         if (err < 0)
2081                 return err;
2082
2083         return ip6_route_add(&cfg);
2084 }
2085
2086 static inline size_t rt6_nlmsg_size(void)
2087 {
2088         return NLMSG_ALIGN(sizeof(struct rtmsg))
2089                + nla_total_size(16) /* RTA_SRC */
2090                + nla_total_size(16) /* RTA_DST */
2091                + nla_total_size(16) /* RTA_GATEWAY */
2092                + nla_total_size(16) /* RTA_PREFSRC */
2093                + nla_total_size(4) /* RTA_TABLE */
2094                + nla_total_size(4) /* RTA_IIF */
2095                + nla_total_size(4) /* RTA_OIF */
2096                + nla_total_size(4) /* RTA_PRIORITY */
2097                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2098                + nla_total_size(sizeof(struct rta_cacheinfo));
2099 }
2100
2101 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2102                          struct in6_addr *dst, struct in6_addr *src,
2103                          int iif, int type, u32 pid, u32 seq,
2104                          int prefix, int nowait, unsigned int flags)
2105 {
2106         struct rtmsg *rtm;
2107         struct nlmsghdr *nlh;
2108         long expires;
2109         u32 table;
2110
2111         if (prefix) {   /* user wants prefix routes only */
2112                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2113                         /* success since this is not a prefix route */
2114                         return 1;
2115                 }
2116         }
2117
2118         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2119         if (nlh == NULL)
2120                 return -EMSGSIZE;
2121
2122         rtm = nlmsg_data(nlh);
2123         rtm->rtm_family = AF_INET6;
2124         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2125         rtm->rtm_src_len = rt->rt6i_src.plen;
2126         rtm->rtm_tos = 0;
2127         if (rt->rt6i_table)
2128                 table = rt->rt6i_table->tb6_id;
2129         else
2130                 table = RT6_TABLE_UNSPEC;
2131         rtm->rtm_table = table;
2132         NLA_PUT_U32(skb, RTA_TABLE, table);
2133         if (rt->rt6i_flags&RTF_REJECT)
2134                 rtm->rtm_type = RTN_UNREACHABLE;
2135         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2136                 rtm->rtm_type = RTN_LOCAL;
2137         else
2138                 rtm->rtm_type = RTN_UNICAST;
2139         rtm->rtm_flags = 0;
2140         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2141         rtm->rtm_protocol = rt->rt6i_protocol;
2142         if (rt->rt6i_flags&RTF_DYNAMIC)
2143                 rtm->rtm_protocol = RTPROT_REDIRECT;
2144         else if (rt->rt6i_flags & RTF_ADDRCONF)
2145                 rtm->rtm_protocol = RTPROT_KERNEL;
2146         else if (rt->rt6i_flags&RTF_DEFAULT)
2147                 rtm->rtm_protocol = RTPROT_RA;
2148
2149         if (rt->rt6i_flags&RTF_CACHE)
2150                 rtm->rtm_flags |= RTM_F_CLONED;
2151
2152         if (dst) {
2153                 NLA_PUT(skb, RTA_DST, 16, dst);
2154                 rtm->rtm_dst_len = 128;
2155         } else if (rtm->rtm_dst_len)
2156                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2157 #ifdef CONFIG_IPV6_SUBTREES
2158         if (src) {
2159                 NLA_PUT(skb, RTA_SRC, 16, src);
2160                 rtm->rtm_src_len = 128;
2161         } else if (rtm->rtm_src_len)
2162                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2163 #endif
2164         if (iif) {
2165 #ifdef CONFIG_IPV6_MROUTE
2166                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2167                         int err = ip6mr_get_route(skb, rtm, nowait);
2168                         if (err <= 0) {
2169                                 if (!nowait) {
2170                                         if (err == 0)
2171                                                 return 0;
2172                                         goto nla_put_failure;
2173                                 } else {
2174                                         if (err == -EMSGSIZE)
2175                                                 goto nla_put_failure;
2176                                 }
2177                         }
2178                 } else
2179 #endif
2180                         NLA_PUT_U32(skb, RTA_IIF, iif);
2181         } else if (dst) {
2182                 struct in6_addr saddr_buf;
2183                 if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2184                                        dst, 0, &saddr_buf) == 0)
2185                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2186         }
2187
2188         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2189                 goto nla_put_failure;
2190
2191         if (rt->u.dst.neighbour)
2192                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2193
2194         if (rt->u.dst.dev)
2195                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2196
2197         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2198
2199         if (!(rt->rt6i_flags & RTF_EXPIRES))
2200                 expires = 0;
2201         else if (rt->rt6i_expires - jiffies < INT_MAX)
2202                 expires = rt->rt6i_expires - jiffies;
2203         else
2204                 expires = INT_MAX;
2205
2206         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2207                                expires, rt->u.dst.error) < 0)
2208                 goto nla_put_failure;
2209
2210         return nlmsg_end(skb, nlh);
2211
2212 nla_put_failure:
2213         nlmsg_cancel(skb, nlh);
2214         return -EMSGSIZE;
2215 }
2216
2217 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2218 {
2219         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2220         int prefix;
2221
2222         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2223                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2224                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2225         } else
2226                 prefix = 0;
2227
2228         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2229                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2230                      prefix, 0, NLM_F_MULTI);
2231 }
2232
2233 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2234 {
2235         struct net *net = sock_net(in_skb->sk);
2236         struct nlattr *tb[RTA_MAX+1];
2237         struct rt6_info *rt;
2238         struct sk_buff *skb;
2239         struct rtmsg *rtm;
2240         struct flowi fl;
2241         int err, iif = 0;
2242
2243         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2244         if (err < 0)
2245                 goto errout;
2246
2247         err = -EINVAL;
2248         memset(&fl, 0, sizeof(fl));
2249
2250         if (tb[RTA_SRC]) {
2251                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2252                         goto errout;
2253
2254                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2255         }
2256
2257         if (tb[RTA_DST]) {
2258                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2259                         goto errout;
2260
2261                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2262         }
2263
2264         if (tb[RTA_IIF])
2265                 iif = nla_get_u32(tb[RTA_IIF]);
2266
2267         if (tb[RTA_OIF])
2268                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2269
2270         if (iif) {
2271                 struct net_device *dev;
2272                 dev = __dev_get_by_index(net, iif);
2273                 if (!dev) {
2274                         err = -ENODEV;
2275                         goto errout;
2276                 }
2277         }
2278
2279         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2280         if (skb == NULL) {
2281                 err = -ENOBUFS;
2282                 goto errout;
2283         }
2284
2285         /* Reserve room for dummy headers, this skb can pass
2286            through good chunk of routing engine.
2287          */
2288         skb_reset_mac_header(skb);
2289         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2290
2291         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2292         skb->dst = &rt->u.dst;
2293
2294         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2295                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2296                             nlh->nlmsg_seq, 0, 0, 0);
2297         if (err < 0) {
2298                 kfree_skb(skb);
2299                 goto errout;
2300         }
2301
2302         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2303 errout:
2304         return err;
2305 }
2306
2307 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2308 {
2309         struct sk_buff *skb;
2310         struct net *net = info->nl_net;
2311         u32 seq;
2312         int err;
2313
2314         err = -ENOBUFS;
2315         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2316
2317         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2318         if (skb == NULL)
2319                 goto errout;
2320
2321         err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2322                                 event, info->pid, seq, 0, 0, 0);
2323         if (err < 0) {
2324                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2325                 WARN_ON(err == -EMSGSIZE);
2326                 kfree_skb(skb);
2327                 goto errout;
2328         }
2329         err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2330                           info->nlh, gfp_any());
2331 errout:
2332         if (err < 0)
2333                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2334 }
2335
2336 static int ip6_route_dev_notify(struct notifier_block *this,
2337                                 unsigned long event, void *data)
2338 {
2339         struct net_device *dev = (struct net_device *)data;
2340         struct net *net = dev_net(dev);
2341
2342         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2343                 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2344                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2345 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2346                 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2347                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2348                 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2349                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2350 #endif
2351         }
2352
2353         return NOTIFY_OK;
2354 }
2355
2356 /*
2357  *      /proc
2358  */
2359
2360 #ifdef CONFIG_PROC_FS
2361
2362 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2363
2364 struct rt6_proc_arg
2365 {
2366         char *buffer;
2367         int offset;
2368         int length;
2369         int skip;
2370         int len;
2371 };
2372
2373 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2374 {
2375         struct seq_file *m = p_arg;
2376
2377         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2378                    rt->rt6i_dst.plen);
2379
2380 #ifdef CONFIG_IPV6_SUBTREES
2381         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2382                    rt->rt6i_src.plen);
2383 #else
2384         seq_puts(m, "00000000000000000000000000000000 00 ");
2385 #endif
2386
2387         if (rt->rt6i_nexthop) {
2388                 seq_printf(m, NIP6_SEQFMT,
2389                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2390         } else {
2391                 seq_puts(m, "00000000000000000000000000000000");
2392         }
2393         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2394                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2395                    rt->u.dst.__use, rt->rt6i_flags,
2396                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2397         return 0;
2398 }
2399
2400 static int ipv6_route_show(struct seq_file *m, void *v)
2401 {
2402         struct net *net = (struct net *)m->private;
2403         fib6_clean_all(net, rt6_info_route, 0, m);
2404         return 0;
2405 }
2406
2407 static int ipv6_route_open(struct inode *inode, struct file *file)
2408 {
2409         int err;
2410         struct net *net = get_proc_net(inode);
2411         if (!net)
2412                 return -ENXIO;
2413
2414         err = single_open(file, ipv6_route_show, net);
2415         if (err < 0) {
2416                 put_net(net);
2417                 return err;
2418         }
2419
2420         return 0;
2421 }
2422
2423 static int ipv6_route_release(struct inode *inode, struct file *file)
2424 {
2425         struct seq_file *seq = file->private_data;
2426         struct net *net = seq->private;
2427         put_net(net);
2428         return single_release(inode, file);
2429 }
2430
2431 static const struct file_operations ipv6_route_proc_fops = {
2432         .owner          = THIS_MODULE,
2433         .open           = ipv6_route_open,
2434         .read           = seq_read,
2435         .llseek         = seq_lseek,
2436         .release        = ipv6_route_release,
2437 };
2438
2439 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2440 {
2441         struct net *net = (struct net *)seq->private;
2442         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2443                    net->ipv6.rt6_stats->fib_nodes,
2444                    net->ipv6.rt6_stats->fib_route_nodes,
2445                    net->ipv6.rt6_stats->fib_rt_alloc,
2446                    net->ipv6.rt6_stats->fib_rt_entries,
2447                    net->ipv6.rt6_stats->fib_rt_cache,
2448                    atomic_read(&net->ipv6.ip6_dst_ops->entries),
2449                    net->ipv6.rt6_stats->fib_discarded_routes);
2450
2451         return 0;
2452 }
2453
2454 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2455 {
2456         int err;
2457         struct net *net = get_proc_net(inode);
2458         if (!net)
2459                 return -ENXIO;
2460
2461         err = single_open(file, rt6_stats_seq_show, net);
2462         if (err < 0) {
2463                 put_net(net);
2464                 return err;
2465         }
2466
2467         return 0;
2468 }
2469
2470 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2471 {
2472         struct seq_file *seq = file->private_data;
2473         struct net *net = (struct net *)seq->private;
2474         put_net(net);
2475         return single_release(inode, file);
2476 }
2477
2478 static const struct file_operations rt6_stats_seq_fops = {
2479         .owner   = THIS_MODULE,
2480         .open    = rt6_stats_seq_open,
2481         .read    = seq_read,
2482         .llseek  = seq_lseek,
2483         .release = rt6_stats_seq_release,
2484 };
2485 #endif  /* CONFIG_PROC_FS */
2486
2487 #ifdef CONFIG_SYSCTL
2488
2489 static
2490 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2491                               void __user *buffer, size_t *lenp, loff_t *ppos)
2492 {
2493         struct net *net = current->nsproxy->net_ns;
2494         int delay = net->ipv6.sysctl.flush_delay;
2495         if (write) {
2496                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2497                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2498                 return 0;
2499         } else
2500                 return -EINVAL;
2501 }
2502
2503 ctl_table ipv6_route_table_template[] = {
2504         {
2505                 .procname       =       "flush",
2506                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2507                 .maxlen         =       sizeof(int),
2508                 .mode           =       0200,
2509                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2510         },
2511         {
2512                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2513                 .procname       =       "gc_thresh",
2514                 .data           =       &ip6_dst_ops_template.gc_thresh,
2515                 .maxlen         =       sizeof(int),
2516                 .mode           =       0644,
2517                 .proc_handler   =       &proc_dointvec,
2518         },
2519         {
2520                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2521                 .procname       =       "max_size",
2522                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2523                 .maxlen         =       sizeof(int),
2524                 .mode           =       0644,
2525                 .proc_handler   =       &proc_dointvec,
2526         },
2527         {
2528                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2529                 .procname       =       "gc_min_interval",
2530                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2531                 .maxlen         =       sizeof(int),
2532                 .mode           =       0644,
2533                 .proc_handler   =       &proc_dointvec_jiffies,
2534                 .strategy       =       &sysctl_jiffies,
2535         },
2536         {
2537                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2538                 .procname       =       "gc_timeout",
2539                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2540                 .maxlen         =       sizeof(int),
2541                 .mode           =       0644,
2542                 .proc_handler   =       &proc_dointvec_jiffies,
2543                 .strategy       =       &sysctl_jiffies,
2544         },
2545         {
2546                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2547                 .procname       =       "gc_interval",
2548                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2549                 .maxlen         =       sizeof(int),
2550                 .mode           =       0644,
2551                 .proc_handler   =       &proc_dointvec_jiffies,
2552                 .strategy       =       &sysctl_jiffies,
2553         },
2554         {
2555                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2556                 .procname       =       "gc_elasticity",
2557                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2558                 .maxlen         =       sizeof(int),
2559                 .mode           =       0644,
2560                 .proc_handler   =       &proc_dointvec_jiffies,
2561                 .strategy       =       &sysctl_jiffies,
2562         },
2563         {
2564                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2565                 .procname       =       "mtu_expires",
2566                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2567                 .maxlen         =       sizeof(int),
2568                 .mode           =       0644,
2569                 .proc_handler   =       &proc_dointvec_jiffies,
2570                 .strategy       =       &sysctl_jiffies,
2571         },
2572         {
2573                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2574                 .procname       =       "min_adv_mss",
2575                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2576                 .maxlen         =       sizeof(int),
2577                 .mode           =       0644,
2578                 .proc_handler   =       &proc_dointvec_jiffies,
2579                 .strategy       =       &sysctl_jiffies,
2580         },
2581         {
2582                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2583                 .procname       =       "gc_min_interval_ms",
2584                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2585                 .maxlen         =       sizeof(int),
2586                 .mode           =       0644,
2587                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2588                 .strategy       =       &sysctl_ms_jiffies,
2589         },
2590         { .ctl_name = 0 }
2591 };
2592
2593 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2594 {
2595         struct ctl_table *table;
2596
2597         table = kmemdup(ipv6_route_table_template,
2598                         sizeof(ipv6_route_table_template),
2599                         GFP_KERNEL);
2600
2601         if (table) {
2602                 table[0].data = &net->ipv6.sysctl.flush_delay;
2603                 table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2604                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2605                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2606                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2607                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2608                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2609                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2610                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2611         }
2612
2613         return table;
2614 }
2615 #endif
2616
2617 static int ip6_route_net_init(struct net *net)
2618 {
2619         int ret = -ENOMEM;
2620
2621         net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2622                                         sizeof(*net->ipv6.ip6_dst_ops),
2623                                         GFP_KERNEL);
2624         if (!net->ipv6.ip6_dst_ops)
2625                 goto out;
2626         net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2627
2628         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2629                                            sizeof(*net->ipv6.ip6_null_entry),
2630                                            GFP_KERNEL);
2631         if (!net->ipv6.ip6_null_entry)
2632                 goto out_ip6_dst_ops;
2633         net->ipv6.ip6_null_entry->u.dst.path =
2634                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2635         net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2636
2637 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2638         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2639                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2640                                                GFP_KERNEL);
2641         if (!net->ipv6.ip6_prohibit_entry) {
2642                 kfree(net->ipv6.ip6_null_entry);
2643                 goto out;
2644         }
2645         net->ipv6.ip6_prohibit_entry->u.dst.path =
2646                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2647         net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2648
2649         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2650                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2651                                                GFP_KERNEL);
2652         if (!net->ipv6.ip6_blk_hole_entry) {
2653                 kfree(net->ipv6.ip6_null_entry);
2654                 kfree(net->ipv6.ip6_prohibit_entry);
2655                 goto out;
2656         }
2657         net->ipv6.ip6_blk_hole_entry->u.dst.path =
2658                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2659         net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2660 #endif
2661
2662 #ifdef CONFIG_PROC_FS
2663         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2664         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2665 #endif
2666         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2667
2668         ret = 0;
2669 out:
2670         return ret;
2671
2672 out_ip6_dst_ops:
2673         release_net(net->ipv6.ip6_dst_ops->dst_net);
2674         kfree(net->ipv6.ip6_dst_ops);
2675         goto out;
2676 }
2677
2678 static void ip6_route_net_exit(struct net *net)
2679 {
2680 #ifdef CONFIG_PROC_FS
2681         proc_net_remove(net, "ipv6_route");
2682         proc_net_remove(net, "rt6_stats");
2683 #endif
2684         kfree(net->ipv6.ip6_null_entry);
2685 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2686         kfree(net->ipv6.ip6_prohibit_entry);
2687         kfree(net->ipv6.ip6_blk_hole_entry);
2688 #endif
2689         release_net(net->ipv6.ip6_dst_ops->dst_net);
2690         kfree(net->ipv6.ip6_dst_ops);
2691 }
2692
2693 static struct pernet_operations ip6_route_net_ops = {
2694         .init = ip6_route_net_init,
2695         .exit = ip6_route_net_exit,
2696 };
2697
2698 static struct notifier_block ip6_route_dev_notifier = {
2699         .notifier_call = ip6_route_dev_notify,
2700         .priority = 0,
2701 };
2702
2703 int __init ip6_route_init(void)
2704 {
2705         int ret;
2706
2707         ret = -ENOMEM;
2708         ip6_dst_ops_template.kmem_cachep =
2709                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2710                                   SLAB_HWCACHE_ALIGN, NULL);
2711         if (!ip6_dst_ops_template.kmem_cachep)
2712                 goto out;;
2713
2714         ret = register_pernet_subsys(&ip6_route_net_ops);
2715         if (ret)
2716                 goto out_kmem_cache;
2717
2718         /* Registering of the loopback is done before this portion of code,
2719          * the loopback reference in rt6_info will not be taken, do it
2720          * manually for init_net */
2721         init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2722         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2723   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2724         init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2725         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2726         init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2727         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2728   #endif
2729         ret = fib6_init();
2730         if (ret)
2731                 goto out_register_subsys;
2732
2733         ret = xfrm6_init();
2734         if (ret)
2735                 goto out_fib6_init;
2736
2737         ret = fib6_rules_init();
2738         if (ret)
2739                 goto xfrm6_init;
2740
2741         ret = -ENOBUFS;
2742         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2743             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2744             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2745                 goto fib6_rules_init;
2746
2747         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2748         if (ret)
2749                 goto fib6_rules_init;
2750
2751 out:
2752         return ret;
2753
2754 fib6_rules_init:
2755         fib6_rules_cleanup();
2756 xfrm6_init:
2757         xfrm6_fini();
2758 out_fib6_init:
2759         fib6_gc_cleanup();
2760 out_register_subsys:
2761         unregister_pernet_subsys(&ip6_route_net_ops);
2762 out_kmem_cache:
2763         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2764         goto out;
2765 }
2766
2767 void ip6_route_cleanup(void)
2768 {
2769         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2770         fib6_rules_cleanup();
2771         xfrm6_fini();
2772         fib6_gc_cleanup();
2773         unregister_pernet_subsys(&ip6_route_net_ops);
2774         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2775 }