Merge branch 'for-2.6.30' into for-2.6.31
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void             ip6_dst_destroy(struct dst_entry *);
80 static void             ip6_dst_ifdown(struct dst_entry *,
81                                        struct net_device *dev, int how);
82 static int               ip6_dst_gc(struct dst_ops *ops);
83
84 static int              ip6_pkt_discard(struct sk_buff *skb);
85 static int              ip6_pkt_discard_out(struct sk_buff *skb);
86 static void             ip6_link_failure(struct sk_buff *skb);
87 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91                                            struct in6_addr *prefix, int prefixlen,
92                                            struct in6_addr *gwaddr, int ifindex,
93                                            unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95                                            struct in6_addr *prefix, int prefixlen,
96                                            struct in6_addr *gwaddr, int ifindex);
97 #endif
98
99 static struct dst_ops ip6_dst_ops_template = {
100         .family                 =       AF_INET6,
101         .protocol               =       cpu_to_be16(ETH_P_IPV6),
102         .gc                     =       ip6_dst_gc,
103         .gc_thresh              =       1024,
104         .check                  =       ip6_dst_check,
105         .destroy                =       ip6_dst_destroy,
106         .ifdown                 =       ip6_dst_ifdown,
107         .negative_advice        =       ip6_negative_advice,
108         .link_failure           =       ip6_link_failure,
109         .update_pmtu            =       ip6_rt_update_pmtu,
110         .local_out              =       __ip6_local_out,
111         .entries                =       ATOMIC_INIT(0),
112 };
113
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117
118 static struct dst_ops ip6_dst_blackhole_ops = {
119         .family                 =       AF_INET6,
120         .protocol               =       cpu_to_be16(ETH_P_IPV6),
121         .destroy                =       ip6_dst_destroy,
122         .check                  =       ip6_dst_check,
123         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
124         .entries                =       ATOMIC_INIT(0),
125 };
126
127 static struct rt6_info ip6_null_entry_template = {
128         .u = {
129                 .dst = {
130                         .__refcnt       = ATOMIC_INIT(1),
131                         .__use          = 1,
132                         .obsolete       = -1,
133                         .error          = -ENETUNREACH,
134                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
135                         .input          = ip6_pkt_discard,
136                         .output         = ip6_pkt_discard_out,
137                 }
138         },
139         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
140         .rt6i_metric    = ~(u32) 0,
141         .rt6i_ref       = ATOMIC_INIT(1),
142 };
143
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145
146 static int ip6_pkt_prohibit(struct sk_buff *skb);
147 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148
149 static struct rt6_info ip6_prohibit_entry_template = {
150         .u = {
151                 .dst = {
152                         .__refcnt       = ATOMIC_INIT(1),
153                         .__use          = 1,
154                         .obsolete       = -1,
155                         .error          = -EACCES,
156                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
157                         .input          = ip6_pkt_prohibit,
158                         .output         = ip6_pkt_prohibit_out,
159                 }
160         },
161         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
162         .rt6i_metric    = ~(u32) 0,
163         .rt6i_ref       = ATOMIC_INIT(1),
164 };
165
166 static struct rt6_info ip6_blk_hole_entry_template = {
167         .u = {
168                 .dst = {
169                         .__refcnt       = ATOMIC_INIT(1),
170                         .__use          = 1,
171                         .obsolete       = -1,
172                         .error          = -EINVAL,
173                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
174                         .input          = dst_discard,
175                         .output         = dst_discard,
176                 }
177         },
178         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
179         .rt6i_metric    = ~(u32) 0,
180         .rt6i_ref       = ATOMIC_INIT(1),
181 };
182
183 #endif
184
185 /* allocate dst with ip6_dst_ops */
186 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
187 {
188         return (struct rt6_info *)dst_alloc(ops);
189 }
190
191 static void ip6_dst_destroy(struct dst_entry *dst)
192 {
193         struct rt6_info *rt = (struct rt6_info *)dst;
194         struct inet6_dev *idev = rt->rt6i_idev;
195
196         if (idev != NULL) {
197                 rt->rt6i_idev = NULL;
198                 in6_dev_put(idev);
199         }
200 }
201
202 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
203                            int how)
204 {
205         struct rt6_info *rt = (struct rt6_info *)dst;
206         struct inet6_dev *idev = rt->rt6i_idev;
207         struct net_device *loopback_dev =
208                 dev_net(dev)->loopback_dev;
209
210         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
211                 struct inet6_dev *loopback_idev =
212                         in6_dev_get(loopback_dev);
213                 if (loopback_idev != NULL) {
214                         rt->rt6i_idev = loopback_idev;
215                         in6_dev_put(idev);
216                 }
217         }
218 }
219
220 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
221 {
222         return (rt->rt6i_flags & RTF_EXPIRES &&
223                 time_after(jiffies, rt->rt6i_expires));
224 }
225
226 static inline int rt6_need_strict(struct in6_addr *daddr)
227 {
228         return (ipv6_addr_type(daddr) &
229                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
230 }
231
232 /*
233  *      Route lookup. Any table->tb6_lock is implied.
234  */
235
236 static inline struct rt6_info *rt6_device_match(struct net *net,
237                                                     struct rt6_info *rt,
238                                                     struct in6_addr *saddr,
239                                                     int oif,
240                                                     int flags)
241 {
242         struct rt6_info *local = NULL;
243         struct rt6_info *sprt;
244
245         if (!oif && ipv6_addr_any(saddr))
246                 goto out;
247
248         for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
249                 struct net_device *dev = sprt->rt6i_dev;
250
251                 if (oif) {
252                         if (dev->ifindex == oif)
253                                 return sprt;
254                         if (dev->flags & IFF_LOOPBACK) {
255                                 if (sprt->rt6i_idev == NULL ||
256                                     sprt->rt6i_idev->dev->ifindex != oif) {
257                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
258                                                 continue;
259                                         if (local && (!oif ||
260                                                       local->rt6i_idev->dev->ifindex == oif))
261                                                 continue;
262                                 }
263                                 local = sprt;
264                         }
265                 } else {
266                         if (ipv6_chk_addr(net, saddr, dev,
267                                           flags & RT6_LOOKUP_F_IFACE))
268                                 return sprt;
269                 }
270         }
271
272         if (oif) {
273                 if (local)
274                         return local;
275
276                 if (flags & RT6_LOOKUP_F_IFACE)
277                         return net->ipv6.ip6_null_entry;
278         }
279 out:
280         return rt;
281 }
282
283 #ifdef CONFIG_IPV6_ROUTER_PREF
284 static void rt6_probe(struct rt6_info *rt)
285 {
286         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
287         /*
288          * Okay, this does not seem to be appropriate
289          * for now, however, we need to check if it
290          * is really so; aka Router Reachability Probing.
291          *
292          * Router Reachability Probe MUST be rate-limited
293          * to no more than one per minute.
294          */
295         if (!neigh || (neigh->nud_state & NUD_VALID))
296                 return;
297         read_lock_bh(&neigh->lock);
298         if (!(neigh->nud_state & NUD_VALID) &&
299             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
300                 struct in6_addr mcaddr;
301                 struct in6_addr *target;
302
303                 neigh->updated = jiffies;
304                 read_unlock_bh(&neigh->lock);
305
306                 target = (struct in6_addr *)&neigh->primary_key;
307                 addrconf_addr_solict_mult(target, &mcaddr);
308                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
309         } else
310                 read_unlock_bh(&neigh->lock);
311 }
312 #else
313 static inline void rt6_probe(struct rt6_info *rt)
314 {
315         return;
316 }
317 #endif
318
319 /*
320  * Default Router Selection (RFC 2461 6.3.6)
321  */
322 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
323 {
324         struct net_device *dev = rt->rt6i_dev;
325         if (!oif || dev->ifindex == oif)
326                 return 2;
327         if ((dev->flags & IFF_LOOPBACK) &&
328             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
329                 return 1;
330         return 0;
331 }
332
333 static inline int rt6_check_neigh(struct rt6_info *rt)
334 {
335         struct neighbour *neigh = rt->rt6i_nexthop;
336         int m;
337         if (rt->rt6i_flags & RTF_NONEXTHOP ||
338             !(rt->rt6i_flags & RTF_GATEWAY))
339                 m = 1;
340         else if (neigh) {
341                 read_lock_bh(&neigh->lock);
342                 if (neigh->nud_state & NUD_VALID)
343                         m = 2;
344 #ifdef CONFIG_IPV6_ROUTER_PREF
345                 else if (neigh->nud_state & NUD_FAILED)
346                         m = 0;
347 #endif
348                 else
349                         m = 1;
350                 read_unlock_bh(&neigh->lock);
351         } else
352                 m = 0;
353         return m;
354 }
355
356 static int rt6_score_route(struct rt6_info *rt, int oif,
357                            int strict)
358 {
359         int m, n;
360
361         m = rt6_check_dev(rt, oif);
362         if (!m && (strict & RT6_LOOKUP_F_IFACE))
363                 return -1;
364 #ifdef CONFIG_IPV6_ROUTER_PREF
365         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
366 #endif
367         n = rt6_check_neigh(rt);
368         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
369                 return -1;
370         return m;
371 }
372
373 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
374                                    int *mpri, struct rt6_info *match)
375 {
376         int m;
377
378         if (rt6_check_expired(rt))
379                 goto out;
380
381         m = rt6_score_route(rt, oif, strict);
382         if (m < 0)
383                 goto out;
384
385         if (m > *mpri) {
386                 if (strict & RT6_LOOKUP_F_REACHABLE)
387                         rt6_probe(match);
388                 *mpri = m;
389                 match = rt;
390         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
391                 rt6_probe(rt);
392         }
393
394 out:
395         return match;
396 }
397
398 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
399                                      struct rt6_info *rr_head,
400                                      u32 metric, int oif, int strict)
401 {
402         struct rt6_info *rt, *match;
403         int mpri = -1;
404
405         match = NULL;
406         for (rt = rr_head; rt && rt->rt6i_metric == metric;
407              rt = rt->u.dst.rt6_next)
408                 match = find_match(rt, oif, strict, &mpri, match);
409         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
410              rt = rt->u.dst.rt6_next)
411                 match = find_match(rt, oif, strict, &mpri, match);
412
413         return match;
414 }
415
416 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
417 {
418         struct rt6_info *match, *rt0;
419         struct net *net;
420
421         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
422                   __func__, fn->leaf, oif);
423
424         rt0 = fn->rr_ptr;
425         if (!rt0)
426                 fn->rr_ptr = rt0 = fn->leaf;
427
428         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
429
430         if (!match &&
431             (strict & RT6_LOOKUP_F_REACHABLE)) {
432                 struct rt6_info *next = rt0->u.dst.rt6_next;
433
434                 /* no entries matched; do round-robin */
435                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
436                         next = fn->leaf;
437
438                 if (next != rt0)
439                         fn->rr_ptr = next;
440         }
441
442         RT6_TRACE("%s() => %p\n",
443                   __func__, match);
444
445         net = dev_net(rt0->rt6i_dev);
446         return (match ? match : net->ipv6.ip6_null_entry);
447 }
448
449 #ifdef CONFIG_IPV6_ROUTE_INFO
450 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
451                   struct in6_addr *gwaddr)
452 {
453         struct net *net = dev_net(dev);
454         struct route_info *rinfo = (struct route_info *) opt;
455         struct in6_addr prefix_buf, *prefix;
456         unsigned int pref;
457         unsigned long lifetime;
458         struct rt6_info *rt;
459
460         if (len < sizeof(struct route_info)) {
461                 return -EINVAL;
462         }
463
464         /* Sanity check for prefix_len and length */
465         if (rinfo->length > 3) {
466                 return -EINVAL;
467         } else if (rinfo->prefix_len > 128) {
468                 return -EINVAL;
469         } else if (rinfo->prefix_len > 64) {
470                 if (rinfo->length < 2) {
471                         return -EINVAL;
472                 }
473         } else if (rinfo->prefix_len > 0) {
474                 if (rinfo->length < 1) {
475                         return -EINVAL;
476                 }
477         }
478
479         pref = rinfo->route_pref;
480         if (pref == ICMPV6_ROUTER_PREF_INVALID)
481                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
482
483         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
484
485         if (rinfo->length == 3)
486                 prefix = (struct in6_addr *)rinfo->prefix;
487         else {
488                 /* this function is safe */
489                 ipv6_addr_prefix(&prefix_buf,
490                                  (struct in6_addr *)rinfo->prefix,
491                                  rinfo->prefix_len);
492                 prefix = &prefix_buf;
493         }
494
495         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
496                                 dev->ifindex);
497
498         if (rt && !lifetime) {
499                 ip6_del_rt(rt);
500                 rt = NULL;
501         }
502
503         if (!rt && lifetime)
504                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
505                                         pref);
506         else if (rt)
507                 rt->rt6i_flags = RTF_ROUTEINFO |
508                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
509
510         if (rt) {
511                 if (!addrconf_finite_timeout(lifetime)) {
512                         rt->rt6i_flags &= ~RTF_EXPIRES;
513                 } else {
514                         rt->rt6i_expires = jiffies + HZ * lifetime;
515                         rt->rt6i_flags |= RTF_EXPIRES;
516                 }
517                 dst_release(&rt->u.dst);
518         }
519         return 0;
520 }
521 #endif
522
523 #define BACKTRACK(__net, saddr)                 \
524 do { \
525         if (rt == __net->ipv6.ip6_null_entry) { \
526                 struct fib6_node *pn; \
527                 while (1) { \
528                         if (fn->fn_flags & RTN_TL_ROOT) \
529                                 goto out; \
530                         pn = fn->parent; \
531                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
532                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
533                         else \
534                                 fn = pn; \
535                         if (fn->fn_flags & RTN_RTINFO) \
536                                 goto restart; \
537                 } \
538         } \
539 } while(0)
540
541 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
542                                              struct fib6_table *table,
543                                              struct flowi *fl, int flags)
544 {
545         struct fib6_node *fn;
546         struct rt6_info *rt;
547
548         read_lock_bh(&table->tb6_lock);
549         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
550 restart:
551         rt = fn->leaf;
552         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
553         BACKTRACK(net, &fl->fl6_src);
554 out:
555         dst_use(&rt->u.dst, jiffies);
556         read_unlock_bh(&table->tb6_lock);
557         return rt;
558
559 }
560
561 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
562                             const struct in6_addr *saddr, int oif, int strict)
563 {
564         struct flowi fl = {
565                 .oif = oif,
566                 .nl_u = {
567                         .ip6_u = {
568                                 .daddr = *daddr,
569                         },
570                 },
571         };
572         struct dst_entry *dst;
573         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
574
575         if (saddr) {
576                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
577                 flags |= RT6_LOOKUP_F_HAS_SADDR;
578         }
579
580         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
581         if (dst->error == 0)
582                 return (struct rt6_info *) dst;
583
584         dst_release(dst);
585
586         return NULL;
587 }
588
589 EXPORT_SYMBOL(rt6_lookup);
590
591 /* ip6_ins_rt is called with FREE table->tb6_lock.
592    It takes new route entry, the addition fails by any reason the
593    route is freed. In any case, if caller does not hold it, it may
594    be destroyed.
595  */
596
597 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
598 {
599         int err;
600         struct fib6_table *table;
601
602         table = rt->rt6i_table;
603         write_lock_bh(&table->tb6_lock);
604         err = fib6_add(&table->tb6_root, rt, info);
605         write_unlock_bh(&table->tb6_lock);
606
607         return err;
608 }
609
610 int ip6_ins_rt(struct rt6_info *rt)
611 {
612         struct nl_info info = {
613                 .nl_net = dev_net(rt->rt6i_dev),
614         };
615         return __ip6_ins_rt(rt, &info);
616 }
617
618 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
619                                       struct in6_addr *saddr)
620 {
621         struct rt6_info *rt;
622
623         /*
624          *      Clone the route.
625          */
626
627         rt = ip6_rt_copy(ort);
628
629         if (rt) {
630                 struct neighbour *neigh;
631                 int attempts = !in_softirq();
632
633                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
634                         if (rt->rt6i_dst.plen != 128 &&
635                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
636                                 rt->rt6i_flags |= RTF_ANYCAST;
637                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
638                 }
639
640                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
641                 rt->rt6i_dst.plen = 128;
642                 rt->rt6i_flags |= RTF_CACHE;
643                 rt->u.dst.flags |= DST_HOST;
644
645 #ifdef CONFIG_IPV6_SUBTREES
646                 if (rt->rt6i_src.plen && saddr) {
647                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
648                         rt->rt6i_src.plen = 128;
649                 }
650 #endif
651
652         retry:
653                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
654                 if (IS_ERR(neigh)) {
655                         struct net *net = dev_net(rt->rt6i_dev);
656                         int saved_rt_min_interval =
657                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
658                         int saved_rt_elasticity =
659                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
660
661                         if (attempts-- > 0) {
662                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
663                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
664
665                                 ip6_dst_gc(net->ipv6.ip6_dst_ops);
666
667                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
668                                         saved_rt_elasticity;
669                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
670                                         saved_rt_min_interval;
671                                 goto retry;
672                         }
673
674                         if (net_ratelimit())
675                                 printk(KERN_WARNING
676                                        "Neighbour table overflow.\n");
677                         dst_free(&rt->u.dst);
678                         return NULL;
679                 }
680                 rt->rt6i_nexthop = neigh;
681
682         }
683
684         return rt;
685 }
686
687 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
688 {
689         struct rt6_info *rt = ip6_rt_copy(ort);
690         if (rt) {
691                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
692                 rt->rt6i_dst.plen = 128;
693                 rt->rt6i_flags |= RTF_CACHE;
694                 rt->u.dst.flags |= DST_HOST;
695                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
696         }
697         return rt;
698 }
699
700 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
701                                       struct flowi *fl, int flags)
702 {
703         struct fib6_node *fn;
704         struct rt6_info *rt, *nrt;
705         int strict = 0;
706         int attempts = 3;
707         int err;
708         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
709
710         strict |= flags & RT6_LOOKUP_F_IFACE;
711
712 relookup:
713         read_lock_bh(&table->tb6_lock);
714
715 restart_2:
716         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
717
718 restart:
719         rt = rt6_select(fn, oif, strict | reachable);
720
721         BACKTRACK(net, &fl->fl6_src);
722         if (rt == net->ipv6.ip6_null_entry ||
723             rt->rt6i_flags & RTF_CACHE)
724                 goto out;
725
726         dst_hold(&rt->u.dst);
727         read_unlock_bh(&table->tb6_lock);
728
729         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
730                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
731         else {
732 #if CLONE_OFFLINK_ROUTE
733                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
734 #else
735                 goto out2;
736 #endif
737         }
738
739         dst_release(&rt->u.dst);
740         rt = nrt ? : net->ipv6.ip6_null_entry;
741
742         dst_hold(&rt->u.dst);
743         if (nrt) {
744                 err = ip6_ins_rt(nrt);
745                 if (!err)
746                         goto out2;
747         }
748
749         if (--attempts <= 0)
750                 goto out2;
751
752         /*
753          * Race condition! In the gap, when table->tb6_lock was
754          * released someone could insert this route.  Relookup.
755          */
756         dst_release(&rt->u.dst);
757         goto relookup;
758
759 out:
760         if (reachable) {
761                 reachable = 0;
762                 goto restart_2;
763         }
764         dst_hold(&rt->u.dst);
765         read_unlock_bh(&table->tb6_lock);
766 out2:
767         rt->u.dst.lastuse = jiffies;
768         rt->u.dst.__use++;
769
770         return rt;
771 }
772
773 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
774                                             struct flowi *fl, int flags)
775 {
776         return ip6_pol_route(net, table, fl->iif, fl, flags);
777 }
778
779 void ip6_route_input(struct sk_buff *skb)
780 {
781         struct ipv6hdr *iph = ipv6_hdr(skb);
782         struct net *net = dev_net(skb->dev);
783         int flags = RT6_LOOKUP_F_HAS_SADDR;
784         struct flowi fl = {
785                 .iif = skb->dev->ifindex,
786                 .nl_u = {
787                         .ip6_u = {
788                                 .daddr = iph->daddr,
789                                 .saddr = iph->saddr,
790                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
791                         },
792                 },
793                 .mark = skb->mark,
794                 .proto = iph->nexthdr,
795         };
796
797         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
798                 flags |= RT6_LOOKUP_F_IFACE;
799
800         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
801 }
802
803 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
804                                              struct flowi *fl, int flags)
805 {
806         return ip6_pol_route(net, table, fl->oif, fl, flags);
807 }
808
809 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
810                                     struct flowi *fl)
811 {
812         int flags = 0;
813
814         if (rt6_need_strict(&fl->fl6_dst))
815                 flags |= RT6_LOOKUP_F_IFACE;
816
817         if (!ipv6_addr_any(&fl->fl6_src))
818                 flags |= RT6_LOOKUP_F_HAS_SADDR;
819         else if (sk) {
820                 unsigned int prefs = inet6_sk(sk)->srcprefs;
821                 if (prefs & IPV6_PREFER_SRC_TMP)
822                         flags |= RT6_LOOKUP_F_SRCPREF_TMP;
823                 if (prefs & IPV6_PREFER_SRC_PUBLIC)
824                         flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
825                 if (prefs & IPV6_PREFER_SRC_COA)
826                         flags |= RT6_LOOKUP_F_SRCPREF_COA;
827         }
828
829         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
830 }
831
832 EXPORT_SYMBOL(ip6_route_output);
833
834 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
835 {
836         struct rt6_info *ort = (struct rt6_info *) *dstp;
837         struct rt6_info *rt = (struct rt6_info *)
838                 dst_alloc(&ip6_dst_blackhole_ops);
839         struct dst_entry *new = NULL;
840
841         if (rt) {
842                 new = &rt->u.dst;
843
844                 atomic_set(&new->__refcnt, 1);
845                 new->__use = 1;
846                 new->input = dst_discard;
847                 new->output = dst_discard;
848
849                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
850                 new->dev = ort->u.dst.dev;
851                 if (new->dev)
852                         dev_hold(new->dev);
853                 rt->rt6i_idev = ort->rt6i_idev;
854                 if (rt->rt6i_idev)
855                         in6_dev_hold(rt->rt6i_idev);
856                 rt->rt6i_expires = 0;
857
858                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
859                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
860                 rt->rt6i_metric = 0;
861
862                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
863 #ifdef CONFIG_IPV6_SUBTREES
864                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
865 #endif
866
867                 dst_free(new);
868         }
869
870         dst_release(*dstp);
871         *dstp = new;
872         return (new ? 0 : -ENOMEM);
873 }
874 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
875
876 /*
877  *      Destination cache support functions
878  */
879
880 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
881 {
882         struct rt6_info *rt;
883
884         rt = (struct rt6_info *) dst;
885
886         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
887                 return dst;
888
889         return NULL;
890 }
891
892 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
893 {
894         struct rt6_info *rt = (struct rt6_info *) dst;
895
896         if (rt) {
897                 if (rt->rt6i_flags & RTF_CACHE)
898                         ip6_del_rt(rt);
899                 else
900                         dst_release(dst);
901         }
902         return NULL;
903 }
904
905 static void ip6_link_failure(struct sk_buff *skb)
906 {
907         struct rt6_info *rt;
908
909         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
910
911         rt = (struct rt6_info *) skb->dst;
912         if (rt) {
913                 if (rt->rt6i_flags&RTF_CACHE) {
914                         dst_set_expires(&rt->u.dst, 0);
915                         rt->rt6i_flags |= RTF_EXPIRES;
916                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
917                         rt->rt6i_node->fn_sernum = -1;
918         }
919 }
920
921 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
922 {
923         struct rt6_info *rt6 = (struct rt6_info*)dst;
924
925         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
926                 rt6->rt6i_flags |= RTF_MODIFIED;
927                 if (mtu < IPV6_MIN_MTU) {
928                         mtu = IPV6_MIN_MTU;
929                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
930                 }
931                 dst->metrics[RTAX_MTU-1] = mtu;
932                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
933         }
934 }
935
936 static int ipv6_get_mtu(struct net_device *dev);
937
938 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
939 {
940         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
941
942         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
943                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
944
945         /*
946          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
947          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
948          * IPV6_MAXPLEN is also valid and means: "any MSS,
949          * rely only on pmtu discovery"
950          */
951         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
952                 mtu = IPV6_MAXPLEN;
953         return mtu;
954 }
955
956 static struct dst_entry *icmp6_dst_gc_list;
957 static DEFINE_SPINLOCK(icmp6_dst_lock);
958
959 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
960                                   struct neighbour *neigh,
961                                   const struct in6_addr *addr)
962 {
963         struct rt6_info *rt;
964         struct inet6_dev *idev = in6_dev_get(dev);
965         struct net *net = dev_net(dev);
966
967         if (unlikely(idev == NULL))
968                 return NULL;
969
970         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
971         if (unlikely(rt == NULL)) {
972                 in6_dev_put(idev);
973                 goto out;
974         }
975
976         dev_hold(dev);
977         if (neigh)
978                 neigh_hold(neigh);
979         else {
980                 neigh = ndisc_get_neigh(dev, addr);
981                 if (IS_ERR(neigh))
982                         neigh = NULL;
983         }
984
985         rt->rt6i_dev      = dev;
986         rt->rt6i_idev     = idev;
987         rt->rt6i_nexthop  = neigh;
988         atomic_set(&rt->u.dst.__refcnt, 1);
989         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
990         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
991         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
992         rt->u.dst.output  = ip6_output;
993
994 #if 0   /* there's no chance to use these for ndisc */
995         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
996                                 ? DST_HOST
997                                 : 0;
998         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
999         rt->rt6i_dst.plen = 128;
1000 #endif
1001
1002         spin_lock_bh(&icmp6_dst_lock);
1003         rt->u.dst.next = icmp6_dst_gc_list;
1004         icmp6_dst_gc_list = &rt->u.dst;
1005         spin_unlock_bh(&icmp6_dst_lock);
1006
1007         fib6_force_start_gc(net);
1008
1009 out:
1010         return &rt->u.dst;
1011 }
1012
1013 int icmp6_dst_gc(void)
1014 {
1015         struct dst_entry *dst, *next, **pprev;
1016         int more = 0;
1017
1018         next = NULL;
1019
1020         spin_lock_bh(&icmp6_dst_lock);
1021         pprev = &icmp6_dst_gc_list;
1022
1023         while ((dst = *pprev) != NULL) {
1024                 if (!atomic_read(&dst->__refcnt)) {
1025                         *pprev = dst->next;
1026                         dst_free(dst);
1027                 } else {
1028                         pprev = &dst->next;
1029                         ++more;
1030                 }
1031         }
1032
1033         spin_unlock_bh(&icmp6_dst_lock);
1034
1035         return more;
1036 }
1037
1038 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1039                             void *arg)
1040 {
1041         struct dst_entry *dst, **pprev;
1042
1043         spin_lock_bh(&icmp6_dst_lock);
1044         pprev = &icmp6_dst_gc_list;
1045         while ((dst = *pprev) != NULL) {
1046                 struct rt6_info *rt = (struct rt6_info *) dst;
1047                 if (func(rt, arg)) {
1048                         *pprev = dst->next;
1049                         dst_free(dst);
1050                 } else {
1051                         pprev = &dst->next;
1052                 }
1053         }
1054         spin_unlock_bh(&icmp6_dst_lock);
1055 }
1056
1057 static int ip6_dst_gc(struct dst_ops *ops)
1058 {
1059         unsigned long now = jiffies;
1060         struct net *net = ops->dst_net;
1061         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1062         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1063         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1064         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1065         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1066
1067         if (time_after(rt_last_gc + rt_min_interval, now) &&
1068             atomic_read(&ops->entries) <= rt_max_size)
1069                 goto out;
1070
1071         net->ipv6.ip6_rt_gc_expire++;
1072         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1073         net->ipv6.ip6_rt_last_gc = now;
1074         if (atomic_read(&ops->entries) < ops->gc_thresh)
1075                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1076 out:
1077         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1078         return (atomic_read(&ops->entries) > rt_max_size);
1079 }
1080
1081 /* Clean host part of a prefix. Not necessary in radix tree,
1082    but results in cleaner routing tables.
1083
1084    Remove it only when all the things will work!
1085  */
1086
1087 static int ipv6_get_mtu(struct net_device *dev)
1088 {
1089         int mtu = IPV6_MIN_MTU;
1090         struct inet6_dev *idev;
1091
1092         idev = in6_dev_get(dev);
1093         if (idev) {
1094                 mtu = idev->cnf.mtu6;
1095                 in6_dev_put(idev);
1096         }
1097         return mtu;
1098 }
1099
1100 int ip6_dst_hoplimit(struct dst_entry *dst)
1101 {
1102         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1103         if (hoplimit < 0) {
1104                 struct net_device *dev = dst->dev;
1105                 struct inet6_dev *idev = in6_dev_get(dev);
1106                 if (idev) {
1107                         hoplimit = idev->cnf.hop_limit;
1108                         in6_dev_put(idev);
1109                 } else
1110                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1111         }
1112         return hoplimit;
1113 }
1114
1115 /*
1116  *
1117  */
1118
1119 int ip6_route_add(struct fib6_config *cfg)
1120 {
1121         int err;
1122         struct net *net = cfg->fc_nlinfo.nl_net;
1123         struct rt6_info *rt = NULL;
1124         struct net_device *dev = NULL;
1125         struct inet6_dev *idev = NULL;
1126         struct fib6_table *table;
1127         int addr_type;
1128
1129         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1130                 return -EINVAL;
1131 #ifndef CONFIG_IPV6_SUBTREES
1132         if (cfg->fc_src_len)
1133                 return -EINVAL;
1134 #endif
1135         if (cfg->fc_ifindex) {
1136                 err = -ENODEV;
1137                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1138                 if (!dev)
1139                         goto out;
1140                 idev = in6_dev_get(dev);
1141                 if (!idev)
1142                         goto out;
1143         }
1144
1145         if (cfg->fc_metric == 0)
1146                 cfg->fc_metric = IP6_RT_PRIO_USER;
1147
1148         table = fib6_new_table(net, cfg->fc_table);
1149         if (table == NULL) {
1150                 err = -ENOBUFS;
1151                 goto out;
1152         }
1153
1154         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1155
1156         if (rt == NULL) {
1157                 err = -ENOMEM;
1158                 goto out;
1159         }
1160
1161         rt->u.dst.obsolete = -1;
1162         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1163                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1164                                 0;
1165
1166         if (cfg->fc_protocol == RTPROT_UNSPEC)
1167                 cfg->fc_protocol = RTPROT_BOOT;
1168         rt->rt6i_protocol = cfg->fc_protocol;
1169
1170         addr_type = ipv6_addr_type(&cfg->fc_dst);
1171
1172         if (addr_type & IPV6_ADDR_MULTICAST)
1173                 rt->u.dst.input = ip6_mc_input;
1174         else
1175                 rt->u.dst.input = ip6_forward;
1176
1177         rt->u.dst.output = ip6_output;
1178
1179         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1180         rt->rt6i_dst.plen = cfg->fc_dst_len;
1181         if (rt->rt6i_dst.plen == 128)
1182                rt->u.dst.flags = DST_HOST;
1183
1184 #ifdef CONFIG_IPV6_SUBTREES
1185         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1186         rt->rt6i_src.plen = cfg->fc_src_len;
1187 #endif
1188
1189         rt->rt6i_metric = cfg->fc_metric;
1190
1191         /* We cannot add true routes via loopback here,
1192            they would result in kernel looping; promote them to reject routes
1193          */
1194         if ((cfg->fc_flags & RTF_REJECT) ||
1195             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1196                 /* hold loopback dev/idev if we haven't done so. */
1197                 if (dev != net->loopback_dev) {
1198                         if (dev) {
1199                                 dev_put(dev);
1200                                 in6_dev_put(idev);
1201                         }
1202                         dev = net->loopback_dev;
1203                         dev_hold(dev);
1204                         idev = in6_dev_get(dev);
1205                         if (!idev) {
1206                                 err = -ENODEV;
1207                                 goto out;
1208                         }
1209                 }
1210                 rt->u.dst.output = ip6_pkt_discard_out;
1211                 rt->u.dst.input = ip6_pkt_discard;
1212                 rt->u.dst.error = -ENETUNREACH;
1213                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1214                 goto install_route;
1215         }
1216
1217         if (cfg->fc_flags & RTF_GATEWAY) {
1218                 struct in6_addr *gw_addr;
1219                 int gwa_type;
1220
1221                 gw_addr = &cfg->fc_gateway;
1222                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1223                 gwa_type = ipv6_addr_type(gw_addr);
1224
1225                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1226                         struct rt6_info *grt;
1227
1228                         /* IPv6 strictly inhibits using not link-local
1229                            addresses as nexthop address.
1230                            Otherwise, router will not able to send redirects.
1231                            It is very good, but in some (rare!) circumstances
1232                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1233                            some exceptions. --ANK
1234                          */
1235                         err = -EINVAL;
1236                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1237                                 goto out;
1238
1239                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1240
1241                         err = -EHOSTUNREACH;
1242                         if (grt == NULL)
1243                                 goto out;
1244                         if (dev) {
1245                                 if (dev != grt->rt6i_dev) {
1246                                         dst_release(&grt->u.dst);
1247                                         goto out;
1248                                 }
1249                         } else {
1250                                 dev = grt->rt6i_dev;
1251                                 idev = grt->rt6i_idev;
1252                                 dev_hold(dev);
1253                                 in6_dev_hold(grt->rt6i_idev);
1254                         }
1255                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1256                                 err = 0;
1257                         dst_release(&grt->u.dst);
1258
1259                         if (err)
1260                                 goto out;
1261                 }
1262                 err = -EINVAL;
1263                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1264                         goto out;
1265         }
1266
1267         err = -ENODEV;
1268         if (dev == NULL)
1269                 goto out;
1270
1271         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1272                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1273                 if (IS_ERR(rt->rt6i_nexthop)) {
1274                         err = PTR_ERR(rt->rt6i_nexthop);
1275                         rt->rt6i_nexthop = NULL;
1276                         goto out;
1277                 }
1278         }
1279
1280         rt->rt6i_flags = cfg->fc_flags;
1281
1282 install_route:
1283         if (cfg->fc_mx) {
1284                 struct nlattr *nla;
1285                 int remaining;
1286
1287                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1288                         int type = nla_type(nla);
1289
1290                         if (type) {
1291                                 if (type > RTAX_MAX) {
1292                                         err = -EINVAL;
1293                                         goto out;
1294                                 }
1295
1296                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1297                         }
1298                 }
1299         }
1300
1301         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1302                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1303         if (!dst_mtu(&rt->u.dst))
1304                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1305         if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1306                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1307         rt->u.dst.dev = dev;
1308         rt->rt6i_idev = idev;
1309         rt->rt6i_table = table;
1310
1311         cfg->fc_nlinfo.nl_net = dev_net(dev);
1312
1313         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1314
1315 out:
1316         if (dev)
1317                 dev_put(dev);
1318         if (idev)
1319                 in6_dev_put(idev);
1320         if (rt)
1321                 dst_free(&rt->u.dst);
1322         return err;
1323 }
1324
1325 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1326 {
1327         int err;
1328         struct fib6_table *table;
1329         struct net *net = dev_net(rt->rt6i_dev);
1330
1331         if (rt == net->ipv6.ip6_null_entry)
1332                 return -ENOENT;
1333
1334         table = rt->rt6i_table;
1335         write_lock_bh(&table->tb6_lock);
1336
1337         err = fib6_del(rt, info);
1338         dst_release(&rt->u.dst);
1339
1340         write_unlock_bh(&table->tb6_lock);
1341
1342         return err;
1343 }
1344
1345 int ip6_del_rt(struct rt6_info *rt)
1346 {
1347         struct nl_info info = {
1348                 .nl_net = dev_net(rt->rt6i_dev),
1349         };
1350         return __ip6_del_rt(rt, &info);
1351 }
1352
1353 static int ip6_route_del(struct fib6_config *cfg)
1354 {
1355         struct fib6_table *table;
1356         struct fib6_node *fn;
1357         struct rt6_info *rt;
1358         int err = -ESRCH;
1359
1360         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1361         if (table == NULL)
1362                 return err;
1363
1364         read_lock_bh(&table->tb6_lock);
1365
1366         fn = fib6_locate(&table->tb6_root,
1367                          &cfg->fc_dst, cfg->fc_dst_len,
1368                          &cfg->fc_src, cfg->fc_src_len);
1369
1370         if (fn) {
1371                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1372                         if (cfg->fc_ifindex &&
1373                             (rt->rt6i_dev == NULL ||
1374                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1375                                 continue;
1376                         if (cfg->fc_flags & RTF_GATEWAY &&
1377                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1378                                 continue;
1379                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1380                                 continue;
1381                         dst_hold(&rt->u.dst);
1382                         read_unlock_bh(&table->tb6_lock);
1383
1384                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1385                 }
1386         }
1387         read_unlock_bh(&table->tb6_lock);
1388
1389         return err;
1390 }
1391
1392 /*
1393  *      Handle redirects
1394  */
1395 struct ip6rd_flowi {
1396         struct flowi fl;
1397         struct in6_addr gateway;
1398 };
1399
1400 static struct rt6_info *__ip6_route_redirect(struct net *net,
1401                                              struct fib6_table *table,
1402                                              struct flowi *fl,
1403                                              int flags)
1404 {
1405         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1406         struct rt6_info *rt;
1407         struct fib6_node *fn;
1408
1409         /*
1410          * Get the "current" route for this destination and
1411          * check if the redirect has come from approriate router.
1412          *
1413          * RFC 2461 specifies that redirects should only be
1414          * accepted if they come from the nexthop to the target.
1415          * Due to the way the routes are chosen, this notion
1416          * is a bit fuzzy and one might need to check all possible
1417          * routes.
1418          */
1419
1420         read_lock_bh(&table->tb6_lock);
1421         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1422 restart:
1423         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1424                 /*
1425                  * Current route is on-link; redirect is always invalid.
1426                  *
1427                  * Seems, previous statement is not true. It could
1428                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1429                  * But then router serving it might decide, that we should
1430                  * know truth 8)8) --ANK (980726).
1431                  */
1432                 if (rt6_check_expired(rt))
1433                         continue;
1434                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1435                         continue;
1436                 if (fl->oif != rt->rt6i_dev->ifindex)
1437                         continue;
1438                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1439                         continue;
1440                 break;
1441         }
1442
1443         if (!rt)
1444                 rt = net->ipv6.ip6_null_entry;
1445         BACKTRACK(net, &fl->fl6_src);
1446 out:
1447         dst_hold(&rt->u.dst);
1448
1449         read_unlock_bh(&table->tb6_lock);
1450
1451         return rt;
1452 };
1453
1454 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1455                                            struct in6_addr *src,
1456                                            struct in6_addr *gateway,
1457                                            struct net_device *dev)
1458 {
1459         int flags = RT6_LOOKUP_F_HAS_SADDR;
1460         struct net *net = dev_net(dev);
1461         struct ip6rd_flowi rdfl = {
1462                 .fl = {
1463                         .oif = dev->ifindex,
1464                         .nl_u = {
1465                                 .ip6_u = {
1466                                         .daddr = *dest,
1467                                         .saddr = *src,
1468                                 },
1469                         },
1470                 },
1471                 .gateway = *gateway,
1472         };
1473
1474         if (rt6_need_strict(dest))
1475                 flags |= RT6_LOOKUP_F_IFACE;
1476
1477         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1478                                                    flags, __ip6_route_redirect);
1479 }
1480
1481 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1482                   struct in6_addr *saddr,
1483                   struct neighbour *neigh, u8 *lladdr, int on_link)
1484 {
1485         struct rt6_info *rt, *nrt = NULL;
1486         struct netevent_redirect netevent;
1487         struct net *net = dev_net(neigh->dev);
1488
1489         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1490
1491         if (rt == net->ipv6.ip6_null_entry) {
1492                 if (net_ratelimit())
1493                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1494                                "for redirect target\n");
1495                 goto out;
1496         }
1497
1498         /*
1499          *      We have finally decided to accept it.
1500          */
1501
1502         neigh_update(neigh, lladdr, NUD_STALE,
1503                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1504                      NEIGH_UPDATE_F_OVERRIDE|
1505                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1506                                      NEIGH_UPDATE_F_ISROUTER))
1507                      );
1508
1509         /*
1510          * Redirect received -> path was valid.
1511          * Look, redirects are sent only in response to data packets,
1512          * so that this nexthop apparently is reachable. --ANK
1513          */
1514         dst_confirm(&rt->u.dst);
1515
1516         /* Duplicate redirect: silently ignore. */
1517         if (neigh == rt->u.dst.neighbour)
1518                 goto out;
1519
1520         nrt = ip6_rt_copy(rt);
1521         if (nrt == NULL)
1522                 goto out;
1523
1524         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1525         if (on_link)
1526                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1527
1528         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1529         nrt->rt6i_dst.plen = 128;
1530         nrt->u.dst.flags |= DST_HOST;
1531
1532         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1533         nrt->rt6i_nexthop = neigh_clone(neigh);
1534         /* Reset pmtu, it may be better */
1535         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1536         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1537                                                         dst_mtu(&nrt->u.dst));
1538
1539         if (ip6_ins_rt(nrt))
1540                 goto out;
1541
1542         netevent.old = &rt->u.dst;
1543         netevent.new = &nrt->u.dst;
1544         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1545
1546         if (rt->rt6i_flags&RTF_CACHE) {
1547                 ip6_del_rt(rt);
1548                 return;
1549         }
1550
1551 out:
1552         dst_release(&rt->u.dst);
1553         return;
1554 }
1555
1556 /*
1557  *      Handle ICMP "packet too big" messages
1558  *      i.e. Path MTU discovery
1559  */
1560
1561 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1562                         struct net_device *dev, u32 pmtu)
1563 {
1564         struct rt6_info *rt, *nrt;
1565         struct net *net = dev_net(dev);
1566         int allfrag = 0;
1567
1568         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1569         if (rt == NULL)
1570                 return;
1571
1572         if (pmtu >= dst_mtu(&rt->u.dst))
1573                 goto out;
1574
1575         if (pmtu < IPV6_MIN_MTU) {
1576                 /*
1577                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578                  * MTU (1280) and a fragment header should always be included
1579                  * after a node receiving Too Big message reporting PMTU is
1580                  * less than the IPv6 Minimum Link MTU.
1581                  */
1582                 pmtu = IPV6_MIN_MTU;
1583                 allfrag = 1;
1584         }
1585
1586         /* New mtu received -> path was valid.
1587            They are sent only in response to data packets,
1588            so that this nexthop apparently is reachable. --ANK
1589          */
1590         dst_confirm(&rt->u.dst);
1591
1592         /* Host route. If it is static, it would be better
1593            not to override it, but add new one, so that
1594            when cache entry will expire old pmtu
1595            would return automatically.
1596          */
1597         if (rt->rt6i_flags & RTF_CACHE) {
1598                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1599                 if (allfrag)
1600                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1601                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1602                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1603                 goto out;
1604         }
1605
1606         /* Network route.
1607            Two cases are possible:
1608            1. It is connected route. Action: COW
1609            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1610          */
1611         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1612                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1613         else
1614                 nrt = rt6_alloc_clone(rt, daddr);
1615
1616         if (nrt) {
1617                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1618                 if (allfrag)
1619                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1620
1621                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1622                  * happened within 5 mins, the recommended timer is 10 mins.
1623                  * Here this route expiration time is set to ip6_rt_mtu_expires
1624                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1625                  * and detecting PMTU increase will be automatically happened.
1626                  */
1627                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1628                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1629
1630                 ip6_ins_rt(nrt);
1631         }
1632 out:
1633         dst_release(&rt->u.dst);
1634 }
1635
1636 /*
1637  *      Misc support functions
1638  */
1639
1640 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1641 {
1642         struct net *net = dev_net(ort->rt6i_dev);
1643         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1644
1645         if (rt) {
1646                 rt->u.dst.input = ort->u.dst.input;
1647                 rt->u.dst.output = ort->u.dst.output;
1648
1649                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1650                 rt->u.dst.error = ort->u.dst.error;
1651                 rt->u.dst.dev = ort->u.dst.dev;
1652                 if (rt->u.dst.dev)
1653                         dev_hold(rt->u.dst.dev);
1654                 rt->rt6i_idev = ort->rt6i_idev;
1655                 if (rt->rt6i_idev)
1656                         in6_dev_hold(rt->rt6i_idev);
1657                 rt->u.dst.lastuse = jiffies;
1658                 rt->rt6i_expires = 0;
1659
1660                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1661                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1662                 rt->rt6i_metric = 0;
1663
1664                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1665 #ifdef CONFIG_IPV6_SUBTREES
1666                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1667 #endif
1668                 rt->rt6i_table = ort->rt6i_table;
1669         }
1670         return rt;
1671 }
1672
1673 #ifdef CONFIG_IPV6_ROUTE_INFO
1674 static struct rt6_info *rt6_get_route_info(struct net *net,
1675                                            struct in6_addr *prefix, int prefixlen,
1676                                            struct in6_addr *gwaddr, int ifindex)
1677 {
1678         struct fib6_node *fn;
1679         struct rt6_info *rt = NULL;
1680         struct fib6_table *table;
1681
1682         table = fib6_get_table(net, RT6_TABLE_INFO);
1683         if (table == NULL)
1684                 return NULL;
1685
1686         write_lock_bh(&table->tb6_lock);
1687         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1688         if (!fn)
1689                 goto out;
1690
1691         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1692                 if (rt->rt6i_dev->ifindex != ifindex)
1693                         continue;
1694                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1695                         continue;
1696                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1697                         continue;
1698                 dst_hold(&rt->u.dst);
1699                 break;
1700         }
1701 out:
1702         write_unlock_bh(&table->tb6_lock);
1703         return rt;
1704 }
1705
1706 static struct rt6_info *rt6_add_route_info(struct net *net,
1707                                            struct in6_addr *prefix, int prefixlen,
1708                                            struct in6_addr *gwaddr, int ifindex,
1709                                            unsigned pref)
1710 {
1711         struct fib6_config cfg = {
1712                 .fc_table       = RT6_TABLE_INFO,
1713                 .fc_metric      = IP6_RT_PRIO_USER,
1714                 .fc_ifindex     = ifindex,
1715                 .fc_dst_len     = prefixlen,
1716                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1717                                   RTF_UP | RTF_PREF(pref),
1718                 .fc_nlinfo.pid = 0,
1719                 .fc_nlinfo.nlh = NULL,
1720                 .fc_nlinfo.nl_net = net,
1721         };
1722
1723         ipv6_addr_copy(&cfg.fc_dst, prefix);
1724         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1725
1726         /* We should treat it as a default route if prefix length is 0. */
1727         if (!prefixlen)
1728                 cfg.fc_flags |= RTF_DEFAULT;
1729
1730         ip6_route_add(&cfg);
1731
1732         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1733 }
1734 #endif
1735
1736 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1737 {
1738         struct rt6_info *rt;
1739         struct fib6_table *table;
1740
1741         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1742         if (table == NULL)
1743                 return NULL;
1744
1745         write_lock_bh(&table->tb6_lock);
1746         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1747                 if (dev == rt->rt6i_dev &&
1748                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1749                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1750                         break;
1751         }
1752         if (rt)
1753                 dst_hold(&rt->u.dst);
1754         write_unlock_bh(&table->tb6_lock);
1755         return rt;
1756 }
1757
1758 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1759                                      struct net_device *dev,
1760                                      unsigned int pref)
1761 {
1762         struct fib6_config cfg = {
1763                 .fc_table       = RT6_TABLE_DFLT,
1764                 .fc_metric      = IP6_RT_PRIO_USER,
1765                 .fc_ifindex     = dev->ifindex,
1766                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1767                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1768                 .fc_nlinfo.pid = 0,
1769                 .fc_nlinfo.nlh = NULL,
1770                 .fc_nlinfo.nl_net = dev_net(dev),
1771         };
1772
1773         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1774
1775         ip6_route_add(&cfg);
1776
1777         return rt6_get_dflt_router(gwaddr, dev);
1778 }
1779
1780 void rt6_purge_dflt_routers(struct net *net)
1781 {
1782         struct rt6_info *rt;
1783         struct fib6_table *table;
1784
1785         /* NOTE: Keep consistent with rt6_get_dflt_router */
1786         table = fib6_get_table(net, RT6_TABLE_DFLT);
1787         if (table == NULL)
1788                 return;
1789
1790 restart:
1791         read_lock_bh(&table->tb6_lock);
1792         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1793                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1794                         dst_hold(&rt->u.dst);
1795                         read_unlock_bh(&table->tb6_lock);
1796                         ip6_del_rt(rt);
1797                         goto restart;
1798                 }
1799         }
1800         read_unlock_bh(&table->tb6_lock);
1801 }
1802
1803 static void rtmsg_to_fib6_config(struct net *net,
1804                                  struct in6_rtmsg *rtmsg,
1805                                  struct fib6_config *cfg)
1806 {
1807         memset(cfg, 0, sizeof(*cfg));
1808
1809         cfg->fc_table = RT6_TABLE_MAIN;
1810         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1811         cfg->fc_metric = rtmsg->rtmsg_metric;
1812         cfg->fc_expires = rtmsg->rtmsg_info;
1813         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1814         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1815         cfg->fc_flags = rtmsg->rtmsg_flags;
1816
1817         cfg->fc_nlinfo.nl_net = net;
1818
1819         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1820         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1821         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1822 }
1823
1824 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1825 {
1826         struct fib6_config cfg;
1827         struct in6_rtmsg rtmsg;
1828         int err;
1829
1830         switch(cmd) {
1831         case SIOCADDRT:         /* Add a route */
1832         case SIOCDELRT:         /* Delete a route */
1833                 if (!capable(CAP_NET_ADMIN))
1834                         return -EPERM;
1835                 err = copy_from_user(&rtmsg, arg,
1836                                      sizeof(struct in6_rtmsg));
1837                 if (err)
1838                         return -EFAULT;
1839
1840                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1841
1842                 rtnl_lock();
1843                 switch (cmd) {
1844                 case SIOCADDRT:
1845                         err = ip6_route_add(&cfg);
1846                         break;
1847                 case SIOCDELRT:
1848                         err = ip6_route_del(&cfg);
1849                         break;
1850                 default:
1851                         err = -EINVAL;
1852                 }
1853                 rtnl_unlock();
1854
1855                 return err;
1856         }
1857
1858         return -EINVAL;
1859 }
1860
1861 /*
1862  *      Drop the packet on the floor
1863  */
1864
1865 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1866 {
1867         int type;
1868         struct dst_entry *dst = skb->dst;
1869         switch (ipstats_mib_noroutes) {
1870         case IPSTATS_MIB_INNOROUTES:
1871                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1872                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1873                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1874                                       IPSTATS_MIB_INADDRERRORS);
1875                         break;
1876                 }
1877                 /* FALLTHROUGH */
1878         case IPSTATS_MIB_OUTNOROUTES:
1879                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1880                               ipstats_mib_noroutes);
1881                 break;
1882         }
1883         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1884         kfree_skb(skb);
1885         return 0;
1886 }
1887
1888 static int ip6_pkt_discard(struct sk_buff *skb)
1889 {
1890         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1891 }
1892
1893 static int ip6_pkt_discard_out(struct sk_buff *skb)
1894 {
1895         skb->dev = skb->dst->dev;
1896         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1897 }
1898
1899 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1900
1901 static int ip6_pkt_prohibit(struct sk_buff *skb)
1902 {
1903         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1904 }
1905
1906 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1907 {
1908         skb->dev = skb->dst->dev;
1909         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1910 }
1911
1912 #endif
1913
1914 /*
1915  *      Allocate a dst for local (unicast / anycast) address.
1916  */
1917
1918 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1919                                     const struct in6_addr *addr,
1920                                     int anycast)
1921 {
1922         struct net *net = dev_net(idev->dev);
1923         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1924         struct neighbour *neigh;
1925
1926         if (rt == NULL)
1927                 return ERR_PTR(-ENOMEM);
1928
1929         dev_hold(net->loopback_dev);
1930         in6_dev_hold(idev);
1931
1932         rt->u.dst.flags = DST_HOST;
1933         rt->u.dst.input = ip6_input;
1934         rt->u.dst.output = ip6_output;
1935         rt->rt6i_dev = net->loopback_dev;
1936         rt->rt6i_idev = idev;
1937         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1938         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1939         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1940         rt->u.dst.obsolete = -1;
1941
1942         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1943         if (anycast)
1944                 rt->rt6i_flags |= RTF_ANYCAST;
1945         else
1946                 rt->rt6i_flags |= RTF_LOCAL;
1947         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1948         if (IS_ERR(neigh)) {
1949                 dst_free(&rt->u.dst);
1950
1951                 /* We are casting this because that is the return
1952                  * value type.  But an errno encoded pointer is the
1953                  * same regardless of the underlying pointer type,
1954                  * and that's what we are returning.  So this is OK.
1955                  */
1956                 return (struct rt6_info *) neigh;
1957         }
1958         rt->rt6i_nexthop = neigh;
1959
1960         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1961         rt->rt6i_dst.plen = 128;
1962         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1963
1964         atomic_set(&rt->u.dst.__refcnt, 1);
1965
1966         return rt;
1967 }
1968
1969 struct arg_dev_net {
1970         struct net_device *dev;
1971         struct net *net;
1972 };
1973
1974 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1975 {
1976         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1977         struct net *net = ((struct arg_dev_net *)arg)->net;
1978
1979         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1980             rt != net->ipv6.ip6_null_entry) {
1981                 RT6_TRACE("deleted by ifdown %p\n", rt);
1982                 return -1;
1983         }
1984         return 0;
1985 }
1986
1987 void rt6_ifdown(struct net *net, struct net_device *dev)
1988 {
1989         struct arg_dev_net adn = {
1990                 .dev = dev,
1991                 .net = net,
1992         };
1993
1994         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1995         icmp6_clean_all(fib6_ifdown, &adn);
1996 }
1997
1998 struct rt6_mtu_change_arg
1999 {
2000         struct net_device *dev;
2001         unsigned mtu;
2002 };
2003
2004 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2005 {
2006         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2007         struct inet6_dev *idev;
2008         struct net *net = dev_net(arg->dev);
2009
2010         /* In IPv6 pmtu discovery is not optional,
2011            so that RTAX_MTU lock cannot disable it.
2012            We still use this lock to block changes
2013            caused by addrconf/ndisc.
2014         */
2015
2016         idev = __in6_dev_get(arg->dev);
2017         if (idev == NULL)
2018                 return 0;
2019
2020         /* For administrative MTU increase, there is no way to discover
2021            IPv6 PMTU increase, so PMTU increase should be updated here.
2022            Since RFC 1981 doesn't include administrative MTU increase
2023            update PMTU increase is a MUST. (i.e. jumbo frame)
2024          */
2025         /*
2026            If new MTU is less than route PMTU, this new MTU will be the
2027            lowest MTU in the path, update the route PMTU to reflect PMTU
2028            decreases; if new MTU is greater than route PMTU, and the
2029            old MTU is the lowest MTU in the path, update the route PMTU
2030            to reflect the increase. In this case if the other nodes' MTU
2031            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2032            PMTU discouvery.
2033          */
2034         if (rt->rt6i_dev == arg->dev &&
2035             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2036             (dst_mtu(&rt->u.dst) >= arg->mtu ||
2037              (dst_mtu(&rt->u.dst) < arg->mtu &&
2038               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2039                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2040                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2041         }
2042         return 0;
2043 }
2044
2045 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2046 {
2047         struct rt6_mtu_change_arg arg = {
2048                 .dev = dev,
2049                 .mtu = mtu,
2050         };
2051
2052         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2053 }
2054
2055 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2056         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2057         [RTA_OIF]               = { .type = NLA_U32 },
2058         [RTA_IIF]               = { .type = NLA_U32 },
2059         [RTA_PRIORITY]          = { .type = NLA_U32 },
2060         [RTA_METRICS]           = { .type = NLA_NESTED },
2061 };
2062
2063 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2064                               struct fib6_config *cfg)
2065 {
2066         struct rtmsg *rtm;
2067         struct nlattr *tb[RTA_MAX+1];
2068         int err;
2069
2070         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2071         if (err < 0)
2072                 goto errout;
2073
2074         err = -EINVAL;
2075         rtm = nlmsg_data(nlh);
2076         memset(cfg, 0, sizeof(*cfg));
2077
2078         cfg->fc_table = rtm->rtm_table;
2079         cfg->fc_dst_len = rtm->rtm_dst_len;
2080         cfg->fc_src_len = rtm->rtm_src_len;
2081         cfg->fc_flags = RTF_UP;
2082         cfg->fc_protocol = rtm->rtm_protocol;
2083
2084         if (rtm->rtm_type == RTN_UNREACHABLE)
2085                 cfg->fc_flags |= RTF_REJECT;
2086
2087         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2088         cfg->fc_nlinfo.nlh = nlh;
2089         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2090
2091         if (tb[RTA_GATEWAY]) {
2092                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2093                 cfg->fc_flags |= RTF_GATEWAY;
2094         }
2095
2096         if (tb[RTA_DST]) {
2097                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2098
2099                 if (nla_len(tb[RTA_DST]) < plen)
2100                         goto errout;
2101
2102                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2103         }
2104
2105         if (tb[RTA_SRC]) {
2106                 int plen = (rtm->rtm_src_len + 7) >> 3;
2107
2108                 if (nla_len(tb[RTA_SRC]) < plen)
2109                         goto errout;
2110
2111                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2112         }
2113
2114         if (tb[RTA_OIF])
2115                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2116
2117         if (tb[RTA_PRIORITY])
2118                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2119
2120         if (tb[RTA_METRICS]) {
2121                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2122                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2123         }
2124
2125         if (tb[RTA_TABLE])
2126                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2127
2128         err = 0;
2129 errout:
2130         return err;
2131 }
2132
2133 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2134 {
2135         struct fib6_config cfg;
2136         int err;
2137
2138         err = rtm_to_fib6_config(skb, nlh, &cfg);
2139         if (err < 0)
2140                 return err;
2141
2142         return ip6_route_del(&cfg);
2143 }
2144
2145 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2146 {
2147         struct fib6_config cfg;
2148         int err;
2149
2150         err = rtm_to_fib6_config(skb, nlh, &cfg);
2151         if (err < 0)
2152                 return err;
2153
2154         return ip6_route_add(&cfg);
2155 }
2156
2157 static inline size_t rt6_nlmsg_size(void)
2158 {
2159         return NLMSG_ALIGN(sizeof(struct rtmsg))
2160                + nla_total_size(16) /* RTA_SRC */
2161                + nla_total_size(16) /* RTA_DST */
2162                + nla_total_size(16) /* RTA_GATEWAY */
2163                + nla_total_size(16) /* RTA_PREFSRC */
2164                + nla_total_size(4) /* RTA_TABLE */
2165                + nla_total_size(4) /* RTA_IIF */
2166                + nla_total_size(4) /* RTA_OIF */
2167                + nla_total_size(4) /* RTA_PRIORITY */
2168                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2169                + nla_total_size(sizeof(struct rta_cacheinfo));
2170 }
2171
2172 static int rt6_fill_node(struct net *net,
2173                          struct sk_buff *skb, struct rt6_info *rt,
2174                          struct in6_addr *dst, struct in6_addr *src,
2175                          int iif, int type, u32 pid, u32 seq,
2176                          int prefix, int nowait, unsigned int flags)
2177 {
2178         struct rtmsg *rtm;
2179         struct nlmsghdr *nlh;
2180         long expires;
2181         u32 table;
2182
2183         if (prefix) {   /* user wants prefix routes only */
2184                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2185                         /* success since this is not a prefix route */
2186                         return 1;
2187                 }
2188         }
2189
2190         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2191         if (nlh == NULL)
2192                 return -EMSGSIZE;
2193
2194         rtm = nlmsg_data(nlh);
2195         rtm->rtm_family = AF_INET6;
2196         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2197         rtm->rtm_src_len = rt->rt6i_src.plen;
2198         rtm->rtm_tos = 0;
2199         if (rt->rt6i_table)
2200                 table = rt->rt6i_table->tb6_id;
2201         else
2202                 table = RT6_TABLE_UNSPEC;
2203         rtm->rtm_table = table;
2204         NLA_PUT_U32(skb, RTA_TABLE, table);
2205         if (rt->rt6i_flags&RTF_REJECT)
2206                 rtm->rtm_type = RTN_UNREACHABLE;
2207         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2208                 rtm->rtm_type = RTN_LOCAL;
2209         else
2210                 rtm->rtm_type = RTN_UNICAST;
2211         rtm->rtm_flags = 0;
2212         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2213         rtm->rtm_protocol = rt->rt6i_protocol;
2214         if (rt->rt6i_flags&RTF_DYNAMIC)
2215                 rtm->rtm_protocol = RTPROT_REDIRECT;
2216         else if (rt->rt6i_flags & RTF_ADDRCONF)
2217                 rtm->rtm_protocol = RTPROT_KERNEL;
2218         else if (rt->rt6i_flags&RTF_DEFAULT)
2219                 rtm->rtm_protocol = RTPROT_RA;
2220
2221         if (rt->rt6i_flags&RTF_CACHE)
2222                 rtm->rtm_flags |= RTM_F_CLONED;
2223
2224         if (dst) {
2225                 NLA_PUT(skb, RTA_DST, 16, dst);
2226                 rtm->rtm_dst_len = 128;
2227         } else if (rtm->rtm_dst_len)
2228                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2229 #ifdef CONFIG_IPV6_SUBTREES
2230         if (src) {
2231                 NLA_PUT(skb, RTA_SRC, 16, src);
2232                 rtm->rtm_src_len = 128;
2233         } else if (rtm->rtm_src_len)
2234                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2235 #endif
2236         if (iif) {
2237 #ifdef CONFIG_IPV6_MROUTE
2238                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2239                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2240                         if (err <= 0) {
2241                                 if (!nowait) {
2242                                         if (err == 0)
2243                                                 return 0;
2244                                         goto nla_put_failure;
2245                                 } else {
2246                                         if (err == -EMSGSIZE)
2247                                                 goto nla_put_failure;
2248                                 }
2249                         }
2250                 } else
2251 #endif
2252                         NLA_PUT_U32(skb, RTA_IIF, iif);
2253         } else if (dst) {
2254                 struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2255                 struct in6_addr saddr_buf;
2256                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2257                                        dst, 0, &saddr_buf) == 0)
2258                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2259         }
2260
2261         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2262                 goto nla_put_failure;
2263
2264         if (rt->u.dst.neighbour)
2265                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2266
2267         if (rt->u.dst.dev)
2268                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2269
2270         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2271
2272         if (!(rt->rt6i_flags & RTF_EXPIRES))
2273                 expires = 0;
2274         else if (rt->rt6i_expires - jiffies < INT_MAX)
2275                 expires = rt->rt6i_expires - jiffies;
2276         else
2277                 expires = INT_MAX;
2278
2279         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2280                                expires, rt->u.dst.error) < 0)
2281                 goto nla_put_failure;
2282
2283         return nlmsg_end(skb, nlh);
2284
2285 nla_put_failure:
2286         nlmsg_cancel(skb, nlh);
2287         return -EMSGSIZE;
2288 }
2289
2290 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2291 {
2292         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2293         int prefix;
2294
2295         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2296                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2297                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2298         } else
2299                 prefix = 0;
2300
2301         return rt6_fill_node(arg->net,
2302                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2303                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2304                      prefix, 0, NLM_F_MULTI);
2305 }
2306
2307 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2308 {
2309         struct net *net = sock_net(in_skb->sk);
2310         struct nlattr *tb[RTA_MAX+1];
2311         struct rt6_info *rt;
2312         struct sk_buff *skb;
2313         struct rtmsg *rtm;
2314         struct flowi fl;
2315         int err, iif = 0;
2316
2317         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2318         if (err < 0)
2319                 goto errout;
2320
2321         err = -EINVAL;
2322         memset(&fl, 0, sizeof(fl));
2323
2324         if (tb[RTA_SRC]) {
2325                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2326                         goto errout;
2327
2328                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2329         }
2330
2331         if (tb[RTA_DST]) {
2332                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2333                         goto errout;
2334
2335                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2336         }
2337
2338         if (tb[RTA_IIF])
2339                 iif = nla_get_u32(tb[RTA_IIF]);
2340
2341         if (tb[RTA_OIF])
2342                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2343
2344         if (iif) {
2345                 struct net_device *dev;
2346                 dev = __dev_get_by_index(net, iif);
2347                 if (!dev) {
2348                         err = -ENODEV;
2349                         goto errout;
2350                 }
2351         }
2352
2353         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2354         if (skb == NULL) {
2355                 err = -ENOBUFS;
2356                 goto errout;
2357         }
2358
2359         /* Reserve room for dummy headers, this skb can pass
2360            through good chunk of routing engine.
2361          */
2362         skb_reset_mac_header(skb);
2363         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2364
2365         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2366         skb->dst = &rt->u.dst;
2367
2368         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2369                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2370                             nlh->nlmsg_seq, 0, 0, 0);
2371         if (err < 0) {
2372                 kfree_skb(skb);
2373                 goto errout;
2374         }
2375
2376         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2377 errout:
2378         return err;
2379 }
2380
2381 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2382 {
2383         struct sk_buff *skb;
2384         struct net *net = info->nl_net;
2385         u32 seq;
2386         int err;
2387
2388         err = -ENOBUFS;
2389         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2390
2391         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2392         if (skb == NULL)
2393                 goto errout;
2394
2395         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2396                                 event, info->pid, seq, 0, 0, 0);
2397         if (err < 0) {
2398                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2399                 WARN_ON(err == -EMSGSIZE);
2400                 kfree_skb(skb);
2401                 goto errout;
2402         }
2403         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2404                     info->nlh, gfp_any());
2405         return;
2406 errout:
2407         if (err < 0)
2408                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2409 }
2410
2411 static int ip6_route_dev_notify(struct notifier_block *this,
2412                                 unsigned long event, void *data)
2413 {
2414         struct net_device *dev = (struct net_device *)data;
2415         struct net *net = dev_net(dev);
2416
2417         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2418                 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2419                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2420 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2421                 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2422                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2423                 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2424                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2425 #endif
2426         }
2427
2428         return NOTIFY_OK;
2429 }
2430
2431 /*
2432  *      /proc
2433  */
2434
2435 #ifdef CONFIG_PROC_FS
2436
2437 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2438
2439 struct rt6_proc_arg
2440 {
2441         char *buffer;
2442         int offset;
2443         int length;
2444         int skip;
2445         int len;
2446 };
2447
2448 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2449 {
2450         struct seq_file *m = p_arg;
2451
2452         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2453
2454 #ifdef CONFIG_IPV6_SUBTREES
2455         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2456 #else
2457         seq_puts(m, "00000000000000000000000000000000 00 ");
2458 #endif
2459
2460         if (rt->rt6i_nexthop) {
2461                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2462         } else {
2463                 seq_puts(m, "00000000000000000000000000000000");
2464         }
2465         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2466                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2467                    rt->u.dst.__use, rt->rt6i_flags,
2468                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2469         return 0;
2470 }
2471
2472 static int ipv6_route_show(struct seq_file *m, void *v)
2473 {
2474         struct net *net = (struct net *)m->private;
2475         fib6_clean_all(net, rt6_info_route, 0, m);
2476         return 0;
2477 }
2478
2479 static int ipv6_route_open(struct inode *inode, struct file *file)
2480 {
2481         return single_open_net(inode, file, ipv6_route_show);
2482 }
2483
2484 static const struct file_operations ipv6_route_proc_fops = {
2485         .owner          = THIS_MODULE,
2486         .open           = ipv6_route_open,
2487         .read           = seq_read,
2488         .llseek         = seq_lseek,
2489         .release        = single_release_net,
2490 };
2491
2492 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2493 {
2494         struct net *net = (struct net *)seq->private;
2495         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2496                    net->ipv6.rt6_stats->fib_nodes,
2497                    net->ipv6.rt6_stats->fib_route_nodes,
2498                    net->ipv6.rt6_stats->fib_rt_alloc,
2499                    net->ipv6.rt6_stats->fib_rt_entries,
2500                    net->ipv6.rt6_stats->fib_rt_cache,
2501                    atomic_read(&net->ipv6.ip6_dst_ops->entries),
2502                    net->ipv6.rt6_stats->fib_discarded_routes);
2503
2504         return 0;
2505 }
2506
2507 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2508 {
2509         return single_open_net(inode, file, rt6_stats_seq_show);
2510 }
2511
2512 static const struct file_operations rt6_stats_seq_fops = {
2513         .owner   = THIS_MODULE,
2514         .open    = rt6_stats_seq_open,
2515         .read    = seq_read,
2516         .llseek  = seq_lseek,
2517         .release = single_release_net,
2518 };
2519 #endif  /* CONFIG_PROC_FS */
2520
2521 #ifdef CONFIG_SYSCTL
2522
2523 static
2524 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2525                               void __user *buffer, size_t *lenp, loff_t *ppos)
2526 {
2527         struct net *net = current->nsproxy->net_ns;
2528         int delay = net->ipv6.sysctl.flush_delay;
2529         if (write) {
2530                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2531                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2532                 return 0;
2533         } else
2534                 return -EINVAL;
2535 }
2536
2537 ctl_table ipv6_route_table_template[] = {
2538         {
2539                 .procname       =       "flush",
2540                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2541                 .maxlen         =       sizeof(int),
2542                 .mode           =       0200,
2543                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2544         },
2545         {
2546                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2547                 .procname       =       "gc_thresh",
2548                 .data           =       &ip6_dst_ops_template.gc_thresh,
2549                 .maxlen         =       sizeof(int),
2550                 .mode           =       0644,
2551                 .proc_handler   =       proc_dointvec,
2552         },
2553         {
2554                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2555                 .procname       =       "max_size",
2556                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2557                 .maxlen         =       sizeof(int),
2558                 .mode           =       0644,
2559                 .proc_handler   =       proc_dointvec,
2560         },
2561         {
2562                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2563                 .procname       =       "gc_min_interval",
2564                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2565                 .maxlen         =       sizeof(int),
2566                 .mode           =       0644,
2567                 .proc_handler   =       proc_dointvec_jiffies,
2568                 .strategy       =       sysctl_jiffies,
2569         },
2570         {
2571                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2572                 .procname       =       "gc_timeout",
2573                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2574                 .maxlen         =       sizeof(int),
2575                 .mode           =       0644,
2576                 .proc_handler   =       proc_dointvec_jiffies,
2577                 .strategy       =       sysctl_jiffies,
2578         },
2579         {
2580                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2581                 .procname       =       "gc_interval",
2582                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2583                 .maxlen         =       sizeof(int),
2584                 .mode           =       0644,
2585                 .proc_handler   =       proc_dointvec_jiffies,
2586                 .strategy       =       sysctl_jiffies,
2587         },
2588         {
2589                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2590                 .procname       =       "gc_elasticity",
2591                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2592                 .maxlen         =       sizeof(int),
2593                 .mode           =       0644,
2594                 .proc_handler   =       proc_dointvec_jiffies,
2595                 .strategy       =       sysctl_jiffies,
2596         },
2597         {
2598                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2599                 .procname       =       "mtu_expires",
2600                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2601                 .maxlen         =       sizeof(int),
2602                 .mode           =       0644,
2603                 .proc_handler   =       proc_dointvec_jiffies,
2604                 .strategy       =       sysctl_jiffies,
2605         },
2606         {
2607                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2608                 .procname       =       "min_adv_mss",
2609                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2610                 .maxlen         =       sizeof(int),
2611                 .mode           =       0644,
2612                 .proc_handler   =       proc_dointvec_jiffies,
2613                 .strategy       =       sysctl_jiffies,
2614         },
2615         {
2616                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2617                 .procname       =       "gc_min_interval_ms",
2618                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2619                 .maxlen         =       sizeof(int),
2620                 .mode           =       0644,
2621                 .proc_handler   =       proc_dointvec_ms_jiffies,
2622                 .strategy       =       sysctl_ms_jiffies,
2623         },
2624         { .ctl_name = 0 }
2625 };
2626
2627 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2628 {
2629         struct ctl_table *table;
2630
2631         table = kmemdup(ipv6_route_table_template,
2632                         sizeof(ipv6_route_table_template),
2633                         GFP_KERNEL);
2634
2635         if (table) {
2636                 table[0].data = &net->ipv6.sysctl.flush_delay;
2637                 table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2638                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2639                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2640                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2641                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2642                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2643                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2644                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2645         }
2646
2647         return table;
2648 }
2649 #endif
2650
2651 static int ip6_route_net_init(struct net *net)
2652 {
2653         int ret = -ENOMEM;
2654
2655         net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2656                                         sizeof(*net->ipv6.ip6_dst_ops),
2657                                         GFP_KERNEL);
2658         if (!net->ipv6.ip6_dst_ops)
2659                 goto out;
2660         net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2661
2662         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2663                                            sizeof(*net->ipv6.ip6_null_entry),
2664                                            GFP_KERNEL);
2665         if (!net->ipv6.ip6_null_entry)
2666                 goto out_ip6_dst_ops;
2667         net->ipv6.ip6_null_entry->u.dst.path =
2668                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2669         net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2670
2671 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2672         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2673                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2674                                                GFP_KERNEL);
2675         if (!net->ipv6.ip6_prohibit_entry)
2676                 goto out_ip6_null_entry;
2677         net->ipv6.ip6_prohibit_entry->u.dst.path =
2678                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2679         net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2680
2681         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2682                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2683                                                GFP_KERNEL);
2684         if (!net->ipv6.ip6_blk_hole_entry)
2685                 goto out_ip6_prohibit_entry;
2686         net->ipv6.ip6_blk_hole_entry->u.dst.path =
2687                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2688         net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2689 #endif
2690
2691         net->ipv6.sysctl.flush_delay = 0;
2692         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2693         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2694         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2695         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2696         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2697         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2698         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2699
2700 #ifdef CONFIG_PROC_FS
2701         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2702         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2703 #endif
2704         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2705
2706         ret = 0;
2707 out:
2708         return ret;
2709
2710 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2711 out_ip6_prohibit_entry:
2712         kfree(net->ipv6.ip6_prohibit_entry);
2713 out_ip6_null_entry:
2714         kfree(net->ipv6.ip6_null_entry);
2715 #endif
2716 out_ip6_dst_ops:
2717         release_net(net->ipv6.ip6_dst_ops->dst_net);
2718         kfree(net->ipv6.ip6_dst_ops);
2719         goto out;
2720 }
2721
2722 static void ip6_route_net_exit(struct net *net)
2723 {
2724 #ifdef CONFIG_PROC_FS
2725         proc_net_remove(net, "ipv6_route");
2726         proc_net_remove(net, "rt6_stats");
2727 #endif
2728         kfree(net->ipv6.ip6_null_entry);
2729 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2730         kfree(net->ipv6.ip6_prohibit_entry);
2731         kfree(net->ipv6.ip6_blk_hole_entry);
2732 #endif
2733         release_net(net->ipv6.ip6_dst_ops->dst_net);
2734         kfree(net->ipv6.ip6_dst_ops);
2735 }
2736
2737 static struct pernet_operations ip6_route_net_ops = {
2738         .init = ip6_route_net_init,
2739         .exit = ip6_route_net_exit,
2740 };
2741
2742 static struct notifier_block ip6_route_dev_notifier = {
2743         .notifier_call = ip6_route_dev_notify,
2744         .priority = 0,
2745 };
2746
2747 int __init ip6_route_init(void)
2748 {
2749         int ret;
2750
2751         ret = -ENOMEM;
2752         ip6_dst_ops_template.kmem_cachep =
2753                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2754                                   SLAB_HWCACHE_ALIGN, NULL);
2755         if (!ip6_dst_ops_template.kmem_cachep)
2756                 goto out;
2757
2758         ret = register_pernet_subsys(&ip6_route_net_ops);
2759         if (ret)
2760                 goto out_kmem_cache;
2761
2762         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2763
2764         /* Registering of the loopback is done before this portion of code,
2765          * the loopback reference in rt6_info will not be taken, do it
2766          * manually for init_net */
2767         init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2768         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2769   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2770         init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2771         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2772         init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2773         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2774   #endif
2775         ret = fib6_init();
2776         if (ret)
2777                 goto out_register_subsys;
2778
2779         ret = xfrm6_init();
2780         if (ret)
2781                 goto out_fib6_init;
2782
2783         ret = fib6_rules_init();
2784         if (ret)
2785                 goto xfrm6_init;
2786
2787         ret = -ENOBUFS;
2788         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2789             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2790             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2791                 goto fib6_rules_init;
2792
2793         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2794         if (ret)
2795                 goto fib6_rules_init;
2796
2797 out:
2798         return ret;
2799
2800 fib6_rules_init:
2801         fib6_rules_cleanup();
2802 xfrm6_init:
2803         xfrm6_fini();
2804 out_fib6_init:
2805         fib6_gc_cleanup();
2806 out_register_subsys:
2807         unregister_pernet_subsys(&ip6_route_net_ops);
2808 out_kmem_cache:
2809         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2810         goto out;
2811 }
2812
2813 void ip6_route_cleanup(void)
2814 {
2815         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2816         fib6_rules_cleanup();
2817         xfrm6_fini();
2818         fib6_gc_cleanup();
2819         unregister_pernet_subsys(&ip6_route_net_ops);
2820         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2821 }