Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void             ip6_dst_destroy(struct dst_entry *);
80 static void             ip6_dst_ifdown(struct dst_entry *,
81                                        struct net_device *dev, int how);
82 static int               ip6_dst_gc(struct dst_ops *ops);
83
84 static int              ip6_pkt_discard(struct sk_buff *skb);
85 static int              ip6_pkt_discard_out(struct sk_buff *skb);
86 static void             ip6_link_failure(struct sk_buff *skb);
87 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91                                            struct in6_addr *prefix, int prefixlen,
92                                            struct in6_addr *gwaddr, int ifindex,
93                                            unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95                                            struct in6_addr *prefix, int prefixlen,
96                                            struct in6_addr *gwaddr, int ifindex);
97 #endif
98
99 static struct dst_ops ip6_dst_ops_template = {
100         .family                 =       AF_INET6,
101         .protocol               =       __constant_htons(ETH_P_IPV6),
102         .gc                     =       ip6_dst_gc,
103         .gc_thresh              =       1024,
104         .check                  =       ip6_dst_check,
105         .destroy                =       ip6_dst_destroy,
106         .ifdown                 =       ip6_dst_ifdown,
107         .negative_advice        =       ip6_negative_advice,
108         .link_failure           =       ip6_link_failure,
109         .update_pmtu            =       ip6_rt_update_pmtu,
110         .local_out              =       __ip6_local_out,
111         .entries                =       ATOMIC_INIT(0),
112 };
113
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117
118 static struct dst_ops ip6_dst_blackhole_ops = {
119         .family                 =       AF_INET6,
120         .protocol               =       __constant_htons(ETH_P_IPV6),
121         .destroy                =       ip6_dst_destroy,
122         .check                  =       ip6_dst_check,
123         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
124         .entries                =       ATOMIC_INIT(0),
125 };
126
127 static struct rt6_info ip6_null_entry_template = {
128         .u = {
129                 .dst = {
130                         .__refcnt       = ATOMIC_INIT(1),
131                         .__use          = 1,
132                         .obsolete       = -1,
133                         .error          = -ENETUNREACH,
134                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
135                         .input          = ip6_pkt_discard,
136                         .output         = ip6_pkt_discard_out,
137                 }
138         },
139         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
140         .rt6i_metric    = ~(u32) 0,
141         .rt6i_ref       = ATOMIC_INIT(1),
142 };
143
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145
146 static int ip6_pkt_prohibit(struct sk_buff *skb);
147 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148
149 static struct rt6_info ip6_prohibit_entry_template = {
150         .u = {
151                 .dst = {
152                         .__refcnt       = ATOMIC_INIT(1),
153                         .__use          = 1,
154                         .obsolete       = -1,
155                         .error          = -EACCES,
156                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
157                         .input          = ip6_pkt_prohibit,
158                         .output         = ip6_pkt_prohibit_out,
159                 }
160         },
161         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
162         .rt6i_metric    = ~(u32) 0,
163         .rt6i_ref       = ATOMIC_INIT(1),
164 };
165
166 static struct rt6_info ip6_blk_hole_entry_template = {
167         .u = {
168                 .dst = {
169                         .__refcnt       = ATOMIC_INIT(1),
170                         .__use          = 1,
171                         .obsolete       = -1,
172                         .error          = -EINVAL,
173                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
174                         .input          = dst_discard,
175                         .output         = dst_discard,
176                 }
177         },
178         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
179         .rt6i_metric    = ~(u32) 0,
180         .rt6i_ref       = ATOMIC_INIT(1),
181 };
182
183 #endif
184
185 /* allocate dst with ip6_dst_ops */
186 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
187 {
188         return (struct rt6_info *)dst_alloc(ops);
189 }
190
191 static void ip6_dst_destroy(struct dst_entry *dst)
192 {
193         struct rt6_info *rt = (struct rt6_info *)dst;
194         struct inet6_dev *idev = rt->rt6i_idev;
195
196         if (idev != NULL) {
197                 rt->rt6i_idev = NULL;
198                 in6_dev_put(idev);
199         }
200 }
201
202 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
203                            int how)
204 {
205         struct rt6_info *rt = (struct rt6_info *)dst;
206         struct inet6_dev *idev = rt->rt6i_idev;
207         struct net_device *loopback_dev =
208                 dev_net(dev)->loopback_dev;
209
210         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
211                 struct inet6_dev *loopback_idev =
212                         in6_dev_get(loopback_dev);
213                 if (loopback_idev != NULL) {
214                         rt->rt6i_idev = loopback_idev;
215                         in6_dev_put(idev);
216                 }
217         }
218 }
219
220 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
221 {
222         return (rt->rt6i_flags & RTF_EXPIRES &&
223                 time_after(jiffies, rt->rt6i_expires));
224 }
225
226 static inline int rt6_need_strict(struct in6_addr *daddr)
227 {
228         return (ipv6_addr_type(daddr) &
229                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
230 }
231
232 /*
233  *      Route lookup. Any table->tb6_lock is implied.
234  */
235
236 static inline struct rt6_info *rt6_device_match(struct net *net,
237                                                     struct rt6_info *rt,
238                                                     struct in6_addr *saddr,
239                                                     int oif,
240                                                     int flags)
241 {
242         struct rt6_info *local = NULL;
243         struct rt6_info *sprt;
244
245         if (!oif && ipv6_addr_any(saddr))
246                 goto out;
247
248         for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
249                 struct net_device *dev = sprt->rt6i_dev;
250
251                 if (oif) {
252                         if (dev->ifindex == oif)
253                                 return sprt;
254                         if (dev->flags & IFF_LOOPBACK) {
255                                 if (sprt->rt6i_idev == NULL ||
256                                     sprt->rt6i_idev->dev->ifindex != oif) {
257                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
258                                                 continue;
259                                         if (local && (!oif ||
260                                                       local->rt6i_idev->dev->ifindex == oif))
261                                                 continue;
262                                 }
263                                 local = sprt;
264                         }
265                 } else {
266                         if (ipv6_chk_addr(net, saddr, dev,
267                                           flags & RT6_LOOKUP_F_IFACE))
268                                 return sprt;
269                 }
270         }
271
272         if (oif) {
273                 if (local)
274                         return local;
275
276                 if (flags & RT6_LOOKUP_F_IFACE)
277                         return net->ipv6.ip6_null_entry;
278         }
279 out:
280         return rt;
281 }
282
283 #ifdef CONFIG_IPV6_ROUTER_PREF
284 static void rt6_probe(struct rt6_info *rt)
285 {
286         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
287         /*
288          * Okay, this does not seem to be appropriate
289          * for now, however, we need to check if it
290          * is really so; aka Router Reachability Probing.
291          *
292          * Router Reachability Probe MUST be rate-limited
293          * to no more than one per minute.
294          */
295         if (!neigh || (neigh->nud_state & NUD_VALID))
296                 return;
297         read_lock_bh(&neigh->lock);
298         if (!(neigh->nud_state & NUD_VALID) &&
299             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
300                 struct in6_addr mcaddr;
301                 struct in6_addr *target;
302
303                 neigh->updated = jiffies;
304                 read_unlock_bh(&neigh->lock);
305
306                 target = (struct in6_addr *)&neigh->primary_key;
307                 addrconf_addr_solict_mult(target, &mcaddr);
308                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
309         } else
310                 read_unlock_bh(&neigh->lock);
311 }
312 #else
313 static inline void rt6_probe(struct rt6_info *rt)
314 {
315         return;
316 }
317 #endif
318
319 /*
320  * Default Router Selection (RFC 2461 6.3.6)
321  */
322 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
323 {
324         struct net_device *dev = rt->rt6i_dev;
325         if (!oif || dev->ifindex == oif)
326                 return 2;
327         if ((dev->flags & IFF_LOOPBACK) &&
328             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
329                 return 1;
330         return 0;
331 }
332
333 static inline int rt6_check_neigh(struct rt6_info *rt)
334 {
335         struct neighbour *neigh = rt->rt6i_nexthop;
336         int m;
337         if (rt->rt6i_flags & RTF_NONEXTHOP ||
338             !(rt->rt6i_flags & RTF_GATEWAY))
339                 m = 1;
340         else if (neigh) {
341                 read_lock_bh(&neigh->lock);
342                 if (neigh->nud_state & NUD_VALID)
343                         m = 2;
344 #ifdef CONFIG_IPV6_ROUTER_PREF
345                 else if (neigh->nud_state & NUD_FAILED)
346                         m = 0;
347 #endif
348                 else
349                         m = 1;
350                 read_unlock_bh(&neigh->lock);
351         } else
352                 m = 0;
353         return m;
354 }
355
356 static int rt6_score_route(struct rt6_info *rt, int oif,
357                            int strict)
358 {
359         int m, n;
360
361         m = rt6_check_dev(rt, oif);
362         if (!m && (strict & RT6_LOOKUP_F_IFACE))
363                 return -1;
364 #ifdef CONFIG_IPV6_ROUTER_PREF
365         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
366 #endif
367         n = rt6_check_neigh(rt);
368         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
369                 return -1;
370         return m;
371 }
372
373 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
374                                    int *mpri, struct rt6_info *match)
375 {
376         int m;
377
378         if (rt6_check_expired(rt))
379                 goto out;
380
381         m = rt6_score_route(rt, oif, strict);
382         if (m < 0)
383                 goto out;
384
385         if (m > *mpri) {
386                 if (strict & RT6_LOOKUP_F_REACHABLE)
387                         rt6_probe(match);
388                 *mpri = m;
389                 match = rt;
390         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
391                 rt6_probe(rt);
392         }
393
394 out:
395         return match;
396 }
397
398 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
399                                      struct rt6_info *rr_head,
400                                      u32 metric, int oif, int strict)
401 {
402         struct rt6_info *rt, *match;
403         int mpri = -1;
404
405         match = NULL;
406         for (rt = rr_head; rt && rt->rt6i_metric == metric;
407              rt = rt->u.dst.rt6_next)
408                 match = find_match(rt, oif, strict, &mpri, match);
409         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
410              rt = rt->u.dst.rt6_next)
411                 match = find_match(rt, oif, strict, &mpri, match);
412
413         return match;
414 }
415
416 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
417 {
418         struct rt6_info *match, *rt0;
419         struct net *net;
420
421         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
422                   __func__, fn->leaf, oif);
423
424         rt0 = fn->rr_ptr;
425         if (!rt0)
426                 fn->rr_ptr = rt0 = fn->leaf;
427
428         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
429
430         if (!match &&
431             (strict & RT6_LOOKUP_F_REACHABLE)) {
432                 struct rt6_info *next = rt0->u.dst.rt6_next;
433
434                 /* no entries matched; do round-robin */
435                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
436                         next = fn->leaf;
437
438                 if (next != rt0)
439                         fn->rr_ptr = next;
440         }
441
442         RT6_TRACE("%s() => %p\n",
443                   __func__, match);
444
445         net = dev_net(rt0->rt6i_dev);
446         return (match ? match : net->ipv6.ip6_null_entry);
447 }
448
449 #ifdef CONFIG_IPV6_ROUTE_INFO
450 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
451                   struct in6_addr *gwaddr)
452 {
453         struct net *net = dev_net(dev);
454         struct route_info *rinfo = (struct route_info *) opt;
455         struct in6_addr prefix_buf, *prefix;
456         unsigned int pref;
457         unsigned long lifetime;
458         struct rt6_info *rt;
459
460         if (len < sizeof(struct route_info)) {
461                 return -EINVAL;
462         }
463
464         /* Sanity check for prefix_len and length */
465         if (rinfo->length > 3) {
466                 return -EINVAL;
467         } else if (rinfo->prefix_len > 128) {
468                 return -EINVAL;
469         } else if (rinfo->prefix_len > 64) {
470                 if (rinfo->length < 2) {
471                         return -EINVAL;
472                 }
473         } else if (rinfo->prefix_len > 0) {
474                 if (rinfo->length < 1) {
475                         return -EINVAL;
476                 }
477         }
478
479         pref = rinfo->route_pref;
480         if (pref == ICMPV6_ROUTER_PREF_INVALID)
481                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
482
483         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
484
485         if (rinfo->length == 3)
486                 prefix = (struct in6_addr *)rinfo->prefix;
487         else {
488                 /* this function is safe */
489                 ipv6_addr_prefix(&prefix_buf,
490                                  (struct in6_addr *)rinfo->prefix,
491                                  rinfo->prefix_len);
492                 prefix = &prefix_buf;
493         }
494
495         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
496                                 dev->ifindex);
497
498         if (rt && !lifetime) {
499                 ip6_del_rt(rt);
500                 rt = NULL;
501         }
502
503         if (!rt && lifetime)
504                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
505                                         pref);
506         else if (rt)
507                 rt->rt6i_flags = RTF_ROUTEINFO |
508                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
509
510         if (rt) {
511                 if (!addrconf_finite_timeout(lifetime)) {
512                         rt->rt6i_flags &= ~RTF_EXPIRES;
513                 } else {
514                         rt->rt6i_expires = jiffies + HZ * lifetime;
515                         rt->rt6i_flags |= RTF_EXPIRES;
516                 }
517                 dst_release(&rt->u.dst);
518         }
519         return 0;
520 }
521 #endif
522
523 #define BACKTRACK(__net, saddr)                 \
524 do { \
525         if (rt == __net->ipv6.ip6_null_entry) { \
526                 struct fib6_node *pn; \
527                 while (1) { \
528                         if (fn->fn_flags & RTN_TL_ROOT) \
529                                 goto out; \
530                         pn = fn->parent; \
531                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
532                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
533                         else \
534                                 fn = pn; \
535                         if (fn->fn_flags & RTN_RTINFO) \
536                                 goto restart; \
537                 } \
538         } \
539 } while(0)
540
541 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
542                                              struct fib6_table *table,
543                                              struct flowi *fl, int flags)
544 {
545         struct fib6_node *fn;
546         struct rt6_info *rt;
547
548         read_lock_bh(&table->tb6_lock);
549         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
550 restart:
551         rt = fn->leaf;
552         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
553         BACKTRACK(net, &fl->fl6_src);
554 out:
555         dst_use(&rt->u.dst, jiffies);
556         read_unlock_bh(&table->tb6_lock);
557         return rt;
558
559 }
560
561 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
562                             const struct in6_addr *saddr, int oif, int strict)
563 {
564         struct flowi fl = {
565                 .oif = oif,
566                 .nl_u = {
567                         .ip6_u = {
568                                 .daddr = *daddr,
569                         },
570                 },
571         };
572         struct dst_entry *dst;
573         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
574
575         if (saddr) {
576                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
577                 flags |= RT6_LOOKUP_F_HAS_SADDR;
578         }
579
580         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
581         if (dst->error == 0)
582                 return (struct rt6_info *) dst;
583
584         dst_release(dst);
585
586         return NULL;
587 }
588
589 EXPORT_SYMBOL(rt6_lookup);
590
591 /* ip6_ins_rt is called with FREE table->tb6_lock.
592    It takes new route entry, the addition fails by any reason the
593    route is freed. In any case, if caller does not hold it, it may
594    be destroyed.
595  */
596
597 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
598 {
599         int err;
600         struct fib6_table *table;
601
602         table = rt->rt6i_table;
603         write_lock_bh(&table->tb6_lock);
604         err = fib6_add(&table->tb6_root, rt, info);
605         write_unlock_bh(&table->tb6_lock);
606
607         return err;
608 }
609
610 int ip6_ins_rt(struct rt6_info *rt)
611 {
612         struct nl_info info = {
613                 .nl_net = dev_net(rt->rt6i_dev),
614         };
615         return __ip6_ins_rt(rt, &info);
616 }
617
618 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
619                                       struct in6_addr *saddr)
620 {
621         struct rt6_info *rt;
622
623         /*
624          *      Clone the route.
625          */
626
627         rt = ip6_rt_copy(ort);
628
629         if (rt) {
630                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
631                         if (rt->rt6i_dst.plen != 128 &&
632                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
633                                 rt->rt6i_flags |= RTF_ANYCAST;
634                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
635                 }
636
637                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
638                 rt->rt6i_dst.plen = 128;
639                 rt->rt6i_flags |= RTF_CACHE;
640                 rt->u.dst.flags |= DST_HOST;
641
642 #ifdef CONFIG_IPV6_SUBTREES
643                 if (rt->rt6i_src.plen && saddr) {
644                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
645                         rt->rt6i_src.plen = 128;
646                 }
647 #endif
648
649                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
650
651         }
652
653         return rt;
654 }
655
656 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
657 {
658         struct rt6_info *rt = ip6_rt_copy(ort);
659         if (rt) {
660                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
661                 rt->rt6i_dst.plen = 128;
662                 rt->rt6i_flags |= RTF_CACHE;
663                 rt->u.dst.flags |= DST_HOST;
664                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
665         }
666         return rt;
667 }
668
669 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
670                                       struct flowi *fl, int flags)
671 {
672         struct fib6_node *fn;
673         struct rt6_info *rt, *nrt;
674         int strict = 0;
675         int attempts = 3;
676         int err;
677         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
678
679         strict |= flags & RT6_LOOKUP_F_IFACE;
680
681 relookup:
682         read_lock_bh(&table->tb6_lock);
683
684 restart_2:
685         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
686
687 restart:
688         rt = rt6_select(fn, oif, strict | reachable);
689
690         BACKTRACK(net, &fl->fl6_src);
691         if (rt == net->ipv6.ip6_null_entry ||
692             rt->rt6i_flags & RTF_CACHE)
693                 goto out;
694
695         dst_hold(&rt->u.dst);
696         read_unlock_bh(&table->tb6_lock);
697
698         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
699                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
700         else {
701 #if CLONE_OFFLINK_ROUTE
702                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
703 #else
704                 goto out2;
705 #endif
706         }
707
708         dst_release(&rt->u.dst);
709         rt = nrt ? : net->ipv6.ip6_null_entry;
710
711         dst_hold(&rt->u.dst);
712         if (nrt) {
713                 err = ip6_ins_rt(nrt);
714                 if (!err)
715                         goto out2;
716         }
717
718         if (--attempts <= 0)
719                 goto out2;
720
721         /*
722          * Race condition! In the gap, when table->tb6_lock was
723          * released someone could insert this route.  Relookup.
724          */
725         dst_release(&rt->u.dst);
726         goto relookup;
727
728 out:
729         if (reachable) {
730                 reachable = 0;
731                 goto restart_2;
732         }
733         dst_hold(&rt->u.dst);
734         read_unlock_bh(&table->tb6_lock);
735 out2:
736         rt->u.dst.lastuse = jiffies;
737         rt->u.dst.__use++;
738
739         return rt;
740 }
741
742 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
743                                             struct flowi *fl, int flags)
744 {
745         return ip6_pol_route(net, table, fl->iif, fl, flags);
746 }
747
748 void ip6_route_input(struct sk_buff *skb)
749 {
750         struct ipv6hdr *iph = ipv6_hdr(skb);
751         struct net *net = dev_net(skb->dev);
752         int flags = RT6_LOOKUP_F_HAS_SADDR;
753         struct flowi fl = {
754                 .iif = skb->dev->ifindex,
755                 .nl_u = {
756                         .ip6_u = {
757                                 .daddr = iph->daddr,
758                                 .saddr = iph->saddr,
759                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
760                         },
761                 },
762                 .mark = skb->mark,
763                 .proto = iph->nexthdr,
764         };
765
766         if (rt6_need_strict(&iph->daddr))
767                 flags |= RT6_LOOKUP_F_IFACE;
768
769         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
770 }
771
772 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
773                                              struct flowi *fl, int flags)
774 {
775         return ip6_pol_route(net, table, fl->oif, fl, flags);
776 }
777
778 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
779                                     struct flowi *fl)
780 {
781         int flags = 0;
782
783         if (rt6_need_strict(&fl->fl6_dst))
784                 flags |= RT6_LOOKUP_F_IFACE;
785
786         if (!ipv6_addr_any(&fl->fl6_src))
787                 flags |= RT6_LOOKUP_F_HAS_SADDR;
788         else if (sk) {
789                 unsigned int prefs = inet6_sk(sk)->srcprefs;
790                 if (prefs & IPV6_PREFER_SRC_TMP)
791                         flags |= RT6_LOOKUP_F_SRCPREF_TMP;
792                 if (prefs & IPV6_PREFER_SRC_PUBLIC)
793                         flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
794                 if (prefs & IPV6_PREFER_SRC_COA)
795                         flags |= RT6_LOOKUP_F_SRCPREF_COA;
796         }
797
798         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
799 }
800
801 EXPORT_SYMBOL(ip6_route_output);
802
803 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
804 {
805         struct rt6_info *ort = (struct rt6_info *) *dstp;
806         struct rt6_info *rt = (struct rt6_info *)
807                 dst_alloc(&ip6_dst_blackhole_ops);
808         struct dst_entry *new = NULL;
809
810         if (rt) {
811                 new = &rt->u.dst;
812
813                 atomic_set(&new->__refcnt, 1);
814                 new->__use = 1;
815                 new->input = dst_discard;
816                 new->output = dst_discard;
817
818                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
819                 new->dev = ort->u.dst.dev;
820                 if (new->dev)
821                         dev_hold(new->dev);
822                 rt->rt6i_idev = ort->rt6i_idev;
823                 if (rt->rt6i_idev)
824                         in6_dev_hold(rt->rt6i_idev);
825                 rt->rt6i_expires = 0;
826
827                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
828                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
829                 rt->rt6i_metric = 0;
830
831                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
832 #ifdef CONFIG_IPV6_SUBTREES
833                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
834 #endif
835
836                 dst_free(new);
837         }
838
839         dst_release(*dstp);
840         *dstp = new;
841         return (new ? 0 : -ENOMEM);
842 }
843 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
844
845 /*
846  *      Destination cache support functions
847  */
848
849 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
850 {
851         struct rt6_info *rt;
852
853         rt = (struct rt6_info *) dst;
854
855         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
856                 return dst;
857
858         return NULL;
859 }
860
861 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
862 {
863         struct rt6_info *rt = (struct rt6_info *) dst;
864
865         if (rt) {
866                 if (rt->rt6i_flags & RTF_CACHE)
867                         ip6_del_rt(rt);
868                 else
869                         dst_release(dst);
870         }
871         return NULL;
872 }
873
874 static void ip6_link_failure(struct sk_buff *skb)
875 {
876         struct rt6_info *rt;
877
878         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
879
880         rt = (struct rt6_info *) skb->dst;
881         if (rt) {
882                 if (rt->rt6i_flags&RTF_CACHE) {
883                         dst_set_expires(&rt->u.dst, 0);
884                         rt->rt6i_flags |= RTF_EXPIRES;
885                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
886                         rt->rt6i_node->fn_sernum = -1;
887         }
888 }
889
890 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
891 {
892         struct rt6_info *rt6 = (struct rt6_info*)dst;
893
894         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
895                 rt6->rt6i_flags |= RTF_MODIFIED;
896                 if (mtu < IPV6_MIN_MTU) {
897                         mtu = IPV6_MIN_MTU;
898                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
899                 }
900                 dst->metrics[RTAX_MTU-1] = mtu;
901                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
902         }
903 }
904
905 static int ipv6_get_mtu(struct net_device *dev);
906
907 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
908 {
909         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
910
911         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
912                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
913
914         /*
915          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
916          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
917          * IPV6_MAXPLEN is also valid and means: "any MSS,
918          * rely only on pmtu discovery"
919          */
920         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
921                 mtu = IPV6_MAXPLEN;
922         return mtu;
923 }
924
925 static struct dst_entry *icmp6_dst_gc_list;
926 static DEFINE_SPINLOCK(icmp6_dst_lock);
927
928 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
929                                   struct neighbour *neigh,
930                                   const struct in6_addr *addr)
931 {
932         struct rt6_info *rt;
933         struct inet6_dev *idev = in6_dev_get(dev);
934         struct net *net = dev_net(dev);
935
936         if (unlikely(idev == NULL))
937                 return NULL;
938
939         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
940         if (unlikely(rt == NULL)) {
941                 in6_dev_put(idev);
942                 goto out;
943         }
944
945         dev_hold(dev);
946         if (neigh)
947                 neigh_hold(neigh);
948         else
949                 neigh = ndisc_get_neigh(dev, addr);
950
951         rt->rt6i_dev      = dev;
952         rt->rt6i_idev     = idev;
953         rt->rt6i_nexthop  = neigh;
954         atomic_set(&rt->u.dst.__refcnt, 1);
955         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
956         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
957         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
958         rt->u.dst.output  = ip6_output;
959
960 #if 0   /* there's no chance to use these for ndisc */
961         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
962                                 ? DST_HOST
963                                 : 0;
964         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
965         rt->rt6i_dst.plen = 128;
966 #endif
967
968         spin_lock_bh(&icmp6_dst_lock);
969         rt->u.dst.next = icmp6_dst_gc_list;
970         icmp6_dst_gc_list = &rt->u.dst;
971         spin_unlock_bh(&icmp6_dst_lock);
972
973         fib6_force_start_gc(net);
974
975 out:
976         return &rt->u.dst;
977 }
978
979 int icmp6_dst_gc(void)
980 {
981         struct dst_entry *dst, *next, **pprev;
982         int more = 0;
983
984         next = NULL;
985
986         spin_lock_bh(&icmp6_dst_lock);
987         pprev = &icmp6_dst_gc_list;
988
989         while ((dst = *pprev) != NULL) {
990                 if (!atomic_read(&dst->__refcnt)) {
991                         *pprev = dst->next;
992                         dst_free(dst);
993                 } else {
994                         pprev = &dst->next;
995                         ++more;
996                 }
997         }
998
999         spin_unlock_bh(&icmp6_dst_lock);
1000
1001         return more;
1002 }
1003
1004 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1005                             void *arg)
1006 {
1007         struct dst_entry *dst, **pprev;
1008
1009         spin_lock_bh(&icmp6_dst_lock);
1010         pprev = &icmp6_dst_gc_list;
1011         while ((dst = *pprev) != NULL) {
1012                 struct rt6_info *rt = (struct rt6_info *) dst;
1013                 if (func(rt, arg)) {
1014                         *pprev = dst->next;
1015                         dst_free(dst);
1016                 } else {
1017                         pprev = &dst->next;
1018                 }
1019         }
1020         spin_unlock_bh(&icmp6_dst_lock);
1021 }
1022
1023 static int ip6_dst_gc(struct dst_ops *ops)
1024 {
1025         unsigned long now = jiffies;
1026         struct net *net = ops->dst_net;
1027         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1028         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1029         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1030         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1031         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1032
1033         if (time_after(rt_last_gc + rt_min_interval, now) &&
1034             atomic_read(&ops->entries) <= rt_max_size)
1035                 goto out;
1036
1037         net->ipv6.ip6_rt_gc_expire++;
1038         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1039         net->ipv6.ip6_rt_last_gc = now;
1040         if (atomic_read(&ops->entries) < ops->gc_thresh)
1041                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1042 out:
1043         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1044         return (atomic_read(&ops->entries) > rt_max_size);
1045 }
1046
1047 /* Clean host part of a prefix. Not necessary in radix tree,
1048    but results in cleaner routing tables.
1049
1050    Remove it only when all the things will work!
1051  */
1052
1053 static int ipv6_get_mtu(struct net_device *dev)
1054 {
1055         int mtu = IPV6_MIN_MTU;
1056         struct inet6_dev *idev;
1057
1058         idev = in6_dev_get(dev);
1059         if (idev) {
1060                 mtu = idev->cnf.mtu6;
1061                 in6_dev_put(idev);
1062         }
1063         return mtu;
1064 }
1065
1066 int ip6_dst_hoplimit(struct dst_entry *dst)
1067 {
1068         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1069         if (hoplimit < 0) {
1070                 struct net_device *dev = dst->dev;
1071                 struct inet6_dev *idev = in6_dev_get(dev);
1072                 if (idev) {
1073                         hoplimit = idev->cnf.hop_limit;
1074                         in6_dev_put(idev);
1075                 } else
1076                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1077         }
1078         return hoplimit;
1079 }
1080
1081 /*
1082  *
1083  */
1084
1085 int ip6_route_add(struct fib6_config *cfg)
1086 {
1087         int err;
1088         struct net *net = cfg->fc_nlinfo.nl_net;
1089         struct rt6_info *rt = NULL;
1090         struct net_device *dev = NULL;
1091         struct inet6_dev *idev = NULL;
1092         struct fib6_table *table;
1093         int addr_type;
1094
1095         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1096                 return -EINVAL;
1097 #ifndef CONFIG_IPV6_SUBTREES
1098         if (cfg->fc_src_len)
1099                 return -EINVAL;
1100 #endif
1101         if (cfg->fc_ifindex) {
1102                 err = -ENODEV;
1103                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1104                 if (!dev)
1105                         goto out;
1106                 idev = in6_dev_get(dev);
1107                 if (!idev)
1108                         goto out;
1109         }
1110
1111         if (cfg->fc_metric == 0)
1112                 cfg->fc_metric = IP6_RT_PRIO_USER;
1113
1114         table = fib6_new_table(net, cfg->fc_table);
1115         if (table == NULL) {
1116                 err = -ENOBUFS;
1117                 goto out;
1118         }
1119
1120         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1121
1122         if (rt == NULL) {
1123                 err = -ENOMEM;
1124                 goto out;
1125         }
1126
1127         rt->u.dst.obsolete = -1;
1128         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1129                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1130                                 0;
1131
1132         if (cfg->fc_protocol == RTPROT_UNSPEC)
1133                 cfg->fc_protocol = RTPROT_BOOT;
1134         rt->rt6i_protocol = cfg->fc_protocol;
1135
1136         addr_type = ipv6_addr_type(&cfg->fc_dst);
1137
1138         if (addr_type & IPV6_ADDR_MULTICAST)
1139                 rt->u.dst.input = ip6_mc_input;
1140         else
1141                 rt->u.dst.input = ip6_forward;
1142
1143         rt->u.dst.output = ip6_output;
1144
1145         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1146         rt->rt6i_dst.plen = cfg->fc_dst_len;
1147         if (rt->rt6i_dst.plen == 128)
1148                rt->u.dst.flags = DST_HOST;
1149
1150 #ifdef CONFIG_IPV6_SUBTREES
1151         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1152         rt->rt6i_src.plen = cfg->fc_src_len;
1153 #endif
1154
1155         rt->rt6i_metric = cfg->fc_metric;
1156
1157         /* We cannot add true routes via loopback here,
1158            they would result in kernel looping; promote them to reject routes
1159          */
1160         if ((cfg->fc_flags & RTF_REJECT) ||
1161             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1162                 /* hold loopback dev/idev if we haven't done so. */
1163                 if (dev != net->loopback_dev) {
1164                         if (dev) {
1165                                 dev_put(dev);
1166                                 in6_dev_put(idev);
1167                         }
1168                         dev = net->loopback_dev;
1169                         dev_hold(dev);
1170                         idev = in6_dev_get(dev);
1171                         if (!idev) {
1172                                 err = -ENODEV;
1173                                 goto out;
1174                         }
1175                 }
1176                 rt->u.dst.output = ip6_pkt_discard_out;
1177                 rt->u.dst.input = ip6_pkt_discard;
1178                 rt->u.dst.error = -ENETUNREACH;
1179                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1180                 goto install_route;
1181         }
1182
1183         if (cfg->fc_flags & RTF_GATEWAY) {
1184                 struct in6_addr *gw_addr;
1185                 int gwa_type;
1186
1187                 gw_addr = &cfg->fc_gateway;
1188                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1189                 gwa_type = ipv6_addr_type(gw_addr);
1190
1191                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1192                         struct rt6_info *grt;
1193
1194                         /* IPv6 strictly inhibits using not link-local
1195                            addresses as nexthop address.
1196                            Otherwise, router will not able to send redirects.
1197                            It is very good, but in some (rare!) circumstances
1198                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1199                            some exceptions. --ANK
1200                          */
1201                         err = -EINVAL;
1202                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1203                                 goto out;
1204
1205                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1206
1207                         err = -EHOSTUNREACH;
1208                         if (grt == NULL)
1209                                 goto out;
1210                         if (dev) {
1211                                 if (dev != grt->rt6i_dev) {
1212                                         dst_release(&grt->u.dst);
1213                                         goto out;
1214                                 }
1215                         } else {
1216                                 dev = grt->rt6i_dev;
1217                                 idev = grt->rt6i_idev;
1218                                 dev_hold(dev);
1219                                 in6_dev_hold(grt->rt6i_idev);
1220                         }
1221                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1222                                 err = 0;
1223                         dst_release(&grt->u.dst);
1224
1225                         if (err)
1226                                 goto out;
1227                 }
1228                 err = -EINVAL;
1229                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1230                         goto out;
1231         }
1232
1233         err = -ENODEV;
1234         if (dev == NULL)
1235                 goto out;
1236
1237         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1238                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1239                 if (IS_ERR(rt->rt6i_nexthop)) {
1240                         err = PTR_ERR(rt->rt6i_nexthop);
1241                         rt->rt6i_nexthop = NULL;
1242                         goto out;
1243                 }
1244         }
1245
1246         rt->rt6i_flags = cfg->fc_flags;
1247
1248 install_route:
1249         if (cfg->fc_mx) {
1250                 struct nlattr *nla;
1251                 int remaining;
1252
1253                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1254                         int type = nla_type(nla);
1255
1256                         if (type) {
1257                                 if (type > RTAX_MAX) {
1258                                         err = -EINVAL;
1259                                         goto out;
1260                                 }
1261
1262                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1263                         }
1264                 }
1265         }
1266
1267         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1268                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1269         if (!dst_mtu(&rt->u.dst))
1270                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1271         if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1272                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1273         rt->u.dst.dev = dev;
1274         rt->rt6i_idev = idev;
1275         rt->rt6i_table = table;
1276
1277         cfg->fc_nlinfo.nl_net = dev_net(dev);
1278
1279         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1280
1281 out:
1282         if (dev)
1283                 dev_put(dev);
1284         if (idev)
1285                 in6_dev_put(idev);
1286         if (rt)
1287                 dst_free(&rt->u.dst);
1288         return err;
1289 }
1290
1291 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1292 {
1293         int err;
1294         struct fib6_table *table;
1295         struct net *net = dev_net(rt->rt6i_dev);
1296
1297         if (rt == net->ipv6.ip6_null_entry)
1298                 return -ENOENT;
1299
1300         table = rt->rt6i_table;
1301         write_lock_bh(&table->tb6_lock);
1302
1303         err = fib6_del(rt, info);
1304         dst_release(&rt->u.dst);
1305
1306         write_unlock_bh(&table->tb6_lock);
1307
1308         return err;
1309 }
1310
1311 int ip6_del_rt(struct rt6_info *rt)
1312 {
1313         struct nl_info info = {
1314                 .nl_net = dev_net(rt->rt6i_dev),
1315         };
1316         return __ip6_del_rt(rt, &info);
1317 }
1318
1319 static int ip6_route_del(struct fib6_config *cfg)
1320 {
1321         struct fib6_table *table;
1322         struct fib6_node *fn;
1323         struct rt6_info *rt;
1324         int err = -ESRCH;
1325
1326         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1327         if (table == NULL)
1328                 return err;
1329
1330         read_lock_bh(&table->tb6_lock);
1331
1332         fn = fib6_locate(&table->tb6_root,
1333                          &cfg->fc_dst, cfg->fc_dst_len,
1334                          &cfg->fc_src, cfg->fc_src_len);
1335
1336         if (fn) {
1337                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1338                         if (cfg->fc_ifindex &&
1339                             (rt->rt6i_dev == NULL ||
1340                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1341                                 continue;
1342                         if (cfg->fc_flags & RTF_GATEWAY &&
1343                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1344                                 continue;
1345                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1346                                 continue;
1347                         dst_hold(&rt->u.dst);
1348                         read_unlock_bh(&table->tb6_lock);
1349
1350                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1351                 }
1352         }
1353         read_unlock_bh(&table->tb6_lock);
1354
1355         return err;
1356 }
1357
1358 /*
1359  *      Handle redirects
1360  */
1361 struct ip6rd_flowi {
1362         struct flowi fl;
1363         struct in6_addr gateway;
1364 };
1365
1366 static struct rt6_info *__ip6_route_redirect(struct net *net,
1367                                              struct fib6_table *table,
1368                                              struct flowi *fl,
1369                                              int flags)
1370 {
1371         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1372         struct rt6_info *rt;
1373         struct fib6_node *fn;
1374
1375         /*
1376          * Get the "current" route for this destination and
1377          * check if the redirect has come from approriate router.
1378          *
1379          * RFC 2461 specifies that redirects should only be
1380          * accepted if they come from the nexthop to the target.
1381          * Due to the way the routes are chosen, this notion
1382          * is a bit fuzzy and one might need to check all possible
1383          * routes.
1384          */
1385
1386         read_lock_bh(&table->tb6_lock);
1387         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1388 restart:
1389         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1390                 /*
1391                  * Current route is on-link; redirect is always invalid.
1392                  *
1393                  * Seems, previous statement is not true. It could
1394                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1395                  * But then router serving it might decide, that we should
1396                  * know truth 8)8) --ANK (980726).
1397                  */
1398                 if (rt6_check_expired(rt))
1399                         continue;
1400                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1401                         continue;
1402                 if (fl->oif != rt->rt6i_dev->ifindex)
1403                         continue;
1404                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1405                         continue;
1406                 break;
1407         }
1408
1409         if (!rt)
1410                 rt = net->ipv6.ip6_null_entry;
1411         BACKTRACK(net, &fl->fl6_src);
1412 out:
1413         dst_hold(&rt->u.dst);
1414
1415         read_unlock_bh(&table->tb6_lock);
1416
1417         return rt;
1418 };
1419
1420 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1421                                            struct in6_addr *src,
1422                                            struct in6_addr *gateway,
1423                                            struct net_device *dev)
1424 {
1425         int flags = RT6_LOOKUP_F_HAS_SADDR;
1426         struct net *net = dev_net(dev);
1427         struct ip6rd_flowi rdfl = {
1428                 .fl = {
1429                         .oif = dev->ifindex,
1430                         .nl_u = {
1431                                 .ip6_u = {
1432                                         .daddr = *dest,
1433                                         .saddr = *src,
1434                                 },
1435                         },
1436                 },
1437                 .gateway = *gateway,
1438         };
1439
1440         if (rt6_need_strict(dest))
1441                 flags |= RT6_LOOKUP_F_IFACE;
1442
1443         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1444                                                    flags, __ip6_route_redirect);
1445 }
1446
1447 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1448                   struct in6_addr *saddr,
1449                   struct neighbour *neigh, u8 *lladdr, int on_link)
1450 {
1451         struct rt6_info *rt, *nrt = NULL;
1452         struct netevent_redirect netevent;
1453         struct net *net = dev_net(neigh->dev);
1454
1455         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1456
1457         if (rt == net->ipv6.ip6_null_entry) {
1458                 if (net_ratelimit())
1459                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1460                                "for redirect target\n");
1461                 goto out;
1462         }
1463
1464         /*
1465          *      We have finally decided to accept it.
1466          */
1467
1468         neigh_update(neigh, lladdr, NUD_STALE,
1469                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1470                      NEIGH_UPDATE_F_OVERRIDE|
1471                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1472                                      NEIGH_UPDATE_F_ISROUTER))
1473                      );
1474
1475         /*
1476          * Redirect received -> path was valid.
1477          * Look, redirects are sent only in response to data packets,
1478          * so that this nexthop apparently is reachable. --ANK
1479          */
1480         dst_confirm(&rt->u.dst);
1481
1482         /* Duplicate redirect: silently ignore. */
1483         if (neigh == rt->u.dst.neighbour)
1484                 goto out;
1485
1486         nrt = ip6_rt_copy(rt);
1487         if (nrt == NULL)
1488                 goto out;
1489
1490         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1491         if (on_link)
1492                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1493
1494         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1495         nrt->rt6i_dst.plen = 128;
1496         nrt->u.dst.flags |= DST_HOST;
1497
1498         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1499         nrt->rt6i_nexthop = neigh_clone(neigh);
1500         /* Reset pmtu, it may be better */
1501         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1502         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1503                                                         dst_mtu(&nrt->u.dst));
1504
1505         if (ip6_ins_rt(nrt))
1506                 goto out;
1507
1508         netevent.old = &rt->u.dst;
1509         netevent.new = &nrt->u.dst;
1510         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1511
1512         if (rt->rt6i_flags&RTF_CACHE) {
1513                 ip6_del_rt(rt);
1514                 return;
1515         }
1516
1517 out:
1518         dst_release(&rt->u.dst);
1519         return;
1520 }
1521
1522 /*
1523  *      Handle ICMP "packet too big" messages
1524  *      i.e. Path MTU discovery
1525  */
1526
1527 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1528                         struct net_device *dev, u32 pmtu)
1529 {
1530         struct rt6_info *rt, *nrt;
1531         struct net *net = dev_net(dev);
1532         int allfrag = 0;
1533
1534         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1535         if (rt == NULL)
1536                 return;
1537
1538         if (pmtu >= dst_mtu(&rt->u.dst))
1539                 goto out;
1540
1541         if (pmtu < IPV6_MIN_MTU) {
1542                 /*
1543                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1544                  * MTU (1280) and a fragment header should always be included
1545                  * after a node receiving Too Big message reporting PMTU is
1546                  * less than the IPv6 Minimum Link MTU.
1547                  */
1548                 pmtu = IPV6_MIN_MTU;
1549                 allfrag = 1;
1550         }
1551
1552         /* New mtu received -> path was valid.
1553            They are sent only in response to data packets,
1554            so that this nexthop apparently is reachable. --ANK
1555          */
1556         dst_confirm(&rt->u.dst);
1557
1558         /* Host route. If it is static, it would be better
1559            not to override it, but add new one, so that
1560            when cache entry will expire old pmtu
1561            would return automatically.
1562          */
1563         if (rt->rt6i_flags & RTF_CACHE) {
1564                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1565                 if (allfrag)
1566                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1567                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1568                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1569                 goto out;
1570         }
1571
1572         /* Network route.
1573            Two cases are possible:
1574            1. It is connected route. Action: COW
1575            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1576          */
1577         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1578                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1579         else
1580                 nrt = rt6_alloc_clone(rt, daddr);
1581
1582         if (nrt) {
1583                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1584                 if (allfrag)
1585                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1586
1587                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1588                  * happened within 5 mins, the recommended timer is 10 mins.
1589                  * Here this route expiration time is set to ip6_rt_mtu_expires
1590                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1591                  * and detecting PMTU increase will be automatically happened.
1592                  */
1593                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1594                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1595
1596                 ip6_ins_rt(nrt);
1597         }
1598 out:
1599         dst_release(&rt->u.dst);
1600 }
1601
1602 /*
1603  *      Misc support functions
1604  */
1605
1606 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1607 {
1608         struct net *net = dev_net(ort->rt6i_dev);
1609         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1610
1611         if (rt) {
1612                 rt->u.dst.input = ort->u.dst.input;
1613                 rt->u.dst.output = ort->u.dst.output;
1614
1615                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1616                 rt->u.dst.error = ort->u.dst.error;
1617                 rt->u.dst.dev = ort->u.dst.dev;
1618                 if (rt->u.dst.dev)
1619                         dev_hold(rt->u.dst.dev);
1620                 rt->rt6i_idev = ort->rt6i_idev;
1621                 if (rt->rt6i_idev)
1622                         in6_dev_hold(rt->rt6i_idev);
1623                 rt->u.dst.lastuse = jiffies;
1624                 rt->rt6i_expires = 0;
1625
1626                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1627                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1628                 rt->rt6i_metric = 0;
1629
1630                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1631 #ifdef CONFIG_IPV6_SUBTREES
1632                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1633 #endif
1634                 rt->rt6i_table = ort->rt6i_table;
1635         }
1636         return rt;
1637 }
1638
1639 #ifdef CONFIG_IPV6_ROUTE_INFO
1640 static struct rt6_info *rt6_get_route_info(struct net *net,
1641                                            struct in6_addr *prefix, int prefixlen,
1642                                            struct in6_addr *gwaddr, int ifindex)
1643 {
1644         struct fib6_node *fn;
1645         struct rt6_info *rt = NULL;
1646         struct fib6_table *table;
1647
1648         table = fib6_get_table(net, RT6_TABLE_INFO);
1649         if (table == NULL)
1650                 return NULL;
1651
1652         write_lock_bh(&table->tb6_lock);
1653         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1654         if (!fn)
1655                 goto out;
1656
1657         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1658                 if (rt->rt6i_dev->ifindex != ifindex)
1659                         continue;
1660                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1661                         continue;
1662                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1663                         continue;
1664                 dst_hold(&rt->u.dst);
1665                 break;
1666         }
1667 out:
1668         write_unlock_bh(&table->tb6_lock);
1669         return rt;
1670 }
1671
1672 static struct rt6_info *rt6_add_route_info(struct net *net,
1673                                            struct in6_addr *prefix, int prefixlen,
1674                                            struct in6_addr *gwaddr, int ifindex,
1675                                            unsigned pref)
1676 {
1677         struct fib6_config cfg = {
1678                 .fc_table       = RT6_TABLE_INFO,
1679                 .fc_metric      = IP6_RT_PRIO_USER,
1680                 .fc_ifindex     = ifindex,
1681                 .fc_dst_len     = prefixlen,
1682                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1683                                   RTF_UP | RTF_PREF(pref),
1684                 .fc_nlinfo.pid = 0,
1685                 .fc_nlinfo.nlh = NULL,
1686                 .fc_nlinfo.nl_net = net,
1687         };
1688
1689         ipv6_addr_copy(&cfg.fc_dst, prefix);
1690         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1691
1692         /* We should treat it as a default route if prefix length is 0. */
1693         if (!prefixlen)
1694                 cfg.fc_flags |= RTF_DEFAULT;
1695
1696         ip6_route_add(&cfg);
1697
1698         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1699 }
1700 #endif
1701
1702 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1703 {
1704         struct rt6_info *rt;
1705         struct fib6_table *table;
1706
1707         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1708         if (table == NULL)
1709                 return NULL;
1710
1711         write_lock_bh(&table->tb6_lock);
1712         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1713                 if (dev == rt->rt6i_dev &&
1714                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1715                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1716                         break;
1717         }
1718         if (rt)
1719                 dst_hold(&rt->u.dst);
1720         write_unlock_bh(&table->tb6_lock);
1721         return rt;
1722 }
1723
1724 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1725                                      struct net_device *dev,
1726                                      unsigned int pref)
1727 {
1728         struct fib6_config cfg = {
1729                 .fc_table       = RT6_TABLE_DFLT,
1730                 .fc_metric      = IP6_RT_PRIO_USER,
1731                 .fc_ifindex     = dev->ifindex,
1732                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1733                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1734                 .fc_nlinfo.pid = 0,
1735                 .fc_nlinfo.nlh = NULL,
1736                 .fc_nlinfo.nl_net = dev_net(dev),
1737         };
1738
1739         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1740
1741         ip6_route_add(&cfg);
1742
1743         return rt6_get_dflt_router(gwaddr, dev);
1744 }
1745
1746 void rt6_purge_dflt_routers(struct net *net)
1747 {
1748         struct rt6_info *rt;
1749         struct fib6_table *table;
1750
1751         /* NOTE: Keep consistent with rt6_get_dflt_router */
1752         table = fib6_get_table(net, RT6_TABLE_DFLT);
1753         if (table == NULL)
1754                 return;
1755
1756 restart:
1757         read_lock_bh(&table->tb6_lock);
1758         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1759                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1760                         dst_hold(&rt->u.dst);
1761                         read_unlock_bh(&table->tb6_lock);
1762                         ip6_del_rt(rt);
1763                         goto restart;
1764                 }
1765         }
1766         read_unlock_bh(&table->tb6_lock);
1767 }
1768
1769 static void rtmsg_to_fib6_config(struct net *net,
1770                                  struct in6_rtmsg *rtmsg,
1771                                  struct fib6_config *cfg)
1772 {
1773         memset(cfg, 0, sizeof(*cfg));
1774
1775         cfg->fc_table = RT6_TABLE_MAIN;
1776         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1777         cfg->fc_metric = rtmsg->rtmsg_metric;
1778         cfg->fc_expires = rtmsg->rtmsg_info;
1779         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1780         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1781         cfg->fc_flags = rtmsg->rtmsg_flags;
1782
1783         cfg->fc_nlinfo.nl_net = net;
1784
1785         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1786         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1787         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1788 }
1789
1790 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1791 {
1792         struct fib6_config cfg;
1793         struct in6_rtmsg rtmsg;
1794         int err;
1795
1796         switch(cmd) {
1797         case SIOCADDRT:         /* Add a route */
1798         case SIOCDELRT:         /* Delete a route */
1799                 if (!capable(CAP_NET_ADMIN))
1800                         return -EPERM;
1801                 err = copy_from_user(&rtmsg, arg,
1802                                      sizeof(struct in6_rtmsg));
1803                 if (err)
1804                         return -EFAULT;
1805
1806                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1807
1808                 rtnl_lock();
1809                 switch (cmd) {
1810                 case SIOCADDRT:
1811                         err = ip6_route_add(&cfg);
1812                         break;
1813                 case SIOCDELRT:
1814                         err = ip6_route_del(&cfg);
1815                         break;
1816                 default:
1817                         err = -EINVAL;
1818                 }
1819                 rtnl_unlock();
1820
1821                 return err;
1822         }
1823
1824         return -EINVAL;
1825 }
1826
1827 /*
1828  *      Drop the packet on the floor
1829  */
1830
1831 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1832 {
1833         int type;
1834         struct dst_entry *dst = skb->dst;
1835         switch (ipstats_mib_noroutes) {
1836         case IPSTATS_MIB_INNOROUTES:
1837                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1838                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1839                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1840                                       IPSTATS_MIB_INADDRERRORS);
1841                         break;
1842                 }
1843                 /* FALLTHROUGH */
1844         case IPSTATS_MIB_OUTNOROUTES:
1845                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1846                               ipstats_mib_noroutes);
1847                 break;
1848         }
1849         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1850         kfree_skb(skb);
1851         return 0;
1852 }
1853
1854 static int ip6_pkt_discard(struct sk_buff *skb)
1855 {
1856         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1857 }
1858
1859 static int ip6_pkt_discard_out(struct sk_buff *skb)
1860 {
1861         skb->dev = skb->dst->dev;
1862         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1863 }
1864
1865 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1866
1867 static int ip6_pkt_prohibit(struct sk_buff *skb)
1868 {
1869         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1870 }
1871
1872 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1873 {
1874         skb->dev = skb->dst->dev;
1875         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1876 }
1877
1878 #endif
1879
1880 /*
1881  *      Allocate a dst for local (unicast / anycast) address.
1882  */
1883
1884 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1885                                     const struct in6_addr *addr,
1886                                     int anycast)
1887 {
1888         struct net *net = dev_net(idev->dev);
1889         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1890
1891         if (rt == NULL)
1892                 return ERR_PTR(-ENOMEM);
1893
1894         dev_hold(net->loopback_dev);
1895         in6_dev_hold(idev);
1896
1897         rt->u.dst.flags = DST_HOST;
1898         rt->u.dst.input = ip6_input;
1899         rt->u.dst.output = ip6_output;
1900         rt->rt6i_dev = net->loopback_dev;
1901         rt->rt6i_idev = idev;
1902         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1903         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1904         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1905         rt->u.dst.obsolete = -1;
1906
1907         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1908         if (anycast)
1909                 rt->rt6i_flags |= RTF_ANYCAST;
1910         else
1911                 rt->rt6i_flags |= RTF_LOCAL;
1912         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1913         if (rt->rt6i_nexthop == NULL) {
1914                 dst_free(&rt->u.dst);
1915                 return ERR_PTR(-ENOMEM);
1916         }
1917
1918         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1919         rt->rt6i_dst.plen = 128;
1920         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1921
1922         atomic_set(&rt->u.dst.__refcnt, 1);
1923
1924         return rt;
1925 }
1926
1927 struct arg_dev_net {
1928         struct net_device *dev;
1929         struct net *net;
1930 };
1931
1932 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1933 {
1934         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1935         struct net *net = ((struct arg_dev_net *)arg)->net;
1936
1937         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1938             rt != net->ipv6.ip6_null_entry) {
1939                 RT6_TRACE("deleted by ifdown %p\n", rt);
1940                 return -1;
1941         }
1942         return 0;
1943 }
1944
1945 void rt6_ifdown(struct net *net, struct net_device *dev)
1946 {
1947         struct arg_dev_net adn = {
1948                 .dev = dev,
1949                 .net = net,
1950         };
1951
1952         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1953         icmp6_clean_all(fib6_ifdown, &adn);
1954 }
1955
1956 struct rt6_mtu_change_arg
1957 {
1958         struct net_device *dev;
1959         unsigned mtu;
1960 };
1961
1962 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1963 {
1964         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1965         struct inet6_dev *idev;
1966         struct net *net = dev_net(arg->dev);
1967
1968         /* In IPv6 pmtu discovery is not optional,
1969            so that RTAX_MTU lock cannot disable it.
1970            We still use this lock to block changes
1971            caused by addrconf/ndisc.
1972         */
1973
1974         idev = __in6_dev_get(arg->dev);
1975         if (idev == NULL)
1976                 return 0;
1977
1978         /* For administrative MTU increase, there is no way to discover
1979            IPv6 PMTU increase, so PMTU increase should be updated here.
1980            Since RFC 1981 doesn't include administrative MTU increase
1981            update PMTU increase is a MUST. (i.e. jumbo frame)
1982          */
1983         /*
1984            If new MTU is less than route PMTU, this new MTU will be the
1985            lowest MTU in the path, update the route PMTU to reflect PMTU
1986            decreases; if new MTU is greater than route PMTU, and the
1987            old MTU is the lowest MTU in the path, update the route PMTU
1988            to reflect the increase. In this case if the other nodes' MTU
1989            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1990            PMTU discouvery.
1991          */
1992         if (rt->rt6i_dev == arg->dev &&
1993             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1994             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1995              (dst_mtu(&rt->u.dst) < arg->mtu &&
1996               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1997                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1998                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1999         }
2000         return 0;
2001 }
2002
2003 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2004 {
2005         struct rt6_mtu_change_arg arg = {
2006                 .dev = dev,
2007                 .mtu = mtu,
2008         };
2009
2010         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2011 }
2012
2013 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2014         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2015         [RTA_OIF]               = { .type = NLA_U32 },
2016         [RTA_IIF]               = { .type = NLA_U32 },
2017         [RTA_PRIORITY]          = { .type = NLA_U32 },
2018         [RTA_METRICS]           = { .type = NLA_NESTED },
2019 };
2020
2021 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2022                               struct fib6_config *cfg)
2023 {
2024         struct rtmsg *rtm;
2025         struct nlattr *tb[RTA_MAX+1];
2026         int err;
2027
2028         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2029         if (err < 0)
2030                 goto errout;
2031
2032         err = -EINVAL;
2033         rtm = nlmsg_data(nlh);
2034         memset(cfg, 0, sizeof(*cfg));
2035
2036         cfg->fc_table = rtm->rtm_table;
2037         cfg->fc_dst_len = rtm->rtm_dst_len;
2038         cfg->fc_src_len = rtm->rtm_src_len;
2039         cfg->fc_flags = RTF_UP;
2040         cfg->fc_protocol = rtm->rtm_protocol;
2041
2042         if (rtm->rtm_type == RTN_UNREACHABLE)
2043                 cfg->fc_flags |= RTF_REJECT;
2044
2045         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2046         cfg->fc_nlinfo.nlh = nlh;
2047         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2048
2049         if (tb[RTA_GATEWAY]) {
2050                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2051                 cfg->fc_flags |= RTF_GATEWAY;
2052         }
2053
2054         if (tb[RTA_DST]) {
2055                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2056
2057                 if (nla_len(tb[RTA_DST]) < plen)
2058                         goto errout;
2059
2060                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2061         }
2062
2063         if (tb[RTA_SRC]) {
2064                 int plen = (rtm->rtm_src_len + 7) >> 3;
2065
2066                 if (nla_len(tb[RTA_SRC]) < plen)
2067                         goto errout;
2068
2069                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2070         }
2071
2072         if (tb[RTA_OIF])
2073                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2074
2075         if (tb[RTA_PRIORITY])
2076                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2077
2078         if (tb[RTA_METRICS]) {
2079                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2080                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2081         }
2082
2083         if (tb[RTA_TABLE])
2084                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2085
2086         err = 0;
2087 errout:
2088         return err;
2089 }
2090
2091 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2092 {
2093         struct fib6_config cfg;
2094         int err;
2095
2096         err = rtm_to_fib6_config(skb, nlh, &cfg);
2097         if (err < 0)
2098                 return err;
2099
2100         return ip6_route_del(&cfg);
2101 }
2102
2103 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2104 {
2105         struct fib6_config cfg;
2106         int err;
2107
2108         err = rtm_to_fib6_config(skb, nlh, &cfg);
2109         if (err < 0)
2110                 return err;
2111
2112         return ip6_route_add(&cfg);
2113 }
2114
2115 static inline size_t rt6_nlmsg_size(void)
2116 {
2117         return NLMSG_ALIGN(sizeof(struct rtmsg))
2118                + nla_total_size(16) /* RTA_SRC */
2119                + nla_total_size(16) /* RTA_DST */
2120                + nla_total_size(16) /* RTA_GATEWAY */
2121                + nla_total_size(16) /* RTA_PREFSRC */
2122                + nla_total_size(4) /* RTA_TABLE */
2123                + nla_total_size(4) /* RTA_IIF */
2124                + nla_total_size(4) /* RTA_OIF */
2125                + nla_total_size(4) /* RTA_PRIORITY */
2126                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2127                + nla_total_size(sizeof(struct rta_cacheinfo));
2128 }
2129
2130 static int rt6_fill_node(struct net *net,
2131                          struct sk_buff *skb, struct rt6_info *rt,
2132                          struct in6_addr *dst, struct in6_addr *src,
2133                          int iif, int type, u32 pid, u32 seq,
2134                          int prefix, int nowait, unsigned int flags)
2135 {
2136         struct rtmsg *rtm;
2137         struct nlmsghdr *nlh;
2138         long expires;
2139         u32 table;
2140
2141         if (prefix) {   /* user wants prefix routes only */
2142                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2143                         /* success since this is not a prefix route */
2144                         return 1;
2145                 }
2146         }
2147
2148         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2149         if (nlh == NULL)
2150                 return -EMSGSIZE;
2151
2152         rtm = nlmsg_data(nlh);
2153         rtm->rtm_family = AF_INET6;
2154         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2155         rtm->rtm_src_len = rt->rt6i_src.plen;
2156         rtm->rtm_tos = 0;
2157         if (rt->rt6i_table)
2158                 table = rt->rt6i_table->tb6_id;
2159         else
2160                 table = RT6_TABLE_UNSPEC;
2161         rtm->rtm_table = table;
2162         NLA_PUT_U32(skb, RTA_TABLE, table);
2163         if (rt->rt6i_flags&RTF_REJECT)
2164                 rtm->rtm_type = RTN_UNREACHABLE;
2165         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2166                 rtm->rtm_type = RTN_LOCAL;
2167         else
2168                 rtm->rtm_type = RTN_UNICAST;
2169         rtm->rtm_flags = 0;
2170         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2171         rtm->rtm_protocol = rt->rt6i_protocol;
2172         if (rt->rt6i_flags&RTF_DYNAMIC)
2173                 rtm->rtm_protocol = RTPROT_REDIRECT;
2174         else if (rt->rt6i_flags & RTF_ADDRCONF)
2175                 rtm->rtm_protocol = RTPROT_KERNEL;
2176         else if (rt->rt6i_flags&RTF_DEFAULT)
2177                 rtm->rtm_protocol = RTPROT_RA;
2178
2179         if (rt->rt6i_flags&RTF_CACHE)
2180                 rtm->rtm_flags |= RTM_F_CLONED;
2181
2182         if (dst) {
2183                 NLA_PUT(skb, RTA_DST, 16, dst);
2184                 rtm->rtm_dst_len = 128;
2185         } else if (rtm->rtm_dst_len)
2186                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2187 #ifdef CONFIG_IPV6_SUBTREES
2188         if (src) {
2189                 NLA_PUT(skb, RTA_SRC, 16, src);
2190                 rtm->rtm_src_len = 128;
2191         } else if (rtm->rtm_src_len)
2192                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2193 #endif
2194         if (iif) {
2195 #ifdef CONFIG_IPV6_MROUTE
2196                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2197                         int err = ip6mr_get_route(skb, rtm, nowait);
2198                         if (err <= 0) {
2199                                 if (!nowait) {
2200                                         if (err == 0)
2201                                                 return 0;
2202                                         goto nla_put_failure;
2203                                 } else {
2204                                         if (err == -EMSGSIZE)
2205                                                 goto nla_put_failure;
2206                                 }
2207                         }
2208                 } else
2209 #endif
2210                         NLA_PUT_U32(skb, RTA_IIF, iif);
2211         } else if (dst) {
2212                 struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2213                 struct in6_addr saddr_buf;
2214                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2215                                        dst, 0, &saddr_buf) == 0)
2216                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2217         }
2218
2219         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2220                 goto nla_put_failure;
2221
2222         if (rt->u.dst.neighbour)
2223                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2224
2225         if (rt->u.dst.dev)
2226                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2227
2228         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2229
2230         if (!(rt->rt6i_flags & RTF_EXPIRES))
2231                 expires = 0;
2232         else if (rt->rt6i_expires - jiffies < INT_MAX)
2233                 expires = rt->rt6i_expires - jiffies;
2234         else
2235                 expires = INT_MAX;
2236
2237         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2238                                expires, rt->u.dst.error) < 0)
2239                 goto nla_put_failure;
2240
2241         return nlmsg_end(skb, nlh);
2242
2243 nla_put_failure:
2244         nlmsg_cancel(skb, nlh);
2245         return -EMSGSIZE;
2246 }
2247
2248 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2249 {
2250         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2251         int prefix;
2252
2253         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2254                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2255                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2256         } else
2257                 prefix = 0;
2258
2259         return rt6_fill_node(arg->net,
2260                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2261                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2262                      prefix, 0, NLM_F_MULTI);
2263 }
2264
2265 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2266 {
2267         struct net *net = sock_net(in_skb->sk);
2268         struct nlattr *tb[RTA_MAX+1];
2269         struct rt6_info *rt;
2270         struct sk_buff *skb;
2271         struct rtmsg *rtm;
2272         struct flowi fl;
2273         int err, iif = 0;
2274
2275         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2276         if (err < 0)
2277                 goto errout;
2278
2279         err = -EINVAL;
2280         memset(&fl, 0, sizeof(fl));
2281
2282         if (tb[RTA_SRC]) {
2283                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2284                         goto errout;
2285
2286                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2287         }
2288
2289         if (tb[RTA_DST]) {
2290                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2291                         goto errout;
2292
2293                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2294         }
2295
2296         if (tb[RTA_IIF])
2297                 iif = nla_get_u32(tb[RTA_IIF]);
2298
2299         if (tb[RTA_OIF])
2300                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2301
2302         if (iif) {
2303                 struct net_device *dev;
2304                 dev = __dev_get_by_index(net, iif);
2305                 if (!dev) {
2306                         err = -ENODEV;
2307                         goto errout;
2308                 }
2309         }
2310
2311         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2312         if (skb == NULL) {
2313                 err = -ENOBUFS;
2314                 goto errout;
2315         }
2316
2317         /* Reserve room for dummy headers, this skb can pass
2318            through good chunk of routing engine.
2319          */
2320         skb_reset_mac_header(skb);
2321         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2322
2323         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2324         skb->dst = &rt->u.dst;
2325
2326         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2327                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2328                             nlh->nlmsg_seq, 0, 0, 0);
2329         if (err < 0) {
2330                 kfree_skb(skb);
2331                 goto errout;
2332         }
2333
2334         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2335 errout:
2336         return err;
2337 }
2338
2339 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2340 {
2341         struct sk_buff *skb;
2342         struct net *net = info->nl_net;
2343         u32 seq;
2344         int err;
2345
2346         err = -ENOBUFS;
2347         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2348
2349         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2350         if (skb == NULL)
2351                 goto errout;
2352
2353         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2354                                 event, info->pid, seq, 0, 0, 0);
2355         if (err < 0) {
2356                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2357                 WARN_ON(err == -EMSGSIZE);
2358                 kfree_skb(skb);
2359                 goto errout;
2360         }
2361         err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2362                           info->nlh, gfp_any());
2363 errout:
2364         if (err < 0)
2365                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2366 }
2367
2368 static int ip6_route_dev_notify(struct notifier_block *this,
2369                                 unsigned long event, void *data)
2370 {
2371         struct net_device *dev = (struct net_device *)data;
2372         struct net *net = dev_net(dev);
2373
2374         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2375                 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2376                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2377 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2378                 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2379                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2380                 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2381                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2382 #endif
2383         }
2384
2385         return NOTIFY_OK;
2386 }
2387
2388 /*
2389  *      /proc
2390  */
2391
2392 #ifdef CONFIG_PROC_FS
2393
2394 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2395
2396 struct rt6_proc_arg
2397 {
2398         char *buffer;
2399         int offset;
2400         int length;
2401         int skip;
2402         int len;
2403 };
2404
2405 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2406 {
2407         struct seq_file *m = p_arg;
2408
2409         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2410
2411 #ifdef CONFIG_IPV6_SUBTREES
2412         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2413 #else
2414         seq_puts(m, "00000000000000000000000000000000 00 ");
2415 #endif
2416
2417         if (rt->rt6i_nexthop) {
2418                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2419         } else {
2420                 seq_puts(m, "00000000000000000000000000000000");
2421         }
2422         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2423                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2424                    rt->u.dst.__use, rt->rt6i_flags,
2425                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2426         return 0;
2427 }
2428
2429 static int ipv6_route_show(struct seq_file *m, void *v)
2430 {
2431         struct net *net = (struct net *)m->private;
2432         fib6_clean_all(net, rt6_info_route, 0, m);
2433         return 0;
2434 }
2435
2436 static int ipv6_route_open(struct inode *inode, struct file *file)
2437 {
2438         return single_open_net(inode, file, ipv6_route_show);
2439 }
2440
2441 static const struct file_operations ipv6_route_proc_fops = {
2442         .owner          = THIS_MODULE,
2443         .open           = ipv6_route_open,
2444         .read           = seq_read,
2445         .llseek         = seq_lseek,
2446         .release        = single_release_net,
2447 };
2448
2449 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2450 {
2451         struct net *net = (struct net *)seq->private;
2452         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2453                    net->ipv6.rt6_stats->fib_nodes,
2454                    net->ipv6.rt6_stats->fib_route_nodes,
2455                    net->ipv6.rt6_stats->fib_rt_alloc,
2456                    net->ipv6.rt6_stats->fib_rt_entries,
2457                    net->ipv6.rt6_stats->fib_rt_cache,
2458                    atomic_read(&net->ipv6.ip6_dst_ops->entries),
2459                    net->ipv6.rt6_stats->fib_discarded_routes);
2460
2461         return 0;
2462 }
2463
2464 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2465 {
2466         return single_open_net(inode, file, rt6_stats_seq_show);
2467 }
2468
2469 static const struct file_operations rt6_stats_seq_fops = {
2470         .owner   = THIS_MODULE,
2471         .open    = rt6_stats_seq_open,
2472         .read    = seq_read,
2473         .llseek  = seq_lseek,
2474         .release = single_release_net,
2475 };
2476 #endif  /* CONFIG_PROC_FS */
2477
2478 #ifdef CONFIG_SYSCTL
2479
2480 static
2481 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2482                               void __user *buffer, size_t *lenp, loff_t *ppos)
2483 {
2484         struct net *net = current->nsproxy->net_ns;
2485         int delay = net->ipv6.sysctl.flush_delay;
2486         if (write) {
2487                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2488                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2489                 return 0;
2490         } else
2491                 return -EINVAL;
2492 }
2493
2494 ctl_table ipv6_route_table_template[] = {
2495         {
2496                 .procname       =       "flush",
2497                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2498                 .maxlen         =       sizeof(int),
2499                 .mode           =       0200,
2500                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2501         },
2502         {
2503                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2504                 .procname       =       "gc_thresh",
2505                 .data           =       &ip6_dst_ops_template.gc_thresh,
2506                 .maxlen         =       sizeof(int),
2507                 .mode           =       0644,
2508                 .proc_handler   =       proc_dointvec,
2509         },
2510         {
2511                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2512                 .procname       =       "max_size",
2513                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2514                 .maxlen         =       sizeof(int),
2515                 .mode           =       0644,
2516                 .proc_handler   =       proc_dointvec,
2517         },
2518         {
2519                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2520                 .procname       =       "gc_min_interval",
2521                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2522                 .maxlen         =       sizeof(int),
2523                 .mode           =       0644,
2524                 .proc_handler   =       proc_dointvec_jiffies,
2525                 .strategy       =       sysctl_jiffies,
2526         },
2527         {
2528                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2529                 .procname       =       "gc_timeout",
2530                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2531                 .maxlen         =       sizeof(int),
2532                 .mode           =       0644,
2533                 .proc_handler   =       proc_dointvec_jiffies,
2534                 .strategy       =       sysctl_jiffies,
2535         },
2536         {
2537                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2538                 .procname       =       "gc_interval",
2539                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2540                 .maxlen         =       sizeof(int),
2541                 .mode           =       0644,
2542                 .proc_handler   =       proc_dointvec_jiffies,
2543                 .strategy       =       sysctl_jiffies,
2544         },
2545         {
2546                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2547                 .procname       =       "gc_elasticity",
2548                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2549                 .maxlen         =       sizeof(int),
2550                 .mode           =       0644,
2551                 .proc_handler   =       proc_dointvec_jiffies,
2552                 .strategy       =       sysctl_jiffies,
2553         },
2554         {
2555                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2556                 .procname       =       "mtu_expires",
2557                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2558                 .maxlen         =       sizeof(int),
2559                 .mode           =       0644,
2560                 .proc_handler   =       proc_dointvec_jiffies,
2561                 .strategy       =       sysctl_jiffies,
2562         },
2563         {
2564                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2565                 .procname       =       "min_adv_mss",
2566                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2567                 .maxlen         =       sizeof(int),
2568                 .mode           =       0644,
2569                 .proc_handler   =       proc_dointvec_jiffies,
2570                 .strategy       =       sysctl_jiffies,
2571         },
2572         {
2573                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2574                 .procname       =       "gc_min_interval_ms",
2575                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2576                 .maxlen         =       sizeof(int),
2577                 .mode           =       0644,
2578                 .proc_handler   =       proc_dointvec_ms_jiffies,
2579                 .strategy       =       sysctl_ms_jiffies,
2580         },
2581         { .ctl_name = 0 }
2582 };
2583
2584 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2585 {
2586         struct ctl_table *table;
2587
2588         table = kmemdup(ipv6_route_table_template,
2589                         sizeof(ipv6_route_table_template),
2590                         GFP_KERNEL);
2591
2592         if (table) {
2593                 table[0].data = &net->ipv6.sysctl.flush_delay;
2594                 table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2595                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2596                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2597                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2598                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2599                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2600                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2601                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2602         }
2603
2604         return table;
2605 }
2606 #endif
2607
2608 static int ip6_route_net_init(struct net *net)
2609 {
2610         int ret = -ENOMEM;
2611
2612         net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2613                                         sizeof(*net->ipv6.ip6_dst_ops),
2614                                         GFP_KERNEL);
2615         if (!net->ipv6.ip6_dst_ops)
2616                 goto out;
2617         net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2618
2619         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2620                                            sizeof(*net->ipv6.ip6_null_entry),
2621                                            GFP_KERNEL);
2622         if (!net->ipv6.ip6_null_entry)
2623                 goto out_ip6_dst_ops;
2624         net->ipv6.ip6_null_entry->u.dst.path =
2625                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2626         net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2627
2628 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2629         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2630                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2631                                                GFP_KERNEL);
2632         if (!net->ipv6.ip6_prohibit_entry)
2633                 goto out_ip6_null_entry;
2634         net->ipv6.ip6_prohibit_entry->u.dst.path =
2635                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2636         net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2637
2638         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2639                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2640                                                GFP_KERNEL);
2641         if (!net->ipv6.ip6_blk_hole_entry)
2642                 goto out_ip6_prohibit_entry;
2643         net->ipv6.ip6_blk_hole_entry->u.dst.path =
2644                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2645         net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2646 #endif
2647
2648         net->ipv6.sysctl.flush_delay = 0;
2649         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2650         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2651         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2652         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2653         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2654         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2655         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2656
2657 #ifdef CONFIG_PROC_FS
2658         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2659         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2660 #endif
2661         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2662
2663         ret = 0;
2664 out:
2665         return ret;
2666
2667 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2668 out_ip6_prohibit_entry:
2669         kfree(net->ipv6.ip6_prohibit_entry);
2670 out_ip6_null_entry:
2671         kfree(net->ipv6.ip6_null_entry);
2672 #endif
2673 out_ip6_dst_ops:
2674         release_net(net->ipv6.ip6_dst_ops->dst_net);
2675         kfree(net->ipv6.ip6_dst_ops);
2676         goto out;
2677 }
2678
2679 static void ip6_route_net_exit(struct net *net)
2680 {
2681 #ifdef CONFIG_PROC_FS
2682         proc_net_remove(net, "ipv6_route");
2683         proc_net_remove(net, "rt6_stats");
2684 #endif
2685         kfree(net->ipv6.ip6_null_entry);
2686 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2687         kfree(net->ipv6.ip6_prohibit_entry);
2688         kfree(net->ipv6.ip6_blk_hole_entry);
2689 #endif
2690         release_net(net->ipv6.ip6_dst_ops->dst_net);
2691         kfree(net->ipv6.ip6_dst_ops);
2692 }
2693
2694 static struct pernet_operations ip6_route_net_ops = {
2695         .init = ip6_route_net_init,
2696         .exit = ip6_route_net_exit,
2697 };
2698
2699 static struct notifier_block ip6_route_dev_notifier = {
2700         .notifier_call = ip6_route_dev_notify,
2701         .priority = 0,
2702 };
2703
2704 int __init ip6_route_init(void)
2705 {
2706         int ret;
2707
2708         ret = -ENOMEM;
2709         ip6_dst_ops_template.kmem_cachep =
2710                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2711                                   SLAB_HWCACHE_ALIGN, NULL);
2712         if (!ip6_dst_ops_template.kmem_cachep)
2713                 goto out;;
2714
2715         ret = register_pernet_subsys(&ip6_route_net_ops);
2716         if (ret)
2717                 goto out_kmem_cache;
2718
2719         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2720
2721         /* Registering of the loopback is done before this portion of code,
2722          * the loopback reference in rt6_info will not be taken, do it
2723          * manually for init_net */
2724         init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2725         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2726   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2727         init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2728         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2729         init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2730         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2731   #endif
2732         ret = fib6_init();
2733         if (ret)
2734                 goto out_register_subsys;
2735
2736         ret = xfrm6_init();
2737         if (ret)
2738                 goto out_fib6_init;
2739
2740         ret = fib6_rules_init();
2741         if (ret)
2742                 goto xfrm6_init;
2743
2744         ret = -ENOBUFS;
2745         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2746             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2747             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2748                 goto fib6_rules_init;
2749
2750         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2751         if (ret)
2752                 goto fib6_rules_init;
2753
2754 out:
2755         return ret;
2756
2757 fib6_rules_init:
2758         fib6_rules_cleanup();
2759 xfrm6_init:
2760         xfrm6_fini();
2761 out_fib6_init:
2762         fib6_gc_cleanup();
2763 out_register_subsys:
2764         unregister_pernet_subsys(&ip6_route_net_ops);
2765 out_kmem_cache:
2766         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2767         goto out;
2768 }
2769
2770 void ip6_route_cleanup(void)
2771 {
2772         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2773         fib6_rules_cleanup();
2774         xfrm6_fini();
2775         fib6_gc_cleanup();
2776         unregister_pernet_subsys(&ip6_route_net_ops);
2777         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2778 }