Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void             ip6_dst_destroy(struct dst_entry *);
80 static void             ip6_dst_ifdown(struct dst_entry *,
81                                        struct net_device *dev, int how);
82 static int               ip6_dst_gc(struct dst_ops *ops);
83
84 static int              ip6_pkt_discard(struct sk_buff *skb);
85 static int              ip6_pkt_discard_out(struct sk_buff *skb);
86 static void             ip6_link_failure(struct sk_buff *skb);
87 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91                                            struct in6_addr *prefix, int prefixlen,
92                                            struct in6_addr *gwaddr, int ifindex,
93                                            unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95                                            struct in6_addr *prefix, int prefixlen,
96                                            struct in6_addr *gwaddr, int ifindex);
97 #endif
98
99 static struct dst_ops ip6_dst_ops_template = {
100         .family                 =       AF_INET6,
101         .protocol               =       __constant_htons(ETH_P_IPV6),
102         .gc                     =       ip6_dst_gc,
103         .gc_thresh              =       1024,
104         .check                  =       ip6_dst_check,
105         .destroy                =       ip6_dst_destroy,
106         .ifdown                 =       ip6_dst_ifdown,
107         .negative_advice        =       ip6_negative_advice,
108         .link_failure           =       ip6_link_failure,
109         .update_pmtu            =       ip6_rt_update_pmtu,
110         .local_out              =       __ip6_local_out,
111         .entry_size             =       sizeof(struct rt6_info),
112         .entries                =       ATOMIC_INIT(0),
113 };
114
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118
119 static struct dst_ops ip6_dst_blackhole_ops = {
120         .family                 =       AF_INET6,
121         .protocol               =       __constant_htons(ETH_P_IPV6),
122         .destroy                =       ip6_dst_destroy,
123         .check                  =       ip6_dst_check,
124         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
125         .entry_size             =       sizeof(struct rt6_info),
126         .entries                =       ATOMIC_INIT(0),
127 };
128
129 static struct rt6_info ip6_null_entry_template = {
130         .u = {
131                 .dst = {
132                         .__refcnt       = ATOMIC_INIT(1),
133                         .__use          = 1,
134                         .obsolete       = -1,
135                         .error          = -ENETUNREACH,
136                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
137                         .input          = ip6_pkt_discard,
138                         .output         = ip6_pkt_discard_out,
139                 }
140         },
141         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
142         .rt6i_metric    = ~(u32) 0,
143         .rt6i_ref       = ATOMIC_INIT(1),
144 };
145
146 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
147
148 static int ip6_pkt_prohibit(struct sk_buff *skb);
149 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
150
151 static struct rt6_info ip6_prohibit_entry_template = {
152         .u = {
153                 .dst = {
154                         .__refcnt       = ATOMIC_INIT(1),
155                         .__use          = 1,
156                         .obsolete       = -1,
157                         .error          = -EACCES,
158                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
159                         .input          = ip6_pkt_prohibit,
160                         .output         = ip6_pkt_prohibit_out,
161                 }
162         },
163         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
164         .rt6i_metric    = ~(u32) 0,
165         .rt6i_ref       = ATOMIC_INIT(1),
166 };
167
168 static struct rt6_info ip6_blk_hole_entry_template = {
169         .u = {
170                 .dst = {
171                         .__refcnt       = ATOMIC_INIT(1),
172                         .__use          = 1,
173                         .obsolete       = -1,
174                         .error          = -EINVAL,
175                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
176                         .input          = dst_discard,
177                         .output         = dst_discard,
178                 }
179         },
180         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #endif
186
187 /* allocate dst with ip6_dst_ops */
188 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
189 {
190         return (struct rt6_info *)dst_alloc(ops);
191 }
192
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195         struct rt6_info *rt = (struct rt6_info *)dst;
196         struct inet6_dev *idev = rt->rt6i_idev;
197
198         if (idev != NULL) {
199                 rt->rt6i_idev = NULL;
200                 in6_dev_put(idev);
201         }
202 }
203
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
205                            int how)
206 {
207         struct rt6_info *rt = (struct rt6_info *)dst;
208         struct inet6_dev *idev = rt->rt6i_idev;
209         struct net_device *loopback_dev =
210                 dev_net(dev)->loopback_dev;
211
212         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
213                 struct inet6_dev *loopback_idev =
214                         in6_dev_get(loopback_dev);
215                 if (loopback_idev != NULL) {
216                         rt->rt6i_idev = loopback_idev;
217                         in6_dev_put(idev);
218                 }
219         }
220 }
221
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
223 {
224         return (rt->rt6i_flags & RTF_EXPIRES &&
225                 time_after(jiffies, rt->rt6i_expires));
226 }
227
228 static inline int rt6_need_strict(struct in6_addr *daddr)
229 {
230         return (ipv6_addr_type(daddr) &
231                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
232 }
233
234 /*
235  *      Route lookup. Any table->tb6_lock is implied.
236  */
237
238 static inline struct rt6_info *rt6_device_match(struct net *net,
239                                                     struct rt6_info *rt,
240                                                     struct in6_addr *saddr,
241                                                     int oif,
242                                                     int flags)
243 {
244         struct rt6_info *local = NULL;
245         struct rt6_info *sprt;
246
247         if (!oif && ipv6_addr_any(saddr))
248                 goto out;
249
250         for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
251                 struct net_device *dev = sprt->rt6i_dev;
252
253                 if (oif) {
254                         if (dev->ifindex == oif)
255                                 return sprt;
256                         if (dev->flags & IFF_LOOPBACK) {
257                                 if (sprt->rt6i_idev == NULL ||
258                                     sprt->rt6i_idev->dev->ifindex != oif) {
259                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
260                                                 continue;
261                                         if (local && (!oif ||
262                                                       local->rt6i_idev->dev->ifindex == oif))
263                                                 continue;
264                                 }
265                                 local = sprt;
266                         }
267                 } else {
268                         if (ipv6_chk_addr(net, saddr, dev,
269                                           flags & RT6_LOOKUP_F_IFACE))
270                                 return sprt;
271                 }
272         }
273
274         if (oif) {
275                 if (local)
276                         return local;
277
278                 if (flags & RT6_LOOKUP_F_IFACE)
279                         return net->ipv6.ip6_null_entry;
280         }
281 out:
282         return rt;
283 }
284
285 #ifdef CONFIG_IPV6_ROUTER_PREF
286 static void rt6_probe(struct rt6_info *rt)
287 {
288         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
289         /*
290          * Okay, this does not seem to be appropriate
291          * for now, however, we need to check if it
292          * is really so; aka Router Reachability Probing.
293          *
294          * Router Reachability Probe MUST be rate-limited
295          * to no more than one per minute.
296          */
297         if (!neigh || (neigh->nud_state & NUD_VALID))
298                 return;
299         read_lock_bh(&neigh->lock);
300         if (!(neigh->nud_state & NUD_VALID) &&
301             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
302                 struct in6_addr mcaddr;
303                 struct in6_addr *target;
304
305                 neigh->updated = jiffies;
306                 read_unlock_bh(&neigh->lock);
307
308                 target = (struct in6_addr *)&neigh->primary_key;
309                 addrconf_addr_solict_mult(target, &mcaddr);
310                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
311         } else
312                 read_unlock_bh(&neigh->lock);
313 }
314 #else
315 static inline void rt6_probe(struct rt6_info *rt)
316 {
317         return;
318 }
319 #endif
320
321 /*
322  * Default Router Selection (RFC 2461 6.3.6)
323  */
324 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
325 {
326         struct net_device *dev = rt->rt6i_dev;
327         if (!oif || dev->ifindex == oif)
328                 return 2;
329         if ((dev->flags & IFF_LOOPBACK) &&
330             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
331                 return 1;
332         return 0;
333 }
334
335 static inline int rt6_check_neigh(struct rt6_info *rt)
336 {
337         struct neighbour *neigh = rt->rt6i_nexthop;
338         int m;
339         if (rt->rt6i_flags & RTF_NONEXTHOP ||
340             !(rt->rt6i_flags & RTF_GATEWAY))
341                 m = 1;
342         else if (neigh) {
343                 read_lock_bh(&neigh->lock);
344                 if (neigh->nud_state & NUD_VALID)
345                         m = 2;
346 #ifdef CONFIG_IPV6_ROUTER_PREF
347                 else if (neigh->nud_state & NUD_FAILED)
348                         m = 0;
349 #endif
350                 else
351                         m = 1;
352                 read_unlock_bh(&neigh->lock);
353         } else
354                 m = 0;
355         return m;
356 }
357
358 static int rt6_score_route(struct rt6_info *rt, int oif,
359                            int strict)
360 {
361         int m, n;
362
363         m = rt6_check_dev(rt, oif);
364         if (!m && (strict & RT6_LOOKUP_F_IFACE))
365                 return -1;
366 #ifdef CONFIG_IPV6_ROUTER_PREF
367         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
368 #endif
369         n = rt6_check_neigh(rt);
370         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
371                 return -1;
372         return m;
373 }
374
375 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
376                                    int *mpri, struct rt6_info *match)
377 {
378         int m;
379
380         if (rt6_check_expired(rt))
381                 goto out;
382
383         m = rt6_score_route(rt, oif, strict);
384         if (m < 0)
385                 goto out;
386
387         if (m > *mpri) {
388                 if (strict & RT6_LOOKUP_F_REACHABLE)
389                         rt6_probe(match);
390                 *mpri = m;
391                 match = rt;
392         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
393                 rt6_probe(rt);
394         }
395
396 out:
397         return match;
398 }
399
400 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
401                                      struct rt6_info *rr_head,
402                                      u32 metric, int oif, int strict)
403 {
404         struct rt6_info *rt, *match;
405         int mpri = -1;
406
407         match = NULL;
408         for (rt = rr_head; rt && rt->rt6i_metric == metric;
409              rt = rt->u.dst.rt6_next)
410                 match = find_match(rt, oif, strict, &mpri, match);
411         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
412              rt = rt->u.dst.rt6_next)
413                 match = find_match(rt, oif, strict, &mpri, match);
414
415         return match;
416 }
417
418 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
419 {
420         struct rt6_info *match, *rt0;
421         struct net *net;
422
423         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
424                   __func__, fn->leaf, oif);
425
426         rt0 = fn->rr_ptr;
427         if (!rt0)
428                 fn->rr_ptr = rt0 = fn->leaf;
429
430         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
431
432         if (!match &&
433             (strict & RT6_LOOKUP_F_REACHABLE)) {
434                 struct rt6_info *next = rt0->u.dst.rt6_next;
435
436                 /* no entries matched; do round-robin */
437                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
438                         next = fn->leaf;
439
440                 if (next != rt0)
441                         fn->rr_ptr = next;
442         }
443
444         RT6_TRACE("%s() => %p\n",
445                   __func__, match);
446
447         net = dev_net(rt0->rt6i_dev);
448         return (match ? match : net->ipv6.ip6_null_entry);
449 }
450
451 #ifdef CONFIG_IPV6_ROUTE_INFO
452 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
453                   struct in6_addr *gwaddr)
454 {
455         struct net *net = dev_net(dev);
456         struct route_info *rinfo = (struct route_info *) opt;
457         struct in6_addr prefix_buf, *prefix;
458         unsigned int pref;
459         unsigned long lifetime;
460         struct rt6_info *rt;
461
462         if (len < sizeof(struct route_info)) {
463                 return -EINVAL;
464         }
465
466         /* Sanity check for prefix_len and length */
467         if (rinfo->length > 3) {
468                 return -EINVAL;
469         } else if (rinfo->prefix_len > 128) {
470                 return -EINVAL;
471         } else if (rinfo->prefix_len > 64) {
472                 if (rinfo->length < 2) {
473                         return -EINVAL;
474                 }
475         } else if (rinfo->prefix_len > 0) {
476                 if (rinfo->length < 1) {
477                         return -EINVAL;
478                 }
479         }
480
481         pref = rinfo->route_pref;
482         if (pref == ICMPV6_ROUTER_PREF_INVALID)
483                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
484
485         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
486
487         if (rinfo->length == 3)
488                 prefix = (struct in6_addr *)rinfo->prefix;
489         else {
490                 /* this function is safe */
491                 ipv6_addr_prefix(&prefix_buf,
492                                  (struct in6_addr *)rinfo->prefix,
493                                  rinfo->prefix_len);
494                 prefix = &prefix_buf;
495         }
496
497         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
498                                 dev->ifindex);
499
500         if (rt && !lifetime) {
501                 ip6_del_rt(rt);
502                 rt = NULL;
503         }
504
505         if (!rt && lifetime)
506                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
507                                         pref);
508         else if (rt)
509                 rt->rt6i_flags = RTF_ROUTEINFO |
510                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
511
512         if (rt) {
513                 if (!addrconf_finite_timeout(lifetime)) {
514                         rt->rt6i_flags &= ~RTF_EXPIRES;
515                 } else {
516                         rt->rt6i_expires = jiffies + HZ * lifetime;
517                         rt->rt6i_flags |= RTF_EXPIRES;
518                 }
519                 dst_release(&rt->u.dst);
520         }
521         return 0;
522 }
523 #endif
524
525 #define BACKTRACK(__net, saddr)                 \
526 do { \
527         if (rt == __net->ipv6.ip6_null_entry) { \
528                 struct fib6_node *pn; \
529                 while (1) { \
530                         if (fn->fn_flags & RTN_TL_ROOT) \
531                                 goto out; \
532                         pn = fn->parent; \
533                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
534                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
535                         else \
536                                 fn = pn; \
537                         if (fn->fn_flags & RTN_RTINFO) \
538                                 goto restart; \
539                 } \
540         } \
541 } while(0)
542
543 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
544                                              struct fib6_table *table,
545                                              struct flowi *fl, int flags)
546 {
547         struct fib6_node *fn;
548         struct rt6_info *rt;
549
550         read_lock_bh(&table->tb6_lock);
551         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
552 restart:
553         rt = fn->leaf;
554         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
555         BACKTRACK(net, &fl->fl6_src);
556 out:
557         dst_use(&rt->u.dst, jiffies);
558         read_unlock_bh(&table->tb6_lock);
559         return rt;
560
561 }
562
563 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
564                             const struct in6_addr *saddr, int oif, int strict)
565 {
566         struct flowi fl = {
567                 .oif = oif,
568                 .nl_u = {
569                         .ip6_u = {
570                                 .daddr = *daddr,
571                         },
572                 },
573         };
574         struct dst_entry *dst;
575         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
576
577         if (saddr) {
578                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
579                 flags |= RT6_LOOKUP_F_HAS_SADDR;
580         }
581
582         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
583         if (dst->error == 0)
584                 return (struct rt6_info *) dst;
585
586         dst_release(dst);
587
588         return NULL;
589 }
590
591 EXPORT_SYMBOL(rt6_lookup);
592
593 /* ip6_ins_rt is called with FREE table->tb6_lock.
594    It takes new route entry, the addition fails by any reason the
595    route is freed. In any case, if caller does not hold it, it may
596    be destroyed.
597  */
598
599 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
600 {
601         int err;
602         struct fib6_table *table;
603
604         table = rt->rt6i_table;
605         write_lock_bh(&table->tb6_lock);
606         err = fib6_add(&table->tb6_root, rt, info);
607         write_unlock_bh(&table->tb6_lock);
608
609         return err;
610 }
611
612 int ip6_ins_rt(struct rt6_info *rt)
613 {
614         struct nl_info info = {
615                 .nl_net = dev_net(rt->rt6i_dev),
616         };
617         return __ip6_ins_rt(rt, &info);
618 }
619
620 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
621                                       struct in6_addr *saddr)
622 {
623         struct rt6_info *rt;
624
625         /*
626          *      Clone the route.
627          */
628
629         rt = ip6_rt_copy(ort);
630
631         if (rt) {
632                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
633                         if (rt->rt6i_dst.plen != 128 &&
634                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
635                                 rt->rt6i_flags |= RTF_ANYCAST;
636                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
637                 }
638
639                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
640                 rt->rt6i_dst.plen = 128;
641                 rt->rt6i_flags |= RTF_CACHE;
642                 rt->u.dst.flags |= DST_HOST;
643
644 #ifdef CONFIG_IPV6_SUBTREES
645                 if (rt->rt6i_src.plen && saddr) {
646                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
647                         rt->rt6i_src.plen = 128;
648                 }
649 #endif
650
651                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
652
653         }
654
655         return rt;
656 }
657
658 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
659 {
660         struct rt6_info *rt = ip6_rt_copy(ort);
661         if (rt) {
662                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
663                 rt->rt6i_dst.plen = 128;
664                 rt->rt6i_flags |= RTF_CACHE;
665                 rt->u.dst.flags |= DST_HOST;
666                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
667         }
668         return rt;
669 }
670
671 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
672                                       struct flowi *fl, int flags)
673 {
674         struct fib6_node *fn;
675         struct rt6_info *rt, *nrt;
676         int strict = 0;
677         int attempts = 3;
678         int err;
679         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
680
681         strict |= flags & RT6_LOOKUP_F_IFACE;
682
683 relookup:
684         read_lock_bh(&table->tb6_lock);
685
686 restart_2:
687         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
688
689 restart:
690         rt = rt6_select(fn, oif, strict | reachable);
691
692         BACKTRACK(net, &fl->fl6_src);
693         if (rt == net->ipv6.ip6_null_entry ||
694             rt->rt6i_flags & RTF_CACHE)
695                 goto out;
696
697         dst_hold(&rt->u.dst);
698         read_unlock_bh(&table->tb6_lock);
699
700         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
701                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
702         else {
703 #if CLONE_OFFLINK_ROUTE
704                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
705 #else
706                 goto out2;
707 #endif
708         }
709
710         dst_release(&rt->u.dst);
711         rt = nrt ? : net->ipv6.ip6_null_entry;
712
713         dst_hold(&rt->u.dst);
714         if (nrt) {
715                 err = ip6_ins_rt(nrt);
716                 if (!err)
717                         goto out2;
718         }
719
720         if (--attempts <= 0)
721                 goto out2;
722
723         /*
724          * Race condition! In the gap, when table->tb6_lock was
725          * released someone could insert this route.  Relookup.
726          */
727         dst_release(&rt->u.dst);
728         goto relookup;
729
730 out:
731         if (reachable) {
732                 reachable = 0;
733                 goto restart_2;
734         }
735         dst_hold(&rt->u.dst);
736         read_unlock_bh(&table->tb6_lock);
737 out2:
738         rt->u.dst.lastuse = jiffies;
739         rt->u.dst.__use++;
740
741         return rt;
742 }
743
744 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
745                                             struct flowi *fl, int flags)
746 {
747         return ip6_pol_route(net, table, fl->iif, fl, flags);
748 }
749
750 void ip6_route_input(struct sk_buff *skb)
751 {
752         struct ipv6hdr *iph = ipv6_hdr(skb);
753         struct net *net = dev_net(skb->dev);
754         int flags = RT6_LOOKUP_F_HAS_SADDR;
755         struct flowi fl = {
756                 .iif = skb->dev->ifindex,
757                 .nl_u = {
758                         .ip6_u = {
759                                 .daddr = iph->daddr,
760                                 .saddr = iph->saddr,
761                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
762                         },
763                 },
764                 .mark = skb->mark,
765                 .proto = iph->nexthdr,
766         };
767
768         if (rt6_need_strict(&iph->daddr))
769                 flags |= RT6_LOOKUP_F_IFACE;
770
771         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
772 }
773
774 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
775                                              struct flowi *fl, int flags)
776 {
777         return ip6_pol_route(net, table, fl->oif, fl, flags);
778 }
779
780 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
781                                     struct flowi *fl)
782 {
783         int flags = 0;
784
785         if (rt6_need_strict(&fl->fl6_dst))
786                 flags |= RT6_LOOKUP_F_IFACE;
787
788         if (!ipv6_addr_any(&fl->fl6_src))
789                 flags |= RT6_LOOKUP_F_HAS_SADDR;
790         else if (sk) {
791                 unsigned int prefs = inet6_sk(sk)->srcprefs;
792                 if (prefs & IPV6_PREFER_SRC_TMP)
793                         flags |= RT6_LOOKUP_F_SRCPREF_TMP;
794                 if (prefs & IPV6_PREFER_SRC_PUBLIC)
795                         flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
796                 if (prefs & IPV6_PREFER_SRC_COA)
797                         flags |= RT6_LOOKUP_F_SRCPREF_COA;
798         }
799
800         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
801 }
802
803 EXPORT_SYMBOL(ip6_route_output);
804
805 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
806 {
807         struct rt6_info *ort = (struct rt6_info *) *dstp;
808         struct rt6_info *rt = (struct rt6_info *)
809                 dst_alloc(&ip6_dst_blackhole_ops);
810         struct dst_entry *new = NULL;
811
812         if (rt) {
813                 new = &rt->u.dst;
814
815                 atomic_set(&new->__refcnt, 1);
816                 new->__use = 1;
817                 new->input = dst_discard;
818                 new->output = dst_discard;
819
820                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
821                 new->dev = ort->u.dst.dev;
822                 if (new->dev)
823                         dev_hold(new->dev);
824                 rt->rt6i_idev = ort->rt6i_idev;
825                 if (rt->rt6i_idev)
826                         in6_dev_hold(rt->rt6i_idev);
827                 rt->rt6i_expires = 0;
828
829                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
830                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
831                 rt->rt6i_metric = 0;
832
833                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
834 #ifdef CONFIG_IPV6_SUBTREES
835                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
836 #endif
837
838                 dst_free(new);
839         }
840
841         dst_release(*dstp);
842         *dstp = new;
843         return (new ? 0 : -ENOMEM);
844 }
845 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
846
847 /*
848  *      Destination cache support functions
849  */
850
851 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
852 {
853         struct rt6_info *rt;
854
855         rt = (struct rt6_info *) dst;
856
857         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
858                 return dst;
859
860         return NULL;
861 }
862
863 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
864 {
865         struct rt6_info *rt = (struct rt6_info *) dst;
866
867         if (rt) {
868                 if (rt->rt6i_flags & RTF_CACHE)
869                         ip6_del_rt(rt);
870                 else
871                         dst_release(dst);
872         }
873         return NULL;
874 }
875
876 static void ip6_link_failure(struct sk_buff *skb)
877 {
878         struct rt6_info *rt;
879
880         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
881
882         rt = (struct rt6_info *) skb->dst;
883         if (rt) {
884                 if (rt->rt6i_flags&RTF_CACHE) {
885                         dst_set_expires(&rt->u.dst, 0);
886                         rt->rt6i_flags |= RTF_EXPIRES;
887                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
888                         rt->rt6i_node->fn_sernum = -1;
889         }
890 }
891
892 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
893 {
894         struct rt6_info *rt6 = (struct rt6_info*)dst;
895
896         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
897                 rt6->rt6i_flags |= RTF_MODIFIED;
898                 if (mtu < IPV6_MIN_MTU) {
899                         mtu = IPV6_MIN_MTU;
900                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
901                 }
902                 dst->metrics[RTAX_MTU-1] = mtu;
903                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
904         }
905 }
906
907 static int ipv6_get_mtu(struct net_device *dev);
908
909 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
910 {
911         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
912
913         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
914                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
915
916         /*
917          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
918          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
919          * IPV6_MAXPLEN is also valid and means: "any MSS,
920          * rely only on pmtu discovery"
921          */
922         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
923                 mtu = IPV6_MAXPLEN;
924         return mtu;
925 }
926
927 static struct dst_entry *icmp6_dst_gc_list;
928 static DEFINE_SPINLOCK(icmp6_dst_lock);
929
930 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
931                                   struct neighbour *neigh,
932                                   const struct in6_addr *addr)
933 {
934         struct rt6_info *rt;
935         struct inet6_dev *idev = in6_dev_get(dev);
936         struct net *net = dev_net(dev);
937
938         if (unlikely(idev == NULL))
939                 return NULL;
940
941         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
942         if (unlikely(rt == NULL)) {
943                 in6_dev_put(idev);
944                 goto out;
945         }
946
947         dev_hold(dev);
948         if (neigh)
949                 neigh_hold(neigh);
950         else
951                 neigh = ndisc_get_neigh(dev, addr);
952
953         rt->rt6i_dev      = dev;
954         rt->rt6i_idev     = idev;
955         rt->rt6i_nexthop  = neigh;
956         atomic_set(&rt->u.dst.__refcnt, 1);
957         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
958         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
959         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
960         rt->u.dst.output  = ip6_output;
961
962 #if 0   /* there's no chance to use these for ndisc */
963         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
964                                 ? DST_HOST
965                                 : 0;
966         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
967         rt->rt6i_dst.plen = 128;
968 #endif
969
970         spin_lock_bh(&icmp6_dst_lock);
971         rt->u.dst.next = icmp6_dst_gc_list;
972         icmp6_dst_gc_list = &rt->u.dst;
973         spin_unlock_bh(&icmp6_dst_lock);
974
975         fib6_force_start_gc(net);
976
977 out:
978         return &rt->u.dst;
979 }
980
981 int icmp6_dst_gc(void)
982 {
983         struct dst_entry *dst, *next, **pprev;
984         int more = 0;
985
986         next = NULL;
987
988         spin_lock_bh(&icmp6_dst_lock);
989         pprev = &icmp6_dst_gc_list;
990
991         while ((dst = *pprev) != NULL) {
992                 if (!atomic_read(&dst->__refcnt)) {
993                         *pprev = dst->next;
994                         dst_free(dst);
995                 } else {
996                         pprev = &dst->next;
997                         ++more;
998                 }
999         }
1000
1001         spin_unlock_bh(&icmp6_dst_lock);
1002
1003         return more;
1004 }
1005
1006 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1007                             void *arg)
1008 {
1009         struct dst_entry *dst, **pprev;
1010
1011         spin_lock_bh(&icmp6_dst_lock);
1012         pprev = &icmp6_dst_gc_list;
1013         while ((dst = *pprev) != NULL) {
1014                 struct rt6_info *rt = (struct rt6_info *) dst;
1015                 if (func(rt, arg)) {
1016                         *pprev = dst->next;
1017                         dst_free(dst);
1018                 } else {
1019                         pprev = &dst->next;
1020                 }
1021         }
1022         spin_unlock_bh(&icmp6_dst_lock);
1023 }
1024
1025 static int ip6_dst_gc(struct dst_ops *ops)
1026 {
1027         unsigned long now = jiffies;
1028         struct net *net = ops->dst_net;
1029         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1030         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1031         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1032         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1033         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1034
1035         if (time_after(rt_last_gc + rt_min_interval, now) &&
1036             atomic_read(&ops->entries) <= rt_max_size)
1037                 goto out;
1038
1039         net->ipv6.ip6_rt_gc_expire++;
1040         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1041         net->ipv6.ip6_rt_last_gc = now;
1042         if (atomic_read(&ops->entries) < ops->gc_thresh)
1043                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1044 out:
1045         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1046         return (atomic_read(&ops->entries) > rt_max_size);
1047 }
1048
1049 /* Clean host part of a prefix. Not necessary in radix tree,
1050    but results in cleaner routing tables.
1051
1052    Remove it only when all the things will work!
1053  */
1054
1055 static int ipv6_get_mtu(struct net_device *dev)
1056 {
1057         int mtu = IPV6_MIN_MTU;
1058         struct inet6_dev *idev;
1059
1060         idev = in6_dev_get(dev);
1061         if (idev) {
1062                 mtu = idev->cnf.mtu6;
1063                 in6_dev_put(idev);
1064         }
1065         return mtu;
1066 }
1067
1068 int ip6_dst_hoplimit(struct dst_entry *dst)
1069 {
1070         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1071         if (hoplimit < 0) {
1072                 struct net_device *dev = dst->dev;
1073                 struct inet6_dev *idev = in6_dev_get(dev);
1074                 if (idev) {
1075                         hoplimit = idev->cnf.hop_limit;
1076                         in6_dev_put(idev);
1077                 } else
1078                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1079         }
1080         return hoplimit;
1081 }
1082
1083 /*
1084  *
1085  */
1086
1087 int ip6_route_add(struct fib6_config *cfg)
1088 {
1089         int err;
1090         struct net *net = cfg->fc_nlinfo.nl_net;
1091         struct rt6_info *rt = NULL;
1092         struct net_device *dev = NULL;
1093         struct inet6_dev *idev = NULL;
1094         struct fib6_table *table;
1095         int addr_type;
1096
1097         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1098                 return -EINVAL;
1099 #ifndef CONFIG_IPV6_SUBTREES
1100         if (cfg->fc_src_len)
1101                 return -EINVAL;
1102 #endif
1103         if (cfg->fc_ifindex) {
1104                 err = -ENODEV;
1105                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1106                 if (!dev)
1107                         goto out;
1108                 idev = in6_dev_get(dev);
1109                 if (!idev)
1110                         goto out;
1111         }
1112
1113         if (cfg->fc_metric == 0)
1114                 cfg->fc_metric = IP6_RT_PRIO_USER;
1115
1116         table = fib6_new_table(net, cfg->fc_table);
1117         if (table == NULL) {
1118                 err = -ENOBUFS;
1119                 goto out;
1120         }
1121
1122         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1123
1124         if (rt == NULL) {
1125                 err = -ENOMEM;
1126                 goto out;
1127         }
1128
1129         rt->u.dst.obsolete = -1;
1130         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1131                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1132                                 0;
1133
1134         if (cfg->fc_protocol == RTPROT_UNSPEC)
1135                 cfg->fc_protocol = RTPROT_BOOT;
1136         rt->rt6i_protocol = cfg->fc_protocol;
1137
1138         addr_type = ipv6_addr_type(&cfg->fc_dst);
1139
1140         if (addr_type & IPV6_ADDR_MULTICAST)
1141                 rt->u.dst.input = ip6_mc_input;
1142         else
1143                 rt->u.dst.input = ip6_forward;
1144
1145         rt->u.dst.output = ip6_output;
1146
1147         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1148         rt->rt6i_dst.plen = cfg->fc_dst_len;
1149         if (rt->rt6i_dst.plen == 128)
1150                rt->u.dst.flags = DST_HOST;
1151
1152 #ifdef CONFIG_IPV6_SUBTREES
1153         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1154         rt->rt6i_src.plen = cfg->fc_src_len;
1155 #endif
1156
1157         rt->rt6i_metric = cfg->fc_metric;
1158
1159         /* We cannot add true routes via loopback here,
1160            they would result in kernel looping; promote them to reject routes
1161          */
1162         if ((cfg->fc_flags & RTF_REJECT) ||
1163             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1164                 /* hold loopback dev/idev if we haven't done so. */
1165                 if (dev != net->loopback_dev) {
1166                         if (dev) {
1167                                 dev_put(dev);
1168                                 in6_dev_put(idev);
1169                         }
1170                         dev = net->loopback_dev;
1171                         dev_hold(dev);
1172                         idev = in6_dev_get(dev);
1173                         if (!idev) {
1174                                 err = -ENODEV;
1175                                 goto out;
1176                         }
1177                 }
1178                 rt->u.dst.output = ip6_pkt_discard_out;
1179                 rt->u.dst.input = ip6_pkt_discard;
1180                 rt->u.dst.error = -ENETUNREACH;
1181                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1182                 goto install_route;
1183         }
1184
1185         if (cfg->fc_flags & RTF_GATEWAY) {
1186                 struct in6_addr *gw_addr;
1187                 int gwa_type;
1188
1189                 gw_addr = &cfg->fc_gateway;
1190                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1191                 gwa_type = ipv6_addr_type(gw_addr);
1192
1193                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1194                         struct rt6_info *grt;
1195
1196                         /* IPv6 strictly inhibits using not link-local
1197                            addresses as nexthop address.
1198                            Otherwise, router will not able to send redirects.
1199                            It is very good, but in some (rare!) circumstances
1200                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1201                            some exceptions. --ANK
1202                          */
1203                         err = -EINVAL;
1204                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1205                                 goto out;
1206
1207                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1208
1209                         err = -EHOSTUNREACH;
1210                         if (grt == NULL)
1211                                 goto out;
1212                         if (dev) {
1213                                 if (dev != grt->rt6i_dev) {
1214                                         dst_release(&grt->u.dst);
1215                                         goto out;
1216                                 }
1217                         } else {
1218                                 dev = grt->rt6i_dev;
1219                                 idev = grt->rt6i_idev;
1220                                 dev_hold(dev);
1221                                 in6_dev_hold(grt->rt6i_idev);
1222                         }
1223                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1224                                 err = 0;
1225                         dst_release(&grt->u.dst);
1226
1227                         if (err)
1228                                 goto out;
1229                 }
1230                 err = -EINVAL;
1231                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1232                         goto out;
1233         }
1234
1235         err = -ENODEV;
1236         if (dev == NULL)
1237                 goto out;
1238
1239         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1240                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1241                 if (IS_ERR(rt->rt6i_nexthop)) {
1242                         err = PTR_ERR(rt->rt6i_nexthop);
1243                         rt->rt6i_nexthop = NULL;
1244                         goto out;
1245                 }
1246         }
1247
1248         rt->rt6i_flags = cfg->fc_flags;
1249
1250 install_route:
1251         if (cfg->fc_mx) {
1252                 struct nlattr *nla;
1253                 int remaining;
1254
1255                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1256                         int type = nla_type(nla);
1257
1258                         if (type) {
1259                                 if (type > RTAX_MAX) {
1260                                         err = -EINVAL;
1261                                         goto out;
1262                                 }
1263
1264                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1265                         }
1266                 }
1267         }
1268
1269         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1270                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1271         if (!dst_mtu(&rt->u.dst))
1272                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1273         if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1274                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1275         rt->u.dst.dev = dev;
1276         rt->rt6i_idev = idev;
1277         rt->rt6i_table = table;
1278
1279         cfg->fc_nlinfo.nl_net = dev_net(dev);
1280
1281         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1282
1283 out:
1284         if (dev)
1285                 dev_put(dev);
1286         if (idev)
1287                 in6_dev_put(idev);
1288         if (rt)
1289                 dst_free(&rt->u.dst);
1290         return err;
1291 }
1292
1293 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1294 {
1295         int err;
1296         struct fib6_table *table;
1297         struct net *net = dev_net(rt->rt6i_dev);
1298
1299         if (rt == net->ipv6.ip6_null_entry)
1300                 return -ENOENT;
1301
1302         table = rt->rt6i_table;
1303         write_lock_bh(&table->tb6_lock);
1304
1305         err = fib6_del(rt, info);
1306         dst_release(&rt->u.dst);
1307
1308         write_unlock_bh(&table->tb6_lock);
1309
1310         return err;
1311 }
1312
1313 int ip6_del_rt(struct rt6_info *rt)
1314 {
1315         struct nl_info info = {
1316                 .nl_net = dev_net(rt->rt6i_dev),
1317         };
1318         return __ip6_del_rt(rt, &info);
1319 }
1320
1321 static int ip6_route_del(struct fib6_config *cfg)
1322 {
1323         struct fib6_table *table;
1324         struct fib6_node *fn;
1325         struct rt6_info *rt;
1326         int err = -ESRCH;
1327
1328         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1329         if (table == NULL)
1330                 return err;
1331
1332         read_lock_bh(&table->tb6_lock);
1333
1334         fn = fib6_locate(&table->tb6_root,
1335                          &cfg->fc_dst, cfg->fc_dst_len,
1336                          &cfg->fc_src, cfg->fc_src_len);
1337
1338         if (fn) {
1339                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1340                         if (cfg->fc_ifindex &&
1341                             (rt->rt6i_dev == NULL ||
1342                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1343                                 continue;
1344                         if (cfg->fc_flags & RTF_GATEWAY &&
1345                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1346                                 continue;
1347                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1348                                 continue;
1349                         dst_hold(&rt->u.dst);
1350                         read_unlock_bh(&table->tb6_lock);
1351
1352                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1353                 }
1354         }
1355         read_unlock_bh(&table->tb6_lock);
1356
1357         return err;
1358 }
1359
1360 /*
1361  *      Handle redirects
1362  */
1363 struct ip6rd_flowi {
1364         struct flowi fl;
1365         struct in6_addr gateway;
1366 };
1367
1368 static struct rt6_info *__ip6_route_redirect(struct net *net,
1369                                              struct fib6_table *table,
1370                                              struct flowi *fl,
1371                                              int flags)
1372 {
1373         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1374         struct rt6_info *rt;
1375         struct fib6_node *fn;
1376
1377         /*
1378          * Get the "current" route for this destination and
1379          * check if the redirect has come from approriate router.
1380          *
1381          * RFC 2461 specifies that redirects should only be
1382          * accepted if they come from the nexthop to the target.
1383          * Due to the way the routes are chosen, this notion
1384          * is a bit fuzzy and one might need to check all possible
1385          * routes.
1386          */
1387
1388         read_lock_bh(&table->tb6_lock);
1389         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1390 restart:
1391         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1392                 /*
1393                  * Current route is on-link; redirect is always invalid.
1394                  *
1395                  * Seems, previous statement is not true. It could
1396                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1397                  * But then router serving it might decide, that we should
1398                  * know truth 8)8) --ANK (980726).
1399                  */
1400                 if (rt6_check_expired(rt))
1401                         continue;
1402                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1403                         continue;
1404                 if (fl->oif != rt->rt6i_dev->ifindex)
1405                         continue;
1406                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1407                         continue;
1408                 break;
1409         }
1410
1411         if (!rt)
1412                 rt = net->ipv6.ip6_null_entry;
1413         BACKTRACK(net, &fl->fl6_src);
1414 out:
1415         dst_hold(&rt->u.dst);
1416
1417         read_unlock_bh(&table->tb6_lock);
1418
1419         return rt;
1420 };
1421
1422 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1423                                            struct in6_addr *src,
1424                                            struct in6_addr *gateway,
1425                                            struct net_device *dev)
1426 {
1427         int flags = RT6_LOOKUP_F_HAS_SADDR;
1428         struct net *net = dev_net(dev);
1429         struct ip6rd_flowi rdfl = {
1430                 .fl = {
1431                         .oif = dev->ifindex,
1432                         .nl_u = {
1433                                 .ip6_u = {
1434                                         .daddr = *dest,
1435                                         .saddr = *src,
1436                                 },
1437                         },
1438                 },
1439                 .gateway = *gateway,
1440         };
1441
1442         if (rt6_need_strict(dest))
1443                 flags |= RT6_LOOKUP_F_IFACE;
1444
1445         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1446                                                    flags, __ip6_route_redirect);
1447 }
1448
1449 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1450                   struct in6_addr *saddr,
1451                   struct neighbour *neigh, u8 *lladdr, int on_link)
1452 {
1453         struct rt6_info *rt, *nrt = NULL;
1454         struct netevent_redirect netevent;
1455         struct net *net = dev_net(neigh->dev);
1456
1457         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1458
1459         if (rt == net->ipv6.ip6_null_entry) {
1460                 if (net_ratelimit())
1461                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1462                                "for redirect target\n");
1463                 goto out;
1464         }
1465
1466         /*
1467          *      We have finally decided to accept it.
1468          */
1469
1470         neigh_update(neigh, lladdr, NUD_STALE,
1471                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1472                      NEIGH_UPDATE_F_OVERRIDE|
1473                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1474                                      NEIGH_UPDATE_F_ISROUTER))
1475                      );
1476
1477         /*
1478          * Redirect received -> path was valid.
1479          * Look, redirects are sent only in response to data packets,
1480          * so that this nexthop apparently is reachable. --ANK
1481          */
1482         dst_confirm(&rt->u.dst);
1483
1484         /* Duplicate redirect: silently ignore. */
1485         if (neigh == rt->u.dst.neighbour)
1486                 goto out;
1487
1488         nrt = ip6_rt_copy(rt);
1489         if (nrt == NULL)
1490                 goto out;
1491
1492         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1493         if (on_link)
1494                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1495
1496         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1497         nrt->rt6i_dst.plen = 128;
1498         nrt->u.dst.flags |= DST_HOST;
1499
1500         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1501         nrt->rt6i_nexthop = neigh_clone(neigh);
1502         /* Reset pmtu, it may be better */
1503         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1504         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1505                                                         dst_mtu(&nrt->u.dst));
1506
1507         if (ip6_ins_rt(nrt))
1508                 goto out;
1509
1510         netevent.old = &rt->u.dst;
1511         netevent.new = &nrt->u.dst;
1512         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1513
1514         if (rt->rt6i_flags&RTF_CACHE) {
1515                 ip6_del_rt(rt);
1516                 return;
1517         }
1518
1519 out:
1520         dst_release(&rt->u.dst);
1521         return;
1522 }
1523
1524 /*
1525  *      Handle ICMP "packet too big" messages
1526  *      i.e. Path MTU discovery
1527  */
1528
1529 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1530                         struct net_device *dev, u32 pmtu)
1531 {
1532         struct rt6_info *rt, *nrt;
1533         struct net *net = dev_net(dev);
1534         int allfrag = 0;
1535
1536         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1537         if (rt == NULL)
1538                 return;
1539
1540         if (pmtu >= dst_mtu(&rt->u.dst))
1541                 goto out;
1542
1543         if (pmtu < IPV6_MIN_MTU) {
1544                 /*
1545                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1546                  * MTU (1280) and a fragment header should always be included
1547                  * after a node receiving Too Big message reporting PMTU is
1548                  * less than the IPv6 Minimum Link MTU.
1549                  */
1550                 pmtu = IPV6_MIN_MTU;
1551                 allfrag = 1;
1552         }
1553
1554         /* New mtu received -> path was valid.
1555            They are sent only in response to data packets,
1556            so that this nexthop apparently is reachable. --ANK
1557          */
1558         dst_confirm(&rt->u.dst);
1559
1560         /* Host route. If it is static, it would be better
1561            not to override it, but add new one, so that
1562            when cache entry will expire old pmtu
1563            would return automatically.
1564          */
1565         if (rt->rt6i_flags & RTF_CACHE) {
1566                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1567                 if (allfrag)
1568                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1569                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1570                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1571                 goto out;
1572         }
1573
1574         /* Network route.
1575            Two cases are possible:
1576            1. It is connected route. Action: COW
1577            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1578          */
1579         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1580                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1581         else
1582                 nrt = rt6_alloc_clone(rt, daddr);
1583
1584         if (nrt) {
1585                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1586                 if (allfrag)
1587                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1588
1589                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1590                  * happened within 5 mins, the recommended timer is 10 mins.
1591                  * Here this route expiration time is set to ip6_rt_mtu_expires
1592                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1593                  * and detecting PMTU increase will be automatically happened.
1594                  */
1595                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1596                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1597
1598                 ip6_ins_rt(nrt);
1599         }
1600 out:
1601         dst_release(&rt->u.dst);
1602 }
1603
1604 /*
1605  *      Misc support functions
1606  */
1607
1608 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1609 {
1610         struct net *net = dev_net(ort->rt6i_dev);
1611         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1612
1613         if (rt) {
1614                 rt->u.dst.input = ort->u.dst.input;
1615                 rt->u.dst.output = ort->u.dst.output;
1616
1617                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1618                 rt->u.dst.error = ort->u.dst.error;
1619                 rt->u.dst.dev = ort->u.dst.dev;
1620                 if (rt->u.dst.dev)
1621                         dev_hold(rt->u.dst.dev);
1622                 rt->rt6i_idev = ort->rt6i_idev;
1623                 if (rt->rt6i_idev)
1624                         in6_dev_hold(rt->rt6i_idev);
1625                 rt->u.dst.lastuse = jiffies;
1626                 rt->rt6i_expires = 0;
1627
1628                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1629                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1630                 rt->rt6i_metric = 0;
1631
1632                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1633 #ifdef CONFIG_IPV6_SUBTREES
1634                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1635 #endif
1636                 rt->rt6i_table = ort->rt6i_table;
1637         }
1638         return rt;
1639 }
1640
1641 #ifdef CONFIG_IPV6_ROUTE_INFO
1642 static struct rt6_info *rt6_get_route_info(struct net *net,
1643                                            struct in6_addr *prefix, int prefixlen,
1644                                            struct in6_addr *gwaddr, int ifindex)
1645 {
1646         struct fib6_node *fn;
1647         struct rt6_info *rt = NULL;
1648         struct fib6_table *table;
1649
1650         table = fib6_get_table(net, RT6_TABLE_INFO);
1651         if (table == NULL)
1652                 return NULL;
1653
1654         write_lock_bh(&table->tb6_lock);
1655         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1656         if (!fn)
1657                 goto out;
1658
1659         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1660                 if (rt->rt6i_dev->ifindex != ifindex)
1661                         continue;
1662                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1663                         continue;
1664                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1665                         continue;
1666                 dst_hold(&rt->u.dst);
1667                 break;
1668         }
1669 out:
1670         write_unlock_bh(&table->tb6_lock);
1671         return rt;
1672 }
1673
1674 static struct rt6_info *rt6_add_route_info(struct net *net,
1675                                            struct in6_addr *prefix, int prefixlen,
1676                                            struct in6_addr *gwaddr, int ifindex,
1677                                            unsigned pref)
1678 {
1679         struct fib6_config cfg = {
1680                 .fc_table       = RT6_TABLE_INFO,
1681                 .fc_metric      = IP6_RT_PRIO_USER,
1682                 .fc_ifindex     = ifindex,
1683                 .fc_dst_len     = prefixlen,
1684                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1685                                   RTF_UP | RTF_PREF(pref),
1686                 .fc_nlinfo.pid = 0,
1687                 .fc_nlinfo.nlh = NULL,
1688                 .fc_nlinfo.nl_net = net,
1689         };
1690
1691         ipv6_addr_copy(&cfg.fc_dst, prefix);
1692         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1693
1694         /* We should treat it as a default route if prefix length is 0. */
1695         if (!prefixlen)
1696                 cfg.fc_flags |= RTF_DEFAULT;
1697
1698         ip6_route_add(&cfg);
1699
1700         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1701 }
1702 #endif
1703
1704 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1705 {
1706         struct rt6_info *rt;
1707         struct fib6_table *table;
1708
1709         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1710         if (table == NULL)
1711                 return NULL;
1712
1713         write_lock_bh(&table->tb6_lock);
1714         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1715                 if (dev == rt->rt6i_dev &&
1716                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1717                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1718                         break;
1719         }
1720         if (rt)
1721                 dst_hold(&rt->u.dst);
1722         write_unlock_bh(&table->tb6_lock);
1723         return rt;
1724 }
1725
1726 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1727                                      struct net_device *dev,
1728                                      unsigned int pref)
1729 {
1730         struct fib6_config cfg = {
1731                 .fc_table       = RT6_TABLE_DFLT,
1732                 .fc_metric      = IP6_RT_PRIO_USER,
1733                 .fc_ifindex     = dev->ifindex,
1734                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1735                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1736                 .fc_nlinfo.pid = 0,
1737                 .fc_nlinfo.nlh = NULL,
1738                 .fc_nlinfo.nl_net = dev_net(dev),
1739         };
1740
1741         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1742
1743         ip6_route_add(&cfg);
1744
1745         return rt6_get_dflt_router(gwaddr, dev);
1746 }
1747
1748 void rt6_purge_dflt_routers(struct net *net)
1749 {
1750         struct rt6_info *rt;
1751         struct fib6_table *table;
1752
1753         /* NOTE: Keep consistent with rt6_get_dflt_router */
1754         table = fib6_get_table(net, RT6_TABLE_DFLT);
1755         if (table == NULL)
1756                 return;
1757
1758 restart:
1759         read_lock_bh(&table->tb6_lock);
1760         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1761                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1762                         dst_hold(&rt->u.dst);
1763                         read_unlock_bh(&table->tb6_lock);
1764                         ip6_del_rt(rt);
1765                         goto restart;
1766                 }
1767         }
1768         read_unlock_bh(&table->tb6_lock);
1769 }
1770
1771 static void rtmsg_to_fib6_config(struct net *net,
1772                                  struct in6_rtmsg *rtmsg,
1773                                  struct fib6_config *cfg)
1774 {
1775         memset(cfg, 0, sizeof(*cfg));
1776
1777         cfg->fc_table = RT6_TABLE_MAIN;
1778         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1779         cfg->fc_metric = rtmsg->rtmsg_metric;
1780         cfg->fc_expires = rtmsg->rtmsg_info;
1781         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1782         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1783         cfg->fc_flags = rtmsg->rtmsg_flags;
1784
1785         cfg->fc_nlinfo.nl_net = net;
1786
1787         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1788         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1789         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1790 }
1791
1792 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1793 {
1794         struct fib6_config cfg;
1795         struct in6_rtmsg rtmsg;
1796         int err;
1797
1798         switch(cmd) {
1799         case SIOCADDRT:         /* Add a route */
1800         case SIOCDELRT:         /* Delete a route */
1801                 if (!capable(CAP_NET_ADMIN))
1802                         return -EPERM;
1803                 err = copy_from_user(&rtmsg, arg,
1804                                      sizeof(struct in6_rtmsg));
1805                 if (err)
1806                         return -EFAULT;
1807
1808                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1809
1810                 rtnl_lock();
1811                 switch (cmd) {
1812                 case SIOCADDRT:
1813                         err = ip6_route_add(&cfg);
1814                         break;
1815                 case SIOCDELRT:
1816                         err = ip6_route_del(&cfg);
1817                         break;
1818                 default:
1819                         err = -EINVAL;
1820                 }
1821                 rtnl_unlock();
1822
1823                 return err;
1824         }
1825
1826         return -EINVAL;
1827 }
1828
1829 /*
1830  *      Drop the packet on the floor
1831  */
1832
1833 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1834 {
1835         int type;
1836         struct dst_entry *dst = skb->dst;
1837         switch (ipstats_mib_noroutes) {
1838         case IPSTATS_MIB_INNOROUTES:
1839                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1840                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1841                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1842                                       IPSTATS_MIB_INADDRERRORS);
1843                         break;
1844                 }
1845                 /* FALLTHROUGH */
1846         case IPSTATS_MIB_OUTNOROUTES:
1847                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1848                               ipstats_mib_noroutes);
1849                 break;
1850         }
1851         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1852         kfree_skb(skb);
1853         return 0;
1854 }
1855
1856 static int ip6_pkt_discard(struct sk_buff *skb)
1857 {
1858         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1859 }
1860
1861 static int ip6_pkt_discard_out(struct sk_buff *skb)
1862 {
1863         skb->dev = skb->dst->dev;
1864         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1865 }
1866
1867 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1868
1869 static int ip6_pkt_prohibit(struct sk_buff *skb)
1870 {
1871         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1872 }
1873
1874 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1875 {
1876         skb->dev = skb->dst->dev;
1877         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1878 }
1879
1880 #endif
1881
1882 /*
1883  *      Allocate a dst for local (unicast / anycast) address.
1884  */
1885
1886 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1887                                     const struct in6_addr *addr,
1888                                     int anycast)
1889 {
1890         struct net *net = dev_net(idev->dev);
1891         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1892
1893         if (rt == NULL)
1894                 return ERR_PTR(-ENOMEM);
1895
1896         dev_hold(net->loopback_dev);
1897         in6_dev_hold(idev);
1898
1899         rt->u.dst.flags = DST_HOST;
1900         rt->u.dst.input = ip6_input;
1901         rt->u.dst.output = ip6_output;
1902         rt->rt6i_dev = net->loopback_dev;
1903         rt->rt6i_idev = idev;
1904         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1905         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1906         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1907         rt->u.dst.obsolete = -1;
1908
1909         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1910         if (anycast)
1911                 rt->rt6i_flags |= RTF_ANYCAST;
1912         else
1913                 rt->rt6i_flags |= RTF_LOCAL;
1914         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1915         if (rt->rt6i_nexthop == NULL) {
1916                 dst_free(&rt->u.dst);
1917                 return ERR_PTR(-ENOMEM);
1918         }
1919
1920         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1921         rt->rt6i_dst.plen = 128;
1922         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1923
1924         atomic_set(&rt->u.dst.__refcnt, 1);
1925
1926         return rt;
1927 }
1928
1929 struct arg_dev_net {
1930         struct net_device *dev;
1931         struct net *net;
1932 };
1933
1934 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1935 {
1936         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1937         struct net *net = ((struct arg_dev_net *)arg)->net;
1938
1939         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1940             rt != net->ipv6.ip6_null_entry) {
1941                 RT6_TRACE("deleted by ifdown %p\n", rt);
1942                 return -1;
1943         }
1944         return 0;
1945 }
1946
1947 void rt6_ifdown(struct net *net, struct net_device *dev)
1948 {
1949         struct arg_dev_net adn = {
1950                 .dev = dev,
1951                 .net = net,
1952         };
1953
1954         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1955         icmp6_clean_all(fib6_ifdown, &adn);
1956 }
1957
1958 struct rt6_mtu_change_arg
1959 {
1960         struct net_device *dev;
1961         unsigned mtu;
1962 };
1963
1964 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1965 {
1966         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1967         struct inet6_dev *idev;
1968         struct net *net = dev_net(arg->dev);
1969
1970         /* In IPv6 pmtu discovery is not optional,
1971            so that RTAX_MTU lock cannot disable it.
1972            We still use this lock to block changes
1973            caused by addrconf/ndisc.
1974         */
1975
1976         idev = __in6_dev_get(arg->dev);
1977         if (idev == NULL)
1978                 return 0;
1979
1980         /* For administrative MTU increase, there is no way to discover
1981            IPv6 PMTU increase, so PMTU increase should be updated here.
1982            Since RFC 1981 doesn't include administrative MTU increase
1983            update PMTU increase is a MUST. (i.e. jumbo frame)
1984          */
1985         /*
1986            If new MTU is less than route PMTU, this new MTU will be the
1987            lowest MTU in the path, update the route PMTU to reflect PMTU
1988            decreases; if new MTU is greater than route PMTU, and the
1989            old MTU is the lowest MTU in the path, update the route PMTU
1990            to reflect the increase. In this case if the other nodes' MTU
1991            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1992            PMTU discouvery.
1993          */
1994         if (rt->rt6i_dev == arg->dev &&
1995             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1996             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1997              (dst_mtu(&rt->u.dst) < arg->mtu &&
1998               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1999                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2000                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2001         }
2002         return 0;
2003 }
2004
2005 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2006 {
2007         struct rt6_mtu_change_arg arg = {
2008                 .dev = dev,
2009                 .mtu = mtu,
2010         };
2011
2012         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2013 }
2014
2015 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2016         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2017         [RTA_OIF]               = { .type = NLA_U32 },
2018         [RTA_IIF]               = { .type = NLA_U32 },
2019         [RTA_PRIORITY]          = { .type = NLA_U32 },
2020         [RTA_METRICS]           = { .type = NLA_NESTED },
2021 };
2022
2023 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2024                               struct fib6_config *cfg)
2025 {
2026         struct rtmsg *rtm;
2027         struct nlattr *tb[RTA_MAX+1];
2028         int err;
2029
2030         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2031         if (err < 0)
2032                 goto errout;
2033
2034         err = -EINVAL;
2035         rtm = nlmsg_data(nlh);
2036         memset(cfg, 0, sizeof(*cfg));
2037
2038         cfg->fc_table = rtm->rtm_table;
2039         cfg->fc_dst_len = rtm->rtm_dst_len;
2040         cfg->fc_src_len = rtm->rtm_src_len;
2041         cfg->fc_flags = RTF_UP;
2042         cfg->fc_protocol = rtm->rtm_protocol;
2043
2044         if (rtm->rtm_type == RTN_UNREACHABLE)
2045                 cfg->fc_flags |= RTF_REJECT;
2046
2047         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2048         cfg->fc_nlinfo.nlh = nlh;
2049         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2050
2051         if (tb[RTA_GATEWAY]) {
2052                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2053                 cfg->fc_flags |= RTF_GATEWAY;
2054         }
2055
2056         if (tb[RTA_DST]) {
2057                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2058
2059                 if (nla_len(tb[RTA_DST]) < plen)
2060                         goto errout;
2061
2062                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2063         }
2064
2065         if (tb[RTA_SRC]) {
2066                 int plen = (rtm->rtm_src_len + 7) >> 3;
2067
2068                 if (nla_len(tb[RTA_SRC]) < plen)
2069                         goto errout;
2070
2071                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2072         }
2073
2074         if (tb[RTA_OIF])
2075                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2076
2077         if (tb[RTA_PRIORITY])
2078                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2079
2080         if (tb[RTA_METRICS]) {
2081                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2082                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2083         }
2084
2085         if (tb[RTA_TABLE])
2086                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2087
2088         err = 0;
2089 errout:
2090         return err;
2091 }
2092
2093 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2094 {
2095         struct fib6_config cfg;
2096         int err;
2097
2098         err = rtm_to_fib6_config(skb, nlh, &cfg);
2099         if (err < 0)
2100                 return err;
2101
2102         return ip6_route_del(&cfg);
2103 }
2104
2105 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2106 {
2107         struct fib6_config cfg;
2108         int err;
2109
2110         err = rtm_to_fib6_config(skb, nlh, &cfg);
2111         if (err < 0)
2112                 return err;
2113
2114         return ip6_route_add(&cfg);
2115 }
2116
2117 static inline size_t rt6_nlmsg_size(void)
2118 {
2119         return NLMSG_ALIGN(sizeof(struct rtmsg))
2120                + nla_total_size(16) /* RTA_SRC */
2121                + nla_total_size(16) /* RTA_DST */
2122                + nla_total_size(16) /* RTA_GATEWAY */
2123                + nla_total_size(16) /* RTA_PREFSRC */
2124                + nla_total_size(4) /* RTA_TABLE */
2125                + nla_total_size(4) /* RTA_IIF */
2126                + nla_total_size(4) /* RTA_OIF */
2127                + nla_total_size(4) /* RTA_PRIORITY */
2128                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2129                + nla_total_size(sizeof(struct rta_cacheinfo));
2130 }
2131
2132 static int rt6_fill_node(struct net *net,
2133                          struct sk_buff *skb, struct rt6_info *rt,
2134                          struct in6_addr *dst, struct in6_addr *src,
2135                          int iif, int type, u32 pid, u32 seq,
2136                          int prefix, int nowait, unsigned int flags)
2137 {
2138         struct rtmsg *rtm;
2139         struct nlmsghdr *nlh;
2140         long expires;
2141         u32 table;
2142
2143         if (prefix) {   /* user wants prefix routes only */
2144                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2145                         /* success since this is not a prefix route */
2146                         return 1;
2147                 }
2148         }
2149
2150         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2151         if (nlh == NULL)
2152                 return -EMSGSIZE;
2153
2154         rtm = nlmsg_data(nlh);
2155         rtm->rtm_family = AF_INET6;
2156         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2157         rtm->rtm_src_len = rt->rt6i_src.plen;
2158         rtm->rtm_tos = 0;
2159         if (rt->rt6i_table)
2160                 table = rt->rt6i_table->tb6_id;
2161         else
2162                 table = RT6_TABLE_UNSPEC;
2163         rtm->rtm_table = table;
2164         NLA_PUT_U32(skb, RTA_TABLE, table);
2165         if (rt->rt6i_flags&RTF_REJECT)
2166                 rtm->rtm_type = RTN_UNREACHABLE;
2167         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2168                 rtm->rtm_type = RTN_LOCAL;
2169         else
2170                 rtm->rtm_type = RTN_UNICAST;
2171         rtm->rtm_flags = 0;
2172         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2173         rtm->rtm_protocol = rt->rt6i_protocol;
2174         if (rt->rt6i_flags&RTF_DYNAMIC)
2175                 rtm->rtm_protocol = RTPROT_REDIRECT;
2176         else if (rt->rt6i_flags & RTF_ADDRCONF)
2177                 rtm->rtm_protocol = RTPROT_KERNEL;
2178         else if (rt->rt6i_flags&RTF_DEFAULT)
2179                 rtm->rtm_protocol = RTPROT_RA;
2180
2181         if (rt->rt6i_flags&RTF_CACHE)
2182                 rtm->rtm_flags |= RTM_F_CLONED;
2183
2184         if (dst) {
2185                 NLA_PUT(skb, RTA_DST, 16, dst);
2186                 rtm->rtm_dst_len = 128;
2187         } else if (rtm->rtm_dst_len)
2188                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2189 #ifdef CONFIG_IPV6_SUBTREES
2190         if (src) {
2191                 NLA_PUT(skb, RTA_SRC, 16, src);
2192                 rtm->rtm_src_len = 128;
2193         } else if (rtm->rtm_src_len)
2194                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2195 #endif
2196         if (iif) {
2197 #ifdef CONFIG_IPV6_MROUTE
2198                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2199                         int err = ip6mr_get_route(skb, rtm, nowait);
2200                         if (err <= 0) {
2201                                 if (!nowait) {
2202                                         if (err == 0)
2203                                                 return 0;
2204                                         goto nla_put_failure;
2205                                 } else {
2206                                         if (err == -EMSGSIZE)
2207                                                 goto nla_put_failure;
2208                                 }
2209                         }
2210                 } else
2211 #endif
2212                         NLA_PUT_U32(skb, RTA_IIF, iif);
2213         } else if (dst) {
2214                 struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2215                 struct in6_addr saddr_buf;
2216                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2217                                        dst, 0, &saddr_buf) == 0)
2218                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2219         }
2220
2221         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2222                 goto nla_put_failure;
2223
2224         if (rt->u.dst.neighbour)
2225                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2226
2227         if (rt->u.dst.dev)
2228                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2229
2230         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2231
2232         if (!(rt->rt6i_flags & RTF_EXPIRES))
2233                 expires = 0;
2234         else if (rt->rt6i_expires - jiffies < INT_MAX)
2235                 expires = rt->rt6i_expires - jiffies;
2236         else
2237                 expires = INT_MAX;
2238
2239         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2240                                expires, rt->u.dst.error) < 0)
2241                 goto nla_put_failure;
2242
2243         return nlmsg_end(skb, nlh);
2244
2245 nla_put_failure:
2246         nlmsg_cancel(skb, nlh);
2247         return -EMSGSIZE;
2248 }
2249
2250 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2251 {
2252         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2253         int prefix;
2254
2255         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2256                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2257                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2258         } else
2259                 prefix = 0;
2260
2261         return rt6_fill_node(arg->net,
2262                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2263                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2264                      prefix, 0, NLM_F_MULTI);
2265 }
2266
2267 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2268 {
2269         struct net *net = sock_net(in_skb->sk);
2270         struct nlattr *tb[RTA_MAX+1];
2271         struct rt6_info *rt;
2272         struct sk_buff *skb;
2273         struct rtmsg *rtm;
2274         struct flowi fl;
2275         int err, iif = 0;
2276
2277         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2278         if (err < 0)
2279                 goto errout;
2280
2281         err = -EINVAL;
2282         memset(&fl, 0, sizeof(fl));
2283
2284         if (tb[RTA_SRC]) {
2285                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2286                         goto errout;
2287
2288                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2289         }
2290
2291         if (tb[RTA_DST]) {
2292                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2293                         goto errout;
2294
2295                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2296         }
2297
2298         if (tb[RTA_IIF])
2299                 iif = nla_get_u32(tb[RTA_IIF]);
2300
2301         if (tb[RTA_OIF])
2302                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2303
2304         if (iif) {
2305                 struct net_device *dev;
2306                 dev = __dev_get_by_index(net, iif);
2307                 if (!dev) {
2308                         err = -ENODEV;
2309                         goto errout;
2310                 }
2311         }
2312
2313         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2314         if (skb == NULL) {
2315                 err = -ENOBUFS;
2316                 goto errout;
2317         }
2318
2319         /* Reserve room for dummy headers, this skb can pass
2320            through good chunk of routing engine.
2321          */
2322         skb_reset_mac_header(skb);
2323         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2324
2325         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2326         skb->dst = &rt->u.dst;
2327
2328         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2329                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2330                             nlh->nlmsg_seq, 0, 0, 0);
2331         if (err < 0) {
2332                 kfree_skb(skb);
2333                 goto errout;
2334         }
2335
2336         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2337 errout:
2338         return err;
2339 }
2340
2341 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2342 {
2343         struct sk_buff *skb;
2344         struct net *net = info->nl_net;
2345         u32 seq;
2346         int err;
2347
2348         err = -ENOBUFS;
2349         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2350
2351         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2352         if (skb == NULL)
2353                 goto errout;
2354
2355         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2356                                 event, info->pid, seq, 0, 0, 0);
2357         if (err < 0) {
2358                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2359                 WARN_ON(err == -EMSGSIZE);
2360                 kfree_skb(skb);
2361                 goto errout;
2362         }
2363         err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2364                           info->nlh, gfp_any());
2365 errout:
2366         if (err < 0)
2367                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2368 }
2369
2370 static int ip6_route_dev_notify(struct notifier_block *this,
2371                                 unsigned long event, void *data)
2372 {
2373         struct net_device *dev = (struct net_device *)data;
2374         struct net *net = dev_net(dev);
2375
2376         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2377                 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2378                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2379 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2380                 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2381                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2382                 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2383                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2384 #endif
2385         }
2386
2387         return NOTIFY_OK;
2388 }
2389
2390 /*
2391  *      /proc
2392  */
2393
2394 #ifdef CONFIG_PROC_FS
2395
2396 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2397
2398 struct rt6_proc_arg
2399 {
2400         char *buffer;
2401         int offset;
2402         int length;
2403         int skip;
2404         int len;
2405 };
2406
2407 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2408 {
2409         struct seq_file *m = p_arg;
2410
2411         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2412
2413 #ifdef CONFIG_IPV6_SUBTREES
2414         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2415 #else
2416         seq_puts(m, "00000000000000000000000000000000 00 ");
2417 #endif
2418
2419         if (rt->rt6i_nexthop) {
2420                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2421         } else {
2422                 seq_puts(m, "00000000000000000000000000000000");
2423         }
2424         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2425                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2426                    rt->u.dst.__use, rt->rt6i_flags,
2427                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2428         return 0;
2429 }
2430
2431 static int ipv6_route_show(struct seq_file *m, void *v)
2432 {
2433         struct net *net = (struct net *)m->private;
2434         fib6_clean_all(net, rt6_info_route, 0, m);
2435         return 0;
2436 }
2437
2438 static int ipv6_route_open(struct inode *inode, struct file *file)
2439 {
2440         return single_open_net(inode, file, ipv6_route_show);
2441 }
2442
2443 static const struct file_operations ipv6_route_proc_fops = {
2444         .owner          = THIS_MODULE,
2445         .open           = ipv6_route_open,
2446         .read           = seq_read,
2447         .llseek         = seq_lseek,
2448         .release        = single_release_net,
2449 };
2450
2451 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2452 {
2453         struct net *net = (struct net *)seq->private;
2454         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2455                    net->ipv6.rt6_stats->fib_nodes,
2456                    net->ipv6.rt6_stats->fib_route_nodes,
2457                    net->ipv6.rt6_stats->fib_rt_alloc,
2458                    net->ipv6.rt6_stats->fib_rt_entries,
2459                    net->ipv6.rt6_stats->fib_rt_cache,
2460                    atomic_read(&net->ipv6.ip6_dst_ops->entries),
2461                    net->ipv6.rt6_stats->fib_discarded_routes);
2462
2463         return 0;
2464 }
2465
2466 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2467 {
2468         return single_open_net(inode, file, rt6_stats_seq_show);
2469 }
2470
2471 static const struct file_operations rt6_stats_seq_fops = {
2472         .owner   = THIS_MODULE,
2473         .open    = rt6_stats_seq_open,
2474         .read    = seq_read,
2475         .llseek  = seq_lseek,
2476         .release = single_release_net,
2477 };
2478 #endif  /* CONFIG_PROC_FS */
2479
2480 #ifdef CONFIG_SYSCTL
2481
2482 static
2483 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2484                               void __user *buffer, size_t *lenp, loff_t *ppos)
2485 {
2486         struct net *net = current->nsproxy->net_ns;
2487         int delay = net->ipv6.sysctl.flush_delay;
2488         if (write) {
2489                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2490                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2491                 return 0;
2492         } else
2493                 return -EINVAL;
2494 }
2495
2496 ctl_table ipv6_route_table_template[] = {
2497         {
2498                 .procname       =       "flush",
2499                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2500                 .maxlen         =       sizeof(int),
2501                 .mode           =       0200,
2502                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2503         },
2504         {
2505                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2506                 .procname       =       "gc_thresh",
2507                 .data           =       &ip6_dst_ops_template.gc_thresh,
2508                 .maxlen         =       sizeof(int),
2509                 .mode           =       0644,
2510                 .proc_handler   =       &proc_dointvec,
2511         },
2512         {
2513                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2514                 .procname       =       "max_size",
2515                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2516                 .maxlen         =       sizeof(int),
2517                 .mode           =       0644,
2518                 .proc_handler   =       &proc_dointvec,
2519         },
2520         {
2521                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2522                 .procname       =       "gc_min_interval",
2523                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2524                 .maxlen         =       sizeof(int),
2525                 .mode           =       0644,
2526                 .proc_handler   =       &proc_dointvec_jiffies,
2527                 .strategy       =       &sysctl_jiffies,
2528         },
2529         {
2530                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2531                 .procname       =       "gc_timeout",
2532                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2533                 .maxlen         =       sizeof(int),
2534                 .mode           =       0644,
2535                 .proc_handler   =       &proc_dointvec_jiffies,
2536                 .strategy       =       &sysctl_jiffies,
2537         },
2538         {
2539                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2540                 .procname       =       "gc_interval",
2541                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2542                 .maxlen         =       sizeof(int),
2543                 .mode           =       0644,
2544                 .proc_handler   =       &proc_dointvec_jiffies,
2545                 .strategy       =       &sysctl_jiffies,
2546         },
2547         {
2548                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2549                 .procname       =       "gc_elasticity",
2550                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2551                 .maxlen         =       sizeof(int),
2552                 .mode           =       0644,
2553                 .proc_handler   =       &proc_dointvec_jiffies,
2554                 .strategy       =       &sysctl_jiffies,
2555         },
2556         {
2557                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2558                 .procname       =       "mtu_expires",
2559                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2560                 .maxlen         =       sizeof(int),
2561                 .mode           =       0644,
2562                 .proc_handler   =       &proc_dointvec_jiffies,
2563                 .strategy       =       &sysctl_jiffies,
2564         },
2565         {
2566                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2567                 .procname       =       "min_adv_mss",
2568                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2569                 .maxlen         =       sizeof(int),
2570                 .mode           =       0644,
2571                 .proc_handler   =       &proc_dointvec_jiffies,
2572                 .strategy       =       &sysctl_jiffies,
2573         },
2574         {
2575                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2576                 .procname       =       "gc_min_interval_ms",
2577                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2578                 .maxlen         =       sizeof(int),
2579                 .mode           =       0644,
2580                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2581                 .strategy       =       &sysctl_ms_jiffies,
2582         },
2583         { .ctl_name = 0 }
2584 };
2585
2586 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2587 {
2588         struct ctl_table *table;
2589
2590         table = kmemdup(ipv6_route_table_template,
2591                         sizeof(ipv6_route_table_template),
2592                         GFP_KERNEL);
2593
2594         if (table) {
2595                 table[0].data = &net->ipv6.sysctl.flush_delay;
2596                 table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2597                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2598                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2599                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2600                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2601                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2602                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2603                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2604         }
2605
2606         return table;
2607 }
2608 #endif
2609
2610 static int ip6_route_net_init(struct net *net)
2611 {
2612         int ret = -ENOMEM;
2613
2614         net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2615                                         sizeof(*net->ipv6.ip6_dst_ops),
2616                                         GFP_KERNEL);
2617         if (!net->ipv6.ip6_dst_ops)
2618                 goto out;
2619         net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2620
2621         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2622                                            sizeof(*net->ipv6.ip6_null_entry),
2623                                            GFP_KERNEL);
2624         if (!net->ipv6.ip6_null_entry)
2625                 goto out_ip6_dst_ops;
2626         net->ipv6.ip6_null_entry->u.dst.path =
2627                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2628         net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2629
2630 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2631         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2632                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2633                                                GFP_KERNEL);
2634         if (!net->ipv6.ip6_prohibit_entry)
2635                 goto out_ip6_null_entry;
2636         net->ipv6.ip6_prohibit_entry->u.dst.path =
2637                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2638         net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2639
2640         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2641                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2642                                                GFP_KERNEL);
2643         if (!net->ipv6.ip6_blk_hole_entry)
2644                 goto out_ip6_prohibit_entry;
2645         net->ipv6.ip6_blk_hole_entry->u.dst.path =
2646                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2647         net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2648 #endif
2649
2650         net->ipv6.sysctl.flush_delay = 0;
2651         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2652         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2653         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2654         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2655         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2656         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2657         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2658
2659 #ifdef CONFIG_PROC_FS
2660         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2661         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2662 #endif
2663         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2664
2665         ret = 0;
2666 out:
2667         return ret;
2668
2669 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2670 out_ip6_prohibit_entry:
2671         kfree(net->ipv6.ip6_prohibit_entry);
2672 out_ip6_null_entry:
2673         kfree(net->ipv6.ip6_null_entry);
2674 #endif
2675 out_ip6_dst_ops:
2676         release_net(net->ipv6.ip6_dst_ops->dst_net);
2677         kfree(net->ipv6.ip6_dst_ops);
2678         goto out;
2679 }
2680
2681 static void ip6_route_net_exit(struct net *net)
2682 {
2683 #ifdef CONFIG_PROC_FS
2684         proc_net_remove(net, "ipv6_route");
2685         proc_net_remove(net, "rt6_stats");
2686 #endif
2687         kfree(net->ipv6.ip6_null_entry);
2688 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2689         kfree(net->ipv6.ip6_prohibit_entry);
2690         kfree(net->ipv6.ip6_blk_hole_entry);
2691 #endif
2692         release_net(net->ipv6.ip6_dst_ops->dst_net);
2693         kfree(net->ipv6.ip6_dst_ops);
2694 }
2695
2696 static struct pernet_operations ip6_route_net_ops = {
2697         .init = ip6_route_net_init,
2698         .exit = ip6_route_net_exit,
2699 };
2700
2701 static struct notifier_block ip6_route_dev_notifier = {
2702         .notifier_call = ip6_route_dev_notify,
2703         .priority = 0,
2704 };
2705
2706 int __init ip6_route_init(void)
2707 {
2708         int ret;
2709
2710         ret = -ENOMEM;
2711         ip6_dst_ops_template.kmem_cachep =
2712                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2713                                   SLAB_HWCACHE_ALIGN, NULL);
2714         if (!ip6_dst_ops_template.kmem_cachep)
2715                 goto out;;
2716
2717         ret = register_pernet_subsys(&ip6_route_net_ops);
2718         if (ret)
2719                 goto out_kmem_cache;
2720
2721         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2722
2723         /* Registering of the loopback is done before this portion of code,
2724          * the loopback reference in rt6_info will not be taken, do it
2725          * manually for init_net */
2726         init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2727         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2728   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2729         init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2730         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2731         init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2732         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2733   #endif
2734         ret = fib6_init();
2735         if (ret)
2736                 goto out_register_subsys;
2737
2738         ret = xfrm6_init();
2739         if (ret)
2740                 goto out_fib6_init;
2741
2742         ret = fib6_rules_init();
2743         if (ret)
2744                 goto xfrm6_init;
2745
2746         ret = -ENOBUFS;
2747         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2748             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2749             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2750                 goto fib6_rules_init;
2751
2752         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2753         if (ret)
2754                 goto fib6_rules_init;
2755
2756 out:
2757         return ret;
2758
2759 fib6_rules_init:
2760         fib6_rules_cleanup();
2761 xfrm6_init:
2762         xfrm6_fini();
2763 out_fib6_init:
2764         fib6_gc_cleanup();
2765 out_register_subsys:
2766         unregister_pernet_subsys(&ip6_route_net_ops);
2767 out_kmem_cache:
2768         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2769         goto out;
2770 }
2771
2772 void ip6_route_cleanup(void)
2773 {
2774         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2775         fib6_rules_cleanup();
2776         xfrm6_fini();
2777         fib6_gc_cleanup();
2778         unregister_pernet_subsys(&ip6_route_net_ops);
2779         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2780 }