[NETNS][IPV6] rt6_info - make rt6_info accessed as a pointer
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops = {
101         .family                 =       AF_INET6,
102         .protocol               =       __constant_htons(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       ip6_local_out,
112         .entry_size             =       sizeof(struct rt6_info),
113         .entries                =       ATOMIC_INIT(0),
114 };
115
116 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
117 {
118 }
119
120 static struct dst_ops ip6_dst_blackhole_ops = {
121         .family                 =       AF_INET6,
122         .protocol               =       __constant_htons(ETH_P_IPV6),
123         .destroy                =       ip6_dst_destroy,
124         .check                  =       ip6_dst_check,
125         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
126         .entry_size             =       sizeof(struct rt6_info),
127         .entries                =       ATOMIC_INIT(0),
128 };
129
130 static struct rt6_info ip6_null_entry_template = {
131         .u = {
132                 .dst = {
133                         .__refcnt       = ATOMIC_INIT(1),
134                         .__use          = 1,
135                         .obsolete       = -1,
136                         .error          = -ENETUNREACH,
137                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
138                         .input          = ip6_pkt_discard,
139                         .output         = ip6_pkt_discard_out,
140                         .ops            = &ip6_dst_ops,
141                 }
142         },
143         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
144         .rt6i_metric    = ~(u32) 0,
145         .rt6i_ref       = ATOMIC_INIT(1),
146 };
147
148 struct rt6_info *ip6_null_entry;
149
150 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
151
152 static int ip6_pkt_prohibit(struct sk_buff *skb);
153 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
154
155 struct rt6_info ip6_prohibit_entry_template = {
156         .u = {
157                 .dst = {
158                         .__refcnt       = ATOMIC_INIT(1),
159                         .__use          = 1,
160                         .obsolete       = -1,
161                         .error          = -EACCES,
162                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
163                         .input          = ip6_pkt_prohibit,
164                         .output         = ip6_pkt_prohibit_out,
165                         .ops            = &ip6_dst_ops,
166                 }
167         },
168         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
169         .rt6i_metric    = ~(u32) 0,
170         .rt6i_ref       = ATOMIC_INIT(1),
171 };
172
173 struct rt6_info *ip6_prohibit_entry;
174
175 static struct rt6_info ip6_blk_hole_entry_template = {
176         .u = {
177                 .dst = {
178                         .__refcnt       = ATOMIC_INIT(1),
179                         .__use          = 1,
180                         .obsolete       = -1,
181                         .error          = -EINVAL,
182                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
183                         .input          = dst_discard,
184                         .output         = dst_discard,
185                         .ops            = &ip6_dst_ops,
186                 }
187         },
188         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
189         .rt6i_metric    = ~(u32) 0,
190         .rt6i_ref       = ATOMIC_INIT(1),
191 };
192
193 struct rt6_info *ip6_blk_hole_entry;
194
195 #endif
196
197 /* allocate dst with ip6_dst_ops */
198 static __inline__ struct rt6_info *ip6_dst_alloc(void)
199 {
200         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
201 }
202
203 static void ip6_dst_destroy(struct dst_entry *dst)
204 {
205         struct rt6_info *rt = (struct rt6_info *)dst;
206         struct inet6_dev *idev = rt->rt6i_idev;
207
208         if (idev != NULL) {
209                 rt->rt6i_idev = NULL;
210                 in6_dev_put(idev);
211         }
212 }
213
214 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
215                            int how)
216 {
217         struct rt6_info *rt = (struct rt6_info *)dst;
218         struct inet6_dev *idev = rt->rt6i_idev;
219         struct net_device *loopback_dev =
220                 dev->nd_net->loopback_dev;
221
222         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
223                 struct inet6_dev *loopback_idev =
224                         in6_dev_get(loopback_dev);
225                 if (loopback_idev != NULL) {
226                         rt->rt6i_idev = loopback_idev;
227                         in6_dev_put(idev);
228                 }
229         }
230 }
231
232 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
233 {
234         return (rt->rt6i_flags & RTF_EXPIRES &&
235                 time_after(jiffies, rt->rt6i_expires));
236 }
237
238 static inline int rt6_need_strict(struct in6_addr *daddr)
239 {
240         return (ipv6_addr_type(daddr) &
241                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
242 }
243
244 /*
245  *      Route lookup. Any table->tb6_lock is implied.
246  */
247
248 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
249                                                     int oif,
250                                                     int strict)
251 {
252         struct rt6_info *local = NULL;
253         struct rt6_info *sprt;
254
255         if (oif) {
256                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
257                         struct net_device *dev = sprt->rt6i_dev;
258                         if (dev->ifindex == oif)
259                                 return sprt;
260                         if (dev->flags & IFF_LOOPBACK) {
261                                 if (sprt->rt6i_idev == NULL ||
262                                     sprt->rt6i_idev->dev->ifindex != oif) {
263                                         if (strict && oif)
264                                                 continue;
265                                         if (local && (!oif ||
266                                                       local->rt6i_idev->dev->ifindex == oif))
267                                                 continue;
268                                 }
269                                 local = sprt;
270                         }
271                 }
272
273                 if (local)
274                         return local;
275
276                 if (strict)
277                         return ip6_null_entry;
278         }
279         return rt;
280 }
281
282 #ifdef CONFIG_IPV6_ROUTER_PREF
283 static void rt6_probe(struct rt6_info *rt)
284 {
285         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
286         /*
287          * Okay, this does not seem to be appropriate
288          * for now, however, we need to check if it
289          * is really so; aka Router Reachability Probing.
290          *
291          * Router Reachability Probe MUST be rate-limited
292          * to no more than one per minute.
293          */
294         if (!neigh || (neigh->nud_state & NUD_VALID))
295                 return;
296         read_lock_bh(&neigh->lock);
297         if (!(neigh->nud_state & NUD_VALID) &&
298             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
299                 struct in6_addr mcaddr;
300                 struct in6_addr *target;
301
302                 neigh->updated = jiffies;
303                 read_unlock_bh(&neigh->lock);
304
305                 target = (struct in6_addr *)&neigh->primary_key;
306                 addrconf_addr_solict_mult(target, &mcaddr);
307                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
308         } else
309                 read_unlock_bh(&neigh->lock);
310 }
311 #else
312 static inline void rt6_probe(struct rt6_info *rt)
313 {
314         return;
315 }
316 #endif
317
318 /*
319  * Default Router Selection (RFC 2461 6.3.6)
320  */
321 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
322 {
323         struct net_device *dev = rt->rt6i_dev;
324         if (!oif || dev->ifindex == oif)
325                 return 2;
326         if ((dev->flags & IFF_LOOPBACK) &&
327             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
328                 return 1;
329         return 0;
330 }
331
332 static inline int rt6_check_neigh(struct rt6_info *rt)
333 {
334         struct neighbour *neigh = rt->rt6i_nexthop;
335         int m;
336         if (rt->rt6i_flags & RTF_NONEXTHOP ||
337             !(rt->rt6i_flags & RTF_GATEWAY))
338                 m = 1;
339         else if (neigh) {
340                 read_lock_bh(&neigh->lock);
341                 if (neigh->nud_state & NUD_VALID)
342                         m = 2;
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344                 else if (neigh->nud_state & NUD_FAILED)
345                         m = 0;
346 #endif
347                 else
348                         m = 1;
349                 read_unlock_bh(&neigh->lock);
350         } else
351                 m = 0;
352         return m;
353 }
354
355 static int rt6_score_route(struct rt6_info *rt, int oif,
356                            int strict)
357 {
358         int m, n;
359
360         m = rt6_check_dev(rt, oif);
361         if (!m && (strict & RT6_LOOKUP_F_IFACE))
362                 return -1;
363 #ifdef CONFIG_IPV6_ROUTER_PREF
364         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
365 #endif
366         n = rt6_check_neigh(rt);
367         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
368                 return -1;
369         return m;
370 }
371
372 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
373                                    int *mpri, struct rt6_info *match)
374 {
375         int m;
376
377         if (rt6_check_expired(rt))
378                 goto out;
379
380         m = rt6_score_route(rt, oif, strict);
381         if (m < 0)
382                 goto out;
383
384         if (m > *mpri) {
385                 if (strict & RT6_LOOKUP_F_REACHABLE)
386                         rt6_probe(match);
387                 *mpri = m;
388                 match = rt;
389         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
390                 rt6_probe(rt);
391         }
392
393 out:
394         return match;
395 }
396
397 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
398                                      struct rt6_info *rr_head,
399                                      u32 metric, int oif, int strict)
400 {
401         struct rt6_info *rt, *match;
402         int mpri = -1;
403
404         match = NULL;
405         for (rt = rr_head; rt && rt->rt6i_metric == metric;
406              rt = rt->u.dst.rt6_next)
407                 match = find_match(rt, oif, strict, &mpri, match);
408         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
409              rt = rt->u.dst.rt6_next)
410                 match = find_match(rt, oif, strict, &mpri, match);
411
412         return match;
413 }
414
415 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
416 {
417         struct rt6_info *match, *rt0;
418
419         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
420                   __FUNCTION__, fn->leaf, oif);
421
422         rt0 = fn->rr_ptr;
423         if (!rt0)
424                 fn->rr_ptr = rt0 = fn->leaf;
425
426         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
427
428         if (!match &&
429             (strict & RT6_LOOKUP_F_REACHABLE)) {
430                 struct rt6_info *next = rt0->u.dst.rt6_next;
431
432                 /* no entries matched; do round-robin */
433                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
434                         next = fn->leaf;
435
436                 if (next != rt0)
437                         fn->rr_ptr = next;
438         }
439
440         RT6_TRACE("%s() => %p\n",
441                   __FUNCTION__, match);
442
443         return (match ? match : ip6_null_entry);
444 }
445
446 #ifdef CONFIG_IPV6_ROUTE_INFO
447 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
448                   struct in6_addr *gwaddr)
449 {
450         struct net *net = dev->nd_net;
451         struct route_info *rinfo = (struct route_info *) opt;
452         struct in6_addr prefix_buf, *prefix;
453         unsigned int pref;
454         u32 lifetime;
455         struct rt6_info *rt;
456
457         if (len < sizeof(struct route_info)) {
458                 return -EINVAL;
459         }
460
461         /* Sanity check for prefix_len and length */
462         if (rinfo->length > 3) {
463                 return -EINVAL;
464         } else if (rinfo->prefix_len > 128) {
465                 return -EINVAL;
466         } else if (rinfo->prefix_len > 64) {
467                 if (rinfo->length < 2) {
468                         return -EINVAL;
469                 }
470         } else if (rinfo->prefix_len > 0) {
471                 if (rinfo->length < 1) {
472                         return -EINVAL;
473                 }
474         }
475
476         pref = rinfo->route_pref;
477         if (pref == ICMPV6_ROUTER_PREF_INVALID)
478                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
479
480         lifetime = ntohl(rinfo->lifetime);
481         if (lifetime == 0xffffffff) {
482                 /* infinity */
483         } else if (lifetime > 0x7fffffff/HZ) {
484                 /* Avoid arithmetic overflow */
485                 lifetime = 0x7fffffff/HZ - 1;
486         }
487
488         if (rinfo->length == 3)
489                 prefix = (struct in6_addr *)rinfo->prefix;
490         else {
491                 /* this function is safe */
492                 ipv6_addr_prefix(&prefix_buf,
493                                  (struct in6_addr *)rinfo->prefix,
494                                  rinfo->prefix_len);
495                 prefix = &prefix_buf;
496         }
497
498         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
499                                 dev->ifindex);
500
501         if (rt && !lifetime) {
502                 ip6_del_rt(rt);
503                 rt = NULL;
504         }
505
506         if (!rt && lifetime)
507                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
508                                         pref);
509         else if (rt)
510                 rt->rt6i_flags = RTF_ROUTEINFO |
511                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
512
513         if (rt) {
514                 if (lifetime == 0xffffffff) {
515                         rt->rt6i_flags &= ~RTF_EXPIRES;
516                 } else {
517                         rt->rt6i_expires = jiffies + HZ * lifetime;
518                         rt->rt6i_flags |= RTF_EXPIRES;
519                 }
520                 dst_release(&rt->u.dst);
521         }
522         return 0;
523 }
524 #endif
525
526 #define BACKTRACK(saddr) \
527 do { \
528         if (rt == ip6_null_entry) { \
529                 struct fib6_node *pn; \
530                 while (1) { \
531                         if (fn->fn_flags & RTN_TL_ROOT) \
532                                 goto out; \
533                         pn = fn->parent; \
534                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
535                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
536                         else \
537                                 fn = pn; \
538                         if (fn->fn_flags & RTN_RTINFO) \
539                                 goto restart; \
540                 } \
541         } \
542 } while(0)
543
544 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
545                                              struct flowi *fl, int flags)
546 {
547         struct fib6_node *fn;
548         struct rt6_info *rt;
549
550         read_lock_bh(&table->tb6_lock);
551         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
552 restart:
553         rt = fn->leaf;
554         rt = rt6_device_match(rt, fl->oif, flags);
555         BACKTRACK(&fl->fl6_src);
556 out:
557         dst_use(&rt->u.dst, jiffies);
558         read_unlock_bh(&table->tb6_lock);
559         return rt;
560
561 }
562
563 struct rt6_info *rt6_lookup(struct net *net, struct in6_addr *daddr,
564                             struct in6_addr *saddr, int oif, int strict)
565 {
566         struct flowi fl = {
567                 .oif = oif,
568                 .nl_u = {
569                         .ip6_u = {
570                                 .daddr = *daddr,
571                         },
572                 },
573         };
574         struct dst_entry *dst;
575         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
576
577         if (saddr) {
578                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
579                 flags |= RT6_LOOKUP_F_HAS_SADDR;
580         }
581
582         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
583         if (dst->error == 0)
584                 return (struct rt6_info *) dst;
585
586         dst_release(dst);
587
588         return NULL;
589 }
590
591 EXPORT_SYMBOL(rt6_lookup);
592
593 /* ip6_ins_rt is called with FREE table->tb6_lock.
594    It takes new route entry, the addition fails by any reason the
595    route is freed. In any case, if caller does not hold it, it may
596    be destroyed.
597  */
598
599 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
600 {
601         int err;
602         struct fib6_table *table;
603
604         table = rt->rt6i_table;
605         write_lock_bh(&table->tb6_lock);
606         err = fib6_add(&table->tb6_root, rt, info);
607         write_unlock_bh(&table->tb6_lock);
608
609         return err;
610 }
611
612 int ip6_ins_rt(struct rt6_info *rt)
613 {
614         struct nl_info info = {
615                 .nl_net = rt->rt6i_dev->nd_net,
616         };
617         return __ip6_ins_rt(rt, &info);
618 }
619
620 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
621                                       struct in6_addr *saddr)
622 {
623         struct rt6_info *rt;
624
625         /*
626          *      Clone the route.
627          */
628
629         rt = ip6_rt_copy(ort);
630
631         if (rt) {
632                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
633                         if (rt->rt6i_dst.plen != 128 &&
634                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
635                                 rt->rt6i_flags |= RTF_ANYCAST;
636                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
637                 }
638
639                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
640                 rt->rt6i_dst.plen = 128;
641                 rt->rt6i_flags |= RTF_CACHE;
642                 rt->u.dst.flags |= DST_HOST;
643
644 #ifdef CONFIG_IPV6_SUBTREES
645                 if (rt->rt6i_src.plen && saddr) {
646                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
647                         rt->rt6i_src.plen = 128;
648                 }
649 #endif
650
651                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
652
653         }
654
655         return rt;
656 }
657
658 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
659 {
660         struct rt6_info *rt = ip6_rt_copy(ort);
661         if (rt) {
662                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
663                 rt->rt6i_dst.plen = 128;
664                 rt->rt6i_flags |= RTF_CACHE;
665                 rt->u.dst.flags |= DST_HOST;
666                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
667         }
668         return rt;
669 }
670
671 static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
672                                             struct flowi *fl, int flags)
673 {
674         struct fib6_node *fn;
675         struct rt6_info *rt, *nrt;
676         int strict = 0;
677         int attempts = 3;
678         int err;
679         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
680
681         strict |= flags & RT6_LOOKUP_F_IFACE;
682
683 relookup:
684         read_lock_bh(&table->tb6_lock);
685
686 restart_2:
687         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
688
689 restart:
690         rt = rt6_select(fn, oif, strict | reachable);
691         BACKTRACK(&fl->fl6_src);
692         if (rt == ip6_null_entry ||
693             rt->rt6i_flags & RTF_CACHE)
694                 goto out;
695
696         dst_hold(&rt->u.dst);
697         read_unlock_bh(&table->tb6_lock);
698
699         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
700                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
701         else {
702 #if CLONE_OFFLINK_ROUTE
703                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
704 #else
705                 goto out2;
706 #endif
707         }
708
709         dst_release(&rt->u.dst);
710         rt = nrt ? : ip6_null_entry;
711
712         dst_hold(&rt->u.dst);
713         if (nrt) {
714                 err = ip6_ins_rt(nrt);
715                 if (!err)
716                         goto out2;
717         }
718
719         if (--attempts <= 0)
720                 goto out2;
721
722         /*
723          * Race condition! In the gap, when table->tb6_lock was
724          * released someone could insert this route.  Relookup.
725          */
726         dst_release(&rt->u.dst);
727         goto relookup;
728
729 out:
730         if (reachable) {
731                 reachable = 0;
732                 goto restart_2;
733         }
734         dst_hold(&rt->u.dst);
735         read_unlock_bh(&table->tb6_lock);
736 out2:
737         rt->u.dst.lastuse = jiffies;
738         rt->u.dst.__use++;
739
740         return rt;
741 }
742
743 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
744                                             struct flowi *fl, int flags)
745 {
746         return ip6_pol_route(table, fl->iif, fl, flags);
747 }
748
749 void ip6_route_input(struct sk_buff *skb)
750 {
751         struct ipv6hdr *iph = ipv6_hdr(skb);
752         struct net *net = skb->dev->nd_net;
753         int flags = RT6_LOOKUP_F_HAS_SADDR;
754         struct flowi fl = {
755                 .iif = skb->dev->ifindex,
756                 .nl_u = {
757                         .ip6_u = {
758                                 .daddr = iph->daddr,
759                                 .saddr = iph->saddr,
760                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
761                         },
762                 },
763                 .mark = skb->mark,
764                 .proto = iph->nexthdr,
765         };
766
767         if (rt6_need_strict(&iph->daddr))
768                 flags |= RT6_LOOKUP_F_IFACE;
769
770         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
771 }
772
773 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
774                                              struct flowi *fl, int flags)
775 {
776         return ip6_pol_route(table, fl->oif, fl, flags);
777 }
778
779 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
780 {
781         int flags = 0;
782
783         if (rt6_need_strict(&fl->fl6_dst))
784                 flags |= RT6_LOOKUP_F_IFACE;
785
786         if (!ipv6_addr_any(&fl->fl6_src))
787                 flags |= RT6_LOOKUP_F_HAS_SADDR;
788
789         return fib6_rule_lookup(&init_net, fl, flags, ip6_pol_route_output);
790 }
791
792 EXPORT_SYMBOL(ip6_route_output);
793
794 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
795 {
796         struct rt6_info *ort = (struct rt6_info *) *dstp;
797         struct rt6_info *rt = (struct rt6_info *)
798                 dst_alloc(&ip6_dst_blackhole_ops);
799         struct dst_entry *new = NULL;
800
801         if (rt) {
802                 new = &rt->u.dst;
803
804                 atomic_set(&new->__refcnt, 1);
805                 new->__use = 1;
806                 new->input = dst_discard;
807                 new->output = dst_discard;
808
809                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
810                 new->dev = ort->u.dst.dev;
811                 if (new->dev)
812                         dev_hold(new->dev);
813                 rt->rt6i_idev = ort->rt6i_idev;
814                 if (rt->rt6i_idev)
815                         in6_dev_hold(rt->rt6i_idev);
816                 rt->rt6i_expires = 0;
817
818                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
819                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
820                 rt->rt6i_metric = 0;
821
822                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
823 #ifdef CONFIG_IPV6_SUBTREES
824                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
825 #endif
826
827                 dst_free(new);
828         }
829
830         dst_release(*dstp);
831         *dstp = new;
832         return (new ? 0 : -ENOMEM);
833 }
834 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
835
836 /*
837  *      Destination cache support functions
838  */
839
840 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
841 {
842         struct rt6_info *rt;
843
844         rt = (struct rt6_info *) dst;
845
846         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
847                 return dst;
848
849         return NULL;
850 }
851
852 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
853 {
854         struct rt6_info *rt = (struct rt6_info *) dst;
855
856         if (rt) {
857                 if (rt->rt6i_flags & RTF_CACHE)
858                         ip6_del_rt(rt);
859                 else
860                         dst_release(dst);
861         }
862         return NULL;
863 }
864
865 static void ip6_link_failure(struct sk_buff *skb)
866 {
867         struct rt6_info *rt;
868
869         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
870
871         rt = (struct rt6_info *) skb->dst;
872         if (rt) {
873                 if (rt->rt6i_flags&RTF_CACHE) {
874                         dst_set_expires(&rt->u.dst, 0);
875                         rt->rt6i_flags |= RTF_EXPIRES;
876                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
877                         rt->rt6i_node->fn_sernum = -1;
878         }
879 }
880
881 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
882 {
883         struct rt6_info *rt6 = (struct rt6_info*)dst;
884
885         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
886                 rt6->rt6i_flags |= RTF_MODIFIED;
887                 if (mtu < IPV6_MIN_MTU) {
888                         mtu = IPV6_MIN_MTU;
889                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
890                 }
891                 dst->metrics[RTAX_MTU-1] = mtu;
892                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
893         }
894 }
895
896 static int ipv6_get_mtu(struct net_device *dev);
897
898 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
899 {
900         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
901
902         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
903                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
904
905         /*
906          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
907          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
908          * IPV6_MAXPLEN is also valid and means: "any MSS,
909          * rely only on pmtu discovery"
910          */
911         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
912                 mtu = IPV6_MAXPLEN;
913         return mtu;
914 }
915
916 static struct dst_entry *icmp6_dst_gc_list;
917 static DEFINE_SPINLOCK(icmp6_dst_lock);
918
919 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
920                                   struct neighbour *neigh,
921                                   struct in6_addr *addr)
922 {
923         struct rt6_info *rt;
924         struct inet6_dev *idev = in6_dev_get(dev);
925         struct net *net = dev->nd_net;
926
927         if (unlikely(idev == NULL))
928                 return NULL;
929
930         rt = ip6_dst_alloc();
931         if (unlikely(rt == NULL)) {
932                 in6_dev_put(idev);
933                 goto out;
934         }
935
936         dev_hold(dev);
937         if (neigh)
938                 neigh_hold(neigh);
939         else
940                 neigh = ndisc_get_neigh(dev, addr);
941
942         rt->rt6i_dev      = dev;
943         rt->rt6i_idev     = idev;
944         rt->rt6i_nexthop  = neigh;
945         atomic_set(&rt->u.dst.__refcnt, 1);
946         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
947         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
948         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
949         rt->u.dst.output  = ip6_output;
950
951 #if 0   /* there's no chance to use these for ndisc */
952         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
953                                 ? DST_HOST
954                                 : 0;
955         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
956         rt->rt6i_dst.plen = 128;
957 #endif
958
959         spin_lock_bh(&icmp6_dst_lock);
960         rt->u.dst.next = icmp6_dst_gc_list;
961         icmp6_dst_gc_list = &rt->u.dst;
962         spin_unlock_bh(&icmp6_dst_lock);
963
964         fib6_force_start_gc(net);
965
966 out:
967         return &rt->u.dst;
968 }
969
970 int icmp6_dst_gc(int *more)
971 {
972         struct dst_entry *dst, *next, **pprev;
973         int freed;
974
975         next = NULL;
976         freed = 0;
977
978         spin_lock_bh(&icmp6_dst_lock);
979         pprev = &icmp6_dst_gc_list;
980
981         while ((dst = *pprev) != NULL) {
982                 if (!atomic_read(&dst->__refcnt)) {
983                         *pprev = dst->next;
984                         dst_free(dst);
985                         freed++;
986                 } else {
987                         pprev = &dst->next;
988                         (*more)++;
989                 }
990         }
991
992         spin_unlock_bh(&icmp6_dst_lock);
993
994         return freed;
995 }
996
997 static int ip6_dst_gc(struct dst_ops *ops)
998 {
999         static unsigned expire = 30*HZ;
1000         static unsigned long last_gc;
1001         unsigned long now = jiffies;
1002
1003         if (time_after(last_gc + init_net.ipv6.sysctl.ip6_rt_gc_min_interval, now) &&
1004             atomic_read(&ip6_dst_ops.entries) <= init_net.ipv6.sysctl.ip6_rt_max_size)
1005                 goto out;
1006
1007         expire++;
1008         fib6_run_gc(expire, &init_net);
1009         last_gc = now;
1010         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1011                 expire = init_net.ipv6.sysctl.ip6_rt_gc_timeout>>1;
1012
1013 out:
1014         expire -= expire>>init_net.ipv6.sysctl.ip6_rt_gc_elasticity;
1015         return (atomic_read(&ip6_dst_ops.entries) > init_net.ipv6.sysctl.ip6_rt_max_size);
1016 }
1017
1018 /* Clean host part of a prefix. Not necessary in radix tree,
1019    but results in cleaner routing tables.
1020
1021    Remove it only when all the things will work!
1022  */
1023
1024 static int ipv6_get_mtu(struct net_device *dev)
1025 {
1026         int mtu = IPV6_MIN_MTU;
1027         struct inet6_dev *idev;
1028
1029         idev = in6_dev_get(dev);
1030         if (idev) {
1031                 mtu = idev->cnf.mtu6;
1032                 in6_dev_put(idev);
1033         }
1034         return mtu;
1035 }
1036
1037 int ipv6_get_hoplimit(struct net_device *dev)
1038 {
1039         int hoplimit = ipv6_devconf.hop_limit;
1040         struct inet6_dev *idev;
1041
1042         idev = in6_dev_get(dev);
1043         if (idev) {
1044                 hoplimit = idev->cnf.hop_limit;
1045                 in6_dev_put(idev);
1046         }
1047         return hoplimit;
1048 }
1049
1050 /*
1051  *
1052  */
1053
1054 int ip6_route_add(struct fib6_config *cfg)
1055 {
1056         int err;
1057         struct net *net = cfg->fc_nlinfo.nl_net;
1058         struct rt6_info *rt = NULL;
1059         struct net_device *dev = NULL;
1060         struct inet6_dev *idev = NULL;
1061         struct fib6_table *table;
1062         int addr_type;
1063
1064         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1065                 return -EINVAL;
1066 #ifndef CONFIG_IPV6_SUBTREES
1067         if (cfg->fc_src_len)
1068                 return -EINVAL;
1069 #endif
1070         if (cfg->fc_ifindex) {
1071                 err = -ENODEV;
1072                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1073                 if (!dev)
1074                         goto out;
1075                 idev = in6_dev_get(dev);
1076                 if (!idev)
1077                         goto out;
1078         }
1079
1080         if (cfg->fc_metric == 0)
1081                 cfg->fc_metric = IP6_RT_PRIO_USER;
1082
1083         table = fib6_new_table(net, cfg->fc_table);
1084         if (table == NULL) {
1085                 err = -ENOBUFS;
1086                 goto out;
1087         }
1088
1089         rt = ip6_dst_alloc();
1090
1091         if (rt == NULL) {
1092                 err = -ENOMEM;
1093                 goto out;
1094         }
1095
1096         rt->u.dst.obsolete = -1;
1097         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1098
1099         if (cfg->fc_protocol == RTPROT_UNSPEC)
1100                 cfg->fc_protocol = RTPROT_BOOT;
1101         rt->rt6i_protocol = cfg->fc_protocol;
1102
1103         addr_type = ipv6_addr_type(&cfg->fc_dst);
1104
1105         if (addr_type & IPV6_ADDR_MULTICAST)
1106                 rt->u.dst.input = ip6_mc_input;
1107         else
1108                 rt->u.dst.input = ip6_forward;
1109
1110         rt->u.dst.output = ip6_output;
1111
1112         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1113         rt->rt6i_dst.plen = cfg->fc_dst_len;
1114         if (rt->rt6i_dst.plen == 128)
1115                rt->u.dst.flags = DST_HOST;
1116
1117 #ifdef CONFIG_IPV6_SUBTREES
1118         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1119         rt->rt6i_src.plen = cfg->fc_src_len;
1120 #endif
1121
1122         rt->rt6i_metric = cfg->fc_metric;
1123
1124         /* We cannot add true routes via loopback here,
1125            they would result in kernel looping; promote them to reject routes
1126          */
1127         if ((cfg->fc_flags & RTF_REJECT) ||
1128             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1129                 /* hold loopback dev/idev if we haven't done so. */
1130                 if (dev != net->loopback_dev) {
1131                         if (dev) {
1132                                 dev_put(dev);
1133                                 in6_dev_put(idev);
1134                         }
1135                         dev = net->loopback_dev;
1136                         dev_hold(dev);
1137                         idev = in6_dev_get(dev);
1138                         if (!idev) {
1139                                 err = -ENODEV;
1140                                 goto out;
1141                         }
1142                 }
1143                 rt->u.dst.output = ip6_pkt_discard_out;
1144                 rt->u.dst.input = ip6_pkt_discard;
1145                 rt->u.dst.error = -ENETUNREACH;
1146                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1147                 goto install_route;
1148         }
1149
1150         if (cfg->fc_flags & RTF_GATEWAY) {
1151                 struct in6_addr *gw_addr;
1152                 int gwa_type;
1153
1154                 gw_addr = &cfg->fc_gateway;
1155                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1156                 gwa_type = ipv6_addr_type(gw_addr);
1157
1158                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1159                         struct rt6_info *grt;
1160
1161                         /* IPv6 strictly inhibits using not link-local
1162                            addresses as nexthop address.
1163                            Otherwise, router will not able to send redirects.
1164                            It is very good, but in some (rare!) circumstances
1165                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1166                            some exceptions. --ANK
1167                          */
1168                         err = -EINVAL;
1169                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1170                                 goto out;
1171
1172                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1173
1174                         err = -EHOSTUNREACH;
1175                         if (grt == NULL)
1176                                 goto out;
1177                         if (dev) {
1178                                 if (dev != grt->rt6i_dev) {
1179                                         dst_release(&grt->u.dst);
1180                                         goto out;
1181                                 }
1182                         } else {
1183                                 dev = grt->rt6i_dev;
1184                                 idev = grt->rt6i_idev;
1185                                 dev_hold(dev);
1186                                 in6_dev_hold(grt->rt6i_idev);
1187                         }
1188                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1189                                 err = 0;
1190                         dst_release(&grt->u.dst);
1191
1192                         if (err)
1193                                 goto out;
1194                 }
1195                 err = -EINVAL;
1196                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1197                         goto out;
1198         }
1199
1200         err = -ENODEV;
1201         if (dev == NULL)
1202                 goto out;
1203
1204         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1205                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1206                 if (IS_ERR(rt->rt6i_nexthop)) {
1207                         err = PTR_ERR(rt->rt6i_nexthop);
1208                         rt->rt6i_nexthop = NULL;
1209                         goto out;
1210                 }
1211         }
1212
1213         rt->rt6i_flags = cfg->fc_flags;
1214
1215 install_route:
1216         if (cfg->fc_mx) {
1217                 struct nlattr *nla;
1218                 int remaining;
1219
1220                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1221                         int type = nla_type(nla);
1222
1223                         if (type) {
1224                                 if (type > RTAX_MAX) {
1225                                         err = -EINVAL;
1226                                         goto out;
1227                                 }
1228
1229                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1230                         }
1231                 }
1232         }
1233
1234         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1235                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1236         if (!rt->u.dst.metrics[RTAX_MTU-1])
1237                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1238         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1239                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1240         rt->u.dst.dev = dev;
1241         rt->rt6i_idev = idev;
1242         rt->rt6i_table = table;
1243
1244         cfg->fc_nlinfo.nl_net = dev->nd_net;
1245
1246         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1247
1248 out:
1249         if (dev)
1250                 dev_put(dev);
1251         if (idev)
1252                 in6_dev_put(idev);
1253         if (rt)
1254                 dst_free(&rt->u.dst);
1255         return err;
1256 }
1257
1258 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1259 {
1260         int err;
1261         struct fib6_table *table;
1262
1263         if (rt == ip6_null_entry)
1264                 return -ENOENT;
1265
1266         table = rt->rt6i_table;
1267         write_lock_bh(&table->tb6_lock);
1268
1269         err = fib6_del(rt, info);
1270         dst_release(&rt->u.dst);
1271
1272         write_unlock_bh(&table->tb6_lock);
1273
1274         return err;
1275 }
1276
1277 int ip6_del_rt(struct rt6_info *rt)
1278 {
1279         struct nl_info info = {
1280                 .nl_net = rt->rt6i_dev->nd_net,
1281         };
1282         return __ip6_del_rt(rt, &info);
1283 }
1284
1285 static int ip6_route_del(struct fib6_config *cfg)
1286 {
1287         struct fib6_table *table;
1288         struct fib6_node *fn;
1289         struct rt6_info *rt;
1290         int err = -ESRCH;
1291
1292         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1293         if (table == NULL)
1294                 return err;
1295
1296         read_lock_bh(&table->tb6_lock);
1297
1298         fn = fib6_locate(&table->tb6_root,
1299                          &cfg->fc_dst, cfg->fc_dst_len,
1300                          &cfg->fc_src, cfg->fc_src_len);
1301
1302         if (fn) {
1303                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1304                         if (cfg->fc_ifindex &&
1305                             (rt->rt6i_dev == NULL ||
1306                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1307                                 continue;
1308                         if (cfg->fc_flags & RTF_GATEWAY &&
1309                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1310                                 continue;
1311                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1312                                 continue;
1313                         dst_hold(&rt->u.dst);
1314                         read_unlock_bh(&table->tb6_lock);
1315
1316                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1317                 }
1318         }
1319         read_unlock_bh(&table->tb6_lock);
1320
1321         return err;
1322 }
1323
1324 /*
1325  *      Handle redirects
1326  */
1327 struct ip6rd_flowi {
1328         struct flowi fl;
1329         struct in6_addr gateway;
1330 };
1331
1332 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1333                                              struct flowi *fl,
1334                                              int flags)
1335 {
1336         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1337         struct rt6_info *rt;
1338         struct fib6_node *fn;
1339
1340         /*
1341          * Get the "current" route for this destination and
1342          * check if the redirect has come from approriate router.
1343          *
1344          * RFC 2461 specifies that redirects should only be
1345          * accepted if they come from the nexthop to the target.
1346          * Due to the way the routes are chosen, this notion
1347          * is a bit fuzzy and one might need to check all possible
1348          * routes.
1349          */
1350
1351         read_lock_bh(&table->tb6_lock);
1352         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1353 restart:
1354         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1355                 /*
1356                  * Current route is on-link; redirect is always invalid.
1357                  *
1358                  * Seems, previous statement is not true. It could
1359                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1360                  * But then router serving it might decide, that we should
1361                  * know truth 8)8) --ANK (980726).
1362                  */
1363                 if (rt6_check_expired(rt))
1364                         continue;
1365                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1366                         continue;
1367                 if (fl->oif != rt->rt6i_dev->ifindex)
1368                         continue;
1369                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1370                         continue;
1371                 break;
1372         }
1373
1374         if (!rt)
1375                 rt = ip6_null_entry;
1376         BACKTRACK(&fl->fl6_src);
1377 out:
1378         dst_hold(&rt->u.dst);
1379
1380         read_unlock_bh(&table->tb6_lock);
1381
1382         return rt;
1383 };
1384
1385 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1386                                            struct in6_addr *src,
1387                                            struct in6_addr *gateway,
1388                                            struct net_device *dev)
1389 {
1390         int flags = RT6_LOOKUP_F_HAS_SADDR;
1391         struct net *net = dev->nd_net;
1392         struct ip6rd_flowi rdfl = {
1393                 .fl = {
1394                         .oif = dev->ifindex,
1395                         .nl_u = {
1396                                 .ip6_u = {
1397                                         .daddr = *dest,
1398                                         .saddr = *src,
1399                                 },
1400                         },
1401                 },
1402                 .gateway = *gateway,
1403         };
1404
1405         if (rt6_need_strict(dest))
1406                 flags |= RT6_LOOKUP_F_IFACE;
1407
1408         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1409                                                    flags, __ip6_route_redirect);
1410 }
1411
1412 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1413                   struct in6_addr *saddr,
1414                   struct neighbour *neigh, u8 *lladdr, int on_link)
1415 {
1416         struct rt6_info *rt, *nrt = NULL;
1417         struct netevent_redirect netevent;
1418
1419         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1420
1421         if (rt == ip6_null_entry) {
1422                 if (net_ratelimit())
1423                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1424                                "for redirect target\n");
1425                 goto out;
1426         }
1427
1428         /*
1429          *      We have finally decided to accept it.
1430          */
1431
1432         neigh_update(neigh, lladdr, NUD_STALE,
1433                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1434                      NEIGH_UPDATE_F_OVERRIDE|
1435                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1436                                      NEIGH_UPDATE_F_ISROUTER))
1437                      );
1438
1439         /*
1440          * Redirect received -> path was valid.
1441          * Look, redirects are sent only in response to data packets,
1442          * so that this nexthop apparently is reachable. --ANK
1443          */
1444         dst_confirm(&rt->u.dst);
1445
1446         /* Duplicate redirect: silently ignore. */
1447         if (neigh == rt->u.dst.neighbour)
1448                 goto out;
1449
1450         nrt = ip6_rt_copy(rt);
1451         if (nrt == NULL)
1452                 goto out;
1453
1454         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1455         if (on_link)
1456                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1457
1458         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1459         nrt->rt6i_dst.plen = 128;
1460         nrt->u.dst.flags |= DST_HOST;
1461
1462         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1463         nrt->rt6i_nexthop = neigh_clone(neigh);
1464         /* Reset pmtu, it may be better */
1465         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1466         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(neigh->dev->nd_net,
1467                                                         dst_mtu(&nrt->u.dst));
1468
1469         if (ip6_ins_rt(nrt))
1470                 goto out;
1471
1472         netevent.old = &rt->u.dst;
1473         netevent.new = &nrt->u.dst;
1474         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1475
1476         if (rt->rt6i_flags&RTF_CACHE) {
1477                 ip6_del_rt(rt);
1478                 return;
1479         }
1480
1481 out:
1482         dst_release(&rt->u.dst);
1483         return;
1484 }
1485
1486 /*
1487  *      Handle ICMP "packet too big" messages
1488  *      i.e. Path MTU discovery
1489  */
1490
1491 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1492                         struct net_device *dev, u32 pmtu)
1493 {
1494         struct rt6_info *rt, *nrt;
1495         struct net *net = dev->nd_net;
1496         int allfrag = 0;
1497
1498         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1499         if (rt == NULL)
1500                 return;
1501
1502         if (pmtu >= dst_mtu(&rt->u.dst))
1503                 goto out;
1504
1505         if (pmtu < IPV6_MIN_MTU) {
1506                 /*
1507                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1508                  * MTU (1280) and a fragment header should always be included
1509                  * after a node receiving Too Big message reporting PMTU is
1510                  * less than the IPv6 Minimum Link MTU.
1511                  */
1512                 pmtu = IPV6_MIN_MTU;
1513                 allfrag = 1;
1514         }
1515
1516         /* New mtu received -> path was valid.
1517            They are sent only in response to data packets,
1518            so that this nexthop apparently is reachable. --ANK
1519          */
1520         dst_confirm(&rt->u.dst);
1521
1522         /* Host route. If it is static, it would be better
1523            not to override it, but add new one, so that
1524            when cache entry will expire old pmtu
1525            would return automatically.
1526          */
1527         if (rt->rt6i_flags & RTF_CACHE) {
1528                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1529                 if (allfrag)
1530                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1531                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1532                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1533                 goto out;
1534         }
1535
1536         /* Network route.
1537            Two cases are possible:
1538            1. It is connected route. Action: COW
1539            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1540          */
1541         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1542                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1543         else
1544                 nrt = rt6_alloc_clone(rt, daddr);
1545
1546         if (nrt) {
1547                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1548                 if (allfrag)
1549                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1550
1551                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1552                  * happened within 5 mins, the recommended timer is 10 mins.
1553                  * Here this route expiration time is set to ip6_rt_mtu_expires
1554                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1555                  * and detecting PMTU increase will be automatically happened.
1556                  */
1557                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1558                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1559
1560                 ip6_ins_rt(nrt);
1561         }
1562 out:
1563         dst_release(&rt->u.dst);
1564 }
1565
1566 /*
1567  *      Misc support functions
1568  */
1569
1570 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1571 {
1572         struct rt6_info *rt = ip6_dst_alloc();
1573
1574         if (rt) {
1575                 rt->u.dst.input = ort->u.dst.input;
1576                 rt->u.dst.output = ort->u.dst.output;
1577
1578                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1579                 rt->u.dst.error = ort->u.dst.error;
1580                 rt->u.dst.dev = ort->u.dst.dev;
1581                 if (rt->u.dst.dev)
1582                         dev_hold(rt->u.dst.dev);
1583                 rt->rt6i_idev = ort->rt6i_idev;
1584                 if (rt->rt6i_idev)
1585                         in6_dev_hold(rt->rt6i_idev);
1586                 rt->u.dst.lastuse = jiffies;
1587                 rt->rt6i_expires = 0;
1588
1589                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1590                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1591                 rt->rt6i_metric = 0;
1592
1593                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1594 #ifdef CONFIG_IPV6_SUBTREES
1595                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1596 #endif
1597                 rt->rt6i_table = ort->rt6i_table;
1598         }
1599         return rt;
1600 }
1601
1602 #ifdef CONFIG_IPV6_ROUTE_INFO
1603 static struct rt6_info *rt6_get_route_info(struct net *net,
1604                                            struct in6_addr *prefix, int prefixlen,
1605                                            struct in6_addr *gwaddr, int ifindex)
1606 {
1607         struct fib6_node *fn;
1608         struct rt6_info *rt = NULL;
1609         struct fib6_table *table;
1610
1611         table = fib6_get_table(net, RT6_TABLE_INFO);
1612         if (table == NULL)
1613                 return NULL;
1614
1615         write_lock_bh(&table->tb6_lock);
1616         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1617         if (!fn)
1618                 goto out;
1619
1620         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1621                 if (rt->rt6i_dev->ifindex != ifindex)
1622                         continue;
1623                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1624                         continue;
1625                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1626                         continue;
1627                 dst_hold(&rt->u.dst);
1628                 break;
1629         }
1630 out:
1631         write_unlock_bh(&table->tb6_lock);
1632         return rt;
1633 }
1634
1635 static struct rt6_info *rt6_add_route_info(struct net *net,
1636                                            struct in6_addr *prefix, int prefixlen,
1637                                            struct in6_addr *gwaddr, int ifindex,
1638                                            unsigned pref)
1639 {
1640         struct fib6_config cfg = {
1641                 .fc_table       = RT6_TABLE_INFO,
1642                 .fc_metric      = IP6_RT_PRIO_USER,
1643                 .fc_ifindex     = ifindex,
1644                 .fc_dst_len     = prefixlen,
1645                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1646                                   RTF_UP | RTF_PREF(pref),
1647                 .fc_nlinfo.pid = 0,
1648                 .fc_nlinfo.nlh = NULL,
1649                 .fc_nlinfo.nl_net = net,
1650         };
1651
1652         ipv6_addr_copy(&cfg.fc_dst, prefix);
1653         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1654
1655         /* We should treat it as a default route if prefix length is 0. */
1656         if (!prefixlen)
1657                 cfg.fc_flags |= RTF_DEFAULT;
1658
1659         ip6_route_add(&cfg);
1660
1661         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1662 }
1663 #endif
1664
1665 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1666 {
1667         struct rt6_info *rt;
1668         struct fib6_table *table;
1669
1670         table = fib6_get_table(dev->nd_net, RT6_TABLE_DFLT);
1671         if (table == NULL)
1672                 return NULL;
1673
1674         write_lock_bh(&table->tb6_lock);
1675         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1676                 if (dev == rt->rt6i_dev &&
1677                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1678                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1679                         break;
1680         }
1681         if (rt)
1682                 dst_hold(&rt->u.dst);
1683         write_unlock_bh(&table->tb6_lock);
1684         return rt;
1685 }
1686
1687 EXPORT_SYMBOL(rt6_get_dflt_router);
1688
1689 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1690                                      struct net_device *dev,
1691                                      unsigned int pref)
1692 {
1693         struct fib6_config cfg = {
1694                 .fc_table       = RT6_TABLE_DFLT,
1695                 .fc_metric      = IP6_RT_PRIO_USER,
1696                 .fc_ifindex     = dev->ifindex,
1697                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1698                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1699                 .fc_nlinfo.pid = 0,
1700                 .fc_nlinfo.nlh = NULL,
1701                 .fc_nlinfo.nl_net = dev->nd_net,
1702         };
1703
1704         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1705
1706         ip6_route_add(&cfg);
1707
1708         return rt6_get_dflt_router(gwaddr, dev);
1709 }
1710
1711 void rt6_purge_dflt_routers(struct net *net)
1712 {
1713         struct rt6_info *rt;
1714         struct fib6_table *table;
1715
1716         /* NOTE: Keep consistent with rt6_get_dflt_router */
1717         table = fib6_get_table(net, RT6_TABLE_DFLT);
1718         if (table == NULL)
1719                 return;
1720
1721 restart:
1722         read_lock_bh(&table->tb6_lock);
1723         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1724                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1725                         dst_hold(&rt->u.dst);
1726                         read_unlock_bh(&table->tb6_lock);
1727                         ip6_del_rt(rt);
1728                         goto restart;
1729                 }
1730         }
1731         read_unlock_bh(&table->tb6_lock);
1732 }
1733
1734 static void rtmsg_to_fib6_config(struct net *net,
1735                                  struct in6_rtmsg *rtmsg,
1736                                  struct fib6_config *cfg)
1737 {
1738         memset(cfg, 0, sizeof(*cfg));
1739
1740         cfg->fc_table = RT6_TABLE_MAIN;
1741         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1742         cfg->fc_metric = rtmsg->rtmsg_metric;
1743         cfg->fc_expires = rtmsg->rtmsg_info;
1744         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1745         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1746         cfg->fc_flags = rtmsg->rtmsg_flags;
1747
1748         cfg->fc_nlinfo.nl_net = net;
1749
1750         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1751         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1752         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1753 }
1754
1755 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1756 {
1757         struct fib6_config cfg;
1758         struct in6_rtmsg rtmsg;
1759         int err;
1760
1761         switch(cmd) {
1762         case SIOCADDRT:         /* Add a route */
1763         case SIOCDELRT:         /* Delete a route */
1764                 if (!capable(CAP_NET_ADMIN))
1765                         return -EPERM;
1766                 err = copy_from_user(&rtmsg, arg,
1767                                      sizeof(struct in6_rtmsg));
1768                 if (err)
1769                         return -EFAULT;
1770
1771                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1772
1773                 rtnl_lock();
1774                 switch (cmd) {
1775                 case SIOCADDRT:
1776                         err = ip6_route_add(&cfg);
1777                         break;
1778                 case SIOCDELRT:
1779                         err = ip6_route_del(&cfg);
1780                         break;
1781                 default:
1782                         err = -EINVAL;
1783                 }
1784                 rtnl_unlock();
1785
1786                 return err;
1787         }
1788
1789         return -EINVAL;
1790 }
1791
1792 /*
1793  *      Drop the packet on the floor
1794  */
1795
1796 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1797 {
1798         int type;
1799         switch (ipstats_mib_noroutes) {
1800         case IPSTATS_MIB_INNOROUTES:
1801                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1802                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1803                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1804                         break;
1805                 }
1806                 /* FALLTHROUGH */
1807         case IPSTATS_MIB_OUTNOROUTES:
1808                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1809                 break;
1810         }
1811         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1812         kfree_skb(skb);
1813         return 0;
1814 }
1815
1816 static int ip6_pkt_discard(struct sk_buff *skb)
1817 {
1818         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1819 }
1820
1821 static int ip6_pkt_discard_out(struct sk_buff *skb)
1822 {
1823         skb->dev = skb->dst->dev;
1824         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1825 }
1826
1827 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1828
1829 static int ip6_pkt_prohibit(struct sk_buff *skb)
1830 {
1831         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1832 }
1833
1834 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1835 {
1836         skb->dev = skb->dst->dev;
1837         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1838 }
1839
1840 #endif
1841
1842 /*
1843  *      Allocate a dst for local (unicast / anycast) address.
1844  */
1845
1846 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1847                                     const struct in6_addr *addr,
1848                                     int anycast)
1849 {
1850         struct net *net = idev->dev->nd_net;
1851         struct rt6_info *rt = ip6_dst_alloc();
1852
1853         if (rt == NULL)
1854                 return ERR_PTR(-ENOMEM);
1855
1856         dev_hold(net->loopback_dev);
1857         in6_dev_hold(idev);
1858
1859         rt->u.dst.flags = DST_HOST;
1860         rt->u.dst.input = ip6_input;
1861         rt->u.dst.output = ip6_output;
1862         rt->rt6i_dev = net->loopback_dev;
1863         rt->rt6i_idev = idev;
1864         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1865         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1866         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1867         rt->u.dst.obsolete = -1;
1868
1869         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1870         if (anycast)
1871                 rt->rt6i_flags |= RTF_ANYCAST;
1872         else
1873                 rt->rt6i_flags |= RTF_LOCAL;
1874         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1875         if (rt->rt6i_nexthop == NULL) {
1876                 dst_free(&rt->u.dst);
1877                 return ERR_PTR(-ENOMEM);
1878         }
1879
1880         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1881         rt->rt6i_dst.plen = 128;
1882         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1883
1884         atomic_set(&rt->u.dst.__refcnt, 1);
1885
1886         return rt;
1887 }
1888
1889 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1890 {
1891         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1892             rt != ip6_null_entry) {
1893                 RT6_TRACE("deleted by ifdown %p\n", rt);
1894                 return -1;
1895         }
1896         return 0;
1897 }
1898
1899 void rt6_ifdown(struct net *net, struct net_device *dev)
1900 {
1901         fib6_clean_all(net, fib6_ifdown, 0, dev);
1902 }
1903
1904 struct rt6_mtu_change_arg
1905 {
1906         struct net_device *dev;
1907         unsigned mtu;
1908 };
1909
1910 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1911 {
1912         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1913         struct inet6_dev *idev;
1914         struct net *net = arg->dev->nd_net;
1915
1916         /* In IPv6 pmtu discovery is not optional,
1917            so that RTAX_MTU lock cannot disable it.
1918            We still use this lock to block changes
1919            caused by addrconf/ndisc.
1920         */
1921
1922         idev = __in6_dev_get(arg->dev);
1923         if (idev == NULL)
1924                 return 0;
1925
1926         /* For administrative MTU increase, there is no way to discover
1927            IPv6 PMTU increase, so PMTU increase should be updated here.
1928            Since RFC 1981 doesn't include administrative MTU increase
1929            update PMTU increase is a MUST. (i.e. jumbo frame)
1930          */
1931         /*
1932            If new MTU is less than route PMTU, this new MTU will be the
1933            lowest MTU in the path, update the route PMTU to reflect PMTU
1934            decreases; if new MTU is greater than route PMTU, and the
1935            old MTU is the lowest MTU in the path, update the route PMTU
1936            to reflect the increase. In this case if the other nodes' MTU
1937            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1938            PMTU discouvery.
1939          */
1940         if (rt->rt6i_dev == arg->dev &&
1941             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1942             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1943              (dst_mtu(&rt->u.dst) < arg->mtu &&
1944               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1945                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1946                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1947         }
1948         return 0;
1949 }
1950
1951 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1952 {
1953         struct rt6_mtu_change_arg arg = {
1954                 .dev = dev,
1955                 .mtu = mtu,
1956         };
1957
1958         fib6_clean_all(dev->nd_net, rt6_mtu_change_route, 0, &arg);
1959 }
1960
1961 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1962         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1963         [RTA_OIF]               = { .type = NLA_U32 },
1964         [RTA_IIF]               = { .type = NLA_U32 },
1965         [RTA_PRIORITY]          = { .type = NLA_U32 },
1966         [RTA_METRICS]           = { .type = NLA_NESTED },
1967 };
1968
1969 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1970                               struct fib6_config *cfg)
1971 {
1972         struct rtmsg *rtm;
1973         struct nlattr *tb[RTA_MAX+1];
1974         int err;
1975
1976         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1977         if (err < 0)
1978                 goto errout;
1979
1980         err = -EINVAL;
1981         rtm = nlmsg_data(nlh);
1982         memset(cfg, 0, sizeof(*cfg));
1983
1984         cfg->fc_table = rtm->rtm_table;
1985         cfg->fc_dst_len = rtm->rtm_dst_len;
1986         cfg->fc_src_len = rtm->rtm_src_len;
1987         cfg->fc_flags = RTF_UP;
1988         cfg->fc_protocol = rtm->rtm_protocol;
1989
1990         if (rtm->rtm_type == RTN_UNREACHABLE)
1991                 cfg->fc_flags |= RTF_REJECT;
1992
1993         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1994         cfg->fc_nlinfo.nlh = nlh;
1995         cfg->fc_nlinfo.nl_net = skb->sk->sk_net;
1996
1997         if (tb[RTA_GATEWAY]) {
1998                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1999                 cfg->fc_flags |= RTF_GATEWAY;
2000         }
2001
2002         if (tb[RTA_DST]) {
2003                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2004
2005                 if (nla_len(tb[RTA_DST]) < plen)
2006                         goto errout;
2007
2008                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2009         }
2010
2011         if (tb[RTA_SRC]) {
2012                 int plen = (rtm->rtm_src_len + 7) >> 3;
2013
2014                 if (nla_len(tb[RTA_SRC]) < plen)
2015                         goto errout;
2016
2017                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2018         }
2019
2020         if (tb[RTA_OIF])
2021                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2022
2023         if (tb[RTA_PRIORITY])
2024                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2025
2026         if (tb[RTA_METRICS]) {
2027                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2028                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2029         }
2030
2031         if (tb[RTA_TABLE])
2032                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2033
2034         err = 0;
2035 errout:
2036         return err;
2037 }
2038
2039 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2040 {
2041         struct fib6_config cfg;
2042         int err;
2043
2044         err = rtm_to_fib6_config(skb, nlh, &cfg);
2045         if (err < 0)
2046                 return err;
2047
2048         return ip6_route_del(&cfg);
2049 }
2050
2051 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2052 {
2053         struct fib6_config cfg;
2054         int err;
2055
2056         err = rtm_to_fib6_config(skb, nlh, &cfg);
2057         if (err < 0)
2058                 return err;
2059
2060         return ip6_route_add(&cfg);
2061 }
2062
2063 static inline size_t rt6_nlmsg_size(void)
2064 {
2065         return NLMSG_ALIGN(sizeof(struct rtmsg))
2066                + nla_total_size(16) /* RTA_SRC */
2067                + nla_total_size(16) /* RTA_DST */
2068                + nla_total_size(16) /* RTA_GATEWAY */
2069                + nla_total_size(16) /* RTA_PREFSRC */
2070                + nla_total_size(4) /* RTA_TABLE */
2071                + nla_total_size(4) /* RTA_IIF */
2072                + nla_total_size(4) /* RTA_OIF */
2073                + nla_total_size(4) /* RTA_PRIORITY */
2074                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2075                + nla_total_size(sizeof(struct rta_cacheinfo));
2076 }
2077
2078 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2079                          struct in6_addr *dst, struct in6_addr *src,
2080                          int iif, int type, u32 pid, u32 seq,
2081                          int prefix, unsigned int flags)
2082 {
2083         struct rtmsg *rtm;
2084         struct nlmsghdr *nlh;
2085         long expires;
2086         u32 table;
2087
2088         if (prefix) {   /* user wants prefix routes only */
2089                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2090                         /* success since this is not a prefix route */
2091                         return 1;
2092                 }
2093         }
2094
2095         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2096         if (nlh == NULL)
2097                 return -EMSGSIZE;
2098
2099         rtm = nlmsg_data(nlh);
2100         rtm->rtm_family = AF_INET6;
2101         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2102         rtm->rtm_src_len = rt->rt6i_src.plen;
2103         rtm->rtm_tos = 0;
2104         if (rt->rt6i_table)
2105                 table = rt->rt6i_table->tb6_id;
2106         else
2107                 table = RT6_TABLE_UNSPEC;
2108         rtm->rtm_table = table;
2109         NLA_PUT_U32(skb, RTA_TABLE, table);
2110         if (rt->rt6i_flags&RTF_REJECT)
2111                 rtm->rtm_type = RTN_UNREACHABLE;
2112         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2113                 rtm->rtm_type = RTN_LOCAL;
2114         else
2115                 rtm->rtm_type = RTN_UNICAST;
2116         rtm->rtm_flags = 0;
2117         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2118         rtm->rtm_protocol = rt->rt6i_protocol;
2119         if (rt->rt6i_flags&RTF_DYNAMIC)
2120                 rtm->rtm_protocol = RTPROT_REDIRECT;
2121         else if (rt->rt6i_flags & RTF_ADDRCONF)
2122                 rtm->rtm_protocol = RTPROT_KERNEL;
2123         else if (rt->rt6i_flags&RTF_DEFAULT)
2124                 rtm->rtm_protocol = RTPROT_RA;
2125
2126         if (rt->rt6i_flags&RTF_CACHE)
2127                 rtm->rtm_flags |= RTM_F_CLONED;
2128
2129         if (dst) {
2130                 NLA_PUT(skb, RTA_DST, 16, dst);
2131                 rtm->rtm_dst_len = 128;
2132         } else if (rtm->rtm_dst_len)
2133                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2134 #ifdef CONFIG_IPV6_SUBTREES
2135         if (src) {
2136                 NLA_PUT(skb, RTA_SRC, 16, src);
2137                 rtm->rtm_src_len = 128;
2138         } else if (rtm->rtm_src_len)
2139                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2140 #endif
2141         if (iif)
2142                 NLA_PUT_U32(skb, RTA_IIF, iif);
2143         else if (dst) {
2144                 struct in6_addr saddr_buf;
2145                 if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2146                                        dst, &saddr_buf) == 0)
2147                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2148         }
2149
2150         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2151                 goto nla_put_failure;
2152
2153         if (rt->u.dst.neighbour)
2154                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2155
2156         if (rt->u.dst.dev)
2157                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2158
2159         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2160
2161         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2162         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2163                                expires, rt->u.dst.error) < 0)
2164                 goto nla_put_failure;
2165
2166         return nlmsg_end(skb, nlh);
2167
2168 nla_put_failure:
2169         nlmsg_cancel(skb, nlh);
2170         return -EMSGSIZE;
2171 }
2172
2173 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2174 {
2175         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2176         int prefix;
2177
2178         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2179                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2180                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2181         } else
2182                 prefix = 0;
2183
2184         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2185                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2186                      prefix, NLM_F_MULTI);
2187 }
2188
2189 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2190 {
2191         struct net *net = in_skb->sk->sk_net;
2192         struct nlattr *tb[RTA_MAX+1];
2193         struct rt6_info *rt;
2194         struct sk_buff *skb;
2195         struct rtmsg *rtm;
2196         struct flowi fl;
2197         int err, iif = 0;
2198
2199         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2200         if (err < 0)
2201                 goto errout;
2202
2203         err = -EINVAL;
2204         memset(&fl, 0, sizeof(fl));
2205
2206         if (tb[RTA_SRC]) {
2207                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2208                         goto errout;
2209
2210                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2211         }
2212
2213         if (tb[RTA_DST]) {
2214                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2215                         goto errout;
2216
2217                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2218         }
2219
2220         if (tb[RTA_IIF])
2221                 iif = nla_get_u32(tb[RTA_IIF]);
2222
2223         if (tb[RTA_OIF])
2224                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2225
2226         if (iif) {
2227                 struct net_device *dev;
2228                 dev = __dev_get_by_index(net, iif);
2229                 if (!dev) {
2230                         err = -ENODEV;
2231                         goto errout;
2232                 }
2233         }
2234
2235         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2236         if (skb == NULL) {
2237                 err = -ENOBUFS;
2238                 goto errout;
2239         }
2240
2241         /* Reserve room for dummy headers, this skb can pass
2242            through good chunk of routing engine.
2243          */
2244         skb_reset_mac_header(skb);
2245         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2246
2247         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2248         skb->dst = &rt->u.dst;
2249
2250         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2251                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2252                             nlh->nlmsg_seq, 0, 0);
2253         if (err < 0) {
2254                 kfree_skb(skb);
2255                 goto errout;
2256         }
2257
2258         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2259 errout:
2260         return err;
2261 }
2262
2263 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2264 {
2265         struct sk_buff *skb;
2266         struct net *net = info->nl_net;
2267         u32 seq;
2268         int err;
2269
2270         err = -ENOBUFS;
2271         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2272
2273         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2274         if (skb == NULL)
2275                 goto errout;
2276
2277         err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2278                                 event, info->pid, seq, 0, 0);
2279         if (err < 0) {
2280                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2281                 WARN_ON(err == -EMSGSIZE);
2282                 kfree_skb(skb);
2283                 goto errout;
2284         }
2285         err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2286                           info->nlh, gfp_any());
2287 errout:
2288         if (err < 0)
2289                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2290 }
2291
2292 /*
2293  *      /proc
2294  */
2295
2296 #ifdef CONFIG_PROC_FS
2297
2298 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2299
2300 struct rt6_proc_arg
2301 {
2302         char *buffer;
2303         int offset;
2304         int length;
2305         int skip;
2306         int len;
2307 };
2308
2309 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2310 {
2311         struct seq_file *m = p_arg;
2312
2313         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2314                    rt->rt6i_dst.plen);
2315
2316 #ifdef CONFIG_IPV6_SUBTREES
2317         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2318                    rt->rt6i_src.plen);
2319 #else
2320         seq_puts(m, "00000000000000000000000000000000 00 ");
2321 #endif
2322
2323         if (rt->rt6i_nexthop) {
2324                 seq_printf(m, NIP6_SEQFMT,
2325                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2326         } else {
2327                 seq_puts(m, "00000000000000000000000000000000");
2328         }
2329         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2330                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2331                    rt->u.dst.__use, rt->rt6i_flags,
2332                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2333         return 0;
2334 }
2335
2336 static int ipv6_route_show(struct seq_file *m, void *v)
2337 {
2338         struct net *net = (struct net *)m->private;
2339         fib6_clean_all(net, rt6_info_route, 0, m);
2340         return 0;
2341 }
2342
2343 static int ipv6_route_open(struct inode *inode, struct file *file)
2344 {
2345         struct net *net = get_proc_net(inode);
2346         if (!net)
2347                 return -ENXIO;
2348         return single_open(file, ipv6_route_show, net);
2349 }
2350
2351 static int ipv6_route_release(struct inode *inode, struct file *file)
2352 {
2353         struct seq_file *seq = file->private_data;
2354         struct net *net = seq->private;
2355         put_net(net);
2356         return single_release(inode, file);
2357 }
2358
2359 static const struct file_operations ipv6_route_proc_fops = {
2360         .owner          = THIS_MODULE,
2361         .open           = ipv6_route_open,
2362         .read           = seq_read,
2363         .llseek         = seq_lseek,
2364         .release        = ipv6_route_release,
2365 };
2366
2367 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2368 {
2369         struct net *net = (struct net *)seq->private;
2370         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2371                    net->ipv6.rt6_stats->fib_nodes,
2372                    net->ipv6.rt6_stats->fib_route_nodes,
2373                    net->ipv6.rt6_stats->fib_rt_alloc,
2374                    net->ipv6.rt6_stats->fib_rt_entries,
2375                    net->ipv6.rt6_stats->fib_rt_cache,
2376                    atomic_read(&ip6_dst_ops.entries),
2377                    net->ipv6.rt6_stats->fib_discarded_routes);
2378
2379         return 0;
2380 }
2381
2382 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2383 {
2384         struct net *net = get_proc_net(inode);
2385         return single_open(file, rt6_stats_seq_show, net);
2386 }
2387
2388 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2389 {
2390         struct seq_file *seq = file->private_data;
2391         struct net *net = (struct net *)seq->private;
2392         put_net(net);
2393         return single_release(inode, file);
2394 }
2395
2396 static const struct file_operations rt6_stats_seq_fops = {
2397         .owner   = THIS_MODULE,
2398         .open    = rt6_stats_seq_open,
2399         .read    = seq_read,
2400         .llseek  = seq_lseek,
2401         .release = rt6_stats_seq_release,
2402 };
2403 #endif  /* CONFIG_PROC_FS */
2404
2405 #ifdef CONFIG_SYSCTL
2406
2407 static
2408 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2409                               void __user *buffer, size_t *lenp, loff_t *ppos)
2410 {
2411         struct net *net = current->nsproxy->net_ns;
2412         int delay = net->ipv6.sysctl.flush_delay;
2413         if (write) {
2414                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2415                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2416                 return 0;
2417         } else
2418                 return -EINVAL;
2419 }
2420
2421 ctl_table ipv6_route_table_template[] = {
2422         {
2423                 .procname       =       "flush",
2424                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2425                 .maxlen         =       sizeof(int),
2426                 .mode           =       0200,
2427                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2428         },
2429         {
2430                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2431                 .procname       =       "gc_thresh",
2432                 .data           =       &ip6_dst_ops.gc_thresh,
2433                 .maxlen         =       sizeof(int),
2434                 .mode           =       0644,
2435                 .proc_handler   =       &proc_dointvec,
2436         },
2437         {
2438                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2439                 .procname       =       "max_size",
2440                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2441                 .maxlen         =       sizeof(int),
2442                 .mode           =       0644,
2443                 .proc_handler   =       &proc_dointvec,
2444         },
2445         {
2446                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2447                 .procname       =       "gc_min_interval",
2448                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2449                 .maxlen         =       sizeof(int),
2450                 .mode           =       0644,
2451                 .proc_handler   =       &proc_dointvec_jiffies,
2452                 .strategy       =       &sysctl_jiffies,
2453         },
2454         {
2455                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2456                 .procname       =       "gc_timeout",
2457                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2458                 .maxlen         =       sizeof(int),
2459                 .mode           =       0644,
2460                 .proc_handler   =       &proc_dointvec_jiffies,
2461                 .strategy       =       &sysctl_jiffies,
2462         },
2463         {
2464                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2465                 .procname       =       "gc_interval",
2466                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2467                 .maxlen         =       sizeof(int),
2468                 .mode           =       0644,
2469                 .proc_handler   =       &proc_dointvec_jiffies,
2470                 .strategy       =       &sysctl_jiffies,
2471         },
2472         {
2473                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2474                 .procname       =       "gc_elasticity",
2475                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2476                 .maxlen         =       sizeof(int),
2477                 .mode           =       0644,
2478                 .proc_handler   =       &proc_dointvec_jiffies,
2479                 .strategy       =       &sysctl_jiffies,
2480         },
2481         {
2482                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2483                 .procname       =       "mtu_expires",
2484                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2485                 .maxlen         =       sizeof(int),
2486                 .mode           =       0644,
2487                 .proc_handler   =       &proc_dointvec_jiffies,
2488                 .strategy       =       &sysctl_jiffies,
2489         },
2490         {
2491                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2492                 .procname       =       "min_adv_mss",
2493                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2494                 .maxlen         =       sizeof(int),
2495                 .mode           =       0644,
2496                 .proc_handler   =       &proc_dointvec_jiffies,
2497                 .strategy       =       &sysctl_jiffies,
2498         },
2499         {
2500                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2501                 .procname       =       "gc_min_interval_ms",
2502                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2503                 .maxlen         =       sizeof(int),
2504                 .mode           =       0644,
2505                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2506                 .strategy       =       &sysctl_ms_jiffies,
2507         },
2508         { .ctl_name = 0 }
2509 };
2510
2511 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2512 {
2513         struct ctl_table *table;
2514
2515         table = kmemdup(ipv6_route_table_template,
2516                         sizeof(ipv6_route_table_template),
2517                         GFP_KERNEL);
2518
2519         if (table) {
2520                 table[0].data = &net->ipv6.sysctl.flush_delay;
2521                 /* table[1].data will be handled when we have
2522                    routes per namespace */
2523                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2524                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2525                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2526                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2527                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2528                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2529                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2530         }
2531
2532         return table;
2533 }
2534 #endif
2535
2536 static int ip6_route_net_init(struct net *net)
2537 {
2538 #ifdef CONFIG_PROC_FS
2539         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2540         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2541 #endif
2542         return 0;
2543 }
2544
2545 static void ip6_route_net_exit(struct net *net)
2546 {
2547 #ifdef CONFIG_PROC_FS
2548         proc_net_remove(net, "ipv6_route");
2549         proc_net_remove(net, "rt6_stats");
2550 #endif
2551         rt6_ifdown(net, NULL);
2552 }
2553
2554 static struct pernet_operations ip6_route_net_ops = {
2555         .init = ip6_route_net_init,
2556         .exit = ip6_route_net_exit,
2557 };
2558
2559 int __init ip6_route_init(void)
2560 {
2561         int ret;
2562
2563         ip6_dst_ops.kmem_cachep =
2564                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2565                                   SLAB_HWCACHE_ALIGN, NULL);
2566         if (!ip6_dst_ops.kmem_cachep)
2567                 return -ENOMEM;
2568
2569         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2570
2571         ret = -ENOMEM;
2572         ip6_null_entry = kmemdup(&ip6_null_entry_template,
2573                                  sizeof(*ip6_null_entry), GFP_KERNEL);
2574         if (!ip6_null_entry)
2575                 goto out_kmem_cache;
2576         ip6_null_entry->u.dst.path = (struct dst_entry *)ip6_null_entry;
2577
2578 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2579         ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2580                                      sizeof(*ip6_prohibit_entry), GFP_KERNEL);
2581         if (!ip6_prohibit_entry)
2582                 goto out_ip6_null_entry;
2583         ip6_prohibit_entry->u.dst.path = (struct dst_entry *)ip6_prohibit_entry;
2584
2585         ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2586                                      sizeof(*ip6_blk_hole_entry), GFP_KERNEL);
2587         if (!ip6_blk_hole_entry)
2588                 goto out_ip6_prohibit_entry;
2589         ip6_blk_hole_entry->u.dst.path = (struct dst_entry *)ip6_blk_hole_entry;
2590 #endif
2591
2592         ret = fib6_init();
2593         if (ret)
2594                 goto out_ip6_blk_hole_entry;
2595
2596         ret = xfrm6_init();
2597         if (ret)
2598                 goto out_fib6_init;
2599
2600         ret = fib6_rules_init();
2601         if (ret)
2602                 goto xfrm6_init;
2603
2604         ret = -ENOBUFS;
2605         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2606             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2607             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2608                 goto fib6_rules_init;
2609
2610         ret = register_pernet_subsys(&ip6_route_net_ops);
2611         if (ret)
2612                 goto fib6_rules_init;
2613 out:
2614         return ret;
2615
2616 fib6_rules_init:
2617         fib6_rules_cleanup();
2618 xfrm6_init:
2619         xfrm6_fini();
2620 out_fib6_init:
2621         fib6_gc_cleanup();
2622 out_ip6_blk_hole_entry:
2623 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2624         kfree(ip6_blk_hole_entry);
2625 out_ip6_prohibit_entry:
2626         kfree(ip6_prohibit_entry);
2627 out_ip6_null_entry:
2628 #endif
2629         kfree(ip6_null_entry);
2630 out_kmem_cache:
2631         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2632         goto out;
2633 }
2634
2635 void ip6_route_cleanup(void)
2636 {
2637         unregister_pernet_subsys(&ip6_route_net_ops);
2638         fib6_rules_cleanup();
2639         xfrm6_fini();
2640         fib6_gc_cleanup();
2641         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2642
2643         kfree(ip6_null_entry);
2644 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2645         kfree(ip6_prohibit_entry);
2646         kfree(ip6_blk_hole_entry);
2647 #endif
2648 }