ipv4: Make inet_sock.h independent of route.h
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void             ip6_dst_destroy(struct dst_entry *);
80 static void             ip6_dst_ifdown(struct dst_entry *,
81                                        struct net_device *dev, int how);
82 static int               ip6_dst_gc(struct dst_ops *ops);
83
84 static int              ip6_pkt_discard(struct sk_buff *skb);
85 static int              ip6_pkt_discard_out(struct sk_buff *skb);
86 static void             ip6_link_failure(struct sk_buff *skb);
87 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91                                            struct in6_addr *prefix, int prefixlen,
92                                            struct in6_addr *gwaddr, int ifindex,
93                                            unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95                                            struct in6_addr *prefix, int prefixlen,
96                                            struct in6_addr *gwaddr, int ifindex);
97 #endif
98
99 static struct dst_ops ip6_dst_ops_template = {
100         .family                 =       AF_INET6,
101         .protocol               =       __constant_htons(ETH_P_IPV6),
102         .gc                     =       ip6_dst_gc,
103         .gc_thresh              =       1024,
104         .check                  =       ip6_dst_check,
105         .destroy                =       ip6_dst_destroy,
106         .ifdown                 =       ip6_dst_ifdown,
107         .negative_advice        =       ip6_negative_advice,
108         .link_failure           =       ip6_link_failure,
109         .update_pmtu            =       ip6_rt_update_pmtu,
110         .local_out              =       __ip6_local_out,
111         .entry_size             =       sizeof(struct rt6_info),
112         .entries                =       ATOMIC_INIT(0),
113 };
114
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118
119 static struct dst_ops ip6_dst_blackhole_ops = {
120         .family                 =       AF_INET6,
121         .protocol               =       __constant_htons(ETH_P_IPV6),
122         .destroy                =       ip6_dst_destroy,
123         .check                  =       ip6_dst_check,
124         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
125         .entry_size             =       sizeof(struct rt6_info),
126         .entries                =       ATOMIC_INIT(0),
127 };
128
129 static struct rt6_info ip6_null_entry_template = {
130         .u = {
131                 .dst = {
132                         .__refcnt       = ATOMIC_INIT(1),
133                         .__use          = 1,
134                         .obsolete       = -1,
135                         .error          = -ENETUNREACH,
136                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
137                         .input          = ip6_pkt_discard,
138                         .output         = ip6_pkt_discard_out,
139                 }
140         },
141         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
142         .rt6i_metric    = ~(u32) 0,
143         .rt6i_ref       = ATOMIC_INIT(1),
144 };
145
146 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
147
148 static int ip6_pkt_prohibit(struct sk_buff *skb);
149 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
150
151 static struct rt6_info ip6_prohibit_entry_template = {
152         .u = {
153                 .dst = {
154                         .__refcnt       = ATOMIC_INIT(1),
155                         .__use          = 1,
156                         .obsolete       = -1,
157                         .error          = -EACCES,
158                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
159                         .input          = ip6_pkt_prohibit,
160                         .output         = ip6_pkt_prohibit_out,
161                 }
162         },
163         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
164         .rt6i_metric    = ~(u32) 0,
165         .rt6i_ref       = ATOMIC_INIT(1),
166 };
167
168 static struct rt6_info ip6_blk_hole_entry_template = {
169         .u = {
170                 .dst = {
171                         .__refcnt       = ATOMIC_INIT(1),
172                         .__use          = 1,
173                         .obsolete       = -1,
174                         .error          = -EINVAL,
175                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
176                         .input          = dst_discard,
177                         .output         = dst_discard,
178                 }
179         },
180         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #endif
186
187 /* allocate dst with ip6_dst_ops */
188 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
189 {
190         return (struct rt6_info *)dst_alloc(ops);
191 }
192
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195         struct rt6_info *rt = (struct rt6_info *)dst;
196         struct inet6_dev *idev = rt->rt6i_idev;
197
198         if (idev != NULL) {
199                 rt->rt6i_idev = NULL;
200                 in6_dev_put(idev);
201         }
202 }
203
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
205                            int how)
206 {
207         struct rt6_info *rt = (struct rt6_info *)dst;
208         struct inet6_dev *idev = rt->rt6i_idev;
209         struct net_device *loopback_dev =
210                 dev_net(dev)->loopback_dev;
211
212         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
213                 struct inet6_dev *loopback_idev =
214                         in6_dev_get(loopback_dev);
215                 if (loopback_idev != NULL) {
216                         rt->rt6i_idev = loopback_idev;
217                         in6_dev_put(idev);
218                 }
219         }
220 }
221
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
223 {
224         return (rt->rt6i_flags & RTF_EXPIRES &&
225                 time_after(jiffies, rt->rt6i_expires));
226 }
227
228 static inline int rt6_need_strict(struct in6_addr *daddr)
229 {
230         return (ipv6_addr_type(daddr) &
231                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
232 }
233
234 /*
235  *      Route lookup. Any table->tb6_lock is implied.
236  */
237
238 static inline struct rt6_info *rt6_device_match(struct net *net,
239                                                     struct rt6_info *rt,
240                                                     struct in6_addr *saddr,
241                                                     int oif,
242                                                     int flags)
243 {
244         struct rt6_info *local = NULL;
245         struct rt6_info *sprt;
246
247         if (!oif && ipv6_addr_any(saddr))
248                 goto out;
249
250         for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
251                 struct net_device *dev = sprt->rt6i_dev;
252
253                 if (oif) {
254                         if (dev->ifindex == oif)
255                                 return sprt;
256                         if (dev->flags & IFF_LOOPBACK) {
257                                 if (sprt->rt6i_idev == NULL ||
258                                     sprt->rt6i_idev->dev->ifindex != oif) {
259                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
260                                                 continue;
261                                         if (local && (!oif ||
262                                                       local->rt6i_idev->dev->ifindex == oif))
263                                                 continue;
264                                 }
265                                 local = sprt;
266                         }
267                 } else {
268                         if (ipv6_chk_addr(net, saddr, dev,
269                                           flags & RT6_LOOKUP_F_IFACE))
270                                 return sprt;
271                 }
272         }
273
274         if (oif) {
275                 if (local)
276                         return local;
277
278                 if (flags & RT6_LOOKUP_F_IFACE)
279                         return net->ipv6.ip6_null_entry;
280         }
281 out:
282         return rt;
283 }
284
285 #ifdef CONFIG_IPV6_ROUTER_PREF
286 static void rt6_probe(struct rt6_info *rt)
287 {
288         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
289         /*
290          * Okay, this does not seem to be appropriate
291          * for now, however, we need to check if it
292          * is really so; aka Router Reachability Probing.
293          *
294          * Router Reachability Probe MUST be rate-limited
295          * to no more than one per minute.
296          */
297         if (!neigh || (neigh->nud_state & NUD_VALID))
298                 return;
299         read_lock_bh(&neigh->lock);
300         if (!(neigh->nud_state & NUD_VALID) &&
301             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
302                 struct in6_addr mcaddr;
303                 struct in6_addr *target;
304
305                 neigh->updated = jiffies;
306                 read_unlock_bh(&neigh->lock);
307
308                 target = (struct in6_addr *)&neigh->primary_key;
309                 addrconf_addr_solict_mult(target, &mcaddr);
310                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
311         } else
312                 read_unlock_bh(&neigh->lock);
313 }
314 #else
315 static inline void rt6_probe(struct rt6_info *rt)
316 {
317         return;
318 }
319 #endif
320
321 /*
322  * Default Router Selection (RFC 2461 6.3.6)
323  */
324 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
325 {
326         struct net_device *dev = rt->rt6i_dev;
327         if (!oif || dev->ifindex == oif)
328                 return 2;
329         if ((dev->flags & IFF_LOOPBACK) &&
330             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
331                 return 1;
332         return 0;
333 }
334
335 static inline int rt6_check_neigh(struct rt6_info *rt)
336 {
337         struct neighbour *neigh = rt->rt6i_nexthop;
338         int m;
339         if (rt->rt6i_flags & RTF_NONEXTHOP ||
340             !(rt->rt6i_flags & RTF_GATEWAY))
341                 m = 1;
342         else if (neigh) {
343                 read_lock_bh(&neigh->lock);
344                 if (neigh->nud_state & NUD_VALID)
345                         m = 2;
346 #ifdef CONFIG_IPV6_ROUTER_PREF
347                 else if (neigh->nud_state & NUD_FAILED)
348                         m = 0;
349 #endif
350                 else
351                         m = 1;
352                 read_unlock_bh(&neigh->lock);
353         } else
354                 m = 0;
355         return m;
356 }
357
358 static int rt6_score_route(struct rt6_info *rt, int oif,
359                            int strict)
360 {
361         int m, n;
362
363         m = rt6_check_dev(rt, oif);
364         if (!m && (strict & RT6_LOOKUP_F_IFACE))
365                 return -1;
366 #ifdef CONFIG_IPV6_ROUTER_PREF
367         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
368 #endif
369         n = rt6_check_neigh(rt);
370         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
371                 return -1;
372         return m;
373 }
374
375 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
376                                    int *mpri, struct rt6_info *match)
377 {
378         int m;
379
380         if (rt6_check_expired(rt))
381                 goto out;
382
383         m = rt6_score_route(rt, oif, strict);
384         if (m < 0)
385                 goto out;
386
387         if (m > *mpri) {
388                 if (strict & RT6_LOOKUP_F_REACHABLE)
389                         rt6_probe(match);
390                 *mpri = m;
391                 match = rt;
392         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
393                 rt6_probe(rt);
394         }
395
396 out:
397         return match;
398 }
399
400 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
401                                      struct rt6_info *rr_head,
402                                      u32 metric, int oif, int strict)
403 {
404         struct rt6_info *rt, *match;
405         int mpri = -1;
406
407         match = NULL;
408         for (rt = rr_head; rt && rt->rt6i_metric == metric;
409              rt = rt->u.dst.rt6_next)
410                 match = find_match(rt, oif, strict, &mpri, match);
411         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
412              rt = rt->u.dst.rt6_next)
413                 match = find_match(rt, oif, strict, &mpri, match);
414
415         return match;
416 }
417
418 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
419 {
420         struct rt6_info *match, *rt0;
421         struct net *net;
422
423         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
424                   __func__, fn->leaf, oif);
425
426         rt0 = fn->rr_ptr;
427         if (!rt0)
428                 fn->rr_ptr = rt0 = fn->leaf;
429
430         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
431
432         if (!match &&
433             (strict & RT6_LOOKUP_F_REACHABLE)) {
434                 struct rt6_info *next = rt0->u.dst.rt6_next;
435
436                 /* no entries matched; do round-robin */
437                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
438                         next = fn->leaf;
439
440                 if (next != rt0)
441                         fn->rr_ptr = next;
442         }
443
444         RT6_TRACE("%s() => %p\n",
445                   __func__, match);
446
447         net = dev_net(rt0->rt6i_dev);
448         return (match ? match : net->ipv6.ip6_null_entry);
449 }
450
451 #ifdef CONFIG_IPV6_ROUTE_INFO
452 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
453                   struct in6_addr *gwaddr)
454 {
455         struct net *net = dev_net(dev);
456         struct route_info *rinfo = (struct route_info *) opt;
457         struct in6_addr prefix_buf, *prefix;
458         unsigned int pref;
459         unsigned long lifetime;
460         struct rt6_info *rt;
461
462         if (len < sizeof(struct route_info)) {
463                 return -EINVAL;
464         }
465
466         /* Sanity check for prefix_len and length */
467         if (rinfo->length > 3) {
468                 return -EINVAL;
469         } else if (rinfo->prefix_len > 128) {
470                 return -EINVAL;
471         } else if (rinfo->prefix_len > 64) {
472                 if (rinfo->length < 2) {
473                         return -EINVAL;
474                 }
475         } else if (rinfo->prefix_len > 0) {
476                 if (rinfo->length < 1) {
477                         return -EINVAL;
478                 }
479         }
480
481         pref = rinfo->route_pref;
482         if (pref == ICMPV6_ROUTER_PREF_INVALID)
483                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
484
485         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
486
487         if (rinfo->length == 3)
488                 prefix = (struct in6_addr *)rinfo->prefix;
489         else {
490                 /* this function is safe */
491                 ipv6_addr_prefix(&prefix_buf,
492                                  (struct in6_addr *)rinfo->prefix,
493                                  rinfo->prefix_len);
494                 prefix = &prefix_buf;
495         }
496
497         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
498                                 dev->ifindex);
499
500         if (rt && !lifetime) {
501                 ip6_del_rt(rt);
502                 rt = NULL;
503         }
504
505         if (!rt && lifetime)
506                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
507                                         pref);
508         else if (rt)
509                 rt->rt6i_flags = RTF_ROUTEINFO |
510                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
511
512         if (rt) {
513                 if (!addrconf_finite_timeout(lifetime)) {
514                         rt->rt6i_flags &= ~RTF_EXPIRES;
515                 } else {
516                         rt->rt6i_expires = jiffies + HZ * lifetime;
517                         rt->rt6i_flags |= RTF_EXPIRES;
518                 }
519                 dst_release(&rt->u.dst);
520         }
521         return 0;
522 }
523 #endif
524
525 #define BACKTRACK(__net, saddr)                 \
526 do { \
527         if (rt == __net->ipv6.ip6_null_entry) { \
528                 struct fib6_node *pn; \
529                 while (1) { \
530                         if (fn->fn_flags & RTN_TL_ROOT) \
531                                 goto out; \
532                         pn = fn->parent; \
533                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
534                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
535                         else \
536                                 fn = pn; \
537                         if (fn->fn_flags & RTN_RTINFO) \
538                                 goto restart; \
539                 } \
540         } \
541 } while(0)
542
543 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
544                                              struct fib6_table *table,
545                                              struct flowi *fl, int flags)
546 {
547         struct fib6_node *fn;
548         struct rt6_info *rt;
549
550         read_lock_bh(&table->tb6_lock);
551         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
552 restart:
553         rt = fn->leaf;
554         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
555         BACKTRACK(net, &fl->fl6_src);
556 out:
557         dst_use(&rt->u.dst, jiffies);
558         read_unlock_bh(&table->tb6_lock);
559         return rt;
560
561 }
562
563 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
564                             const struct in6_addr *saddr, int oif, int strict)
565 {
566         struct flowi fl = {
567                 .oif = oif,
568                 .nl_u = {
569                         .ip6_u = {
570                                 .daddr = *daddr,
571                         },
572                 },
573         };
574         struct dst_entry *dst;
575         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
576
577         if (saddr) {
578                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
579                 flags |= RT6_LOOKUP_F_HAS_SADDR;
580         }
581
582         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
583         if (dst->error == 0)
584                 return (struct rt6_info *) dst;
585
586         dst_release(dst);
587
588         return NULL;
589 }
590
591 EXPORT_SYMBOL(rt6_lookup);
592
593 /* ip6_ins_rt is called with FREE table->tb6_lock.
594    It takes new route entry, the addition fails by any reason the
595    route is freed. In any case, if caller does not hold it, it may
596    be destroyed.
597  */
598
599 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
600 {
601         int err;
602         struct fib6_table *table;
603
604         table = rt->rt6i_table;
605         write_lock_bh(&table->tb6_lock);
606         err = fib6_add(&table->tb6_root, rt, info);
607         write_unlock_bh(&table->tb6_lock);
608
609         return err;
610 }
611
612 int ip6_ins_rt(struct rt6_info *rt)
613 {
614         struct nl_info info = {
615                 .nl_net = dev_net(rt->rt6i_dev),
616         };
617         return __ip6_ins_rt(rt, &info);
618 }
619
620 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
621                                       struct in6_addr *saddr)
622 {
623         struct rt6_info *rt;
624
625         /*
626          *      Clone the route.
627          */
628
629         rt = ip6_rt_copy(ort);
630
631         if (rt) {
632                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
633                         if (rt->rt6i_dst.plen != 128 &&
634                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
635                                 rt->rt6i_flags |= RTF_ANYCAST;
636                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
637                 }
638
639                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
640                 rt->rt6i_dst.plen = 128;
641                 rt->rt6i_flags |= RTF_CACHE;
642                 rt->u.dst.flags |= DST_HOST;
643
644 #ifdef CONFIG_IPV6_SUBTREES
645                 if (rt->rt6i_src.plen && saddr) {
646                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
647                         rt->rt6i_src.plen = 128;
648                 }
649 #endif
650
651                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
652
653         }
654
655         return rt;
656 }
657
658 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
659 {
660         struct rt6_info *rt = ip6_rt_copy(ort);
661         if (rt) {
662                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
663                 rt->rt6i_dst.plen = 128;
664                 rt->rt6i_flags |= RTF_CACHE;
665                 rt->u.dst.flags |= DST_HOST;
666                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
667         }
668         return rt;
669 }
670
671 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
672                                       struct flowi *fl, int flags)
673 {
674         struct fib6_node *fn;
675         struct rt6_info *rt, *nrt;
676         int strict = 0;
677         int attempts = 3;
678         int err;
679         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
680
681         strict |= flags & RT6_LOOKUP_F_IFACE;
682
683 relookup:
684         read_lock_bh(&table->tb6_lock);
685
686 restart_2:
687         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
688
689 restart:
690         rt = rt6_select(fn, oif, strict | reachable);
691
692         BACKTRACK(net, &fl->fl6_src);
693         if (rt == net->ipv6.ip6_null_entry ||
694             rt->rt6i_flags & RTF_CACHE)
695                 goto out;
696
697         dst_hold(&rt->u.dst);
698         read_unlock_bh(&table->tb6_lock);
699
700         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
701                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
702         else {
703 #if CLONE_OFFLINK_ROUTE
704                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
705 #else
706                 goto out2;
707 #endif
708         }
709
710         dst_release(&rt->u.dst);
711         rt = nrt ? : net->ipv6.ip6_null_entry;
712
713         dst_hold(&rt->u.dst);
714         if (nrt) {
715                 err = ip6_ins_rt(nrt);
716                 if (!err)
717                         goto out2;
718         }
719
720         if (--attempts <= 0)
721                 goto out2;
722
723         /*
724          * Race condition! In the gap, when table->tb6_lock was
725          * released someone could insert this route.  Relookup.
726          */
727         dst_release(&rt->u.dst);
728         goto relookup;
729
730 out:
731         if (reachable) {
732                 reachable = 0;
733                 goto restart_2;
734         }
735         dst_hold(&rt->u.dst);
736         read_unlock_bh(&table->tb6_lock);
737 out2:
738         rt->u.dst.lastuse = jiffies;
739         rt->u.dst.__use++;
740
741         return rt;
742 }
743
744 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
745                                             struct flowi *fl, int flags)
746 {
747         return ip6_pol_route(net, table, fl->iif, fl, flags);
748 }
749
750 void ip6_route_input(struct sk_buff *skb)
751 {
752         struct ipv6hdr *iph = ipv6_hdr(skb);
753         struct net *net = dev_net(skb->dev);
754         int flags = RT6_LOOKUP_F_HAS_SADDR;
755         struct flowi fl = {
756                 .iif = skb->dev->ifindex,
757                 .nl_u = {
758                         .ip6_u = {
759                                 .daddr = iph->daddr,
760                                 .saddr = iph->saddr,
761                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
762                         },
763                 },
764                 .mark = skb->mark,
765                 .proto = iph->nexthdr,
766         };
767
768         if (rt6_need_strict(&iph->daddr))
769                 flags |= RT6_LOOKUP_F_IFACE;
770
771         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
772 }
773
774 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
775                                              struct flowi *fl, int flags)
776 {
777         return ip6_pol_route(net, table, fl->oif, fl, flags);
778 }
779
780 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
781                                     struct flowi *fl)
782 {
783         int flags = 0;
784
785         if (rt6_need_strict(&fl->fl6_dst))
786                 flags |= RT6_LOOKUP_F_IFACE;
787
788         if (!ipv6_addr_any(&fl->fl6_src))
789                 flags |= RT6_LOOKUP_F_HAS_SADDR;
790         else if (sk) {
791                 unsigned int prefs = inet6_sk(sk)->srcprefs;
792                 if (prefs & IPV6_PREFER_SRC_TMP)
793                         flags |= RT6_LOOKUP_F_SRCPREF_TMP;
794                 if (prefs & IPV6_PREFER_SRC_PUBLIC)
795                         flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
796                 if (prefs & IPV6_PREFER_SRC_COA)
797                         flags |= RT6_LOOKUP_F_SRCPREF_COA;
798         }
799
800         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
801 }
802
803 EXPORT_SYMBOL(ip6_route_output);
804
805 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
806 {
807         struct rt6_info *ort = (struct rt6_info *) *dstp;
808         struct rt6_info *rt = (struct rt6_info *)
809                 dst_alloc(&ip6_dst_blackhole_ops);
810         struct dst_entry *new = NULL;
811
812         if (rt) {
813                 new = &rt->u.dst;
814
815                 atomic_set(&new->__refcnt, 1);
816                 new->__use = 1;
817                 new->input = dst_discard;
818                 new->output = dst_discard;
819
820                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
821                 new->dev = ort->u.dst.dev;
822                 if (new->dev)
823                         dev_hold(new->dev);
824                 rt->rt6i_idev = ort->rt6i_idev;
825                 if (rt->rt6i_idev)
826                         in6_dev_hold(rt->rt6i_idev);
827                 rt->rt6i_expires = 0;
828
829                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
830                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
831                 rt->rt6i_metric = 0;
832
833                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
834 #ifdef CONFIG_IPV6_SUBTREES
835                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
836 #endif
837
838                 dst_free(new);
839         }
840
841         dst_release(*dstp);
842         *dstp = new;
843         return (new ? 0 : -ENOMEM);
844 }
845 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
846
847 /*
848  *      Destination cache support functions
849  */
850
851 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
852 {
853         struct rt6_info *rt;
854
855         rt = (struct rt6_info *) dst;
856
857         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
858                 return dst;
859
860         return NULL;
861 }
862
863 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
864 {
865         struct rt6_info *rt = (struct rt6_info *) dst;
866
867         if (rt) {
868                 if (rt->rt6i_flags & RTF_CACHE)
869                         ip6_del_rt(rt);
870                 else
871                         dst_release(dst);
872         }
873         return NULL;
874 }
875
876 static void ip6_link_failure(struct sk_buff *skb)
877 {
878         struct rt6_info *rt;
879
880         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
881
882         rt = (struct rt6_info *) skb->dst;
883         if (rt) {
884                 if (rt->rt6i_flags&RTF_CACHE) {
885                         dst_set_expires(&rt->u.dst, 0);
886                         rt->rt6i_flags |= RTF_EXPIRES;
887                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
888                         rt->rt6i_node->fn_sernum = -1;
889         }
890 }
891
892 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
893 {
894         struct rt6_info *rt6 = (struct rt6_info*)dst;
895
896         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
897                 rt6->rt6i_flags |= RTF_MODIFIED;
898                 if (mtu < IPV6_MIN_MTU) {
899                         mtu = IPV6_MIN_MTU;
900                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
901                 }
902                 dst->metrics[RTAX_MTU-1] = mtu;
903                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
904         }
905 }
906
907 static int ipv6_get_mtu(struct net_device *dev);
908
909 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
910 {
911         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
912
913         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
914                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
915
916         /*
917          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
918          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
919          * IPV6_MAXPLEN is also valid and means: "any MSS,
920          * rely only on pmtu discovery"
921          */
922         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
923                 mtu = IPV6_MAXPLEN;
924         return mtu;
925 }
926
927 static struct dst_entry *icmp6_dst_gc_list;
928 static DEFINE_SPINLOCK(icmp6_dst_lock);
929
930 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
931                                   struct neighbour *neigh,
932                                   const struct in6_addr *addr)
933 {
934         struct rt6_info *rt;
935         struct inet6_dev *idev = in6_dev_get(dev);
936         struct net *net = dev_net(dev);
937
938         if (unlikely(idev == NULL))
939                 return NULL;
940
941         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
942         if (unlikely(rt == NULL)) {
943                 in6_dev_put(idev);
944                 goto out;
945         }
946
947         dev_hold(dev);
948         if (neigh)
949                 neigh_hold(neigh);
950         else
951                 neigh = ndisc_get_neigh(dev, addr);
952
953         rt->rt6i_dev      = dev;
954         rt->rt6i_idev     = idev;
955         rt->rt6i_nexthop  = neigh;
956         atomic_set(&rt->u.dst.__refcnt, 1);
957         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
958         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
959         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
960         rt->u.dst.output  = ip6_output;
961
962 #if 0   /* there's no chance to use these for ndisc */
963         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
964                                 ? DST_HOST
965                                 : 0;
966         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
967         rt->rt6i_dst.plen = 128;
968 #endif
969
970         spin_lock_bh(&icmp6_dst_lock);
971         rt->u.dst.next = icmp6_dst_gc_list;
972         icmp6_dst_gc_list = &rt->u.dst;
973         spin_unlock_bh(&icmp6_dst_lock);
974
975         fib6_force_start_gc(net);
976
977 out:
978         return &rt->u.dst;
979 }
980
981 int icmp6_dst_gc(void)
982 {
983         struct dst_entry *dst, *next, **pprev;
984         int more = 0;
985
986         next = NULL;
987
988         spin_lock_bh(&icmp6_dst_lock);
989         pprev = &icmp6_dst_gc_list;
990
991         while ((dst = *pprev) != NULL) {
992                 if (!atomic_read(&dst->__refcnt)) {
993                         *pprev = dst->next;
994                         dst_free(dst);
995                 } else {
996                         pprev = &dst->next;
997                         ++more;
998                 }
999         }
1000
1001         spin_unlock_bh(&icmp6_dst_lock);
1002
1003         return more;
1004 }
1005
1006 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1007                             void *arg)
1008 {
1009         struct dst_entry *dst, **pprev;
1010
1011         spin_lock_bh(&icmp6_dst_lock);
1012         pprev = &icmp6_dst_gc_list;
1013         while ((dst = *pprev) != NULL) {
1014                 struct rt6_info *rt = (struct rt6_info *) dst;
1015                 if (func(rt, arg)) {
1016                         *pprev = dst->next;
1017                         dst_free(dst);
1018                 } else {
1019                         pprev = &dst->next;
1020                 }
1021         }
1022         spin_unlock_bh(&icmp6_dst_lock);
1023 }
1024
1025 static int ip6_dst_gc(struct dst_ops *ops)
1026 {
1027         unsigned long now = jiffies;
1028         struct net *net = ops->dst_net;
1029         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1030         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1031         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1032         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1033         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1034
1035         if (time_after(rt_last_gc + rt_min_interval, now) &&
1036             atomic_read(&ops->entries) <= rt_max_size)
1037                 goto out;
1038
1039         net->ipv6.ip6_rt_gc_expire++;
1040         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1041         net->ipv6.ip6_rt_last_gc = now;
1042         if (atomic_read(&ops->entries) < ops->gc_thresh)
1043                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1044 out:
1045         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1046         return (atomic_read(&ops->entries) > rt_max_size);
1047 }
1048
1049 /* Clean host part of a prefix. Not necessary in radix tree,
1050    but results in cleaner routing tables.
1051
1052    Remove it only when all the things will work!
1053  */
1054
1055 static int ipv6_get_mtu(struct net_device *dev)
1056 {
1057         int mtu = IPV6_MIN_MTU;
1058         struct inet6_dev *idev;
1059
1060         idev = in6_dev_get(dev);
1061         if (idev) {
1062                 mtu = idev->cnf.mtu6;
1063                 in6_dev_put(idev);
1064         }
1065         return mtu;
1066 }
1067
1068 int ip6_dst_hoplimit(struct dst_entry *dst)
1069 {
1070         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1071         if (hoplimit < 0) {
1072                 struct net_device *dev = dst->dev;
1073                 struct inet6_dev *idev = in6_dev_get(dev);
1074                 if (idev) {
1075                         hoplimit = idev->cnf.hop_limit;
1076                         in6_dev_put(idev);
1077                 } else
1078                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1079         }
1080         return hoplimit;
1081 }
1082
1083 /*
1084  *
1085  */
1086
1087 int ip6_route_add(struct fib6_config *cfg)
1088 {
1089         int err;
1090         struct net *net = cfg->fc_nlinfo.nl_net;
1091         struct rt6_info *rt = NULL;
1092         struct net_device *dev = NULL;
1093         struct inet6_dev *idev = NULL;
1094         struct fib6_table *table;
1095         int addr_type;
1096
1097         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1098                 return -EINVAL;
1099 #ifndef CONFIG_IPV6_SUBTREES
1100         if (cfg->fc_src_len)
1101                 return -EINVAL;
1102 #endif
1103         if (cfg->fc_ifindex) {
1104                 err = -ENODEV;
1105                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1106                 if (!dev)
1107                         goto out;
1108                 idev = in6_dev_get(dev);
1109                 if (!idev)
1110                         goto out;
1111         }
1112
1113         if (cfg->fc_metric == 0)
1114                 cfg->fc_metric = IP6_RT_PRIO_USER;
1115
1116         table = fib6_new_table(net, cfg->fc_table);
1117         if (table == NULL) {
1118                 err = -ENOBUFS;
1119                 goto out;
1120         }
1121
1122         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1123
1124         if (rt == NULL) {
1125                 err = -ENOMEM;
1126                 goto out;
1127         }
1128
1129         rt->u.dst.obsolete = -1;
1130         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1131                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1132                                 0;
1133
1134         if (cfg->fc_protocol == RTPROT_UNSPEC)
1135                 cfg->fc_protocol = RTPROT_BOOT;
1136         rt->rt6i_protocol = cfg->fc_protocol;
1137
1138         addr_type = ipv6_addr_type(&cfg->fc_dst);
1139
1140         if (addr_type & IPV6_ADDR_MULTICAST)
1141                 rt->u.dst.input = ip6_mc_input;
1142         else
1143                 rt->u.dst.input = ip6_forward;
1144
1145         rt->u.dst.output = ip6_output;
1146
1147         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1148         rt->rt6i_dst.plen = cfg->fc_dst_len;
1149         if (rt->rt6i_dst.plen == 128)
1150                rt->u.dst.flags = DST_HOST;
1151
1152 #ifdef CONFIG_IPV6_SUBTREES
1153         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1154         rt->rt6i_src.plen = cfg->fc_src_len;
1155 #endif
1156
1157         rt->rt6i_metric = cfg->fc_metric;
1158
1159         /* We cannot add true routes via loopback here,
1160            they would result in kernel looping; promote them to reject routes
1161          */
1162         if ((cfg->fc_flags & RTF_REJECT) ||
1163             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1164                 /* hold loopback dev/idev if we haven't done so. */
1165                 if (dev != net->loopback_dev) {
1166                         if (dev) {
1167                                 dev_put(dev);
1168                                 in6_dev_put(idev);
1169                         }
1170                         dev = net->loopback_dev;
1171                         dev_hold(dev);
1172                         idev = in6_dev_get(dev);
1173                         if (!idev) {
1174                                 err = -ENODEV;
1175                                 goto out;
1176                         }
1177                 }
1178                 rt->u.dst.output = ip6_pkt_discard_out;
1179                 rt->u.dst.input = ip6_pkt_discard;
1180                 rt->u.dst.error = -ENETUNREACH;
1181                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1182                 goto install_route;
1183         }
1184
1185         if (cfg->fc_flags & RTF_GATEWAY) {
1186                 struct in6_addr *gw_addr;
1187                 int gwa_type;
1188
1189                 gw_addr = &cfg->fc_gateway;
1190                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1191                 gwa_type = ipv6_addr_type(gw_addr);
1192
1193                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1194                         struct rt6_info *grt;
1195
1196                         /* IPv6 strictly inhibits using not link-local
1197                            addresses as nexthop address.
1198                            Otherwise, router will not able to send redirects.
1199                            It is very good, but in some (rare!) circumstances
1200                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1201                            some exceptions. --ANK
1202                          */
1203                         err = -EINVAL;
1204                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1205                                 goto out;
1206
1207                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1208
1209                         err = -EHOSTUNREACH;
1210                         if (grt == NULL)
1211                                 goto out;
1212                         if (dev) {
1213                                 if (dev != grt->rt6i_dev) {
1214                                         dst_release(&grt->u.dst);
1215                                         goto out;
1216                                 }
1217                         } else {
1218                                 dev = grt->rt6i_dev;
1219                                 idev = grt->rt6i_idev;
1220                                 dev_hold(dev);
1221                                 in6_dev_hold(grt->rt6i_idev);
1222                         }
1223                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1224                                 err = 0;
1225                         dst_release(&grt->u.dst);
1226
1227                         if (err)
1228                                 goto out;
1229                 }
1230                 err = -EINVAL;
1231                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1232                         goto out;
1233         }
1234
1235         err = -ENODEV;
1236         if (dev == NULL)
1237                 goto out;
1238
1239         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1240                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1241                 if (IS_ERR(rt->rt6i_nexthop)) {
1242                         err = PTR_ERR(rt->rt6i_nexthop);
1243                         rt->rt6i_nexthop = NULL;
1244                         goto out;
1245                 }
1246         }
1247
1248         rt->rt6i_flags = cfg->fc_flags;
1249
1250 install_route:
1251         if (cfg->fc_mx) {
1252                 struct nlattr *nla;
1253                 int remaining;
1254
1255                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1256                         int type = nla_type(nla);
1257
1258                         if (type) {
1259                                 if (type > RTAX_MAX) {
1260                                         err = -EINVAL;
1261                                         goto out;
1262                                 }
1263
1264                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1265                         }
1266                 }
1267         }
1268
1269         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1270                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1271         if (!dst_mtu(&rt->u.dst))
1272                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1273         if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1274                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1275         rt->u.dst.dev = dev;
1276         rt->rt6i_idev = idev;
1277         rt->rt6i_table = table;
1278
1279         cfg->fc_nlinfo.nl_net = dev_net(dev);
1280
1281         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1282
1283 out:
1284         if (dev)
1285                 dev_put(dev);
1286         if (idev)
1287                 in6_dev_put(idev);
1288         if (rt)
1289                 dst_free(&rt->u.dst);
1290         return err;
1291 }
1292
1293 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1294 {
1295         int err;
1296         struct fib6_table *table;
1297         struct net *net = dev_net(rt->rt6i_dev);
1298
1299         if (rt == net->ipv6.ip6_null_entry)
1300                 return -ENOENT;
1301
1302         table = rt->rt6i_table;
1303         write_lock_bh(&table->tb6_lock);
1304
1305         err = fib6_del(rt, info);
1306         dst_release(&rt->u.dst);
1307
1308         write_unlock_bh(&table->tb6_lock);
1309
1310         return err;
1311 }
1312
1313 int ip6_del_rt(struct rt6_info *rt)
1314 {
1315         struct nl_info info = {
1316                 .nl_net = dev_net(rt->rt6i_dev),
1317         };
1318         return __ip6_del_rt(rt, &info);
1319 }
1320
1321 static int ip6_route_del(struct fib6_config *cfg)
1322 {
1323         struct fib6_table *table;
1324         struct fib6_node *fn;
1325         struct rt6_info *rt;
1326         int err = -ESRCH;
1327
1328         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1329         if (table == NULL)
1330                 return err;
1331
1332         read_lock_bh(&table->tb6_lock);
1333
1334         fn = fib6_locate(&table->tb6_root,
1335                          &cfg->fc_dst, cfg->fc_dst_len,
1336                          &cfg->fc_src, cfg->fc_src_len);
1337
1338         if (fn) {
1339                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1340                         if (cfg->fc_ifindex &&
1341                             (rt->rt6i_dev == NULL ||
1342                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1343                                 continue;
1344                         if (cfg->fc_flags & RTF_GATEWAY &&
1345                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1346                                 continue;
1347                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1348                                 continue;
1349                         dst_hold(&rt->u.dst);
1350                         read_unlock_bh(&table->tb6_lock);
1351
1352                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1353                 }
1354         }
1355         read_unlock_bh(&table->tb6_lock);
1356
1357         return err;
1358 }
1359
1360 /*
1361  *      Handle redirects
1362  */
1363 struct ip6rd_flowi {
1364         struct flowi fl;
1365         struct in6_addr gateway;
1366 };
1367
1368 static struct rt6_info *__ip6_route_redirect(struct net *net,
1369                                              struct fib6_table *table,
1370                                              struct flowi *fl,
1371                                              int flags)
1372 {
1373         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1374         struct rt6_info *rt;
1375         struct fib6_node *fn;
1376
1377         /*
1378          * Get the "current" route for this destination and
1379          * check if the redirect has come from approriate router.
1380          *
1381          * RFC 2461 specifies that redirects should only be
1382          * accepted if they come from the nexthop to the target.
1383          * Due to the way the routes are chosen, this notion
1384          * is a bit fuzzy and one might need to check all possible
1385          * routes.
1386          */
1387
1388         read_lock_bh(&table->tb6_lock);
1389         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1390 restart:
1391         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1392                 /*
1393                  * Current route is on-link; redirect is always invalid.
1394                  *
1395                  * Seems, previous statement is not true. It could
1396                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1397                  * But then router serving it might decide, that we should
1398                  * know truth 8)8) --ANK (980726).
1399                  */
1400                 if (rt6_check_expired(rt))
1401                         continue;
1402                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1403                         continue;
1404                 if (fl->oif != rt->rt6i_dev->ifindex)
1405                         continue;
1406                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1407                         continue;
1408                 break;
1409         }
1410
1411         if (!rt)
1412                 rt = net->ipv6.ip6_null_entry;
1413         BACKTRACK(net, &fl->fl6_src);
1414 out:
1415         dst_hold(&rt->u.dst);
1416
1417         read_unlock_bh(&table->tb6_lock);
1418
1419         return rt;
1420 };
1421
1422 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1423                                            struct in6_addr *src,
1424                                            struct in6_addr *gateway,
1425                                            struct net_device *dev)
1426 {
1427         int flags = RT6_LOOKUP_F_HAS_SADDR;
1428         struct net *net = dev_net(dev);
1429         struct ip6rd_flowi rdfl = {
1430                 .fl = {
1431                         .oif = dev->ifindex,
1432                         .nl_u = {
1433                                 .ip6_u = {
1434                                         .daddr = *dest,
1435                                         .saddr = *src,
1436                                 },
1437                         },
1438                 },
1439                 .gateway = *gateway,
1440         };
1441
1442         if (rt6_need_strict(dest))
1443                 flags |= RT6_LOOKUP_F_IFACE;
1444
1445         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1446                                                    flags, __ip6_route_redirect);
1447 }
1448
1449 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1450                   struct in6_addr *saddr,
1451                   struct neighbour *neigh, u8 *lladdr, int on_link)
1452 {
1453         struct rt6_info *rt, *nrt = NULL;
1454         struct netevent_redirect netevent;
1455         struct net *net = dev_net(neigh->dev);
1456
1457         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1458
1459         if (rt == net->ipv6.ip6_null_entry) {
1460                 if (net_ratelimit())
1461                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1462                                "for redirect target\n");
1463                 goto out;
1464         }
1465
1466         /*
1467          *      We have finally decided to accept it.
1468          */
1469
1470         neigh_update(neigh, lladdr, NUD_STALE,
1471                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1472                      NEIGH_UPDATE_F_OVERRIDE|
1473                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1474                                      NEIGH_UPDATE_F_ISROUTER))
1475                      );
1476
1477         /*
1478          * Redirect received -> path was valid.
1479          * Look, redirects are sent only in response to data packets,
1480          * so that this nexthop apparently is reachable. --ANK
1481          */
1482         dst_confirm(&rt->u.dst);
1483
1484         /* Duplicate redirect: silently ignore. */
1485         if (neigh == rt->u.dst.neighbour)
1486                 goto out;
1487
1488         nrt = ip6_rt_copy(rt);
1489         if (nrt == NULL)
1490                 goto out;
1491
1492         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1493         if (on_link)
1494                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1495
1496         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1497         nrt->rt6i_dst.plen = 128;
1498         nrt->u.dst.flags |= DST_HOST;
1499
1500         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1501         nrt->rt6i_nexthop = neigh_clone(neigh);
1502         /* Reset pmtu, it may be better */
1503         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1504         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1505                                                         dst_mtu(&nrt->u.dst));
1506
1507         if (ip6_ins_rt(nrt))
1508                 goto out;
1509
1510         netevent.old = &rt->u.dst;
1511         netevent.new = &nrt->u.dst;
1512         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1513
1514         if (rt->rt6i_flags&RTF_CACHE) {
1515                 ip6_del_rt(rt);
1516                 return;
1517         }
1518
1519 out:
1520         dst_release(&rt->u.dst);
1521         return;
1522 }
1523
1524 /*
1525  *      Handle ICMP "packet too big" messages
1526  *      i.e. Path MTU discovery
1527  */
1528
1529 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1530                         struct net_device *dev, u32 pmtu)
1531 {
1532         struct rt6_info *rt, *nrt;
1533         struct net *net = dev_net(dev);
1534         int allfrag = 0;
1535
1536         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1537         if (rt == NULL)
1538                 return;
1539
1540         if (pmtu >= dst_mtu(&rt->u.dst))
1541                 goto out;
1542
1543         if (pmtu < IPV6_MIN_MTU) {
1544                 /*
1545                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1546                  * MTU (1280) and a fragment header should always be included
1547                  * after a node receiving Too Big message reporting PMTU is
1548                  * less than the IPv6 Minimum Link MTU.
1549                  */
1550                 pmtu = IPV6_MIN_MTU;
1551                 allfrag = 1;
1552         }
1553
1554         /* New mtu received -> path was valid.
1555            They are sent only in response to data packets,
1556            so that this nexthop apparently is reachable. --ANK
1557          */
1558         dst_confirm(&rt->u.dst);
1559
1560         /* Host route. If it is static, it would be better
1561            not to override it, but add new one, so that
1562            when cache entry will expire old pmtu
1563            would return automatically.
1564          */
1565         if (rt->rt6i_flags & RTF_CACHE) {
1566                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1567                 if (allfrag)
1568                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1569                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1570                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1571                 goto out;
1572         }
1573
1574         /* Network route.
1575            Two cases are possible:
1576            1. It is connected route. Action: COW
1577            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1578          */
1579         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1580                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1581         else
1582                 nrt = rt6_alloc_clone(rt, daddr);
1583
1584         if (nrt) {
1585                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1586                 if (allfrag)
1587                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1588
1589                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1590                  * happened within 5 mins, the recommended timer is 10 mins.
1591                  * Here this route expiration time is set to ip6_rt_mtu_expires
1592                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1593                  * and detecting PMTU increase will be automatically happened.
1594                  */
1595                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1596                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1597
1598                 ip6_ins_rt(nrt);
1599         }
1600 out:
1601         dst_release(&rt->u.dst);
1602 }
1603
1604 /*
1605  *      Misc support functions
1606  */
1607
1608 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1609 {
1610         struct net *net = dev_net(ort->rt6i_dev);
1611         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1612
1613         if (rt) {
1614                 rt->u.dst.input = ort->u.dst.input;
1615                 rt->u.dst.output = ort->u.dst.output;
1616
1617                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1618                 rt->u.dst.error = ort->u.dst.error;
1619                 rt->u.dst.dev = ort->u.dst.dev;
1620                 if (rt->u.dst.dev)
1621                         dev_hold(rt->u.dst.dev);
1622                 rt->rt6i_idev = ort->rt6i_idev;
1623                 if (rt->rt6i_idev)
1624                         in6_dev_hold(rt->rt6i_idev);
1625                 rt->u.dst.lastuse = jiffies;
1626                 rt->rt6i_expires = 0;
1627
1628                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1629                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1630                 rt->rt6i_metric = 0;
1631
1632                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1633 #ifdef CONFIG_IPV6_SUBTREES
1634                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1635 #endif
1636                 rt->rt6i_table = ort->rt6i_table;
1637         }
1638         return rt;
1639 }
1640
1641 #ifdef CONFIG_IPV6_ROUTE_INFO
1642 static struct rt6_info *rt6_get_route_info(struct net *net,
1643                                            struct in6_addr *prefix, int prefixlen,
1644                                            struct in6_addr *gwaddr, int ifindex)
1645 {
1646         struct fib6_node *fn;
1647         struct rt6_info *rt = NULL;
1648         struct fib6_table *table;
1649
1650         table = fib6_get_table(net, RT6_TABLE_INFO);
1651         if (table == NULL)
1652                 return NULL;
1653
1654         write_lock_bh(&table->tb6_lock);
1655         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1656         if (!fn)
1657                 goto out;
1658
1659         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1660                 if (rt->rt6i_dev->ifindex != ifindex)
1661                         continue;
1662                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1663                         continue;
1664                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1665                         continue;
1666                 dst_hold(&rt->u.dst);
1667                 break;
1668         }
1669 out:
1670         write_unlock_bh(&table->tb6_lock);
1671         return rt;
1672 }
1673
1674 static struct rt6_info *rt6_add_route_info(struct net *net,
1675                                            struct in6_addr *prefix, int prefixlen,
1676                                            struct in6_addr *gwaddr, int ifindex,
1677                                            unsigned pref)
1678 {
1679         struct fib6_config cfg = {
1680                 .fc_table       = RT6_TABLE_INFO,
1681                 .fc_metric      = IP6_RT_PRIO_USER,
1682                 .fc_ifindex     = ifindex,
1683                 .fc_dst_len     = prefixlen,
1684                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1685                                   RTF_UP | RTF_PREF(pref),
1686                 .fc_nlinfo.pid = 0,
1687                 .fc_nlinfo.nlh = NULL,
1688                 .fc_nlinfo.nl_net = net,
1689         };
1690
1691         ipv6_addr_copy(&cfg.fc_dst, prefix);
1692         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1693
1694         /* We should treat it as a default route if prefix length is 0. */
1695         if (!prefixlen)
1696                 cfg.fc_flags |= RTF_DEFAULT;
1697
1698         ip6_route_add(&cfg);
1699
1700         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1701 }
1702 #endif
1703
1704 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1705 {
1706         struct rt6_info *rt;
1707         struct fib6_table *table;
1708
1709         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1710         if (table == NULL)
1711                 return NULL;
1712
1713         write_lock_bh(&table->tb6_lock);
1714         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1715                 if (dev == rt->rt6i_dev &&
1716                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1717                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1718                         break;
1719         }
1720         if (rt)
1721                 dst_hold(&rt->u.dst);
1722         write_unlock_bh(&table->tb6_lock);
1723         return rt;
1724 }
1725
1726 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1727                                      struct net_device *dev,
1728                                      unsigned int pref)
1729 {
1730         struct fib6_config cfg = {
1731                 .fc_table       = RT6_TABLE_DFLT,
1732                 .fc_metric      = IP6_RT_PRIO_USER,
1733                 .fc_ifindex     = dev->ifindex,
1734                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1735                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1736                 .fc_nlinfo.pid = 0,
1737                 .fc_nlinfo.nlh = NULL,
1738                 .fc_nlinfo.nl_net = dev_net(dev),
1739         };
1740
1741         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1742
1743         ip6_route_add(&cfg);
1744
1745         return rt6_get_dflt_router(gwaddr, dev);
1746 }
1747
1748 void rt6_purge_dflt_routers(struct net *net)
1749 {
1750         struct rt6_info *rt;
1751         struct fib6_table *table;
1752
1753         /* NOTE: Keep consistent with rt6_get_dflt_router */
1754         table = fib6_get_table(net, RT6_TABLE_DFLT);
1755         if (table == NULL)
1756                 return;
1757
1758 restart:
1759         read_lock_bh(&table->tb6_lock);
1760         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1761                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1762                         dst_hold(&rt->u.dst);
1763                         read_unlock_bh(&table->tb6_lock);
1764                         ip6_del_rt(rt);
1765                         goto restart;
1766                 }
1767         }
1768         read_unlock_bh(&table->tb6_lock);
1769 }
1770
1771 static void rtmsg_to_fib6_config(struct net *net,
1772                                  struct in6_rtmsg *rtmsg,
1773                                  struct fib6_config *cfg)
1774 {
1775         memset(cfg, 0, sizeof(*cfg));
1776
1777         cfg->fc_table = RT6_TABLE_MAIN;
1778         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1779         cfg->fc_metric = rtmsg->rtmsg_metric;
1780         cfg->fc_expires = rtmsg->rtmsg_info;
1781         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1782         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1783         cfg->fc_flags = rtmsg->rtmsg_flags;
1784
1785         cfg->fc_nlinfo.nl_net = net;
1786
1787         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1788         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1789         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1790 }
1791
1792 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1793 {
1794         struct fib6_config cfg;
1795         struct in6_rtmsg rtmsg;
1796         int err;
1797
1798         switch(cmd) {
1799         case SIOCADDRT:         /* Add a route */
1800         case SIOCDELRT:         /* Delete a route */
1801                 if (!capable(CAP_NET_ADMIN))
1802                         return -EPERM;
1803                 err = copy_from_user(&rtmsg, arg,
1804                                      sizeof(struct in6_rtmsg));
1805                 if (err)
1806                         return -EFAULT;
1807
1808                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1809
1810                 rtnl_lock();
1811                 switch (cmd) {
1812                 case SIOCADDRT:
1813                         err = ip6_route_add(&cfg);
1814                         break;
1815                 case SIOCDELRT:
1816                         err = ip6_route_del(&cfg);
1817                         break;
1818                 default:
1819                         err = -EINVAL;
1820                 }
1821                 rtnl_unlock();
1822
1823                 return err;
1824         }
1825
1826         return -EINVAL;
1827 }
1828
1829 /*
1830  *      Drop the packet on the floor
1831  */
1832
1833 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1834 {
1835         int type;
1836         switch (ipstats_mib_noroutes) {
1837         case IPSTATS_MIB_INNOROUTES:
1838                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1839                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1840                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1841                         break;
1842                 }
1843                 /* FALLTHROUGH */
1844         case IPSTATS_MIB_OUTNOROUTES:
1845                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1846                 break;
1847         }
1848         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1849         kfree_skb(skb);
1850         return 0;
1851 }
1852
1853 static int ip6_pkt_discard(struct sk_buff *skb)
1854 {
1855         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1856 }
1857
1858 static int ip6_pkt_discard_out(struct sk_buff *skb)
1859 {
1860         skb->dev = skb->dst->dev;
1861         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1862 }
1863
1864 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1865
1866 static int ip6_pkt_prohibit(struct sk_buff *skb)
1867 {
1868         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1869 }
1870
1871 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1872 {
1873         skb->dev = skb->dst->dev;
1874         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1875 }
1876
1877 #endif
1878
1879 /*
1880  *      Allocate a dst for local (unicast / anycast) address.
1881  */
1882
1883 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1884                                     const struct in6_addr *addr,
1885                                     int anycast)
1886 {
1887         struct net *net = dev_net(idev->dev);
1888         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1889
1890         if (rt == NULL)
1891                 return ERR_PTR(-ENOMEM);
1892
1893         dev_hold(net->loopback_dev);
1894         in6_dev_hold(idev);
1895
1896         rt->u.dst.flags = DST_HOST;
1897         rt->u.dst.input = ip6_input;
1898         rt->u.dst.output = ip6_output;
1899         rt->rt6i_dev = net->loopback_dev;
1900         rt->rt6i_idev = idev;
1901         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1902         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1903         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1904         rt->u.dst.obsolete = -1;
1905
1906         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1907         if (anycast)
1908                 rt->rt6i_flags |= RTF_ANYCAST;
1909         else
1910                 rt->rt6i_flags |= RTF_LOCAL;
1911         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1912         if (rt->rt6i_nexthop == NULL) {
1913                 dst_free(&rt->u.dst);
1914                 return ERR_PTR(-ENOMEM);
1915         }
1916
1917         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1918         rt->rt6i_dst.plen = 128;
1919         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1920
1921         atomic_set(&rt->u.dst.__refcnt, 1);
1922
1923         return rt;
1924 }
1925
1926 struct arg_dev_net {
1927         struct net_device *dev;
1928         struct net *net;
1929 };
1930
1931 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1932 {
1933         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1934         struct net *net = ((struct arg_dev_net *)arg)->net;
1935
1936         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1937             rt != net->ipv6.ip6_null_entry) {
1938                 RT6_TRACE("deleted by ifdown %p\n", rt);
1939                 return -1;
1940         }
1941         return 0;
1942 }
1943
1944 void rt6_ifdown(struct net *net, struct net_device *dev)
1945 {
1946         struct arg_dev_net adn = {
1947                 .dev = dev,
1948                 .net = net,
1949         };
1950
1951         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1952         icmp6_clean_all(fib6_ifdown, &adn);
1953 }
1954
1955 struct rt6_mtu_change_arg
1956 {
1957         struct net_device *dev;
1958         unsigned mtu;
1959 };
1960
1961 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1962 {
1963         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1964         struct inet6_dev *idev;
1965         struct net *net = dev_net(arg->dev);
1966
1967         /* In IPv6 pmtu discovery is not optional,
1968            so that RTAX_MTU lock cannot disable it.
1969            We still use this lock to block changes
1970            caused by addrconf/ndisc.
1971         */
1972
1973         idev = __in6_dev_get(arg->dev);
1974         if (idev == NULL)
1975                 return 0;
1976
1977         /* For administrative MTU increase, there is no way to discover
1978            IPv6 PMTU increase, so PMTU increase should be updated here.
1979            Since RFC 1981 doesn't include administrative MTU increase
1980            update PMTU increase is a MUST. (i.e. jumbo frame)
1981          */
1982         /*
1983            If new MTU is less than route PMTU, this new MTU will be the
1984            lowest MTU in the path, update the route PMTU to reflect PMTU
1985            decreases; if new MTU is greater than route PMTU, and the
1986            old MTU is the lowest MTU in the path, update the route PMTU
1987            to reflect the increase. In this case if the other nodes' MTU
1988            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1989            PMTU discouvery.
1990          */
1991         if (rt->rt6i_dev == arg->dev &&
1992             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1993             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1994              (dst_mtu(&rt->u.dst) < arg->mtu &&
1995               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1996                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1997                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1998         }
1999         return 0;
2000 }
2001
2002 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2003 {
2004         struct rt6_mtu_change_arg arg = {
2005                 .dev = dev,
2006                 .mtu = mtu,
2007         };
2008
2009         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2010 }
2011
2012 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2013         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2014         [RTA_OIF]               = { .type = NLA_U32 },
2015         [RTA_IIF]               = { .type = NLA_U32 },
2016         [RTA_PRIORITY]          = { .type = NLA_U32 },
2017         [RTA_METRICS]           = { .type = NLA_NESTED },
2018 };
2019
2020 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2021                               struct fib6_config *cfg)
2022 {
2023         struct rtmsg *rtm;
2024         struct nlattr *tb[RTA_MAX+1];
2025         int err;
2026
2027         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2028         if (err < 0)
2029                 goto errout;
2030
2031         err = -EINVAL;
2032         rtm = nlmsg_data(nlh);
2033         memset(cfg, 0, sizeof(*cfg));
2034
2035         cfg->fc_table = rtm->rtm_table;
2036         cfg->fc_dst_len = rtm->rtm_dst_len;
2037         cfg->fc_src_len = rtm->rtm_src_len;
2038         cfg->fc_flags = RTF_UP;
2039         cfg->fc_protocol = rtm->rtm_protocol;
2040
2041         if (rtm->rtm_type == RTN_UNREACHABLE)
2042                 cfg->fc_flags |= RTF_REJECT;
2043
2044         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2045         cfg->fc_nlinfo.nlh = nlh;
2046         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2047
2048         if (tb[RTA_GATEWAY]) {
2049                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2050                 cfg->fc_flags |= RTF_GATEWAY;
2051         }
2052
2053         if (tb[RTA_DST]) {
2054                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2055
2056                 if (nla_len(tb[RTA_DST]) < plen)
2057                         goto errout;
2058
2059                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2060         }
2061
2062         if (tb[RTA_SRC]) {
2063                 int plen = (rtm->rtm_src_len + 7) >> 3;
2064
2065                 if (nla_len(tb[RTA_SRC]) < plen)
2066                         goto errout;
2067
2068                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2069         }
2070
2071         if (tb[RTA_OIF])
2072                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2073
2074         if (tb[RTA_PRIORITY])
2075                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2076
2077         if (tb[RTA_METRICS]) {
2078                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2079                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2080         }
2081
2082         if (tb[RTA_TABLE])
2083                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2084
2085         err = 0;
2086 errout:
2087         return err;
2088 }
2089
2090 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2091 {
2092         struct fib6_config cfg;
2093         int err;
2094
2095         err = rtm_to_fib6_config(skb, nlh, &cfg);
2096         if (err < 0)
2097                 return err;
2098
2099         return ip6_route_del(&cfg);
2100 }
2101
2102 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2103 {
2104         struct fib6_config cfg;
2105         int err;
2106
2107         err = rtm_to_fib6_config(skb, nlh, &cfg);
2108         if (err < 0)
2109                 return err;
2110
2111         return ip6_route_add(&cfg);
2112 }
2113
2114 static inline size_t rt6_nlmsg_size(void)
2115 {
2116         return NLMSG_ALIGN(sizeof(struct rtmsg))
2117                + nla_total_size(16) /* RTA_SRC */
2118                + nla_total_size(16) /* RTA_DST */
2119                + nla_total_size(16) /* RTA_GATEWAY */
2120                + nla_total_size(16) /* RTA_PREFSRC */
2121                + nla_total_size(4) /* RTA_TABLE */
2122                + nla_total_size(4) /* RTA_IIF */
2123                + nla_total_size(4) /* RTA_OIF */
2124                + nla_total_size(4) /* RTA_PRIORITY */
2125                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2126                + nla_total_size(sizeof(struct rta_cacheinfo));
2127 }
2128
2129 static int rt6_fill_node(struct net *net,
2130                          struct sk_buff *skb, struct rt6_info *rt,
2131                          struct in6_addr *dst, struct in6_addr *src,
2132                          int iif, int type, u32 pid, u32 seq,
2133                          int prefix, int nowait, unsigned int flags)
2134 {
2135         struct rtmsg *rtm;
2136         struct nlmsghdr *nlh;
2137         long expires;
2138         u32 table;
2139
2140         if (prefix) {   /* user wants prefix routes only */
2141                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2142                         /* success since this is not a prefix route */
2143                         return 1;
2144                 }
2145         }
2146
2147         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2148         if (nlh == NULL)
2149                 return -EMSGSIZE;
2150
2151         rtm = nlmsg_data(nlh);
2152         rtm->rtm_family = AF_INET6;
2153         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2154         rtm->rtm_src_len = rt->rt6i_src.plen;
2155         rtm->rtm_tos = 0;
2156         if (rt->rt6i_table)
2157                 table = rt->rt6i_table->tb6_id;
2158         else
2159                 table = RT6_TABLE_UNSPEC;
2160         rtm->rtm_table = table;
2161         NLA_PUT_U32(skb, RTA_TABLE, table);
2162         if (rt->rt6i_flags&RTF_REJECT)
2163                 rtm->rtm_type = RTN_UNREACHABLE;
2164         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2165                 rtm->rtm_type = RTN_LOCAL;
2166         else
2167                 rtm->rtm_type = RTN_UNICAST;
2168         rtm->rtm_flags = 0;
2169         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2170         rtm->rtm_protocol = rt->rt6i_protocol;
2171         if (rt->rt6i_flags&RTF_DYNAMIC)
2172                 rtm->rtm_protocol = RTPROT_REDIRECT;
2173         else if (rt->rt6i_flags & RTF_ADDRCONF)
2174                 rtm->rtm_protocol = RTPROT_KERNEL;
2175         else if (rt->rt6i_flags&RTF_DEFAULT)
2176                 rtm->rtm_protocol = RTPROT_RA;
2177
2178         if (rt->rt6i_flags&RTF_CACHE)
2179                 rtm->rtm_flags |= RTM_F_CLONED;
2180
2181         if (dst) {
2182                 NLA_PUT(skb, RTA_DST, 16, dst);
2183                 rtm->rtm_dst_len = 128;
2184         } else if (rtm->rtm_dst_len)
2185                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2186 #ifdef CONFIG_IPV6_SUBTREES
2187         if (src) {
2188                 NLA_PUT(skb, RTA_SRC, 16, src);
2189                 rtm->rtm_src_len = 128;
2190         } else if (rtm->rtm_src_len)
2191                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2192 #endif
2193         if (iif) {
2194 #ifdef CONFIG_IPV6_MROUTE
2195                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2196                         int err = ip6mr_get_route(skb, rtm, nowait);
2197                         if (err <= 0) {
2198                                 if (!nowait) {
2199                                         if (err == 0)
2200                                                 return 0;
2201                                         goto nla_put_failure;
2202                                 } else {
2203                                         if (err == -EMSGSIZE)
2204                                                 goto nla_put_failure;
2205                                 }
2206                         }
2207                 } else
2208 #endif
2209                         NLA_PUT_U32(skb, RTA_IIF, iif);
2210         } else if (dst) {
2211                 struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2212                 struct in6_addr saddr_buf;
2213                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2214                                        dst, 0, &saddr_buf) == 0)
2215                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2216         }
2217
2218         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2219                 goto nla_put_failure;
2220
2221         if (rt->u.dst.neighbour)
2222                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2223
2224         if (rt->u.dst.dev)
2225                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2226
2227         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2228
2229         if (!(rt->rt6i_flags & RTF_EXPIRES))
2230                 expires = 0;
2231         else if (rt->rt6i_expires - jiffies < INT_MAX)
2232                 expires = rt->rt6i_expires - jiffies;
2233         else
2234                 expires = INT_MAX;
2235
2236         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2237                                expires, rt->u.dst.error) < 0)
2238                 goto nla_put_failure;
2239
2240         return nlmsg_end(skb, nlh);
2241
2242 nla_put_failure:
2243         nlmsg_cancel(skb, nlh);
2244         return -EMSGSIZE;
2245 }
2246
2247 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2248 {
2249         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2250         int prefix;
2251
2252         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2253                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2254                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2255         } else
2256                 prefix = 0;
2257
2258         return rt6_fill_node(arg->net,
2259                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2260                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2261                      prefix, 0, NLM_F_MULTI);
2262 }
2263
2264 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2265 {
2266         struct net *net = sock_net(in_skb->sk);
2267         struct nlattr *tb[RTA_MAX+1];
2268         struct rt6_info *rt;
2269         struct sk_buff *skb;
2270         struct rtmsg *rtm;
2271         struct flowi fl;
2272         int err, iif = 0;
2273
2274         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2275         if (err < 0)
2276                 goto errout;
2277
2278         err = -EINVAL;
2279         memset(&fl, 0, sizeof(fl));
2280
2281         if (tb[RTA_SRC]) {
2282                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2283                         goto errout;
2284
2285                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2286         }
2287
2288         if (tb[RTA_DST]) {
2289                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2290                         goto errout;
2291
2292                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2293         }
2294
2295         if (tb[RTA_IIF])
2296                 iif = nla_get_u32(tb[RTA_IIF]);
2297
2298         if (tb[RTA_OIF])
2299                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2300
2301         if (iif) {
2302                 struct net_device *dev;
2303                 dev = __dev_get_by_index(net, iif);
2304                 if (!dev) {
2305                         err = -ENODEV;
2306                         goto errout;
2307                 }
2308         }
2309
2310         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2311         if (skb == NULL) {
2312                 err = -ENOBUFS;
2313                 goto errout;
2314         }
2315
2316         /* Reserve room for dummy headers, this skb can pass
2317            through good chunk of routing engine.
2318          */
2319         skb_reset_mac_header(skb);
2320         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2321
2322         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2323         skb->dst = &rt->u.dst;
2324
2325         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2326                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2327                             nlh->nlmsg_seq, 0, 0, 0);
2328         if (err < 0) {
2329                 kfree_skb(skb);
2330                 goto errout;
2331         }
2332
2333         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2334 errout:
2335         return err;
2336 }
2337
2338 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2339 {
2340         struct sk_buff *skb;
2341         struct net *net = info->nl_net;
2342         u32 seq;
2343         int err;
2344
2345         err = -ENOBUFS;
2346         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2347
2348         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2349         if (skb == NULL)
2350                 goto errout;
2351
2352         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2353                                 event, info->pid, seq, 0, 0, 0);
2354         if (err < 0) {
2355                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2356                 WARN_ON(err == -EMSGSIZE);
2357                 kfree_skb(skb);
2358                 goto errout;
2359         }
2360         err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2361                           info->nlh, gfp_any());
2362 errout:
2363         if (err < 0)
2364                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2365 }
2366
2367 static int ip6_route_dev_notify(struct notifier_block *this,
2368                                 unsigned long event, void *data)
2369 {
2370         struct net_device *dev = (struct net_device *)data;
2371         struct net *net = dev_net(dev);
2372
2373         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2374                 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2375                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2376 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2377                 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2378                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2379                 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2380                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2381 #endif
2382         }
2383
2384         return NOTIFY_OK;
2385 }
2386
2387 /*
2388  *      /proc
2389  */
2390
2391 #ifdef CONFIG_PROC_FS
2392
2393 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2394
2395 struct rt6_proc_arg
2396 {
2397         char *buffer;
2398         int offset;
2399         int length;
2400         int skip;
2401         int len;
2402 };
2403
2404 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2405 {
2406         struct seq_file *m = p_arg;
2407
2408         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2409                    rt->rt6i_dst.plen);
2410
2411 #ifdef CONFIG_IPV6_SUBTREES
2412         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2413                    rt->rt6i_src.plen);
2414 #else
2415         seq_puts(m, "00000000000000000000000000000000 00 ");
2416 #endif
2417
2418         if (rt->rt6i_nexthop) {
2419                 seq_printf(m, NIP6_SEQFMT,
2420                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2421         } else {
2422                 seq_puts(m, "00000000000000000000000000000000");
2423         }
2424         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2425                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2426                    rt->u.dst.__use, rt->rt6i_flags,
2427                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2428         return 0;
2429 }
2430
2431 static int ipv6_route_show(struct seq_file *m, void *v)
2432 {
2433         struct net *net = (struct net *)m->private;
2434         fib6_clean_all(net, rt6_info_route, 0, m);
2435         return 0;
2436 }
2437
2438 static int ipv6_route_open(struct inode *inode, struct file *file)
2439 {
2440         return single_open_net(inode, file, ipv6_route_show);
2441 }
2442
2443 static const struct file_operations ipv6_route_proc_fops = {
2444         .owner          = THIS_MODULE,
2445         .open           = ipv6_route_open,
2446         .read           = seq_read,
2447         .llseek         = seq_lseek,
2448         .release        = single_release_net,
2449 };
2450
2451 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2452 {
2453         struct net *net = (struct net *)seq->private;
2454         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2455                    net->ipv6.rt6_stats->fib_nodes,
2456                    net->ipv6.rt6_stats->fib_route_nodes,
2457                    net->ipv6.rt6_stats->fib_rt_alloc,
2458                    net->ipv6.rt6_stats->fib_rt_entries,
2459                    net->ipv6.rt6_stats->fib_rt_cache,
2460                    atomic_read(&net->ipv6.ip6_dst_ops->entries),
2461                    net->ipv6.rt6_stats->fib_discarded_routes);
2462
2463         return 0;
2464 }
2465
2466 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2467 {
2468         return single_open_net(inode, file, rt6_stats_seq_show);
2469 }
2470
2471 static const struct file_operations rt6_stats_seq_fops = {
2472         .owner   = THIS_MODULE,
2473         .open    = rt6_stats_seq_open,
2474         .read    = seq_read,
2475         .llseek  = seq_lseek,
2476         .release = single_release_net,
2477 };
2478 #endif  /* CONFIG_PROC_FS */
2479
2480 #ifdef CONFIG_SYSCTL
2481
2482 static
2483 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2484                               void __user *buffer, size_t *lenp, loff_t *ppos)
2485 {
2486         struct net *net = current->nsproxy->net_ns;
2487         int delay = net->ipv6.sysctl.flush_delay;
2488         if (write) {
2489                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2490                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2491                 return 0;
2492         } else
2493                 return -EINVAL;
2494 }
2495
2496 ctl_table ipv6_route_table_template[] = {
2497         {
2498                 .procname       =       "flush",
2499                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2500                 .maxlen         =       sizeof(int),
2501                 .mode           =       0200,
2502                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2503         },
2504         {
2505                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2506                 .procname       =       "gc_thresh",
2507                 .data           =       &ip6_dst_ops_template.gc_thresh,
2508                 .maxlen         =       sizeof(int),
2509                 .mode           =       0644,
2510                 .proc_handler   =       &proc_dointvec,
2511         },
2512         {
2513                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2514                 .procname       =       "max_size",
2515                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2516                 .maxlen         =       sizeof(int),
2517                 .mode           =       0644,
2518                 .proc_handler   =       &proc_dointvec,
2519         },
2520         {
2521                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2522                 .procname       =       "gc_min_interval",
2523                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2524                 .maxlen         =       sizeof(int),
2525                 .mode           =       0644,
2526                 .proc_handler   =       &proc_dointvec_jiffies,
2527                 .strategy       =       &sysctl_jiffies,
2528         },
2529         {
2530                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2531                 .procname       =       "gc_timeout",
2532                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2533                 .maxlen         =       sizeof(int),
2534                 .mode           =       0644,
2535                 .proc_handler   =       &proc_dointvec_jiffies,
2536                 .strategy       =       &sysctl_jiffies,
2537         },
2538         {
2539                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2540                 .procname       =       "gc_interval",
2541                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2542                 .maxlen         =       sizeof(int),
2543                 .mode           =       0644,
2544                 .proc_handler   =       &proc_dointvec_jiffies,
2545                 .strategy       =       &sysctl_jiffies,
2546         },
2547         {
2548                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2549                 .procname       =       "gc_elasticity",
2550                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2551                 .maxlen         =       sizeof(int),
2552                 .mode           =       0644,
2553                 .proc_handler   =       &proc_dointvec_jiffies,
2554                 .strategy       =       &sysctl_jiffies,
2555         },
2556         {
2557                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2558                 .procname       =       "mtu_expires",
2559                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2560                 .maxlen         =       sizeof(int),
2561                 .mode           =       0644,
2562                 .proc_handler   =       &proc_dointvec_jiffies,
2563                 .strategy       =       &sysctl_jiffies,
2564         },
2565         {
2566                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2567                 .procname       =       "min_adv_mss",
2568                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2569                 .maxlen         =       sizeof(int),
2570                 .mode           =       0644,
2571                 .proc_handler   =       &proc_dointvec_jiffies,
2572                 .strategy       =       &sysctl_jiffies,
2573         },
2574         {
2575                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2576                 .procname       =       "gc_min_interval_ms",
2577                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2578                 .maxlen         =       sizeof(int),
2579                 .mode           =       0644,
2580                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2581                 .strategy       =       &sysctl_ms_jiffies,
2582         },
2583         { .ctl_name = 0 }
2584 };
2585
2586 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2587 {
2588         struct ctl_table *table;
2589
2590         table = kmemdup(ipv6_route_table_template,
2591                         sizeof(ipv6_route_table_template),
2592                         GFP_KERNEL);
2593
2594         if (table) {
2595                 table[0].data = &net->ipv6.sysctl.flush_delay;
2596                 table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2597                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2598                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2599                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2600                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2601                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2602                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2603                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2604         }
2605
2606         return table;
2607 }
2608 #endif
2609
2610 static int ip6_route_net_init(struct net *net)
2611 {
2612         int ret = -ENOMEM;
2613
2614         net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2615                                         sizeof(*net->ipv6.ip6_dst_ops),
2616                                         GFP_KERNEL);
2617         if (!net->ipv6.ip6_dst_ops)
2618                 goto out;
2619         net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2620
2621         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2622                                            sizeof(*net->ipv6.ip6_null_entry),
2623                                            GFP_KERNEL);
2624         if (!net->ipv6.ip6_null_entry)
2625                 goto out_ip6_dst_ops;
2626         net->ipv6.ip6_null_entry->u.dst.path =
2627                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2628         net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2629
2630 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2631         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2632                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2633                                                GFP_KERNEL);
2634         if (!net->ipv6.ip6_prohibit_entry) {
2635                 kfree(net->ipv6.ip6_null_entry);
2636                 goto out;
2637         }
2638         net->ipv6.ip6_prohibit_entry->u.dst.path =
2639                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2640         net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2641
2642         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2643                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2644                                                GFP_KERNEL);
2645         if (!net->ipv6.ip6_blk_hole_entry) {
2646                 kfree(net->ipv6.ip6_null_entry);
2647                 kfree(net->ipv6.ip6_prohibit_entry);
2648                 goto out;
2649         }
2650         net->ipv6.ip6_blk_hole_entry->u.dst.path =
2651                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2652         net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2653 #endif
2654
2655 #ifdef CONFIG_PROC_FS
2656         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2657         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2658 #endif
2659         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2660
2661         ret = 0;
2662 out:
2663         return ret;
2664
2665 out_ip6_dst_ops:
2666         release_net(net->ipv6.ip6_dst_ops->dst_net);
2667         kfree(net->ipv6.ip6_dst_ops);
2668         goto out;
2669 }
2670
2671 static void ip6_route_net_exit(struct net *net)
2672 {
2673 #ifdef CONFIG_PROC_FS
2674         proc_net_remove(net, "ipv6_route");
2675         proc_net_remove(net, "rt6_stats");
2676 #endif
2677         kfree(net->ipv6.ip6_null_entry);
2678 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2679         kfree(net->ipv6.ip6_prohibit_entry);
2680         kfree(net->ipv6.ip6_blk_hole_entry);
2681 #endif
2682         release_net(net->ipv6.ip6_dst_ops->dst_net);
2683         kfree(net->ipv6.ip6_dst_ops);
2684 }
2685
2686 static struct pernet_operations ip6_route_net_ops = {
2687         .init = ip6_route_net_init,
2688         .exit = ip6_route_net_exit,
2689 };
2690
2691 static struct notifier_block ip6_route_dev_notifier = {
2692         .notifier_call = ip6_route_dev_notify,
2693         .priority = 0,
2694 };
2695
2696 int __init ip6_route_init(void)
2697 {
2698         int ret;
2699
2700         ret = -ENOMEM;
2701         ip6_dst_ops_template.kmem_cachep =
2702                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2703                                   SLAB_HWCACHE_ALIGN, NULL);
2704         if (!ip6_dst_ops_template.kmem_cachep)
2705                 goto out;;
2706
2707         ret = register_pernet_subsys(&ip6_route_net_ops);
2708         if (ret)
2709                 goto out_kmem_cache;
2710
2711         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2712
2713         /* Registering of the loopback is done before this portion of code,
2714          * the loopback reference in rt6_info will not be taken, do it
2715          * manually for init_net */
2716         init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2717         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2718   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2719         init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2720         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2721         init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2722         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2723   #endif
2724         ret = fib6_init();
2725         if (ret)
2726                 goto out_register_subsys;
2727
2728         ret = xfrm6_init();
2729         if (ret)
2730                 goto out_fib6_init;
2731
2732         ret = fib6_rules_init();
2733         if (ret)
2734                 goto xfrm6_init;
2735
2736         ret = -ENOBUFS;
2737         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2738             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2739             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2740                 goto fib6_rules_init;
2741
2742         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2743         if (ret)
2744                 goto fib6_rules_init;
2745
2746 out:
2747         return ret;
2748
2749 fib6_rules_init:
2750         fib6_rules_cleanup();
2751 xfrm6_init:
2752         xfrm6_fini();
2753 out_fib6_init:
2754         fib6_gc_cleanup();
2755 out_register_subsys:
2756         unregister_pernet_subsys(&ip6_route_net_ops);
2757 out_kmem_cache:
2758         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2759         goto out;
2760 }
2761
2762 void ip6_route_cleanup(void)
2763 {
2764         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2765         fib6_rules_cleanup();
2766         xfrm6_fini();
2767         fib6_gc_cleanup();
2768         unregister_pernet_subsys(&ip6_route_net_ops);
2769         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2770 }