[NETNS][IPV6] tcp6 - make socket control per namespace
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       __constant_htons(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       ip6_local_out,
112         .entry_size             =       sizeof(struct rt6_info),
113         .entries                =       ATOMIC_INIT(0),
114 };
115
116 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
117 {
118 }
119
120 static struct dst_ops ip6_dst_blackhole_ops = {
121         .family                 =       AF_INET6,
122         .protocol               =       __constant_htons(ETH_P_IPV6),
123         .destroy                =       ip6_dst_destroy,
124         .check                  =       ip6_dst_check,
125         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
126         .entry_size             =       sizeof(struct rt6_info),
127         .entries                =       ATOMIC_INIT(0),
128 };
129
130 static struct rt6_info ip6_null_entry_template = {
131         .u = {
132                 .dst = {
133                         .__refcnt       = ATOMIC_INIT(1),
134                         .__use          = 1,
135                         .obsolete       = -1,
136                         .error          = -ENETUNREACH,
137                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
138                         .input          = ip6_pkt_discard,
139                         .output         = ip6_pkt_discard_out,
140                 }
141         },
142         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
143         .rt6i_metric    = ~(u32) 0,
144         .rt6i_ref       = ATOMIC_INIT(1),
145 };
146
147 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
148
149 static int ip6_pkt_prohibit(struct sk_buff *skb);
150 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
151
152 struct rt6_info ip6_prohibit_entry_template = {
153         .u = {
154                 .dst = {
155                         .__refcnt       = ATOMIC_INIT(1),
156                         .__use          = 1,
157                         .obsolete       = -1,
158                         .error          = -EACCES,
159                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
160                         .input          = ip6_pkt_prohibit,
161                         .output         = ip6_pkt_prohibit_out,
162                 }
163         },
164         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
165         .rt6i_metric    = ~(u32) 0,
166         .rt6i_ref       = ATOMIC_INIT(1),
167 };
168
169 static struct rt6_info ip6_blk_hole_entry_template = {
170         .u = {
171                 .dst = {
172                         .__refcnt       = ATOMIC_INIT(1),
173                         .__use          = 1,
174                         .obsolete       = -1,
175                         .error          = -EINVAL,
176                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
177                         .input          = dst_discard,
178                         .output         = dst_discard,
179                 }
180         },
181         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
182         .rt6i_metric    = ~(u32) 0,
183         .rt6i_ref       = ATOMIC_INIT(1),
184 };
185
186 #endif
187
188 /* allocate dst with ip6_dst_ops */
189 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
190 {
191         return (struct rt6_info *)dst_alloc(ops);
192 }
193
194 static void ip6_dst_destroy(struct dst_entry *dst)
195 {
196         struct rt6_info *rt = (struct rt6_info *)dst;
197         struct inet6_dev *idev = rt->rt6i_idev;
198
199         if (idev != NULL) {
200                 rt->rt6i_idev = NULL;
201                 in6_dev_put(idev);
202         }
203 }
204
205 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
206                            int how)
207 {
208         struct rt6_info *rt = (struct rt6_info *)dst;
209         struct inet6_dev *idev = rt->rt6i_idev;
210         struct net_device *loopback_dev =
211                 dev->nd_net->loopback_dev;
212
213         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
214                 struct inet6_dev *loopback_idev =
215                         in6_dev_get(loopback_dev);
216                 if (loopback_idev != NULL) {
217                         rt->rt6i_idev = loopback_idev;
218                         in6_dev_put(idev);
219                 }
220         }
221 }
222
223 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
224 {
225         return (rt->rt6i_flags & RTF_EXPIRES &&
226                 time_after(jiffies, rt->rt6i_expires));
227 }
228
229 static inline int rt6_need_strict(struct in6_addr *daddr)
230 {
231         return (ipv6_addr_type(daddr) &
232                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
233 }
234
235 /*
236  *      Route lookup. Any table->tb6_lock is implied.
237  */
238
239 static inline struct rt6_info *rt6_device_match(struct net *net,
240                                                     struct rt6_info *rt,
241                                                     int oif,
242                                                     int strict)
243 {
244         struct rt6_info *local = NULL;
245         struct rt6_info *sprt;
246
247         if (oif) {
248                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
249                         struct net_device *dev = sprt->rt6i_dev;
250                         if (dev->ifindex == oif)
251                                 return sprt;
252                         if (dev->flags & IFF_LOOPBACK) {
253                                 if (sprt->rt6i_idev == NULL ||
254                                     sprt->rt6i_idev->dev->ifindex != oif) {
255                                         if (strict && oif)
256                                                 continue;
257                                         if (local && (!oif ||
258                                                       local->rt6i_idev->dev->ifindex == oif))
259                                                 continue;
260                                 }
261                                 local = sprt;
262                         }
263                 }
264
265                 if (local)
266                         return local;
267
268                 if (strict)
269                         return net->ipv6.ip6_null_entry;
270         }
271         return rt;
272 }
273
274 #ifdef CONFIG_IPV6_ROUTER_PREF
275 static void rt6_probe(struct rt6_info *rt)
276 {
277         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
278         /*
279          * Okay, this does not seem to be appropriate
280          * for now, however, we need to check if it
281          * is really so; aka Router Reachability Probing.
282          *
283          * Router Reachability Probe MUST be rate-limited
284          * to no more than one per minute.
285          */
286         if (!neigh || (neigh->nud_state & NUD_VALID))
287                 return;
288         read_lock_bh(&neigh->lock);
289         if (!(neigh->nud_state & NUD_VALID) &&
290             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
291                 struct in6_addr mcaddr;
292                 struct in6_addr *target;
293
294                 neigh->updated = jiffies;
295                 read_unlock_bh(&neigh->lock);
296
297                 target = (struct in6_addr *)&neigh->primary_key;
298                 addrconf_addr_solict_mult(target, &mcaddr);
299                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
300         } else
301                 read_unlock_bh(&neigh->lock);
302 }
303 #else
304 static inline void rt6_probe(struct rt6_info *rt)
305 {
306         return;
307 }
308 #endif
309
310 /*
311  * Default Router Selection (RFC 2461 6.3.6)
312  */
313 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
314 {
315         struct net_device *dev = rt->rt6i_dev;
316         if (!oif || dev->ifindex == oif)
317                 return 2;
318         if ((dev->flags & IFF_LOOPBACK) &&
319             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
320                 return 1;
321         return 0;
322 }
323
324 static inline int rt6_check_neigh(struct rt6_info *rt)
325 {
326         struct neighbour *neigh = rt->rt6i_nexthop;
327         int m;
328         if (rt->rt6i_flags & RTF_NONEXTHOP ||
329             !(rt->rt6i_flags & RTF_GATEWAY))
330                 m = 1;
331         else if (neigh) {
332                 read_lock_bh(&neigh->lock);
333                 if (neigh->nud_state & NUD_VALID)
334                         m = 2;
335 #ifdef CONFIG_IPV6_ROUTER_PREF
336                 else if (neigh->nud_state & NUD_FAILED)
337                         m = 0;
338 #endif
339                 else
340                         m = 1;
341                 read_unlock_bh(&neigh->lock);
342         } else
343                 m = 0;
344         return m;
345 }
346
347 static int rt6_score_route(struct rt6_info *rt, int oif,
348                            int strict)
349 {
350         int m, n;
351
352         m = rt6_check_dev(rt, oif);
353         if (!m && (strict & RT6_LOOKUP_F_IFACE))
354                 return -1;
355 #ifdef CONFIG_IPV6_ROUTER_PREF
356         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
357 #endif
358         n = rt6_check_neigh(rt);
359         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
360                 return -1;
361         return m;
362 }
363
364 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
365                                    int *mpri, struct rt6_info *match)
366 {
367         int m;
368
369         if (rt6_check_expired(rt))
370                 goto out;
371
372         m = rt6_score_route(rt, oif, strict);
373         if (m < 0)
374                 goto out;
375
376         if (m > *mpri) {
377                 if (strict & RT6_LOOKUP_F_REACHABLE)
378                         rt6_probe(match);
379                 *mpri = m;
380                 match = rt;
381         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
382                 rt6_probe(rt);
383         }
384
385 out:
386         return match;
387 }
388
389 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
390                                      struct rt6_info *rr_head,
391                                      u32 metric, int oif, int strict)
392 {
393         struct rt6_info *rt, *match;
394         int mpri = -1;
395
396         match = NULL;
397         for (rt = rr_head; rt && rt->rt6i_metric == metric;
398              rt = rt->u.dst.rt6_next)
399                 match = find_match(rt, oif, strict, &mpri, match);
400         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
401              rt = rt->u.dst.rt6_next)
402                 match = find_match(rt, oif, strict, &mpri, match);
403
404         return match;
405 }
406
407 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
408 {
409         struct rt6_info *match, *rt0;
410         struct net *net;
411
412         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
413                   __func__, fn->leaf, oif);
414
415         rt0 = fn->rr_ptr;
416         if (!rt0)
417                 fn->rr_ptr = rt0 = fn->leaf;
418
419         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
420
421         if (!match &&
422             (strict & RT6_LOOKUP_F_REACHABLE)) {
423                 struct rt6_info *next = rt0->u.dst.rt6_next;
424
425                 /* no entries matched; do round-robin */
426                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
427                         next = fn->leaf;
428
429                 if (next != rt0)
430                         fn->rr_ptr = next;
431         }
432
433         RT6_TRACE("%s() => %p\n",
434                   __func__, match);
435
436         net = rt0->rt6i_dev->nd_net;
437         return (match ? match : net->ipv6.ip6_null_entry);
438 }
439
440 #ifdef CONFIG_IPV6_ROUTE_INFO
441 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
442                   struct in6_addr *gwaddr)
443 {
444         struct net *net = dev->nd_net;
445         struct route_info *rinfo = (struct route_info *) opt;
446         struct in6_addr prefix_buf, *prefix;
447         unsigned int pref;
448         u32 lifetime;
449         struct rt6_info *rt;
450
451         if (len < sizeof(struct route_info)) {
452                 return -EINVAL;
453         }
454
455         /* Sanity check for prefix_len and length */
456         if (rinfo->length > 3) {
457                 return -EINVAL;
458         } else if (rinfo->prefix_len > 128) {
459                 return -EINVAL;
460         } else if (rinfo->prefix_len > 64) {
461                 if (rinfo->length < 2) {
462                         return -EINVAL;
463                 }
464         } else if (rinfo->prefix_len > 0) {
465                 if (rinfo->length < 1) {
466                         return -EINVAL;
467                 }
468         }
469
470         pref = rinfo->route_pref;
471         if (pref == ICMPV6_ROUTER_PREF_INVALID)
472                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
473
474         lifetime = ntohl(rinfo->lifetime);
475         if (lifetime == 0xffffffff) {
476                 /* infinity */
477         } else if (lifetime > 0x7fffffff/HZ) {
478                 /* Avoid arithmetic overflow */
479                 lifetime = 0x7fffffff/HZ - 1;
480         }
481
482         if (rinfo->length == 3)
483                 prefix = (struct in6_addr *)rinfo->prefix;
484         else {
485                 /* this function is safe */
486                 ipv6_addr_prefix(&prefix_buf,
487                                  (struct in6_addr *)rinfo->prefix,
488                                  rinfo->prefix_len);
489                 prefix = &prefix_buf;
490         }
491
492         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
493                                 dev->ifindex);
494
495         if (rt && !lifetime) {
496                 ip6_del_rt(rt);
497                 rt = NULL;
498         }
499
500         if (!rt && lifetime)
501                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
502                                         pref);
503         else if (rt)
504                 rt->rt6i_flags = RTF_ROUTEINFO |
505                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
506
507         if (rt) {
508                 if (lifetime == 0xffffffff) {
509                         rt->rt6i_flags &= ~RTF_EXPIRES;
510                 } else {
511                         rt->rt6i_expires = jiffies + HZ * lifetime;
512                         rt->rt6i_flags |= RTF_EXPIRES;
513                 }
514                 dst_release(&rt->u.dst);
515         }
516         return 0;
517 }
518 #endif
519
520 #define BACKTRACK(__net, saddr)                 \
521 do { \
522         if (rt == __net->ipv6.ip6_null_entry) { \
523                 struct fib6_node *pn; \
524                 while (1) { \
525                         if (fn->fn_flags & RTN_TL_ROOT) \
526                                 goto out; \
527                         pn = fn->parent; \
528                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
529                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
530                         else \
531                                 fn = pn; \
532                         if (fn->fn_flags & RTN_RTINFO) \
533                                 goto restart; \
534                 } \
535         } \
536 } while(0)
537
538 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
539                                              struct fib6_table *table,
540                                              struct flowi *fl, int flags)
541 {
542         struct fib6_node *fn;
543         struct rt6_info *rt;
544
545         read_lock_bh(&table->tb6_lock);
546         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
547 restart:
548         rt = fn->leaf;
549         rt = rt6_device_match(net, rt, fl->oif, flags);
550         BACKTRACK(net, &fl->fl6_src);
551 out:
552         dst_use(&rt->u.dst, jiffies);
553         read_unlock_bh(&table->tb6_lock);
554         return rt;
555
556 }
557
558 struct rt6_info *rt6_lookup(struct net *net, struct in6_addr *daddr,
559                             struct in6_addr *saddr, int oif, int strict)
560 {
561         struct flowi fl = {
562                 .oif = oif,
563                 .nl_u = {
564                         .ip6_u = {
565                                 .daddr = *daddr,
566                         },
567                 },
568         };
569         struct dst_entry *dst;
570         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
571
572         if (saddr) {
573                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
574                 flags |= RT6_LOOKUP_F_HAS_SADDR;
575         }
576
577         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
578         if (dst->error == 0)
579                 return (struct rt6_info *) dst;
580
581         dst_release(dst);
582
583         return NULL;
584 }
585
586 EXPORT_SYMBOL(rt6_lookup);
587
588 /* ip6_ins_rt is called with FREE table->tb6_lock.
589    It takes new route entry, the addition fails by any reason the
590    route is freed. In any case, if caller does not hold it, it may
591    be destroyed.
592  */
593
594 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
595 {
596         int err;
597         struct fib6_table *table;
598
599         table = rt->rt6i_table;
600         write_lock_bh(&table->tb6_lock);
601         err = fib6_add(&table->tb6_root, rt, info);
602         write_unlock_bh(&table->tb6_lock);
603
604         return err;
605 }
606
607 int ip6_ins_rt(struct rt6_info *rt)
608 {
609         struct nl_info info = {
610                 .nl_net = rt->rt6i_dev->nd_net,
611         };
612         return __ip6_ins_rt(rt, &info);
613 }
614
615 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
616                                       struct in6_addr *saddr)
617 {
618         struct rt6_info *rt;
619
620         /*
621          *      Clone the route.
622          */
623
624         rt = ip6_rt_copy(ort);
625
626         if (rt) {
627                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
628                         if (rt->rt6i_dst.plen != 128 &&
629                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
630                                 rt->rt6i_flags |= RTF_ANYCAST;
631                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
632                 }
633
634                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
635                 rt->rt6i_dst.plen = 128;
636                 rt->rt6i_flags |= RTF_CACHE;
637                 rt->u.dst.flags |= DST_HOST;
638
639 #ifdef CONFIG_IPV6_SUBTREES
640                 if (rt->rt6i_src.plen && saddr) {
641                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
642                         rt->rt6i_src.plen = 128;
643                 }
644 #endif
645
646                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
647
648         }
649
650         return rt;
651 }
652
653 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
654 {
655         struct rt6_info *rt = ip6_rt_copy(ort);
656         if (rt) {
657                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
658                 rt->rt6i_dst.plen = 128;
659                 rt->rt6i_flags |= RTF_CACHE;
660                 rt->u.dst.flags |= DST_HOST;
661                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
662         }
663         return rt;
664 }
665
666 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
667                                       struct flowi *fl, int flags)
668 {
669         struct fib6_node *fn;
670         struct rt6_info *rt, *nrt;
671         int strict = 0;
672         int attempts = 3;
673         int err;
674         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
675
676         strict |= flags & RT6_LOOKUP_F_IFACE;
677
678 relookup:
679         read_lock_bh(&table->tb6_lock);
680
681 restart_2:
682         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
683
684 restart:
685         rt = rt6_select(fn, oif, strict | reachable);
686
687         BACKTRACK(net, &fl->fl6_src);
688         if (rt == net->ipv6.ip6_null_entry ||
689             rt->rt6i_flags & RTF_CACHE)
690                 goto out;
691
692         dst_hold(&rt->u.dst);
693         read_unlock_bh(&table->tb6_lock);
694
695         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
696                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
697         else {
698 #if CLONE_OFFLINK_ROUTE
699                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
700 #else
701                 goto out2;
702 #endif
703         }
704
705         dst_release(&rt->u.dst);
706         rt = nrt ? : net->ipv6.ip6_null_entry;
707
708         dst_hold(&rt->u.dst);
709         if (nrt) {
710                 err = ip6_ins_rt(nrt);
711                 if (!err)
712                         goto out2;
713         }
714
715         if (--attempts <= 0)
716                 goto out2;
717
718         /*
719          * Race condition! In the gap, when table->tb6_lock was
720          * released someone could insert this route.  Relookup.
721          */
722         dst_release(&rt->u.dst);
723         goto relookup;
724
725 out:
726         if (reachable) {
727                 reachable = 0;
728                 goto restart_2;
729         }
730         dst_hold(&rt->u.dst);
731         read_unlock_bh(&table->tb6_lock);
732 out2:
733         rt->u.dst.lastuse = jiffies;
734         rt->u.dst.__use++;
735
736         return rt;
737 }
738
739 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
740                                             struct flowi *fl, int flags)
741 {
742         return ip6_pol_route(net, table, fl->iif, fl, flags);
743 }
744
745 void ip6_route_input(struct sk_buff *skb)
746 {
747         struct ipv6hdr *iph = ipv6_hdr(skb);
748         struct net *net = skb->dev->nd_net;
749         int flags = RT6_LOOKUP_F_HAS_SADDR;
750         struct flowi fl = {
751                 .iif = skb->dev->ifindex,
752                 .nl_u = {
753                         .ip6_u = {
754                                 .daddr = iph->daddr,
755                                 .saddr = iph->saddr,
756                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
757                         },
758                 },
759                 .mark = skb->mark,
760                 .proto = iph->nexthdr,
761         };
762
763         if (rt6_need_strict(&iph->daddr))
764                 flags |= RT6_LOOKUP_F_IFACE;
765
766         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
767 }
768
769 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
770                                              struct flowi *fl, int flags)
771 {
772         return ip6_pol_route(net, table, fl->oif, fl, flags);
773 }
774
775 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
776                                     struct flowi *fl)
777 {
778         int flags = 0;
779
780         if (rt6_need_strict(&fl->fl6_dst))
781                 flags |= RT6_LOOKUP_F_IFACE;
782
783         if (!ipv6_addr_any(&fl->fl6_src))
784                 flags |= RT6_LOOKUP_F_HAS_SADDR;
785
786         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
787 }
788
789 EXPORT_SYMBOL(ip6_route_output);
790
791 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
792 {
793         struct rt6_info *ort = (struct rt6_info *) *dstp;
794         struct rt6_info *rt = (struct rt6_info *)
795                 dst_alloc(&ip6_dst_blackhole_ops);
796         struct dst_entry *new = NULL;
797
798         if (rt) {
799                 new = &rt->u.dst;
800
801                 atomic_set(&new->__refcnt, 1);
802                 new->__use = 1;
803                 new->input = dst_discard;
804                 new->output = dst_discard;
805
806                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
807                 new->dev = ort->u.dst.dev;
808                 if (new->dev)
809                         dev_hold(new->dev);
810                 rt->rt6i_idev = ort->rt6i_idev;
811                 if (rt->rt6i_idev)
812                         in6_dev_hold(rt->rt6i_idev);
813                 rt->rt6i_expires = 0;
814
815                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
816                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
817                 rt->rt6i_metric = 0;
818
819                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
820 #ifdef CONFIG_IPV6_SUBTREES
821                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
822 #endif
823
824                 dst_free(new);
825         }
826
827         dst_release(*dstp);
828         *dstp = new;
829         return (new ? 0 : -ENOMEM);
830 }
831 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
832
833 /*
834  *      Destination cache support functions
835  */
836
837 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
838 {
839         struct rt6_info *rt;
840
841         rt = (struct rt6_info *) dst;
842
843         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
844                 return dst;
845
846         return NULL;
847 }
848
849 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
850 {
851         struct rt6_info *rt = (struct rt6_info *) dst;
852
853         if (rt) {
854                 if (rt->rt6i_flags & RTF_CACHE)
855                         ip6_del_rt(rt);
856                 else
857                         dst_release(dst);
858         }
859         return NULL;
860 }
861
862 static void ip6_link_failure(struct sk_buff *skb)
863 {
864         struct rt6_info *rt;
865
866         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
867
868         rt = (struct rt6_info *) skb->dst;
869         if (rt) {
870                 if (rt->rt6i_flags&RTF_CACHE) {
871                         dst_set_expires(&rt->u.dst, 0);
872                         rt->rt6i_flags |= RTF_EXPIRES;
873                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
874                         rt->rt6i_node->fn_sernum = -1;
875         }
876 }
877
878 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
879 {
880         struct rt6_info *rt6 = (struct rt6_info*)dst;
881
882         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
883                 rt6->rt6i_flags |= RTF_MODIFIED;
884                 if (mtu < IPV6_MIN_MTU) {
885                         mtu = IPV6_MIN_MTU;
886                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
887                 }
888                 dst->metrics[RTAX_MTU-1] = mtu;
889                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
890         }
891 }
892
893 static int ipv6_get_mtu(struct net_device *dev);
894
895 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
896 {
897         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
898
899         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
900                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
901
902         /*
903          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
904          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
905          * IPV6_MAXPLEN is also valid and means: "any MSS,
906          * rely only on pmtu discovery"
907          */
908         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
909                 mtu = IPV6_MAXPLEN;
910         return mtu;
911 }
912
913 static struct dst_entry *icmp6_dst_gc_list;
914 static DEFINE_SPINLOCK(icmp6_dst_lock);
915
916 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
917                                   struct neighbour *neigh,
918                                   struct in6_addr *addr)
919 {
920         struct rt6_info *rt;
921         struct inet6_dev *idev = in6_dev_get(dev);
922         struct net *net = dev->nd_net;
923
924         if (unlikely(idev == NULL))
925                 return NULL;
926
927         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
928         if (unlikely(rt == NULL)) {
929                 in6_dev_put(idev);
930                 goto out;
931         }
932
933         dev_hold(dev);
934         if (neigh)
935                 neigh_hold(neigh);
936         else
937                 neigh = ndisc_get_neigh(dev, addr);
938
939         rt->rt6i_dev      = dev;
940         rt->rt6i_idev     = idev;
941         rt->rt6i_nexthop  = neigh;
942         atomic_set(&rt->u.dst.__refcnt, 1);
943         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
944         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
945         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
946         rt->u.dst.output  = ip6_output;
947
948 #if 0   /* there's no chance to use these for ndisc */
949         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
950                                 ? DST_HOST
951                                 : 0;
952         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
953         rt->rt6i_dst.plen = 128;
954 #endif
955
956         spin_lock_bh(&icmp6_dst_lock);
957         rt->u.dst.next = icmp6_dst_gc_list;
958         icmp6_dst_gc_list = &rt->u.dst;
959         spin_unlock_bh(&icmp6_dst_lock);
960
961         fib6_force_start_gc(net);
962
963 out:
964         return &rt->u.dst;
965 }
966
967 int icmp6_dst_gc(int *more)
968 {
969         struct dst_entry *dst, *next, **pprev;
970         int freed;
971
972         next = NULL;
973         freed = 0;
974
975         spin_lock_bh(&icmp6_dst_lock);
976         pprev = &icmp6_dst_gc_list;
977
978         while ((dst = *pprev) != NULL) {
979                 if (!atomic_read(&dst->__refcnt)) {
980                         *pprev = dst->next;
981                         dst_free(dst);
982                         freed++;
983                 } else {
984                         pprev = &dst->next;
985                         (*more)++;
986                 }
987         }
988
989         spin_unlock_bh(&icmp6_dst_lock);
990
991         return freed;
992 }
993
994 static int ip6_dst_gc(struct dst_ops *ops)
995 {
996         unsigned long now = jiffies;
997         struct net *net = ops->dst_net;
998         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
999         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1000         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1001         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1002         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1003
1004         if (time_after(rt_last_gc + rt_min_interval, now) &&
1005             atomic_read(&ops->entries) <= rt_max_size)
1006                 goto out;
1007
1008         net->ipv6.ip6_rt_gc_expire++;
1009         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1010         net->ipv6.ip6_rt_last_gc = now;
1011         if (atomic_read(&ops->entries) < ops->gc_thresh)
1012                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1013 out:
1014         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1015         return (atomic_read(&ops->entries) > rt_max_size);
1016 }
1017
1018 /* Clean host part of a prefix. Not necessary in radix tree,
1019    but results in cleaner routing tables.
1020
1021    Remove it only when all the things will work!
1022  */
1023
1024 static int ipv6_get_mtu(struct net_device *dev)
1025 {
1026         int mtu = IPV6_MIN_MTU;
1027         struct inet6_dev *idev;
1028
1029         idev = in6_dev_get(dev);
1030         if (idev) {
1031                 mtu = idev->cnf.mtu6;
1032                 in6_dev_put(idev);
1033         }
1034         return mtu;
1035 }
1036
1037 int ipv6_get_hoplimit(struct net_device *dev)
1038 {
1039         int hoplimit = ipv6_devconf.hop_limit;
1040         struct inet6_dev *idev;
1041
1042         idev = in6_dev_get(dev);
1043         if (idev) {
1044                 hoplimit = idev->cnf.hop_limit;
1045                 in6_dev_put(idev);
1046         }
1047         return hoplimit;
1048 }
1049
1050 /*
1051  *
1052  */
1053
1054 int ip6_route_add(struct fib6_config *cfg)
1055 {
1056         int err;
1057         struct net *net = cfg->fc_nlinfo.nl_net;
1058         struct rt6_info *rt = NULL;
1059         struct net_device *dev = NULL;
1060         struct inet6_dev *idev = NULL;
1061         struct fib6_table *table;
1062         int addr_type;
1063
1064         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1065                 return -EINVAL;
1066 #ifndef CONFIG_IPV6_SUBTREES
1067         if (cfg->fc_src_len)
1068                 return -EINVAL;
1069 #endif
1070         if (cfg->fc_ifindex) {
1071                 err = -ENODEV;
1072                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1073                 if (!dev)
1074                         goto out;
1075                 idev = in6_dev_get(dev);
1076                 if (!idev)
1077                         goto out;
1078         }
1079
1080         if (cfg->fc_metric == 0)
1081                 cfg->fc_metric = IP6_RT_PRIO_USER;
1082
1083         table = fib6_new_table(net, cfg->fc_table);
1084         if (table == NULL) {
1085                 err = -ENOBUFS;
1086                 goto out;
1087         }
1088
1089         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1090
1091         if (rt == NULL) {
1092                 err = -ENOMEM;
1093                 goto out;
1094         }
1095
1096         rt->u.dst.obsolete = -1;
1097         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1098
1099         if (cfg->fc_protocol == RTPROT_UNSPEC)
1100                 cfg->fc_protocol = RTPROT_BOOT;
1101         rt->rt6i_protocol = cfg->fc_protocol;
1102
1103         addr_type = ipv6_addr_type(&cfg->fc_dst);
1104
1105         if (addr_type & IPV6_ADDR_MULTICAST)
1106                 rt->u.dst.input = ip6_mc_input;
1107         else
1108                 rt->u.dst.input = ip6_forward;
1109
1110         rt->u.dst.output = ip6_output;
1111
1112         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1113         rt->rt6i_dst.plen = cfg->fc_dst_len;
1114         if (rt->rt6i_dst.plen == 128)
1115                rt->u.dst.flags = DST_HOST;
1116
1117 #ifdef CONFIG_IPV6_SUBTREES
1118         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1119         rt->rt6i_src.plen = cfg->fc_src_len;
1120 #endif
1121
1122         rt->rt6i_metric = cfg->fc_metric;
1123
1124         /* We cannot add true routes via loopback here,
1125            they would result in kernel looping; promote them to reject routes
1126          */
1127         if ((cfg->fc_flags & RTF_REJECT) ||
1128             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1129                 /* hold loopback dev/idev if we haven't done so. */
1130                 if (dev != net->loopback_dev) {
1131                         if (dev) {
1132                                 dev_put(dev);
1133                                 in6_dev_put(idev);
1134                         }
1135                         dev = net->loopback_dev;
1136                         dev_hold(dev);
1137                         idev = in6_dev_get(dev);
1138                         if (!idev) {
1139                                 err = -ENODEV;
1140                                 goto out;
1141                         }
1142                 }
1143                 rt->u.dst.output = ip6_pkt_discard_out;
1144                 rt->u.dst.input = ip6_pkt_discard;
1145                 rt->u.dst.error = -ENETUNREACH;
1146                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1147                 goto install_route;
1148         }
1149
1150         if (cfg->fc_flags & RTF_GATEWAY) {
1151                 struct in6_addr *gw_addr;
1152                 int gwa_type;
1153
1154                 gw_addr = &cfg->fc_gateway;
1155                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1156                 gwa_type = ipv6_addr_type(gw_addr);
1157
1158                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1159                         struct rt6_info *grt;
1160
1161                         /* IPv6 strictly inhibits using not link-local
1162                            addresses as nexthop address.
1163                            Otherwise, router will not able to send redirects.
1164                            It is very good, but in some (rare!) circumstances
1165                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1166                            some exceptions. --ANK
1167                          */
1168                         err = -EINVAL;
1169                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1170                                 goto out;
1171
1172                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1173
1174                         err = -EHOSTUNREACH;
1175                         if (grt == NULL)
1176                                 goto out;
1177                         if (dev) {
1178                                 if (dev != grt->rt6i_dev) {
1179                                         dst_release(&grt->u.dst);
1180                                         goto out;
1181                                 }
1182                         } else {
1183                                 dev = grt->rt6i_dev;
1184                                 idev = grt->rt6i_idev;
1185                                 dev_hold(dev);
1186                                 in6_dev_hold(grt->rt6i_idev);
1187                         }
1188                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1189                                 err = 0;
1190                         dst_release(&grt->u.dst);
1191
1192                         if (err)
1193                                 goto out;
1194                 }
1195                 err = -EINVAL;
1196                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1197                         goto out;
1198         }
1199
1200         err = -ENODEV;
1201         if (dev == NULL)
1202                 goto out;
1203
1204         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1205                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1206                 if (IS_ERR(rt->rt6i_nexthop)) {
1207                         err = PTR_ERR(rt->rt6i_nexthop);
1208                         rt->rt6i_nexthop = NULL;
1209                         goto out;
1210                 }
1211         }
1212
1213         rt->rt6i_flags = cfg->fc_flags;
1214
1215 install_route:
1216         if (cfg->fc_mx) {
1217                 struct nlattr *nla;
1218                 int remaining;
1219
1220                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1221                         int type = nla_type(nla);
1222
1223                         if (type) {
1224                                 if (type > RTAX_MAX) {
1225                                         err = -EINVAL;
1226                                         goto out;
1227                                 }
1228
1229                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1230                         }
1231                 }
1232         }
1233
1234         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1235                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1236         if (!rt->u.dst.metrics[RTAX_MTU-1])
1237                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1238         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1239                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1240         rt->u.dst.dev = dev;
1241         rt->rt6i_idev = idev;
1242         rt->rt6i_table = table;
1243
1244         cfg->fc_nlinfo.nl_net = dev->nd_net;
1245
1246         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1247
1248 out:
1249         if (dev)
1250                 dev_put(dev);
1251         if (idev)
1252                 in6_dev_put(idev);
1253         if (rt)
1254                 dst_free(&rt->u.dst);
1255         return err;
1256 }
1257
1258 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1259 {
1260         int err;
1261         struct fib6_table *table;
1262         struct net *net = rt->rt6i_dev->nd_net;
1263
1264         if (rt == net->ipv6.ip6_null_entry)
1265                 return -ENOENT;
1266
1267         table = rt->rt6i_table;
1268         write_lock_bh(&table->tb6_lock);
1269
1270         err = fib6_del(rt, info);
1271         dst_release(&rt->u.dst);
1272
1273         write_unlock_bh(&table->tb6_lock);
1274
1275         return err;
1276 }
1277
1278 int ip6_del_rt(struct rt6_info *rt)
1279 {
1280         struct nl_info info = {
1281                 .nl_net = rt->rt6i_dev->nd_net,
1282         };
1283         return __ip6_del_rt(rt, &info);
1284 }
1285
1286 static int ip6_route_del(struct fib6_config *cfg)
1287 {
1288         struct fib6_table *table;
1289         struct fib6_node *fn;
1290         struct rt6_info *rt;
1291         int err = -ESRCH;
1292
1293         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1294         if (table == NULL)
1295                 return err;
1296
1297         read_lock_bh(&table->tb6_lock);
1298
1299         fn = fib6_locate(&table->tb6_root,
1300                          &cfg->fc_dst, cfg->fc_dst_len,
1301                          &cfg->fc_src, cfg->fc_src_len);
1302
1303         if (fn) {
1304                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1305                         if (cfg->fc_ifindex &&
1306                             (rt->rt6i_dev == NULL ||
1307                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1308                                 continue;
1309                         if (cfg->fc_flags & RTF_GATEWAY &&
1310                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1311                                 continue;
1312                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1313                                 continue;
1314                         dst_hold(&rt->u.dst);
1315                         read_unlock_bh(&table->tb6_lock);
1316
1317                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1318                 }
1319         }
1320         read_unlock_bh(&table->tb6_lock);
1321
1322         return err;
1323 }
1324
1325 /*
1326  *      Handle redirects
1327  */
1328 struct ip6rd_flowi {
1329         struct flowi fl;
1330         struct in6_addr gateway;
1331 };
1332
1333 static struct rt6_info *__ip6_route_redirect(struct net *net,
1334                                              struct fib6_table *table,
1335                                              struct flowi *fl,
1336                                              int flags)
1337 {
1338         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1339         struct rt6_info *rt;
1340         struct fib6_node *fn;
1341
1342         /*
1343          * Get the "current" route for this destination and
1344          * check if the redirect has come from approriate router.
1345          *
1346          * RFC 2461 specifies that redirects should only be
1347          * accepted if they come from the nexthop to the target.
1348          * Due to the way the routes are chosen, this notion
1349          * is a bit fuzzy and one might need to check all possible
1350          * routes.
1351          */
1352
1353         read_lock_bh(&table->tb6_lock);
1354         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1355 restart:
1356         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1357                 /*
1358                  * Current route is on-link; redirect is always invalid.
1359                  *
1360                  * Seems, previous statement is not true. It could
1361                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1362                  * But then router serving it might decide, that we should
1363                  * know truth 8)8) --ANK (980726).
1364                  */
1365                 if (rt6_check_expired(rt))
1366                         continue;
1367                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1368                         continue;
1369                 if (fl->oif != rt->rt6i_dev->ifindex)
1370                         continue;
1371                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1372                         continue;
1373                 break;
1374         }
1375
1376         if (!rt)
1377                 rt = net->ipv6.ip6_null_entry;
1378         BACKTRACK(net, &fl->fl6_src);
1379 out:
1380         dst_hold(&rt->u.dst);
1381
1382         read_unlock_bh(&table->tb6_lock);
1383
1384         return rt;
1385 };
1386
1387 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1388                                            struct in6_addr *src,
1389                                            struct in6_addr *gateway,
1390                                            struct net_device *dev)
1391 {
1392         int flags = RT6_LOOKUP_F_HAS_SADDR;
1393         struct net *net = dev->nd_net;
1394         struct ip6rd_flowi rdfl = {
1395                 .fl = {
1396                         .oif = dev->ifindex,
1397                         .nl_u = {
1398                                 .ip6_u = {
1399                                         .daddr = *dest,
1400                                         .saddr = *src,
1401                                 },
1402                         },
1403                 },
1404                 .gateway = *gateway,
1405         };
1406
1407         if (rt6_need_strict(dest))
1408                 flags |= RT6_LOOKUP_F_IFACE;
1409
1410         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1411                                                    flags, __ip6_route_redirect);
1412 }
1413
1414 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1415                   struct in6_addr *saddr,
1416                   struct neighbour *neigh, u8 *lladdr, int on_link)
1417 {
1418         struct rt6_info *rt, *nrt = NULL;
1419         struct netevent_redirect netevent;
1420         struct net *net = neigh->dev->nd_net;
1421
1422         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1423
1424         if (rt == net->ipv6.ip6_null_entry) {
1425                 if (net_ratelimit())
1426                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1427                                "for redirect target\n");
1428                 goto out;
1429         }
1430
1431         /*
1432          *      We have finally decided to accept it.
1433          */
1434
1435         neigh_update(neigh, lladdr, NUD_STALE,
1436                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1437                      NEIGH_UPDATE_F_OVERRIDE|
1438                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1439                                      NEIGH_UPDATE_F_ISROUTER))
1440                      );
1441
1442         /*
1443          * Redirect received -> path was valid.
1444          * Look, redirects are sent only in response to data packets,
1445          * so that this nexthop apparently is reachable. --ANK
1446          */
1447         dst_confirm(&rt->u.dst);
1448
1449         /* Duplicate redirect: silently ignore. */
1450         if (neigh == rt->u.dst.neighbour)
1451                 goto out;
1452
1453         nrt = ip6_rt_copy(rt);
1454         if (nrt == NULL)
1455                 goto out;
1456
1457         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1458         if (on_link)
1459                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1460
1461         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1462         nrt->rt6i_dst.plen = 128;
1463         nrt->u.dst.flags |= DST_HOST;
1464
1465         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1466         nrt->rt6i_nexthop = neigh_clone(neigh);
1467         /* Reset pmtu, it may be better */
1468         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1469         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(neigh->dev->nd_net,
1470                                                         dst_mtu(&nrt->u.dst));
1471
1472         if (ip6_ins_rt(nrt))
1473                 goto out;
1474
1475         netevent.old = &rt->u.dst;
1476         netevent.new = &nrt->u.dst;
1477         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1478
1479         if (rt->rt6i_flags&RTF_CACHE) {
1480                 ip6_del_rt(rt);
1481                 return;
1482         }
1483
1484 out:
1485         dst_release(&rt->u.dst);
1486         return;
1487 }
1488
1489 /*
1490  *      Handle ICMP "packet too big" messages
1491  *      i.e. Path MTU discovery
1492  */
1493
1494 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1495                         struct net_device *dev, u32 pmtu)
1496 {
1497         struct rt6_info *rt, *nrt;
1498         struct net *net = dev->nd_net;
1499         int allfrag = 0;
1500
1501         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1502         if (rt == NULL)
1503                 return;
1504
1505         if (pmtu >= dst_mtu(&rt->u.dst))
1506                 goto out;
1507
1508         if (pmtu < IPV6_MIN_MTU) {
1509                 /*
1510                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1511                  * MTU (1280) and a fragment header should always be included
1512                  * after a node receiving Too Big message reporting PMTU is
1513                  * less than the IPv6 Minimum Link MTU.
1514                  */
1515                 pmtu = IPV6_MIN_MTU;
1516                 allfrag = 1;
1517         }
1518
1519         /* New mtu received -> path was valid.
1520            They are sent only in response to data packets,
1521            so that this nexthop apparently is reachable. --ANK
1522          */
1523         dst_confirm(&rt->u.dst);
1524
1525         /* Host route. If it is static, it would be better
1526            not to override it, but add new one, so that
1527            when cache entry will expire old pmtu
1528            would return automatically.
1529          */
1530         if (rt->rt6i_flags & RTF_CACHE) {
1531                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1532                 if (allfrag)
1533                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1534                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1535                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1536                 goto out;
1537         }
1538
1539         /* Network route.
1540            Two cases are possible:
1541            1. It is connected route. Action: COW
1542            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1543          */
1544         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1545                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1546         else
1547                 nrt = rt6_alloc_clone(rt, daddr);
1548
1549         if (nrt) {
1550                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1551                 if (allfrag)
1552                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1553
1554                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1555                  * happened within 5 mins, the recommended timer is 10 mins.
1556                  * Here this route expiration time is set to ip6_rt_mtu_expires
1557                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1558                  * and detecting PMTU increase will be automatically happened.
1559                  */
1560                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1561                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1562
1563                 ip6_ins_rt(nrt);
1564         }
1565 out:
1566         dst_release(&rt->u.dst);
1567 }
1568
1569 /*
1570  *      Misc support functions
1571  */
1572
1573 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1574 {
1575         struct net *net = ort->rt6i_dev->nd_net;
1576         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1577
1578         if (rt) {
1579                 rt->u.dst.input = ort->u.dst.input;
1580                 rt->u.dst.output = ort->u.dst.output;
1581
1582                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1583                 rt->u.dst.error = ort->u.dst.error;
1584                 rt->u.dst.dev = ort->u.dst.dev;
1585                 if (rt->u.dst.dev)
1586                         dev_hold(rt->u.dst.dev);
1587                 rt->rt6i_idev = ort->rt6i_idev;
1588                 if (rt->rt6i_idev)
1589                         in6_dev_hold(rt->rt6i_idev);
1590                 rt->u.dst.lastuse = jiffies;
1591                 rt->rt6i_expires = 0;
1592
1593                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1594                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1595                 rt->rt6i_metric = 0;
1596
1597                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1598 #ifdef CONFIG_IPV6_SUBTREES
1599                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1600 #endif
1601                 rt->rt6i_table = ort->rt6i_table;
1602         }
1603         return rt;
1604 }
1605
1606 #ifdef CONFIG_IPV6_ROUTE_INFO
1607 static struct rt6_info *rt6_get_route_info(struct net *net,
1608                                            struct in6_addr *prefix, int prefixlen,
1609                                            struct in6_addr *gwaddr, int ifindex)
1610 {
1611         struct fib6_node *fn;
1612         struct rt6_info *rt = NULL;
1613         struct fib6_table *table;
1614
1615         table = fib6_get_table(net, RT6_TABLE_INFO);
1616         if (table == NULL)
1617                 return NULL;
1618
1619         write_lock_bh(&table->tb6_lock);
1620         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1621         if (!fn)
1622                 goto out;
1623
1624         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1625                 if (rt->rt6i_dev->ifindex != ifindex)
1626                         continue;
1627                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1628                         continue;
1629                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1630                         continue;
1631                 dst_hold(&rt->u.dst);
1632                 break;
1633         }
1634 out:
1635         write_unlock_bh(&table->tb6_lock);
1636         return rt;
1637 }
1638
1639 static struct rt6_info *rt6_add_route_info(struct net *net,
1640                                            struct in6_addr *prefix, int prefixlen,
1641                                            struct in6_addr *gwaddr, int ifindex,
1642                                            unsigned pref)
1643 {
1644         struct fib6_config cfg = {
1645                 .fc_table       = RT6_TABLE_INFO,
1646                 .fc_metric      = IP6_RT_PRIO_USER,
1647                 .fc_ifindex     = ifindex,
1648                 .fc_dst_len     = prefixlen,
1649                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1650                                   RTF_UP | RTF_PREF(pref),
1651                 .fc_nlinfo.pid = 0,
1652                 .fc_nlinfo.nlh = NULL,
1653                 .fc_nlinfo.nl_net = net,
1654         };
1655
1656         ipv6_addr_copy(&cfg.fc_dst, prefix);
1657         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1658
1659         /* We should treat it as a default route if prefix length is 0. */
1660         if (!prefixlen)
1661                 cfg.fc_flags |= RTF_DEFAULT;
1662
1663         ip6_route_add(&cfg);
1664
1665         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1666 }
1667 #endif
1668
1669 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1670 {
1671         struct rt6_info *rt;
1672         struct fib6_table *table;
1673
1674         table = fib6_get_table(dev->nd_net, RT6_TABLE_DFLT);
1675         if (table == NULL)
1676                 return NULL;
1677
1678         write_lock_bh(&table->tb6_lock);
1679         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1680                 if (dev == rt->rt6i_dev &&
1681                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1682                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1683                         break;
1684         }
1685         if (rt)
1686                 dst_hold(&rt->u.dst);
1687         write_unlock_bh(&table->tb6_lock);
1688         return rt;
1689 }
1690
1691 EXPORT_SYMBOL(rt6_get_dflt_router);
1692
1693 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1694                                      struct net_device *dev,
1695                                      unsigned int pref)
1696 {
1697         struct fib6_config cfg = {
1698                 .fc_table       = RT6_TABLE_DFLT,
1699                 .fc_metric      = IP6_RT_PRIO_USER,
1700                 .fc_ifindex     = dev->ifindex,
1701                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1702                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1703                 .fc_nlinfo.pid = 0,
1704                 .fc_nlinfo.nlh = NULL,
1705                 .fc_nlinfo.nl_net = dev->nd_net,
1706         };
1707
1708         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1709
1710         ip6_route_add(&cfg);
1711
1712         return rt6_get_dflt_router(gwaddr, dev);
1713 }
1714
1715 void rt6_purge_dflt_routers(struct net *net)
1716 {
1717         struct rt6_info *rt;
1718         struct fib6_table *table;
1719
1720         /* NOTE: Keep consistent with rt6_get_dflt_router */
1721         table = fib6_get_table(net, RT6_TABLE_DFLT);
1722         if (table == NULL)
1723                 return;
1724
1725 restart:
1726         read_lock_bh(&table->tb6_lock);
1727         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1728                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1729                         dst_hold(&rt->u.dst);
1730                         read_unlock_bh(&table->tb6_lock);
1731                         ip6_del_rt(rt);
1732                         goto restart;
1733                 }
1734         }
1735         read_unlock_bh(&table->tb6_lock);
1736 }
1737
1738 static void rtmsg_to_fib6_config(struct net *net,
1739                                  struct in6_rtmsg *rtmsg,
1740                                  struct fib6_config *cfg)
1741 {
1742         memset(cfg, 0, sizeof(*cfg));
1743
1744         cfg->fc_table = RT6_TABLE_MAIN;
1745         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1746         cfg->fc_metric = rtmsg->rtmsg_metric;
1747         cfg->fc_expires = rtmsg->rtmsg_info;
1748         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1749         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1750         cfg->fc_flags = rtmsg->rtmsg_flags;
1751
1752         cfg->fc_nlinfo.nl_net = net;
1753
1754         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1755         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1756         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1757 }
1758
1759 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1760 {
1761         struct fib6_config cfg;
1762         struct in6_rtmsg rtmsg;
1763         int err;
1764
1765         switch(cmd) {
1766         case SIOCADDRT:         /* Add a route */
1767         case SIOCDELRT:         /* Delete a route */
1768                 if (!capable(CAP_NET_ADMIN))
1769                         return -EPERM;
1770                 err = copy_from_user(&rtmsg, arg,
1771                                      sizeof(struct in6_rtmsg));
1772                 if (err)
1773                         return -EFAULT;
1774
1775                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1776
1777                 rtnl_lock();
1778                 switch (cmd) {
1779                 case SIOCADDRT:
1780                         err = ip6_route_add(&cfg);
1781                         break;
1782                 case SIOCDELRT:
1783                         err = ip6_route_del(&cfg);
1784                         break;
1785                 default:
1786                         err = -EINVAL;
1787                 }
1788                 rtnl_unlock();
1789
1790                 return err;
1791         }
1792
1793         return -EINVAL;
1794 }
1795
1796 /*
1797  *      Drop the packet on the floor
1798  */
1799
1800 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1801 {
1802         int type;
1803         switch (ipstats_mib_noroutes) {
1804         case IPSTATS_MIB_INNOROUTES:
1805                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1806                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1807                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1808                         break;
1809                 }
1810                 /* FALLTHROUGH */
1811         case IPSTATS_MIB_OUTNOROUTES:
1812                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1813                 break;
1814         }
1815         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1816         kfree_skb(skb);
1817         return 0;
1818 }
1819
1820 static int ip6_pkt_discard(struct sk_buff *skb)
1821 {
1822         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1823 }
1824
1825 static int ip6_pkt_discard_out(struct sk_buff *skb)
1826 {
1827         skb->dev = skb->dst->dev;
1828         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1829 }
1830
1831 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1832
1833 static int ip6_pkt_prohibit(struct sk_buff *skb)
1834 {
1835         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1836 }
1837
1838 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1839 {
1840         skb->dev = skb->dst->dev;
1841         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1842 }
1843
1844 #endif
1845
1846 /*
1847  *      Allocate a dst for local (unicast / anycast) address.
1848  */
1849
1850 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1851                                     const struct in6_addr *addr,
1852                                     int anycast)
1853 {
1854         struct net *net = idev->dev->nd_net;
1855         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1856
1857         if (rt == NULL)
1858                 return ERR_PTR(-ENOMEM);
1859
1860         dev_hold(net->loopback_dev);
1861         in6_dev_hold(idev);
1862
1863         rt->u.dst.flags = DST_HOST;
1864         rt->u.dst.input = ip6_input;
1865         rt->u.dst.output = ip6_output;
1866         rt->rt6i_dev = net->loopback_dev;
1867         rt->rt6i_idev = idev;
1868         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1869         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1870         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1871         rt->u.dst.obsolete = -1;
1872
1873         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1874         if (anycast)
1875                 rt->rt6i_flags |= RTF_ANYCAST;
1876         else
1877                 rt->rt6i_flags |= RTF_LOCAL;
1878         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1879         if (rt->rt6i_nexthop == NULL) {
1880                 dst_free(&rt->u.dst);
1881                 return ERR_PTR(-ENOMEM);
1882         }
1883
1884         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1885         rt->rt6i_dst.plen = 128;
1886         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1887
1888         atomic_set(&rt->u.dst.__refcnt, 1);
1889
1890         return rt;
1891 }
1892
1893 struct arg_dev_net {
1894         struct net_device *dev;
1895         struct net *net;
1896 };
1897
1898 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1899 {
1900         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1901         struct net *net = ((struct arg_dev_net *)arg)->net;
1902
1903         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1904             rt != net->ipv6.ip6_null_entry) {
1905                 RT6_TRACE("deleted by ifdown %p\n", rt);
1906                 return -1;
1907         }
1908         return 0;
1909 }
1910
1911 void rt6_ifdown(struct net *net, struct net_device *dev)
1912 {
1913         struct arg_dev_net adn = {
1914                 .dev = dev,
1915                 .net = net,
1916         };
1917
1918         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1919 }
1920
1921 struct rt6_mtu_change_arg
1922 {
1923         struct net_device *dev;
1924         unsigned mtu;
1925 };
1926
1927 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1928 {
1929         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1930         struct inet6_dev *idev;
1931         struct net *net = arg->dev->nd_net;
1932
1933         /* In IPv6 pmtu discovery is not optional,
1934            so that RTAX_MTU lock cannot disable it.
1935            We still use this lock to block changes
1936            caused by addrconf/ndisc.
1937         */
1938
1939         idev = __in6_dev_get(arg->dev);
1940         if (idev == NULL)
1941                 return 0;
1942
1943         /* For administrative MTU increase, there is no way to discover
1944            IPv6 PMTU increase, so PMTU increase should be updated here.
1945            Since RFC 1981 doesn't include administrative MTU increase
1946            update PMTU increase is a MUST. (i.e. jumbo frame)
1947          */
1948         /*
1949            If new MTU is less than route PMTU, this new MTU will be the
1950            lowest MTU in the path, update the route PMTU to reflect PMTU
1951            decreases; if new MTU is greater than route PMTU, and the
1952            old MTU is the lowest MTU in the path, update the route PMTU
1953            to reflect the increase. In this case if the other nodes' MTU
1954            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1955            PMTU discouvery.
1956          */
1957         if (rt->rt6i_dev == arg->dev &&
1958             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1959             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1960              (dst_mtu(&rt->u.dst) < arg->mtu &&
1961               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1962                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1963                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1964         }
1965         return 0;
1966 }
1967
1968 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1969 {
1970         struct rt6_mtu_change_arg arg = {
1971                 .dev = dev,
1972                 .mtu = mtu,
1973         };
1974
1975         fib6_clean_all(dev->nd_net, rt6_mtu_change_route, 0, &arg);
1976 }
1977
1978 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1979         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1980         [RTA_OIF]               = { .type = NLA_U32 },
1981         [RTA_IIF]               = { .type = NLA_U32 },
1982         [RTA_PRIORITY]          = { .type = NLA_U32 },
1983         [RTA_METRICS]           = { .type = NLA_NESTED },
1984 };
1985
1986 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1987                               struct fib6_config *cfg)
1988 {
1989         struct rtmsg *rtm;
1990         struct nlattr *tb[RTA_MAX+1];
1991         int err;
1992
1993         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1994         if (err < 0)
1995                 goto errout;
1996
1997         err = -EINVAL;
1998         rtm = nlmsg_data(nlh);
1999         memset(cfg, 0, sizeof(*cfg));
2000
2001         cfg->fc_table = rtm->rtm_table;
2002         cfg->fc_dst_len = rtm->rtm_dst_len;
2003         cfg->fc_src_len = rtm->rtm_src_len;
2004         cfg->fc_flags = RTF_UP;
2005         cfg->fc_protocol = rtm->rtm_protocol;
2006
2007         if (rtm->rtm_type == RTN_UNREACHABLE)
2008                 cfg->fc_flags |= RTF_REJECT;
2009
2010         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2011         cfg->fc_nlinfo.nlh = nlh;
2012         cfg->fc_nlinfo.nl_net = skb->sk->sk_net;
2013
2014         if (tb[RTA_GATEWAY]) {
2015                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2016                 cfg->fc_flags |= RTF_GATEWAY;
2017         }
2018
2019         if (tb[RTA_DST]) {
2020                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2021
2022                 if (nla_len(tb[RTA_DST]) < plen)
2023                         goto errout;
2024
2025                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2026         }
2027
2028         if (tb[RTA_SRC]) {
2029                 int plen = (rtm->rtm_src_len + 7) >> 3;
2030
2031                 if (nla_len(tb[RTA_SRC]) < plen)
2032                         goto errout;
2033
2034                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2035         }
2036
2037         if (tb[RTA_OIF])
2038                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2039
2040         if (tb[RTA_PRIORITY])
2041                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2042
2043         if (tb[RTA_METRICS]) {
2044                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2045                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2046         }
2047
2048         if (tb[RTA_TABLE])
2049                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2050
2051         err = 0;
2052 errout:
2053         return err;
2054 }
2055
2056 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2057 {
2058         struct fib6_config cfg;
2059         int err;
2060
2061         err = rtm_to_fib6_config(skb, nlh, &cfg);
2062         if (err < 0)
2063                 return err;
2064
2065         return ip6_route_del(&cfg);
2066 }
2067
2068 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2069 {
2070         struct fib6_config cfg;
2071         int err;
2072
2073         err = rtm_to_fib6_config(skb, nlh, &cfg);
2074         if (err < 0)
2075                 return err;
2076
2077         return ip6_route_add(&cfg);
2078 }
2079
2080 static inline size_t rt6_nlmsg_size(void)
2081 {
2082         return NLMSG_ALIGN(sizeof(struct rtmsg))
2083                + nla_total_size(16) /* RTA_SRC */
2084                + nla_total_size(16) /* RTA_DST */
2085                + nla_total_size(16) /* RTA_GATEWAY */
2086                + nla_total_size(16) /* RTA_PREFSRC */
2087                + nla_total_size(4) /* RTA_TABLE */
2088                + nla_total_size(4) /* RTA_IIF */
2089                + nla_total_size(4) /* RTA_OIF */
2090                + nla_total_size(4) /* RTA_PRIORITY */
2091                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2092                + nla_total_size(sizeof(struct rta_cacheinfo));
2093 }
2094
2095 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2096                          struct in6_addr *dst, struct in6_addr *src,
2097                          int iif, int type, u32 pid, u32 seq,
2098                          int prefix, unsigned int flags)
2099 {
2100         struct rtmsg *rtm;
2101         struct nlmsghdr *nlh;
2102         long expires;
2103         u32 table;
2104
2105         if (prefix) {   /* user wants prefix routes only */
2106                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2107                         /* success since this is not a prefix route */
2108                         return 1;
2109                 }
2110         }
2111
2112         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2113         if (nlh == NULL)
2114                 return -EMSGSIZE;
2115
2116         rtm = nlmsg_data(nlh);
2117         rtm->rtm_family = AF_INET6;
2118         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2119         rtm->rtm_src_len = rt->rt6i_src.plen;
2120         rtm->rtm_tos = 0;
2121         if (rt->rt6i_table)
2122                 table = rt->rt6i_table->tb6_id;
2123         else
2124                 table = RT6_TABLE_UNSPEC;
2125         rtm->rtm_table = table;
2126         NLA_PUT_U32(skb, RTA_TABLE, table);
2127         if (rt->rt6i_flags&RTF_REJECT)
2128                 rtm->rtm_type = RTN_UNREACHABLE;
2129         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2130                 rtm->rtm_type = RTN_LOCAL;
2131         else
2132                 rtm->rtm_type = RTN_UNICAST;
2133         rtm->rtm_flags = 0;
2134         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2135         rtm->rtm_protocol = rt->rt6i_protocol;
2136         if (rt->rt6i_flags&RTF_DYNAMIC)
2137                 rtm->rtm_protocol = RTPROT_REDIRECT;
2138         else if (rt->rt6i_flags & RTF_ADDRCONF)
2139                 rtm->rtm_protocol = RTPROT_KERNEL;
2140         else if (rt->rt6i_flags&RTF_DEFAULT)
2141                 rtm->rtm_protocol = RTPROT_RA;
2142
2143         if (rt->rt6i_flags&RTF_CACHE)
2144                 rtm->rtm_flags |= RTM_F_CLONED;
2145
2146         if (dst) {
2147                 NLA_PUT(skb, RTA_DST, 16, dst);
2148                 rtm->rtm_dst_len = 128;
2149         } else if (rtm->rtm_dst_len)
2150                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2151 #ifdef CONFIG_IPV6_SUBTREES
2152         if (src) {
2153                 NLA_PUT(skb, RTA_SRC, 16, src);
2154                 rtm->rtm_src_len = 128;
2155         } else if (rtm->rtm_src_len)
2156                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2157 #endif
2158         if (iif)
2159                 NLA_PUT_U32(skb, RTA_IIF, iif);
2160         else if (dst) {
2161                 struct in6_addr saddr_buf;
2162                 if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2163                                        dst, &saddr_buf) == 0)
2164                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2165         }
2166
2167         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2168                 goto nla_put_failure;
2169
2170         if (rt->u.dst.neighbour)
2171                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2172
2173         if (rt->u.dst.dev)
2174                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2175
2176         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2177
2178         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2179         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2180                                expires, rt->u.dst.error) < 0)
2181                 goto nla_put_failure;
2182
2183         return nlmsg_end(skb, nlh);
2184
2185 nla_put_failure:
2186         nlmsg_cancel(skb, nlh);
2187         return -EMSGSIZE;
2188 }
2189
2190 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2191 {
2192         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2193         int prefix;
2194
2195         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2196                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2197                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2198         } else
2199                 prefix = 0;
2200
2201         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2202                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2203                      prefix, NLM_F_MULTI);
2204 }
2205
2206 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2207 {
2208         struct net *net = in_skb->sk->sk_net;
2209         struct nlattr *tb[RTA_MAX+1];
2210         struct rt6_info *rt;
2211         struct sk_buff *skb;
2212         struct rtmsg *rtm;
2213         struct flowi fl;
2214         int err, iif = 0;
2215
2216         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2217         if (err < 0)
2218                 goto errout;
2219
2220         err = -EINVAL;
2221         memset(&fl, 0, sizeof(fl));
2222
2223         if (tb[RTA_SRC]) {
2224                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2225                         goto errout;
2226
2227                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2228         }
2229
2230         if (tb[RTA_DST]) {
2231                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2232                         goto errout;
2233
2234                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2235         }
2236
2237         if (tb[RTA_IIF])
2238                 iif = nla_get_u32(tb[RTA_IIF]);
2239
2240         if (tb[RTA_OIF])
2241                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2242
2243         if (iif) {
2244                 struct net_device *dev;
2245                 dev = __dev_get_by_index(net, iif);
2246                 if (!dev) {
2247                         err = -ENODEV;
2248                         goto errout;
2249                 }
2250         }
2251
2252         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2253         if (skb == NULL) {
2254                 err = -ENOBUFS;
2255                 goto errout;
2256         }
2257
2258         /* Reserve room for dummy headers, this skb can pass
2259            through good chunk of routing engine.
2260          */
2261         skb_reset_mac_header(skb);
2262         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2263
2264         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2265         skb->dst = &rt->u.dst;
2266
2267         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2268                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2269                             nlh->nlmsg_seq, 0, 0);
2270         if (err < 0) {
2271                 kfree_skb(skb);
2272                 goto errout;
2273         }
2274
2275         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2276 errout:
2277         return err;
2278 }
2279
2280 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2281 {
2282         struct sk_buff *skb;
2283         struct net *net = info->nl_net;
2284         u32 seq;
2285         int err;
2286
2287         err = -ENOBUFS;
2288         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2289
2290         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2291         if (skb == NULL)
2292                 goto errout;
2293
2294         err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2295                                 event, info->pid, seq, 0, 0);
2296         if (err < 0) {
2297                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2298                 WARN_ON(err == -EMSGSIZE);
2299                 kfree_skb(skb);
2300                 goto errout;
2301         }
2302         err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2303                           info->nlh, gfp_any());
2304 errout:
2305         if (err < 0)
2306                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2307 }
2308
2309 static int ip6_route_dev_notify(struct notifier_block *this,
2310                                 unsigned long event, void *data)
2311 {
2312         struct net_device *dev = (struct net_device *)data;
2313         struct net *net = dev->nd_net;
2314
2315         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2316                 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2317                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2318 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2319                 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2320                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2321                 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2322                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2323 #endif
2324         }
2325
2326         return NOTIFY_OK;
2327 }
2328
2329 /*
2330  *      /proc
2331  */
2332
2333 #ifdef CONFIG_PROC_FS
2334
2335 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2336
2337 struct rt6_proc_arg
2338 {
2339         char *buffer;
2340         int offset;
2341         int length;
2342         int skip;
2343         int len;
2344 };
2345
2346 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2347 {
2348         struct seq_file *m = p_arg;
2349
2350         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2351                    rt->rt6i_dst.plen);
2352
2353 #ifdef CONFIG_IPV6_SUBTREES
2354         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2355                    rt->rt6i_src.plen);
2356 #else
2357         seq_puts(m, "00000000000000000000000000000000 00 ");
2358 #endif
2359
2360         if (rt->rt6i_nexthop) {
2361                 seq_printf(m, NIP6_SEQFMT,
2362                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2363         } else {
2364                 seq_puts(m, "00000000000000000000000000000000");
2365         }
2366         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2367                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2368                    rt->u.dst.__use, rt->rt6i_flags,
2369                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2370         return 0;
2371 }
2372
2373 static int ipv6_route_show(struct seq_file *m, void *v)
2374 {
2375         struct net *net = (struct net *)m->private;
2376         fib6_clean_all(net, rt6_info_route, 0, m);
2377         return 0;
2378 }
2379
2380 static int ipv6_route_open(struct inode *inode, struct file *file)
2381 {
2382         struct net *net = get_proc_net(inode);
2383         if (!net)
2384                 return -ENXIO;
2385         return single_open(file, ipv6_route_show, net);
2386 }
2387
2388 static int ipv6_route_release(struct inode *inode, struct file *file)
2389 {
2390         struct seq_file *seq = file->private_data;
2391         struct net *net = seq->private;
2392         put_net(net);
2393         return single_release(inode, file);
2394 }
2395
2396 static const struct file_operations ipv6_route_proc_fops = {
2397         .owner          = THIS_MODULE,
2398         .open           = ipv6_route_open,
2399         .read           = seq_read,
2400         .llseek         = seq_lseek,
2401         .release        = ipv6_route_release,
2402 };
2403
2404 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2405 {
2406         struct net *net = (struct net *)seq->private;
2407         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2408                    net->ipv6.rt6_stats->fib_nodes,
2409                    net->ipv6.rt6_stats->fib_route_nodes,
2410                    net->ipv6.rt6_stats->fib_rt_alloc,
2411                    net->ipv6.rt6_stats->fib_rt_entries,
2412                    net->ipv6.rt6_stats->fib_rt_cache,
2413                    atomic_read(&net->ipv6.ip6_dst_ops->entries),
2414                    net->ipv6.rt6_stats->fib_discarded_routes);
2415
2416         return 0;
2417 }
2418
2419 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2420 {
2421         struct net *net = get_proc_net(inode);
2422         return single_open(file, rt6_stats_seq_show, net);
2423 }
2424
2425 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2426 {
2427         struct seq_file *seq = file->private_data;
2428         struct net *net = (struct net *)seq->private;
2429         put_net(net);
2430         return single_release(inode, file);
2431 }
2432
2433 static const struct file_operations rt6_stats_seq_fops = {
2434         .owner   = THIS_MODULE,
2435         .open    = rt6_stats_seq_open,
2436         .read    = seq_read,
2437         .llseek  = seq_lseek,
2438         .release = rt6_stats_seq_release,
2439 };
2440 #endif  /* CONFIG_PROC_FS */
2441
2442 #ifdef CONFIG_SYSCTL
2443
2444 static
2445 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2446                               void __user *buffer, size_t *lenp, loff_t *ppos)
2447 {
2448         struct net *net = current->nsproxy->net_ns;
2449         int delay = net->ipv6.sysctl.flush_delay;
2450         if (write) {
2451                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2452                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2453                 return 0;
2454         } else
2455                 return -EINVAL;
2456 }
2457
2458 ctl_table ipv6_route_table_template[] = {
2459         {
2460                 .procname       =       "flush",
2461                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2462                 .maxlen         =       sizeof(int),
2463                 .mode           =       0200,
2464                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2465         },
2466         {
2467                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2468                 .procname       =       "gc_thresh",
2469                 .data           =       &ip6_dst_ops_template.gc_thresh,
2470                 .maxlen         =       sizeof(int),
2471                 .mode           =       0644,
2472                 .proc_handler   =       &proc_dointvec,
2473         },
2474         {
2475                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2476                 .procname       =       "max_size",
2477                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2478                 .maxlen         =       sizeof(int),
2479                 .mode           =       0644,
2480                 .proc_handler   =       &proc_dointvec,
2481         },
2482         {
2483                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2484                 .procname       =       "gc_min_interval",
2485                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2486                 .maxlen         =       sizeof(int),
2487                 .mode           =       0644,
2488                 .proc_handler   =       &proc_dointvec_jiffies,
2489                 .strategy       =       &sysctl_jiffies,
2490         },
2491         {
2492                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2493                 .procname       =       "gc_timeout",
2494                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2495                 .maxlen         =       sizeof(int),
2496                 .mode           =       0644,
2497                 .proc_handler   =       &proc_dointvec_jiffies,
2498                 .strategy       =       &sysctl_jiffies,
2499         },
2500         {
2501                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2502                 .procname       =       "gc_interval",
2503                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2504                 .maxlen         =       sizeof(int),
2505                 .mode           =       0644,
2506                 .proc_handler   =       &proc_dointvec_jiffies,
2507                 .strategy       =       &sysctl_jiffies,
2508         },
2509         {
2510                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2511                 .procname       =       "gc_elasticity",
2512                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2513                 .maxlen         =       sizeof(int),
2514                 .mode           =       0644,
2515                 .proc_handler   =       &proc_dointvec_jiffies,
2516                 .strategy       =       &sysctl_jiffies,
2517         },
2518         {
2519                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2520                 .procname       =       "mtu_expires",
2521                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2522                 .maxlen         =       sizeof(int),
2523                 .mode           =       0644,
2524                 .proc_handler   =       &proc_dointvec_jiffies,
2525                 .strategy       =       &sysctl_jiffies,
2526         },
2527         {
2528                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2529                 .procname       =       "min_adv_mss",
2530                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2531                 .maxlen         =       sizeof(int),
2532                 .mode           =       0644,
2533                 .proc_handler   =       &proc_dointvec_jiffies,
2534                 .strategy       =       &sysctl_jiffies,
2535         },
2536         {
2537                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2538                 .procname       =       "gc_min_interval_ms",
2539                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2540                 .maxlen         =       sizeof(int),
2541                 .mode           =       0644,
2542                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2543                 .strategy       =       &sysctl_ms_jiffies,
2544         },
2545         { .ctl_name = 0 }
2546 };
2547
2548 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2549 {
2550         struct ctl_table *table;
2551
2552         table = kmemdup(ipv6_route_table_template,
2553                         sizeof(ipv6_route_table_template),
2554                         GFP_KERNEL);
2555
2556         if (table) {
2557                 table[0].data = &net->ipv6.sysctl.flush_delay;
2558                 table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2559                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2560                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2561                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2562                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2563                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2564                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2565                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2566         }
2567
2568         return table;
2569 }
2570 #endif
2571
2572 static int ip6_route_net_init(struct net *net)
2573 {
2574         int ret = 0;
2575
2576         ret = -ENOMEM;
2577         net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2578                                         sizeof(*net->ipv6.ip6_dst_ops),
2579                                         GFP_KERNEL);
2580         if (!net->ipv6.ip6_dst_ops)
2581                 goto out;
2582         net->ipv6.ip6_dst_ops->dst_net = net;
2583
2584         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2585                                            sizeof(*net->ipv6.ip6_null_entry),
2586                                            GFP_KERNEL);
2587         if (!net->ipv6.ip6_null_entry)
2588                 goto out_ip6_dst_ops;
2589         net->ipv6.ip6_null_entry->u.dst.path =
2590                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2591         net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2592
2593 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2594         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2595                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2596                                                GFP_KERNEL);
2597         if (!net->ipv6.ip6_prohibit_entry) {
2598                 kfree(net->ipv6.ip6_null_entry);
2599                 goto out;
2600         }
2601         net->ipv6.ip6_prohibit_entry->u.dst.path =
2602                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2603         net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2604
2605         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2606                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2607                                                GFP_KERNEL);
2608         if (!net->ipv6.ip6_blk_hole_entry) {
2609                 kfree(net->ipv6.ip6_null_entry);
2610                 kfree(net->ipv6.ip6_prohibit_entry);
2611                 goto out;
2612         }
2613         net->ipv6.ip6_blk_hole_entry->u.dst.path =
2614                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2615         net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2616 #endif
2617
2618 #ifdef CONFIG_PROC_FS
2619         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2620         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2621 #endif
2622         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2623
2624         ret = 0;
2625 out:
2626         return ret;
2627
2628 out_ip6_dst_ops:
2629         kfree(net->ipv6.ip6_dst_ops);
2630         goto out;
2631 }
2632
2633 static void ip6_route_net_exit(struct net *net)
2634 {
2635 #ifdef CONFIG_PROC_FS
2636         proc_net_remove(net, "ipv6_route");
2637         proc_net_remove(net, "rt6_stats");
2638 #endif
2639         kfree(net->ipv6.ip6_null_entry);
2640 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2641         kfree(net->ipv6.ip6_prohibit_entry);
2642         kfree(net->ipv6.ip6_blk_hole_entry);
2643 #endif
2644         kfree(net->ipv6.ip6_dst_ops);
2645 }
2646
2647 static struct pernet_operations ip6_route_net_ops = {
2648         .init = ip6_route_net_init,
2649         .exit = ip6_route_net_exit,
2650 };
2651
2652 static struct notifier_block ip6_route_dev_notifier = {
2653         .notifier_call = ip6_route_dev_notify,
2654         .priority = 0,
2655 };
2656
2657 int __init ip6_route_init(void)
2658 {
2659         int ret;
2660
2661         ret = -ENOMEM;
2662         ip6_dst_ops_template.kmem_cachep =
2663                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2664                                   SLAB_HWCACHE_ALIGN, NULL);
2665         if (!ip6_dst_ops_template.kmem_cachep)
2666                 goto out;;
2667
2668         ret = register_pernet_subsys(&ip6_route_net_ops);
2669         if (ret)
2670                 goto out_kmem_cache;
2671
2672         /* Registering of the loopback is done before this portion of code,
2673          * the loopback reference in rt6_info will not be taken, do it
2674          * manually for init_net */
2675         init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2676         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2677   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2678         init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2679         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2680         init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2681         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2682   #endif
2683         ret = fib6_init();
2684         if (ret)
2685                 goto out_register_subsys;
2686
2687         ret = xfrm6_init();
2688         if (ret)
2689                 goto out_fib6_init;
2690
2691         ret = fib6_rules_init();
2692         if (ret)
2693                 goto xfrm6_init;
2694
2695         ret = -ENOBUFS;
2696         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2697             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2698             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2699                 goto fib6_rules_init;
2700
2701         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2702         if (ret)
2703                 goto fib6_rules_init;
2704
2705 out:
2706         return ret;
2707
2708 fib6_rules_init:
2709         fib6_rules_cleanup();
2710 xfrm6_init:
2711         xfrm6_fini();
2712 out_fib6_init:
2713         fib6_gc_cleanup();
2714 out_register_subsys:
2715         unregister_pernet_subsys(&ip6_route_net_ops);
2716 out_kmem_cache:
2717         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2718         goto out;
2719 }
2720
2721 void ip6_route_cleanup(void)
2722 {
2723         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2724         fib6_rules_cleanup();
2725         xfrm6_fini();
2726         fib6_gc_cleanup();
2727         unregister_pernet_subsys(&ip6_route_net_ops);
2728         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2729 }