Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shaggy...
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static int ip6_rt_max_size = 4096;
77 static int ip6_rt_gc_min_interval = HZ / 2;
78 static int ip6_rt_gc_timeout = 60*HZ;
79 int ip6_rt_gc_interval = 30*HZ;
80 static int ip6_rt_gc_elasticity = 9;
81 static int ip6_rt_mtu_expires = 10*60*HZ;
82 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
83
84 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
85 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void             ip6_dst_destroy(struct dst_entry *);
88 static void             ip6_dst_ifdown(struct dst_entry *,
89                                        struct net_device *dev, int how);
90 static int               ip6_dst_gc(void);
91
92 static int              ip6_pkt_discard(struct sk_buff *skb);
93 static int              ip6_pkt_discard_out(struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
99                                            struct in6_addr *gwaddr, int ifindex,
100                                            unsigned pref);
101 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex);
103 #endif
104
105 static struct dst_ops ip6_dst_ops = {
106         .family                 =       AF_INET6,
107         .protocol               =       __constant_htons(ETH_P_IPV6),
108         .gc                     =       ip6_dst_gc,
109         .gc_thresh              =       1024,
110         .check                  =       ip6_dst_check,
111         .destroy                =       ip6_dst_destroy,
112         .ifdown                 =       ip6_dst_ifdown,
113         .negative_advice        =       ip6_negative_advice,
114         .link_failure           =       ip6_link_failure,
115         .update_pmtu            =       ip6_rt_update_pmtu,
116         .entry_size             =       sizeof(struct rt6_info),
117 };
118
119 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
120 {
121 }
122
123 static struct dst_ops ip6_dst_blackhole_ops = {
124         .family                 =       AF_INET6,
125         .protocol               =       __constant_htons(ETH_P_IPV6),
126         .destroy                =       ip6_dst_destroy,
127         .check                  =       ip6_dst_check,
128         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
129         .entry_size             =       sizeof(struct rt6_info),
130 };
131
132 struct rt6_info ip6_null_entry = {
133         .u = {
134                 .dst = {
135                         .__refcnt       = ATOMIC_INIT(1),
136                         .__use          = 1,
137                         .obsolete       = -1,
138                         .error          = -ENETUNREACH,
139                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
140                         .input          = ip6_pkt_discard,
141                         .output         = ip6_pkt_discard_out,
142                         .ops            = &ip6_dst_ops,
143                         .path           = (struct dst_entry*)&ip6_null_entry,
144                 }
145         },
146         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
147         .rt6i_metric    = ~(u32) 0,
148         .rt6i_ref       = ATOMIC_INIT(1),
149 };
150
151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
152
153 static int ip6_pkt_prohibit(struct sk_buff *skb);
154 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
155 static int ip6_pkt_blk_hole(struct sk_buff *skb);
156
157 struct rt6_info ip6_prohibit_entry = {
158         .u = {
159                 .dst = {
160                         .__refcnt       = ATOMIC_INIT(1),
161                         .__use          = 1,
162                         .obsolete       = -1,
163                         .error          = -EACCES,
164                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
165                         .input          = ip6_pkt_prohibit,
166                         .output         = ip6_pkt_prohibit_out,
167                         .ops            = &ip6_dst_ops,
168                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
169                 }
170         },
171         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
172         .rt6i_metric    = ~(u32) 0,
173         .rt6i_ref       = ATOMIC_INIT(1),
174 };
175
176 struct rt6_info ip6_blk_hole_entry = {
177         .u = {
178                 .dst = {
179                         .__refcnt       = ATOMIC_INIT(1),
180                         .__use          = 1,
181                         .obsolete       = -1,
182                         .error          = -EINVAL,
183                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
184                         .input          = ip6_pkt_blk_hole,
185                         .output         = ip6_pkt_blk_hole,
186                         .ops            = &ip6_dst_ops,
187                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
188                 }
189         },
190         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
191         .rt6i_metric    = ~(u32) 0,
192         .rt6i_ref       = ATOMIC_INIT(1),
193 };
194
195 #endif
196
197 /* allocate dst with ip6_dst_ops */
198 static __inline__ struct rt6_info *ip6_dst_alloc(void)
199 {
200         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
201 }
202
203 static void ip6_dst_destroy(struct dst_entry *dst)
204 {
205         struct rt6_info *rt = (struct rt6_info *)dst;
206         struct inet6_dev *idev = rt->rt6i_idev;
207
208         if (idev != NULL) {
209                 rt->rt6i_idev = NULL;
210                 in6_dev_put(idev);
211         }
212 }
213
214 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
215                            int how)
216 {
217         struct rt6_info *rt = (struct rt6_info *)dst;
218         struct inet6_dev *idev = rt->rt6i_idev;
219
220         if (dev != init_net.loopback_dev && idev != NULL && idev->dev == dev) {
221                 struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev);
222                 if (loopback_idev != NULL) {
223                         rt->rt6i_idev = loopback_idev;
224                         in6_dev_put(idev);
225                 }
226         }
227 }
228
229 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
230 {
231         return (rt->rt6i_flags & RTF_EXPIRES &&
232                 time_after(jiffies, rt->rt6i_expires));
233 }
234
235 static inline int rt6_need_strict(struct in6_addr *daddr)
236 {
237         return (ipv6_addr_type(daddr) &
238                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
239 }
240
241 /*
242  *      Route lookup. Any table->tb6_lock is implied.
243  */
244
245 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
246                                                     int oif,
247                                                     int strict)
248 {
249         struct rt6_info *local = NULL;
250         struct rt6_info *sprt;
251
252         if (oif) {
253                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
254                         struct net_device *dev = sprt->rt6i_dev;
255                         if (dev->ifindex == oif)
256                                 return sprt;
257                         if (dev->flags & IFF_LOOPBACK) {
258                                 if (sprt->rt6i_idev == NULL ||
259                                     sprt->rt6i_idev->dev->ifindex != oif) {
260                                         if (strict && oif)
261                                                 continue;
262                                         if (local && (!oif ||
263                                                       local->rt6i_idev->dev->ifindex == oif))
264                                                 continue;
265                                 }
266                                 local = sprt;
267                         }
268                 }
269
270                 if (local)
271                         return local;
272
273                 if (strict)
274                         return &ip6_null_entry;
275         }
276         return rt;
277 }
278
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283         /*
284          * Okay, this does not seem to be appropriate
285          * for now, however, we need to check if it
286          * is really so; aka Router Reachability Probing.
287          *
288          * Router Reachability Probe MUST be rate-limited
289          * to no more than one per minute.
290          */
291         if (!neigh || (neigh->nud_state & NUD_VALID))
292                 return;
293         read_lock_bh(&neigh->lock);
294         if (!(neigh->nud_state & NUD_VALID) &&
295             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296                 struct in6_addr mcaddr;
297                 struct in6_addr *target;
298
299                 neigh->updated = jiffies;
300                 read_unlock_bh(&neigh->lock);
301
302                 target = (struct in6_addr *)&neigh->primary_key;
303                 addrconf_addr_solict_mult(target, &mcaddr);
304                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305         } else
306                 read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311         return;
312 }
313 #endif
314
315 /*
316  * Default Router Selection (RFC 2461 6.3.6)
317  */
318 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
319 {
320         struct net_device *dev = rt->rt6i_dev;
321         if (!oif || dev->ifindex == oif)
322                 return 2;
323         if ((dev->flags & IFF_LOOPBACK) &&
324             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
325                 return 1;
326         return 0;
327 }
328
329 static inline int rt6_check_neigh(struct rt6_info *rt)
330 {
331         struct neighbour *neigh = rt->rt6i_nexthop;
332         int m;
333         if (rt->rt6i_flags & RTF_NONEXTHOP ||
334             !(rt->rt6i_flags & RTF_GATEWAY))
335                 m = 1;
336         else if (neigh) {
337                 read_lock_bh(&neigh->lock);
338                 if (neigh->nud_state & NUD_VALID)
339                         m = 2;
340 #ifdef CONFIG_IPV6_ROUTER_PREF
341                 else if (neigh->nud_state & NUD_FAILED)
342                         m = 0;
343 #endif
344                 else
345                         m = 1;
346                 read_unlock_bh(&neigh->lock);
347         } else
348                 m = 0;
349         return m;
350 }
351
352 static int rt6_score_route(struct rt6_info *rt, int oif,
353                            int strict)
354 {
355         int m, n;
356
357         m = rt6_check_dev(rt, oif);
358         if (!m && (strict & RT6_LOOKUP_F_IFACE))
359                 return -1;
360 #ifdef CONFIG_IPV6_ROUTER_PREF
361         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
362 #endif
363         n = rt6_check_neigh(rt);
364         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
365                 return -1;
366         return m;
367 }
368
369 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
370                                    int *mpri, struct rt6_info *match)
371 {
372         int m;
373
374         if (rt6_check_expired(rt))
375                 goto out;
376
377         m = rt6_score_route(rt, oif, strict);
378         if (m < 0)
379                 goto out;
380
381         if (m > *mpri) {
382                 if (strict & RT6_LOOKUP_F_REACHABLE)
383                         rt6_probe(match);
384                 *mpri = m;
385                 match = rt;
386         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
387                 rt6_probe(rt);
388         }
389
390 out:
391         return match;
392 }
393
394 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
395                                      struct rt6_info *rr_head,
396                                      u32 metric, int oif, int strict)
397 {
398         struct rt6_info *rt, *match;
399         int mpri = -1;
400
401         match = NULL;
402         for (rt = rr_head; rt && rt->rt6i_metric == metric;
403              rt = rt->u.dst.rt6_next)
404                 match = find_match(rt, oif, strict, &mpri, match);
405         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
406              rt = rt->u.dst.rt6_next)
407                 match = find_match(rt, oif, strict, &mpri, match);
408
409         return match;
410 }
411
412 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
413 {
414         struct rt6_info *match, *rt0;
415
416         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
417                   __FUNCTION__, fn->leaf, oif);
418
419         rt0 = fn->rr_ptr;
420         if (!rt0)
421                 fn->rr_ptr = rt0 = fn->leaf;
422
423         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
424
425         if (!match &&
426             (strict & RT6_LOOKUP_F_REACHABLE)) {
427                 struct rt6_info *next = rt0->u.dst.rt6_next;
428
429                 /* no entries matched; do round-robin */
430                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
431                         next = fn->leaf;
432
433                 if (next != rt0)
434                         fn->rr_ptr = next;
435         }
436
437         RT6_TRACE("%s() => %p\n",
438                   __FUNCTION__, match);
439
440         return (match ? match : &ip6_null_entry);
441 }
442
443 #ifdef CONFIG_IPV6_ROUTE_INFO
444 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
445                   struct in6_addr *gwaddr)
446 {
447         struct route_info *rinfo = (struct route_info *) opt;
448         struct in6_addr prefix_buf, *prefix;
449         unsigned int pref;
450         u32 lifetime;
451         struct rt6_info *rt;
452
453         if (len < sizeof(struct route_info)) {
454                 return -EINVAL;
455         }
456
457         /* Sanity check for prefix_len and length */
458         if (rinfo->length > 3) {
459                 return -EINVAL;
460         } else if (rinfo->prefix_len > 128) {
461                 return -EINVAL;
462         } else if (rinfo->prefix_len > 64) {
463                 if (rinfo->length < 2) {
464                         return -EINVAL;
465                 }
466         } else if (rinfo->prefix_len > 0) {
467                 if (rinfo->length < 1) {
468                         return -EINVAL;
469                 }
470         }
471
472         pref = rinfo->route_pref;
473         if (pref == ICMPV6_ROUTER_PREF_INVALID)
474                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
475
476         lifetime = ntohl(rinfo->lifetime);
477         if (lifetime == 0xffffffff) {
478                 /* infinity */
479         } else if (lifetime > 0x7fffffff/HZ) {
480                 /* Avoid arithmetic overflow */
481                 lifetime = 0x7fffffff/HZ - 1;
482         }
483
484         if (rinfo->length == 3)
485                 prefix = (struct in6_addr *)rinfo->prefix;
486         else {
487                 /* this function is safe */
488                 ipv6_addr_prefix(&prefix_buf,
489                                  (struct in6_addr *)rinfo->prefix,
490                                  rinfo->prefix_len);
491                 prefix = &prefix_buf;
492         }
493
494         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
495
496         if (rt && !lifetime) {
497                 ip6_del_rt(rt);
498                 rt = NULL;
499         }
500
501         if (!rt && lifetime)
502                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
503                                         pref);
504         else if (rt)
505                 rt->rt6i_flags = RTF_ROUTEINFO |
506                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
507
508         if (rt) {
509                 if (lifetime == 0xffffffff) {
510                         rt->rt6i_flags &= ~RTF_EXPIRES;
511                 } else {
512                         rt->rt6i_expires = jiffies + HZ * lifetime;
513                         rt->rt6i_flags |= RTF_EXPIRES;
514                 }
515                 dst_release(&rt->u.dst);
516         }
517         return 0;
518 }
519 #endif
520
521 #define BACKTRACK(saddr) \
522 do { \
523         if (rt == &ip6_null_entry) { \
524                 struct fib6_node *pn; \
525                 while (1) { \
526                         if (fn->fn_flags & RTN_TL_ROOT) \
527                                 goto out; \
528                         pn = fn->parent; \
529                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
530                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
531                         else \
532                                 fn = pn; \
533                         if (fn->fn_flags & RTN_RTINFO) \
534                                 goto restart; \
535                 } \
536         } \
537 } while(0)
538
539 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
540                                              struct flowi *fl, int flags)
541 {
542         struct fib6_node *fn;
543         struct rt6_info *rt;
544
545         read_lock_bh(&table->tb6_lock);
546         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
547 restart:
548         rt = fn->leaf;
549         rt = rt6_device_match(rt, fl->oif, flags);
550         BACKTRACK(&fl->fl6_src);
551 out:
552         dst_use(&rt->u.dst, jiffies);
553         read_unlock_bh(&table->tb6_lock);
554         return rt;
555
556 }
557
558 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
559                             int oif, int strict)
560 {
561         struct flowi fl = {
562                 .oif = oif,
563                 .nl_u = {
564                         .ip6_u = {
565                                 .daddr = *daddr,
566                         },
567                 },
568         };
569         struct dst_entry *dst;
570         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
571
572         if (saddr) {
573                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
574                 flags |= RT6_LOOKUP_F_HAS_SADDR;
575         }
576
577         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
578         if (dst->error == 0)
579                 return (struct rt6_info *) dst;
580
581         dst_release(dst);
582
583         return NULL;
584 }
585
586 EXPORT_SYMBOL(rt6_lookup);
587
588 /* ip6_ins_rt is called with FREE table->tb6_lock.
589    It takes new route entry, the addition fails by any reason the
590    route is freed. In any case, if caller does not hold it, it may
591    be destroyed.
592  */
593
594 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
595 {
596         int err;
597         struct fib6_table *table;
598
599         table = rt->rt6i_table;
600         write_lock_bh(&table->tb6_lock);
601         err = fib6_add(&table->tb6_root, rt, info);
602         write_unlock_bh(&table->tb6_lock);
603
604         return err;
605 }
606
607 int ip6_ins_rt(struct rt6_info *rt)
608 {
609         return __ip6_ins_rt(rt, NULL);
610 }
611
612 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
613                                       struct in6_addr *saddr)
614 {
615         struct rt6_info *rt;
616
617         /*
618          *      Clone the route.
619          */
620
621         rt = ip6_rt_copy(ort);
622
623         if (rt) {
624                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
625                         if (rt->rt6i_dst.plen != 128 &&
626                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
627                                 rt->rt6i_flags |= RTF_ANYCAST;
628                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
629                 }
630
631                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
632                 rt->rt6i_dst.plen = 128;
633                 rt->rt6i_flags |= RTF_CACHE;
634                 rt->u.dst.flags |= DST_HOST;
635
636 #ifdef CONFIG_IPV6_SUBTREES
637                 if (rt->rt6i_src.plen && saddr) {
638                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
639                         rt->rt6i_src.plen = 128;
640                 }
641 #endif
642
643                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
644
645         }
646
647         return rt;
648 }
649
650 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
651 {
652         struct rt6_info *rt = ip6_rt_copy(ort);
653         if (rt) {
654                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
655                 rt->rt6i_dst.plen = 128;
656                 rt->rt6i_flags |= RTF_CACHE;
657                 rt->u.dst.flags |= DST_HOST;
658                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
659         }
660         return rt;
661 }
662
663 static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
664                                             struct flowi *fl, int flags)
665 {
666         struct fib6_node *fn;
667         struct rt6_info *rt, *nrt;
668         int strict = 0;
669         int attempts = 3;
670         int err;
671         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
672
673         strict |= flags & RT6_LOOKUP_F_IFACE;
674
675 relookup:
676         read_lock_bh(&table->tb6_lock);
677
678 restart_2:
679         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
680
681 restart:
682         rt = rt6_select(fn, oif, strict | reachable);
683         BACKTRACK(&fl->fl6_src);
684         if (rt == &ip6_null_entry ||
685             rt->rt6i_flags & RTF_CACHE)
686                 goto out;
687
688         dst_hold(&rt->u.dst);
689         read_unlock_bh(&table->tb6_lock);
690
691         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
692                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
693         else {
694 #if CLONE_OFFLINK_ROUTE
695                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
696 #else
697                 goto out2;
698 #endif
699         }
700
701         dst_release(&rt->u.dst);
702         rt = nrt ? : &ip6_null_entry;
703
704         dst_hold(&rt->u.dst);
705         if (nrt) {
706                 err = ip6_ins_rt(nrt);
707                 if (!err)
708                         goto out2;
709         }
710
711         if (--attempts <= 0)
712                 goto out2;
713
714         /*
715          * Race condition! In the gap, when table->tb6_lock was
716          * released someone could insert this route.  Relookup.
717          */
718         dst_release(&rt->u.dst);
719         goto relookup;
720
721 out:
722         if (reachable) {
723                 reachable = 0;
724                 goto restart_2;
725         }
726         dst_hold(&rt->u.dst);
727         read_unlock_bh(&table->tb6_lock);
728 out2:
729         rt->u.dst.lastuse = jiffies;
730         rt->u.dst.__use++;
731
732         return rt;
733 }
734
735 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
736                                             struct flowi *fl, int flags)
737 {
738         return ip6_pol_route(table, fl->iif, fl, flags);
739 }
740
741 void ip6_route_input(struct sk_buff *skb)
742 {
743         struct ipv6hdr *iph = ipv6_hdr(skb);
744         int flags = RT6_LOOKUP_F_HAS_SADDR;
745         struct flowi fl = {
746                 .iif = skb->dev->ifindex,
747                 .nl_u = {
748                         .ip6_u = {
749                                 .daddr = iph->daddr,
750                                 .saddr = iph->saddr,
751                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
752                         },
753                 },
754                 .mark = skb->mark,
755                 .proto = iph->nexthdr,
756         };
757
758         if (rt6_need_strict(&iph->daddr))
759                 flags |= RT6_LOOKUP_F_IFACE;
760
761         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
762 }
763
764 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
765                                              struct flowi *fl, int flags)
766 {
767         return ip6_pol_route(table, fl->oif, fl, flags);
768 }
769
770 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
771 {
772         int flags = 0;
773
774         if (rt6_need_strict(&fl->fl6_dst))
775                 flags |= RT6_LOOKUP_F_IFACE;
776
777         if (!ipv6_addr_any(&fl->fl6_src))
778                 flags |= RT6_LOOKUP_F_HAS_SADDR;
779
780         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
781 }
782
783 EXPORT_SYMBOL(ip6_route_output);
784
785 static int ip6_blackhole_output(struct sk_buff *skb)
786 {
787         kfree_skb(skb);
788         return 0;
789 }
790
791 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
792 {
793         struct rt6_info *ort = (struct rt6_info *) *dstp;
794         struct rt6_info *rt = (struct rt6_info *)
795                 dst_alloc(&ip6_dst_blackhole_ops);
796         struct dst_entry *new = NULL;
797
798         if (rt) {
799                 new = &rt->u.dst;
800
801                 atomic_set(&new->__refcnt, 1);
802                 new->__use = 1;
803                 new->input = ip6_blackhole_output;
804                 new->output = ip6_blackhole_output;
805
806                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
807                 new->dev = ort->u.dst.dev;
808                 if (new->dev)
809                         dev_hold(new->dev);
810                 rt->rt6i_idev = ort->rt6i_idev;
811                 if (rt->rt6i_idev)
812                         in6_dev_hold(rt->rt6i_idev);
813                 rt->rt6i_expires = 0;
814
815                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
816                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
817                 rt->rt6i_metric = 0;
818
819                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
820 #ifdef CONFIG_IPV6_SUBTREES
821                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
822 #endif
823
824                 dst_free(new);
825         }
826
827         dst_release(*dstp);
828         *dstp = new;
829         return (new ? 0 : -ENOMEM);
830 }
831 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
832
833 /*
834  *      Destination cache support functions
835  */
836
837 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
838 {
839         struct rt6_info *rt;
840
841         rt = (struct rt6_info *) dst;
842
843         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
844                 return dst;
845
846         return NULL;
847 }
848
849 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
850 {
851         struct rt6_info *rt = (struct rt6_info *) dst;
852
853         if (rt) {
854                 if (rt->rt6i_flags & RTF_CACHE)
855                         ip6_del_rt(rt);
856                 else
857                         dst_release(dst);
858         }
859         return NULL;
860 }
861
862 static void ip6_link_failure(struct sk_buff *skb)
863 {
864         struct rt6_info *rt;
865
866         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
867
868         rt = (struct rt6_info *) skb->dst;
869         if (rt) {
870                 if (rt->rt6i_flags&RTF_CACHE) {
871                         dst_set_expires(&rt->u.dst, 0);
872                         rt->rt6i_flags |= RTF_EXPIRES;
873                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
874                         rt->rt6i_node->fn_sernum = -1;
875         }
876 }
877
878 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
879 {
880         struct rt6_info *rt6 = (struct rt6_info*)dst;
881
882         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
883                 rt6->rt6i_flags |= RTF_MODIFIED;
884                 if (mtu < IPV6_MIN_MTU) {
885                         mtu = IPV6_MIN_MTU;
886                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
887                 }
888                 dst->metrics[RTAX_MTU-1] = mtu;
889                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
890         }
891 }
892
893 static int ipv6_get_mtu(struct net_device *dev);
894
895 static inline unsigned int ipv6_advmss(unsigned int mtu)
896 {
897         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
898
899         if (mtu < ip6_rt_min_advmss)
900                 mtu = ip6_rt_min_advmss;
901
902         /*
903          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
904          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
905          * IPV6_MAXPLEN is also valid and means: "any MSS,
906          * rely only on pmtu discovery"
907          */
908         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
909                 mtu = IPV6_MAXPLEN;
910         return mtu;
911 }
912
913 static struct dst_entry *ndisc_dst_gc_list;
914 static DEFINE_SPINLOCK(ndisc_lock);
915
916 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
917                                   struct neighbour *neigh,
918                                   struct in6_addr *addr,
919                                   int (*output)(struct sk_buff *))
920 {
921         struct rt6_info *rt;
922         struct inet6_dev *idev = in6_dev_get(dev);
923
924         if (unlikely(idev == NULL))
925                 return NULL;
926
927         rt = ip6_dst_alloc();
928         if (unlikely(rt == NULL)) {
929                 in6_dev_put(idev);
930                 goto out;
931         }
932
933         dev_hold(dev);
934         if (neigh)
935                 neigh_hold(neigh);
936         else
937                 neigh = ndisc_get_neigh(dev, addr);
938
939         rt->rt6i_dev      = dev;
940         rt->rt6i_idev     = idev;
941         rt->rt6i_nexthop  = neigh;
942         atomic_set(&rt->u.dst.__refcnt, 1);
943         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
944         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
945         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
946         rt->u.dst.output  = output;
947
948 #if 0   /* there's no chance to use these for ndisc */
949         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
950                                 ? DST_HOST
951                                 : 0;
952         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
953         rt->rt6i_dst.plen = 128;
954 #endif
955
956         spin_lock_bh(&ndisc_lock);
957         rt->u.dst.next = ndisc_dst_gc_list;
958         ndisc_dst_gc_list = &rt->u.dst;
959         spin_unlock_bh(&ndisc_lock);
960
961         fib6_force_start_gc();
962
963 out:
964         return &rt->u.dst;
965 }
966
967 int ndisc_dst_gc(int *more)
968 {
969         struct dst_entry *dst, *next, **pprev;
970         int freed;
971
972         next = NULL;
973         freed = 0;
974
975         spin_lock_bh(&ndisc_lock);
976         pprev = &ndisc_dst_gc_list;
977
978         while ((dst = *pprev) != NULL) {
979                 if (!atomic_read(&dst->__refcnt)) {
980                         *pprev = dst->next;
981                         dst_free(dst);
982                         freed++;
983                 } else {
984                         pprev = &dst->next;
985                         (*more)++;
986                 }
987         }
988
989         spin_unlock_bh(&ndisc_lock);
990
991         return freed;
992 }
993
994 static int ip6_dst_gc(void)
995 {
996         static unsigned expire = 30*HZ;
997         static unsigned long last_gc;
998         unsigned long now = jiffies;
999
1000         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
1001             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
1002                 goto out;
1003
1004         expire++;
1005         fib6_run_gc(expire);
1006         last_gc = now;
1007         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1008                 expire = ip6_rt_gc_timeout>>1;
1009
1010 out:
1011         expire -= expire>>ip6_rt_gc_elasticity;
1012         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1013 }
1014
1015 /* Clean host part of a prefix. Not necessary in radix tree,
1016    but results in cleaner routing tables.
1017
1018    Remove it only when all the things will work!
1019  */
1020
1021 static int ipv6_get_mtu(struct net_device *dev)
1022 {
1023         int mtu = IPV6_MIN_MTU;
1024         struct inet6_dev *idev;
1025
1026         idev = in6_dev_get(dev);
1027         if (idev) {
1028                 mtu = idev->cnf.mtu6;
1029                 in6_dev_put(idev);
1030         }
1031         return mtu;
1032 }
1033
1034 int ipv6_get_hoplimit(struct net_device *dev)
1035 {
1036         int hoplimit = ipv6_devconf.hop_limit;
1037         struct inet6_dev *idev;
1038
1039         idev = in6_dev_get(dev);
1040         if (idev) {
1041                 hoplimit = idev->cnf.hop_limit;
1042                 in6_dev_put(idev);
1043         }
1044         return hoplimit;
1045 }
1046
1047 /*
1048  *
1049  */
1050
1051 int ip6_route_add(struct fib6_config *cfg)
1052 {
1053         int err;
1054         struct rt6_info *rt = NULL;
1055         struct net_device *dev = NULL;
1056         struct inet6_dev *idev = NULL;
1057         struct fib6_table *table;
1058         int addr_type;
1059
1060         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1061                 return -EINVAL;
1062 #ifndef CONFIG_IPV6_SUBTREES
1063         if (cfg->fc_src_len)
1064                 return -EINVAL;
1065 #endif
1066         if (cfg->fc_ifindex) {
1067                 err = -ENODEV;
1068                 dev = dev_get_by_index(&init_net, cfg->fc_ifindex);
1069                 if (!dev)
1070                         goto out;
1071                 idev = in6_dev_get(dev);
1072                 if (!idev)
1073                         goto out;
1074         }
1075
1076         if (cfg->fc_metric == 0)
1077                 cfg->fc_metric = IP6_RT_PRIO_USER;
1078
1079         table = fib6_new_table(cfg->fc_table);
1080         if (table == NULL) {
1081                 err = -ENOBUFS;
1082                 goto out;
1083         }
1084
1085         rt = ip6_dst_alloc();
1086
1087         if (rt == NULL) {
1088                 err = -ENOMEM;
1089                 goto out;
1090         }
1091
1092         rt->u.dst.obsolete = -1;
1093         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1094
1095         if (cfg->fc_protocol == RTPROT_UNSPEC)
1096                 cfg->fc_protocol = RTPROT_BOOT;
1097         rt->rt6i_protocol = cfg->fc_protocol;
1098
1099         addr_type = ipv6_addr_type(&cfg->fc_dst);
1100
1101         if (addr_type & IPV6_ADDR_MULTICAST)
1102                 rt->u.dst.input = ip6_mc_input;
1103         else
1104                 rt->u.dst.input = ip6_forward;
1105
1106         rt->u.dst.output = ip6_output;
1107
1108         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1109         rt->rt6i_dst.plen = cfg->fc_dst_len;
1110         if (rt->rt6i_dst.plen == 128)
1111                rt->u.dst.flags = DST_HOST;
1112
1113 #ifdef CONFIG_IPV6_SUBTREES
1114         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1115         rt->rt6i_src.plen = cfg->fc_src_len;
1116 #endif
1117
1118         rt->rt6i_metric = cfg->fc_metric;
1119
1120         /* We cannot add true routes via loopback here,
1121            they would result in kernel looping; promote them to reject routes
1122          */
1123         if ((cfg->fc_flags & RTF_REJECT) ||
1124             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1125                 /* hold loopback dev/idev if we haven't done so. */
1126                 if (dev != init_net.loopback_dev) {
1127                         if (dev) {
1128                                 dev_put(dev);
1129                                 in6_dev_put(idev);
1130                         }
1131                         dev = init_net.loopback_dev;
1132                         dev_hold(dev);
1133                         idev = in6_dev_get(dev);
1134                         if (!idev) {
1135                                 err = -ENODEV;
1136                                 goto out;
1137                         }
1138                 }
1139                 rt->u.dst.output = ip6_pkt_discard_out;
1140                 rt->u.dst.input = ip6_pkt_discard;
1141                 rt->u.dst.error = -ENETUNREACH;
1142                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1143                 goto install_route;
1144         }
1145
1146         if (cfg->fc_flags & RTF_GATEWAY) {
1147                 struct in6_addr *gw_addr;
1148                 int gwa_type;
1149
1150                 gw_addr = &cfg->fc_gateway;
1151                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1152                 gwa_type = ipv6_addr_type(gw_addr);
1153
1154                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1155                         struct rt6_info *grt;
1156
1157                         /* IPv6 strictly inhibits using not link-local
1158                            addresses as nexthop address.
1159                            Otherwise, router will not able to send redirects.
1160                            It is very good, but in some (rare!) circumstances
1161                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1162                            some exceptions. --ANK
1163                          */
1164                         err = -EINVAL;
1165                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1166                                 goto out;
1167
1168                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1169
1170                         err = -EHOSTUNREACH;
1171                         if (grt == NULL)
1172                                 goto out;
1173                         if (dev) {
1174                                 if (dev != grt->rt6i_dev) {
1175                                         dst_release(&grt->u.dst);
1176                                         goto out;
1177                                 }
1178                         } else {
1179                                 dev = grt->rt6i_dev;
1180                                 idev = grt->rt6i_idev;
1181                                 dev_hold(dev);
1182                                 in6_dev_hold(grt->rt6i_idev);
1183                         }
1184                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1185                                 err = 0;
1186                         dst_release(&grt->u.dst);
1187
1188                         if (err)
1189                                 goto out;
1190                 }
1191                 err = -EINVAL;
1192                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1193                         goto out;
1194         }
1195
1196         err = -ENODEV;
1197         if (dev == NULL)
1198                 goto out;
1199
1200         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1201                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1202                 if (IS_ERR(rt->rt6i_nexthop)) {
1203                         err = PTR_ERR(rt->rt6i_nexthop);
1204                         rt->rt6i_nexthop = NULL;
1205                         goto out;
1206                 }
1207         }
1208
1209         rt->rt6i_flags = cfg->fc_flags;
1210
1211 install_route:
1212         if (cfg->fc_mx) {
1213                 struct nlattr *nla;
1214                 int remaining;
1215
1216                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1217                         int type = nla_type(nla);
1218
1219                         if (type) {
1220                                 if (type > RTAX_MAX) {
1221                                         err = -EINVAL;
1222                                         goto out;
1223                                 }
1224
1225                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1226                         }
1227                 }
1228         }
1229
1230         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1231                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1232         if (!rt->u.dst.metrics[RTAX_MTU-1])
1233                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1234         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1235                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1236         rt->u.dst.dev = dev;
1237         rt->rt6i_idev = idev;
1238         rt->rt6i_table = table;
1239         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1240
1241 out:
1242         if (dev)
1243                 dev_put(dev);
1244         if (idev)
1245                 in6_dev_put(idev);
1246         if (rt)
1247                 dst_free(&rt->u.dst);
1248         return err;
1249 }
1250
1251 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1252 {
1253         int err;
1254         struct fib6_table *table;
1255
1256         if (rt == &ip6_null_entry)
1257                 return -ENOENT;
1258
1259         table = rt->rt6i_table;
1260         write_lock_bh(&table->tb6_lock);
1261
1262         err = fib6_del(rt, info);
1263         dst_release(&rt->u.dst);
1264
1265         write_unlock_bh(&table->tb6_lock);
1266
1267         return err;
1268 }
1269
1270 int ip6_del_rt(struct rt6_info *rt)
1271 {
1272         return __ip6_del_rt(rt, NULL);
1273 }
1274
1275 static int ip6_route_del(struct fib6_config *cfg)
1276 {
1277         struct fib6_table *table;
1278         struct fib6_node *fn;
1279         struct rt6_info *rt;
1280         int err = -ESRCH;
1281
1282         table = fib6_get_table(cfg->fc_table);
1283         if (table == NULL)
1284                 return err;
1285
1286         read_lock_bh(&table->tb6_lock);
1287
1288         fn = fib6_locate(&table->tb6_root,
1289                          &cfg->fc_dst, cfg->fc_dst_len,
1290                          &cfg->fc_src, cfg->fc_src_len);
1291
1292         if (fn) {
1293                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1294                         if (cfg->fc_ifindex &&
1295                             (rt->rt6i_dev == NULL ||
1296                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1297                                 continue;
1298                         if (cfg->fc_flags & RTF_GATEWAY &&
1299                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1300                                 continue;
1301                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1302                                 continue;
1303                         dst_hold(&rt->u.dst);
1304                         read_unlock_bh(&table->tb6_lock);
1305
1306                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1307                 }
1308         }
1309         read_unlock_bh(&table->tb6_lock);
1310
1311         return err;
1312 }
1313
1314 /*
1315  *      Handle redirects
1316  */
1317 struct ip6rd_flowi {
1318         struct flowi fl;
1319         struct in6_addr gateway;
1320 };
1321
1322 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1323                                              struct flowi *fl,
1324                                              int flags)
1325 {
1326         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1327         struct rt6_info *rt;
1328         struct fib6_node *fn;
1329
1330         /*
1331          * Get the "current" route for this destination and
1332          * check if the redirect has come from approriate router.
1333          *
1334          * RFC 2461 specifies that redirects should only be
1335          * accepted if they come from the nexthop to the target.
1336          * Due to the way the routes are chosen, this notion
1337          * is a bit fuzzy and one might need to check all possible
1338          * routes.
1339          */
1340
1341         read_lock_bh(&table->tb6_lock);
1342         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1343 restart:
1344         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1345                 /*
1346                  * Current route is on-link; redirect is always invalid.
1347                  *
1348                  * Seems, previous statement is not true. It could
1349                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1350                  * But then router serving it might decide, that we should
1351                  * know truth 8)8) --ANK (980726).
1352                  */
1353                 if (rt6_check_expired(rt))
1354                         continue;
1355                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1356                         continue;
1357                 if (fl->oif != rt->rt6i_dev->ifindex)
1358                         continue;
1359                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1360                         continue;
1361                 break;
1362         }
1363
1364         if (!rt)
1365                 rt = &ip6_null_entry;
1366         BACKTRACK(&fl->fl6_src);
1367 out:
1368         dst_hold(&rt->u.dst);
1369
1370         read_unlock_bh(&table->tb6_lock);
1371
1372         return rt;
1373 };
1374
1375 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1376                                            struct in6_addr *src,
1377                                            struct in6_addr *gateway,
1378                                            struct net_device *dev)
1379 {
1380         int flags = RT6_LOOKUP_F_HAS_SADDR;
1381         struct ip6rd_flowi rdfl = {
1382                 .fl = {
1383                         .oif = dev->ifindex,
1384                         .nl_u = {
1385                                 .ip6_u = {
1386                                         .daddr = *dest,
1387                                         .saddr = *src,
1388                                 },
1389                         },
1390                 },
1391                 .gateway = *gateway,
1392         };
1393
1394         if (rt6_need_strict(dest))
1395                 flags |= RT6_LOOKUP_F_IFACE;
1396
1397         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1398 }
1399
1400 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1401                   struct in6_addr *saddr,
1402                   struct neighbour *neigh, u8 *lladdr, int on_link)
1403 {
1404         struct rt6_info *rt, *nrt = NULL;
1405         struct netevent_redirect netevent;
1406
1407         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1408
1409         if (rt == &ip6_null_entry) {
1410                 if (net_ratelimit())
1411                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1412                                "for redirect target\n");
1413                 goto out;
1414         }
1415
1416         /*
1417          *      We have finally decided to accept it.
1418          */
1419
1420         neigh_update(neigh, lladdr, NUD_STALE,
1421                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1422                      NEIGH_UPDATE_F_OVERRIDE|
1423                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1424                                      NEIGH_UPDATE_F_ISROUTER))
1425                      );
1426
1427         /*
1428          * Redirect received -> path was valid.
1429          * Look, redirects are sent only in response to data packets,
1430          * so that this nexthop apparently is reachable. --ANK
1431          */
1432         dst_confirm(&rt->u.dst);
1433
1434         /* Duplicate redirect: silently ignore. */
1435         if (neigh == rt->u.dst.neighbour)
1436                 goto out;
1437
1438         nrt = ip6_rt_copy(rt);
1439         if (nrt == NULL)
1440                 goto out;
1441
1442         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1443         if (on_link)
1444                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1445
1446         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1447         nrt->rt6i_dst.plen = 128;
1448         nrt->u.dst.flags |= DST_HOST;
1449
1450         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1451         nrt->rt6i_nexthop = neigh_clone(neigh);
1452         /* Reset pmtu, it may be better */
1453         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1454         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1455
1456         if (ip6_ins_rt(nrt))
1457                 goto out;
1458
1459         netevent.old = &rt->u.dst;
1460         netevent.new = &nrt->u.dst;
1461         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1462
1463         if (rt->rt6i_flags&RTF_CACHE) {
1464                 ip6_del_rt(rt);
1465                 return;
1466         }
1467
1468 out:
1469         dst_release(&rt->u.dst);
1470         return;
1471 }
1472
1473 /*
1474  *      Handle ICMP "packet too big" messages
1475  *      i.e. Path MTU discovery
1476  */
1477
1478 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1479                         struct net_device *dev, u32 pmtu)
1480 {
1481         struct rt6_info *rt, *nrt;
1482         int allfrag = 0;
1483
1484         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1485         if (rt == NULL)
1486                 return;
1487
1488         if (pmtu >= dst_mtu(&rt->u.dst))
1489                 goto out;
1490
1491         if (pmtu < IPV6_MIN_MTU) {
1492                 /*
1493                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1494                  * MTU (1280) and a fragment header should always be included
1495                  * after a node receiving Too Big message reporting PMTU is
1496                  * less than the IPv6 Minimum Link MTU.
1497                  */
1498                 pmtu = IPV6_MIN_MTU;
1499                 allfrag = 1;
1500         }
1501
1502         /* New mtu received -> path was valid.
1503            They are sent only in response to data packets,
1504            so that this nexthop apparently is reachable. --ANK
1505          */
1506         dst_confirm(&rt->u.dst);
1507
1508         /* Host route. If it is static, it would be better
1509            not to override it, but add new one, so that
1510            when cache entry will expire old pmtu
1511            would return automatically.
1512          */
1513         if (rt->rt6i_flags & RTF_CACHE) {
1514                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1515                 if (allfrag)
1516                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1517                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1518                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1519                 goto out;
1520         }
1521
1522         /* Network route.
1523            Two cases are possible:
1524            1. It is connected route. Action: COW
1525            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1526          */
1527         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1528                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1529         else
1530                 nrt = rt6_alloc_clone(rt, daddr);
1531
1532         if (nrt) {
1533                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1534                 if (allfrag)
1535                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1536
1537                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1538                  * happened within 5 mins, the recommended timer is 10 mins.
1539                  * Here this route expiration time is set to ip6_rt_mtu_expires
1540                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1541                  * and detecting PMTU increase will be automatically happened.
1542                  */
1543                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1544                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1545
1546                 ip6_ins_rt(nrt);
1547         }
1548 out:
1549         dst_release(&rt->u.dst);
1550 }
1551
1552 /*
1553  *      Misc support functions
1554  */
1555
1556 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1557 {
1558         struct rt6_info *rt = ip6_dst_alloc();
1559
1560         if (rt) {
1561                 rt->u.dst.input = ort->u.dst.input;
1562                 rt->u.dst.output = ort->u.dst.output;
1563
1564                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1565                 rt->u.dst.error = ort->u.dst.error;
1566                 rt->u.dst.dev = ort->u.dst.dev;
1567                 if (rt->u.dst.dev)
1568                         dev_hold(rt->u.dst.dev);
1569                 rt->rt6i_idev = ort->rt6i_idev;
1570                 if (rt->rt6i_idev)
1571                         in6_dev_hold(rt->rt6i_idev);
1572                 rt->u.dst.lastuse = jiffies;
1573                 rt->rt6i_expires = 0;
1574
1575                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1576                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1577                 rt->rt6i_metric = 0;
1578
1579                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1580 #ifdef CONFIG_IPV6_SUBTREES
1581                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1582 #endif
1583                 rt->rt6i_table = ort->rt6i_table;
1584         }
1585         return rt;
1586 }
1587
1588 #ifdef CONFIG_IPV6_ROUTE_INFO
1589 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1590                                            struct in6_addr *gwaddr, int ifindex)
1591 {
1592         struct fib6_node *fn;
1593         struct rt6_info *rt = NULL;
1594         struct fib6_table *table;
1595
1596         table = fib6_get_table(RT6_TABLE_INFO);
1597         if (table == NULL)
1598                 return NULL;
1599
1600         write_lock_bh(&table->tb6_lock);
1601         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1602         if (!fn)
1603                 goto out;
1604
1605         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1606                 if (rt->rt6i_dev->ifindex != ifindex)
1607                         continue;
1608                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1609                         continue;
1610                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1611                         continue;
1612                 dst_hold(&rt->u.dst);
1613                 break;
1614         }
1615 out:
1616         write_unlock_bh(&table->tb6_lock);
1617         return rt;
1618 }
1619
1620 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1621                                            struct in6_addr *gwaddr, int ifindex,
1622                                            unsigned pref)
1623 {
1624         struct fib6_config cfg = {
1625                 .fc_table       = RT6_TABLE_INFO,
1626                 .fc_metric      = 1024,
1627                 .fc_ifindex     = ifindex,
1628                 .fc_dst_len     = prefixlen,
1629                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1630                                   RTF_UP | RTF_PREF(pref),
1631         };
1632
1633         ipv6_addr_copy(&cfg.fc_dst, prefix);
1634         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1635
1636         /* We should treat it as a default route if prefix length is 0. */
1637         if (!prefixlen)
1638                 cfg.fc_flags |= RTF_DEFAULT;
1639
1640         ip6_route_add(&cfg);
1641
1642         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1643 }
1644 #endif
1645
1646 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1647 {
1648         struct rt6_info *rt;
1649         struct fib6_table *table;
1650
1651         table = fib6_get_table(RT6_TABLE_DFLT);
1652         if (table == NULL)
1653                 return NULL;
1654
1655         write_lock_bh(&table->tb6_lock);
1656         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1657                 if (dev == rt->rt6i_dev &&
1658                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1659                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1660                         break;
1661         }
1662         if (rt)
1663                 dst_hold(&rt->u.dst);
1664         write_unlock_bh(&table->tb6_lock);
1665         return rt;
1666 }
1667
1668 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1669                                      struct net_device *dev,
1670                                      unsigned int pref)
1671 {
1672         struct fib6_config cfg = {
1673                 .fc_table       = RT6_TABLE_DFLT,
1674                 .fc_metric      = 1024,
1675                 .fc_ifindex     = dev->ifindex,
1676                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1677                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1678         };
1679
1680         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1681
1682         ip6_route_add(&cfg);
1683
1684         return rt6_get_dflt_router(gwaddr, dev);
1685 }
1686
1687 void rt6_purge_dflt_routers(void)
1688 {
1689         struct rt6_info *rt;
1690         struct fib6_table *table;
1691
1692         /* NOTE: Keep consistent with rt6_get_dflt_router */
1693         table = fib6_get_table(RT6_TABLE_DFLT);
1694         if (table == NULL)
1695                 return;
1696
1697 restart:
1698         read_lock_bh(&table->tb6_lock);
1699         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1700                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1701                         dst_hold(&rt->u.dst);
1702                         read_unlock_bh(&table->tb6_lock);
1703                         ip6_del_rt(rt);
1704                         goto restart;
1705                 }
1706         }
1707         read_unlock_bh(&table->tb6_lock);
1708 }
1709
1710 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1711                                  struct fib6_config *cfg)
1712 {
1713         memset(cfg, 0, sizeof(*cfg));
1714
1715         cfg->fc_table = RT6_TABLE_MAIN;
1716         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1717         cfg->fc_metric = rtmsg->rtmsg_metric;
1718         cfg->fc_expires = rtmsg->rtmsg_info;
1719         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1720         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1721         cfg->fc_flags = rtmsg->rtmsg_flags;
1722
1723         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1724         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1725         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1726 }
1727
1728 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1729 {
1730         struct fib6_config cfg;
1731         struct in6_rtmsg rtmsg;
1732         int err;
1733
1734         switch(cmd) {
1735         case SIOCADDRT:         /* Add a route */
1736         case SIOCDELRT:         /* Delete a route */
1737                 if (!capable(CAP_NET_ADMIN))
1738                         return -EPERM;
1739                 err = copy_from_user(&rtmsg, arg,
1740                                      sizeof(struct in6_rtmsg));
1741                 if (err)
1742                         return -EFAULT;
1743
1744                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1745
1746                 rtnl_lock();
1747                 switch (cmd) {
1748                 case SIOCADDRT:
1749                         err = ip6_route_add(&cfg);
1750                         break;
1751                 case SIOCDELRT:
1752                         err = ip6_route_del(&cfg);
1753                         break;
1754                 default:
1755                         err = -EINVAL;
1756                 }
1757                 rtnl_unlock();
1758
1759                 return err;
1760         }
1761
1762         return -EINVAL;
1763 }
1764
1765 /*
1766  *      Drop the packet on the floor
1767  */
1768
1769 static inline int ip6_pkt_drop(struct sk_buff *skb, int code,
1770                                int ipstats_mib_noroutes)
1771 {
1772         int type;
1773         switch (ipstats_mib_noroutes) {
1774         case IPSTATS_MIB_INNOROUTES:
1775                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1776                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1777                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1778                         break;
1779                 }
1780                 /* FALLTHROUGH */
1781         case IPSTATS_MIB_OUTNOROUTES:
1782                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1783                 break;
1784         }
1785         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1786         kfree_skb(skb);
1787         return 0;
1788 }
1789
1790 static int ip6_pkt_discard(struct sk_buff *skb)
1791 {
1792         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1793 }
1794
1795 static int ip6_pkt_discard_out(struct sk_buff *skb)
1796 {
1797         skb->dev = skb->dst->dev;
1798         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1799 }
1800
1801 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1802
1803 static int ip6_pkt_prohibit(struct sk_buff *skb)
1804 {
1805         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1806 }
1807
1808 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1809 {
1810         skb->dev = skb->dst->dev;
1811         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1812 }
1813
1814 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1815 {
1816         kfree_skb(skb);
1817         return 0;
1818 }
1819
1820 #endif
1821
1822 /*
1823  *      Allocate a dst for local (unicast / anycast) address.
1824  */
1825
1826 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1827                                     const struct in6_addr *addr,
1828                                     int anycast)
1829 {
1830         struct rt6_info *rt = ip6_dst_alloc();
1831
1832         if (rt == NULL)
1833                 return ERR_PTR(-ENOMEM);
1834
1835         dev_hold(init_net.loopback_dev);
1836         in6_dev_hold(idev);
1837
1838         rt->u.dst.flags = DST_HOST;
1839         rt->u.dst.input = ip6_input;
1840         rt->u.dst.output = ip6_output;
1841         rt->rt6i_dev = init_net.loopback_dev;
1842         rt->rt6i_idev = idev;
1843         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1844         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1845         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1846         rt->u.dst.obsolete = -1;
1847
1848         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1849         if (anycast)
1850                 rt->rt6i_flags |= RTF_ANYCAST;
1851         else
1852                 rt->rt6i_flags |= RTF_LOCAL;
1853         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1854         if (rt->rt6i_nexthop == NULL) {
1855                 dst_free(&rt->u.dst);
1856                 return ERR_PTR(-ENOMEM);
1857         }
1858
1859         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1860         rt->rt6i_dst.plen = 128;
1861         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1862
1863         atomic_set(&rt->u.dst.__refcnt, 1);
1864
1865         return rt;
1866 }
1867
1868 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1869 {
1870         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1871             rt != &ip6_null_entry) {
1872                 RT6_TRACE("deleted by ifdown %p\n", rt);
1873                 return -1;
1874         }
1875         return 0;
1876 }
1877
1878 void rt6_ifdown(struct net_device *dev)
1879 {
1880         fib6_clean_all(fib6_ifdown, 0, dev);
1881 }
1882
1883 struct rt6_mtu_change_arg
1884 {
1885         struct net_device *dev;
1886         unsigned mtu;
1887 };
1888
1889 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1890 {
1891         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1892         struct inet6_dev *idev;
1893
1894         /* In IPv6 pmtu discovery is not optional,
1895            so that RTAX_MTU lock cannot disable it.
1896            We still use this lock to block changes
1897            caused by addrconf/ndisc.
1898         */
1899
1900         idev = __in6_dev_get(arg->dev);
1901         if (idev == NULL)
1902                 return 0;
1903
1904         /* For administrative MTU increase, there is no way to discover
1905            IPv6 PMTU increase, so PMTU increase should be updated here.
1906            Since RFC 1981 doesn't include administrative MTU increase
1907            update PMTU increase is a MUST. (i.e. jumbo frame)
1908          */
1909         /*
1910            If new MTU is less than route PMTU, this new MTU will be the
1911            lowest MTU in the path, update the route PMTU to reflect PMTU
1912            decreases; if new MTU is greater than route PMTU, and the
1913            old MTU is the lowest MTU in the path, update the route PMTU
1914            to reflect the increase. In this case if the other nodes' MTU
1915            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1916            PMTU discouvery.
1917          */
1918         if (rt->rt6i_dev == arg->dev &&
1919             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1920             (dst_mtu(&rt->u.dst) > arg->mtu ||
1921              (dst_mtu(&rt->u.dst) < arg->mtu &&
1922               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1923                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1924                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1925         }
1926         return 0;
1927 }
1928
1929 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1930 {
1931         struct rt6_mtu_change_arg arg = {
1932                 .dev = dev,
1933                 .mtu = mtu,
1934         };
1935
1936         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1937 }
1938
1939 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1940         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1941         [RTA_OIF]               = { .type = NLA_U32 },
1942         [RTA_IIF]               = { .type = NLA_U32 },
1943         [RTA_PRIORITY]          = { .type = NLA_U32 },
1944         [RTA_METRICS]           = { .type = NLA_NESTED },
1945 };
1946
1947 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1948                               struct fib6_config *cfg)
1949 {
1950         struct rtmsg *rtm;
1951         struct nlattr *tb[RTA_MAX+1];
1952         int err;
1953
1954         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1955         if (err < 0)
1956                 goto errout;
1957
1958         err = -EINVAL;
1959         rtm = nlmsg_data(nlh);
1960         memset(cfg, 0, sizeof(*cfg));
1961
1962         cfg->fc_table = rtm->rtm_table;
1963         cfg->fc_dst_len = rtm->rtm_dst_len;
1964         cfg->fc_src_len = rtm->rtm_src_len;
1965         cfg->fc_flags = RTF_UP;
1966         cfg->fc_protocol = rtm->rtm_protocol;
1967
1968         if (rtm->rtm_type == RTN_UNREACHABLE)
1969                 cfg->fc_flags |= RTF_REJECT;
1970
1971         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1972         cfg->fc_nlinfo.nlh = nlh;
1973
1974         if (tb[RTA_GATEWAY]) {
1975                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1976                 cfg->fc_flags |= RTF_GATEWAY;
1977         }
1978
1979         if (tb[RTA_DST]) {
1980                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1981
1982                 if (nla_len(tb[RTA_DST]) < plen)
1983                         goto errout;
1984
1985                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1986         }
1987
1988         if (tb[RTA_SRC]) {
1989                 int plen = (rtm->rtm_src_len + 7) >> 3;
1990
1991                 if (nla_len(tb[RTA_SRC]) < plen)
1992                         goto errout;
1993
1994                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1995         }
1996
1997         if (tb[RTA_OIF])
1998                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1999
2000         if (tb[RTA_PRIORITY])
2001                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2002
2003         if (tb[RTA_METRICS]) {
2004                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2005                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2006         }
2007
2008         if (tb[RTA_TABLE])
2009                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2010
2011         err = 0;
2012 errout:
2013         return err;
2014 }
2015
2016 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2017 {
2018         struct fib6_config cfg;
2019         int err;
2020
2021         err = rtm_to_fib6_config(skb, nlh, &cfg);
2022         if (err < 0)
2023                 return err;
2024
2025         return ip6_route_del(&cfg);
2026 }
2027
2028 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2029 {
2030         struct fib6_config cfg;
2031         int err;
2032
2033         err = rtm_to_fib6_config(skb, nlh, &cfg);
2034         if (err < 0)
2035                 return err;
2036
2037         return ip6_route_add(&cfg);
2038 }
2039
2040 static inline size_t rt6_nlmsg_size(void)
2041 {
2042         return NLMSG_ALIGN(sizeof(struct rtmsg))
2043                + nla_total_size(16) /* RTA_SRC */
2044                + nla_total_size(16) /* RTA_DST */
2045                + nla_total_size(16) /* RTA_GATEWAY */
2046                + nla_total_size(16) /* RTA_PREFSRC */
2047                + nla_total_size(4) /* RTA_TABLE */
2048                + nla_total_size(4) /* RTA_IIF */
2049                + nla_total_size(4) /* RTA_OIF */
2050                + nla_total_size(4) /* RTA_PRIORITY */
2051                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2052                + nla_total_size(sizeof(struct rta_cacheinfo));
2053 }
2054
2055 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2056                          struct in6_addr *dst, struct in6_addr *src,
2057                          int iif, int type, u32 pid, u32 seq,
2058                          int prefix, unsigned int flags)
2059 {
2060         struct rtmsg *rtm;
2061         struct nlmsghdr *nlh;
2062         long expires;
2063         u32 table;
2064
2065         if (prefix) {   /* user wants prefix routes only */
2066                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2067                         /* success since this is not a prefix route */
2068                         return 1;
2069                 }
2070         }
2071
2072         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2073         if (nlh == NULL)
2074                 return -EMSGSIZE;
2075
2076         rtm = nlmsg_data(nlh);
2077         rtm->rtm_family = AF_INET6;
2078         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2079         rtm->rtm_src_len = rt->rt6i_src.plen;
2080         rtm->rtm_tos = 0;
2081         if (rt->rt6i_table)
2082                 table = rt->rt6i_table->tb6_id;
2083         else
2084                 table = RT6_TABLE_UNSPEC;
2085         rtm->rtm_table = table;
2086         NLA_PUT_U32(skb, RTA_TABLE, table);
2087         if (rt->rt6i_flags&RTF_REJECT)
2088                 rtm->rtm_type = RTN_UNREACHABLE;
2089         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2090                 rtm->rtm_type = RTN_LOCAL;
2091         else
2092                 rtm->rtm_type = RTN_UNICAST;
2093         rtm->rtm_flags = 0;
2094         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2095         rtm->rtm_protocol = rt->rt6i_protocol;
2096         if (rt->rt6i_flags&RTF_DYNAMIC)
2097                 rtm->rtm_protocol = RTPROT_REDIRECT;
2098         else if (rt->rt6i_flags & RTF_ADDRCONF)
2099                 rtm->rtm_protocol = RTPROT_KERNEL;
2100         else if (rt->rt6i_flags&RTF_DEFAULT)
2101                 rtm->rtm_protocol = RTPROT_RA;
2102
2103         if (rt->rt6i_flags&RTF_CACHE)
2104                 rtm->rtm_flags |= RTM_F_CLONED;
2105
2106         if (dst) {
2107                 NLA_PUT(skb, RTA_DST, 16, dst);
2108                 rtm->rtm_dst_len = 128;
2109         } else if (rtm->rtm_dst_len)
2110                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2111 #ifdef CONFIG_IPV6_SUBTREES
2112         if (src) {
2113                 NLA_PUT(skb, RTA_SRC, 16, src);
2114                 rtm->rtm_src_len = 128;
2115         } else if (rtm->rtm_src_len)
2116                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2117 #endif
2118         if (iif)
2119                 NLA_PUT_U32(skb, RTA_IIF, iif);
2120         else if (dst) {
2121                 struct in6_addr saddr_buf;
2122                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2123                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2124         }
2125
2126         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2127                 goto nla_put_failure;
2128
2129         if (rt->u.dst.neighbour)
2130                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2131
2132         if (rt->u.dst.dev)
2133                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2134
2135         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2136
2137         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2138         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2139                                expires, rt->u.dst.error) < 0)
2140                 goto nla_put_failure;
2141
2142         return nlmsg_end(skb, nlh);
2143
2144 nla_put_failure:
2145         nlmsg_cancel(skb, nlh);
2146         return -EMSGSIZE;
2147 }
2148
2149 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2150 {
2151         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2152         int prefix;
2153
2154         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2155                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2156                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2157         } else
2158                 prefix = 0;
2159
2160         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2161                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2162                      prefix, NLM_F_MULTI);
2163 }
2164
2165 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2166 {
2167         struct nlattr *tb[RTA_MAX+1];
2168         struct rt6_info *rt;
2169         struct sk_buff *skb;
2170         struct rtmsg *rtm;
2171         struct flowi fl;
2172         int err, iif = 0;
2173
2174         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2175         if (err < 0)
2176                 goto errout;
2177
2178         err = -EINVAL;
2179         memset(&fl, 0, sizeof(fl));
2180
2181         if (tb[RTA_SRC]) {
2182                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2183                         goto errout;
2184
2185                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2186         }
2187
2188         if (tb[RTA_DST]) {
2189                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2190                         goto errout;
2191
2192                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2193         }
2194
2195         if (tb[RTA_IIF])
2196                 iif = nla_get_u32(tb[RTA_IIF]);
2197
2198         if (tb[RTA_OIF])
2199                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2200
2201         if (iif) {
2202                 struct net_device *dev;
2203                 dev = __dev_get_by_index(&init_net, iif);
2204                 if (!dev) {
2205                         err = -ENODEV;
2206                         goto errout;
2207                 }
2208         }
2209
2210         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2211         if (skb == NULL) {
2212                 err = -ENOBUFS;
2213                 goto errout;
2214         }
2215
2216         /* Reserve room for dummy headers, this skb can pass
2217            through good chunk of routing engine.
2218          */
2219         skb_reset_mac_header(skb);
2220         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2221
2222         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2223         skb->dst = &rt->u.dst;
2224
2225         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2226                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2227                             nlh->nlmsg_seq, 0, 0);
2228         if (err < 0) {
2229                 kfree_skb(skb);
2230                 goto errout;
2231         }
2232
2233         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2234 errout:
2235         return err;
2236 }
2237
2238 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2239 {
2240         struct sk_buff *skb;
2241         u32 pid = 0, seq = 0;
2242         struct nlmsghdr *nlh = NULL;
2243         int err = -ENOBUFS;
2244
2245         if (info) {
2246                 pid = info->pid;
2247                 nlh = info->nlh;
2248                 if (nlh)
2249                         seq = nlh->nlmsg_seq;
2250         }
2251
2252         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2253         if (skb == NULL)
2254                 goto errout;
2255
2256         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2257         if (err < 0) {
2258                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2259                 WARN_ON(err == -EMSGSIZE);
2260                 kfree_skb(skb);
2261                 goto errout;
2262         }
2263         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2264 errout:
2265         if (err < 0)
2266                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2267 }
2268
2269 /*
2270  *      /proc
2271  */
2272
2273 #ifdef CONFIG_PROC_FS
2274
2275 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2276
2277 struct rt6_proc_arg
2278 {
2279         char *buffer;
2280         int offset;
2281         int length;
2282         int skip;
2283         int len;
2284 };
2285
2286 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2287 {
2288         struct seq_file *m = p_arg;
2289
2290         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2291                    rt->rt6i_dst.plen);
2292
2293 #ifdef CONFIG_IPV6_SUBTREES
2294         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2295                    rt->rt6i_src.plen);
2296 #else
2297         seq_puts(m, "00000000000000000000000000000000 00 ");
2298 #endif
2299
2300         if (rt->rt6i_nexthop) {
2301                 seq_printf(m, NIP6_SEQFMT,
2302                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2303         } else {
2304                 seq_puts(m, "00000000000000000000000000000000");
2305         }
2306         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2307                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2308                    rt->u.dst.__use, rt->rt6i_flags,
2309                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2310         return 0;
2311 }
2312
2313 static int ipv6_route_show(struct seq_file *m, void *v)
2314 {
2315         fib6_clean_all(rt6_info_route, 0, m);
2316         return 0;
2317 }
2318
2319 static int ipv6_route_open(struct inode *inode, struct file *file)
2320 {
2321         return single_open(file, ipv6_route_show, NULL);
2322 }
2323
2324 static const struct file_operations ipv6_route_proc_fops = {
2325         .owner          = THIS_MODULE,
2326         .open           = ipv6_route_open,
2327         .read           = seq_read,
2328         .llseek         = seq_lseek,
2329         .release        = single_release,
2330 };
2331
2332 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2333 {
2334         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2335                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2336                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2337                       rt6_stats.fib_rt_cache,
2338                       atomic_read(&ip6_dst_ops.entries),
2339                       rt6_stats.fib_discarded_routes);
2340
2341         return 0;
2342 }
2343
2344 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2345 {
2346         return single_open(file, rt6_stats_seq_show, NULL);
2347 }
2348
2349 static const struct file_operations rt6_stats_seq_fops = {
2350         .owner   = THIS_MODULE,
2351         .open    = rt6_stats_seq_open,
2352         .read    = seq_read,
2353         .llseek  = seq_lseek,
2354         .release = single_release,
2355 };
2356 #endif  /* CONFIG_PROC_FS */
2357
2358 #ifdef CONFIG_SYSCTL
2359
2360 static int flush_delay;
2361
2362 static
2363 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2364                               void __user *buffer, size_t *lenp, loff_t *ppos)
2365 {
2366         if (write) {
2367                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2368                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2369                 return 0;
2370         } else
2371                 return -EINVAL;
2372 }
2373
2374 ctl_table ipv6_route_table[] = {
2375         {
2376                 .procname       =       "flush",
2377                 .data           =       &flush_delay,
2378                 .maxlen         =       sizeof(int),
2379                 .mode           =       0200,
2380                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2381         },
2382         {
2383                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2384                 .procname       =       "gc_thresh",
2385                 .data           =       &ip6_dst_ops.gc_thresh,
2386                 .maxlen         =       sizeof(int),
2387                 .mode           =       0644,
2388                 .proc_handler   =       &proc_dointvec,
2389         },
2390         {
2391                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2392                 .procname       =       "max_size",
2393                 .data           =       &ip6_rt_max_size,
2394                 .maxlen         =       sizeof(int),
2395                 .mode           =       0644,
2396                 .proc_handler   =       &proc_dointvec,
2397         },
2398         {
2399                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2400                 .procname       =       "gc_min_interval",
2401                 .data           =       &ip6_rt_gc_min_interval,
2402                 .maxlen         =       sizeof(int),
2403                 .mode           =       0644,
2404                 .proc_handler   =       &proc_dointvec_jiffies,
2405                 .strategy       =       &sysctl_jiffies,
2406         },
2407         {
2408                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2409                 .procname       =       "gc_timeout",
2410                 .data           =       &ip6_rt_gc_timeout,
2411                 .maxlen         =       sizeof(int),
2412                 .mode           =       0644,
2413                 .proc_handler   =       &proc_dointvec_jiffies,
2414                 .strategy       =       &sysctl_jiffies,
2415         },
2416         {
2417                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2418                 .procname       =       "gc_interval",
2419                 .data           =       &ip6_rt_gc_interval,
2420                 .maxlen         =       sizeof(int),
2421                 .mode           =       0644,
2422                 .proc_handler   =       &proc_dointvec_jiffies,
2423                 .strategy       =       &sysctl_jiffies,
2424         },
2425         {
2426                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2427                 .procname       =       "gc_elasticity",
2428                 .data           =       &ip6_rt_gc_elasticity,
2429                 .maxlen         =       sizeof(int),
2430                 .mode           =       0644,
2431                 .proc_handler   =       &proc_dointvec_jiffies,
2432                 .strategy       =       &sysctl_jiffies,
2433         },
2434         {
2435                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2436                 .procname       =       "mtu_expires",
2437                 .data           =       &ip6_rt_mtu_expires,
2438                 .maxlen         =       sizeof(int),
2439                 .mode           =       0644,
2440                 .proc_handler   =       &proc_dointvec_jiffies,
2441                 .strategy       =       &sysctl_jiffies,
2442         },
2443         {
2444                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2445                 .procname       =       "min_adv_mss",
2446                 .data           =       &ip6_rt_min_advmss,
2447                 .maxlen         =       sizeof(int),
2448                 .mode           =       0644,
2449                 .proc_handler   =       &proc_dointvec_jiffies,
2450                 .strategy       =       &sysctl_jiffies,
2451         },
2452         {
2453                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2454                 .procname       =       "gc_min_interval_ms",
2455                 .data           =       &ip6_rt_gc_min_interval,
2456                 .maxlen         =       sizeof(int),
2457                 .mode           =       0644,
2458                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2459                 .strategy       =       &sysctl_ms_jiffies,
2460         },
2461         { .ctl_name = 0 }
2462 };
2463
2464 #endif
2465
2466 void __init ip6_route_init(void)
2467 {
2468         ip6_dst_ops.kmem_cachep =
2469                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2470                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2471         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2472
2473         fib6_init();
2474         proc_net_fops_create(&init_net, "ipv6_route", 0, &ipv6_route_proc_fops);
2475         proc_net_fops_create(&init_net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2476 #ifdef CONFIG_XFRM
2477         xfrm6_init();
2478 #endif
2479 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2480         fib6_rules_init();
2481 #endif
2482
2483         __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL);
2484         __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL);
2485         __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL);
2486 }
2487
2488 void ip6_route_cleanup(void)
2489 {
2490 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2491         fib6_rules_cleanup();
2492 #endif
2493 #ifdef CONFIG_PROC_FS
2494         proc_net_remove(&init_net, "ipv6_route");
2495         proc_net_remove(&init_net, "rt6_stats");
2496 #endif
2497 #ifdef CONFIG_XFRM
2498         xfrm6_fini();
2499 #endif
2500         rt6_ifdown(NULL);
2501         fib6_gc_cleanup();
2502         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2503 }