Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 struct rt6_info ip6_prohibit_entry = {
145         .u = {
146                 .dst = {
147                         .__refcnt       = ATOMIC_INIT(1),
148                         .__use          = 1,
149                         .dev            = &loopback_dev,
150                         .obsolete       = -1,
151                         .error          = -EACCES,
152                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
153                         .input          = ip6_pkt_discard,
154                         .output         = ip6_pkt_discard_out,
155                         .ops            = &ip6_dst_ops,
156                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
157                 }
158         },
159         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
160         .rt6i_metric    = ~(u32) 0,
161         .rt6i_ref       = ATOMIC_INIT(1),
162 };
163
164 struct rt6_info ip6_blk_hole_entry = {
165         .u = {
166                 .dst = {
167                         .__refcnt       = ATOMIC_INIT(1),
168                         .__use          = 1,
169                         .dev            = &loopback_dev,
170                         .obsolete       = -1,
171                         .error          = -EINVAL,
172                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
173                         .input          = ip6_pkt_discard,
174                         .output         = ip6_pkt_discard_out,
175                         .ops            = &ip6_dst_ops,
176                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
177                 }
178         },
179         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
180         .rt6i_metric    = ~(u32) 0,
181         .rt6i_ref       = ATOMIC_INIT(1),
182 };
183
184 #endif
185
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
188 {
189         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
190 }
191
192 static void ip6_dst_destroy(struct dst_entry *dst)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195         struct inet6_dev *idev = rt->rt6i_idev;
196
197         if (idev != NULL) {
198                 rt->rt6i_idev = NULL;
199                 in6_dev_put(idev);
200         }       
201 }
202
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
204                            int how)
205 {
206         struct rt6_info *rt = (struct rt6_info *)dst;
207         struct inet6_dev *idev = rt->rt6i_idev;
208
209         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211                 if (loopback_idev != NULL) {
212                         rt->rt6i_idev = loopback_idev;
213                         in6_dev_put(idev);
214                 }
215         }
216 }
217
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220         return (rt->rt6i_flags & RTF_EXPIRES &&
221                 time_after(jiffies, rt->rt6i_expires));
222 }
223
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226         return (ipv6_addr_type(daddr) &
227                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
228 }
229
230 /*
231  *      Route lookup. Any table->tb6_lock is implied.
232  */
233
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
235                                                     int oif,
236                                                     int strict)
237 {
238         struct rt6_info *local = NULL;
239         struct rt6_info *sprt;
240
241         if (oif) {
242                 for (sprt = rt; sprt; sprt = sprt->u.next) {
243                         struct net_device *dev = sprt->rt6i_dev;
244                         if (dev->ifindex == oif)
245                                 return sprt;
246                         if (dev->flags & IFF_LOOPBACK) {
247                                 if (sprt->rt6i_idev == NULL ||
248                                     sprt->rt6i_idev->dev->ifindex != oif) {
249                                         if (strict && oif)
250                                                 continue;
251                                         if (local && (!oif || 
252                                                       local->rt6i_idev->dev->ifindex == oif))
253                                                 continue;
254                                 }
255                                 local = sprt;
256                         }
257                 }
258
259                 if (local)
260                         return local;
261
262                 if (strict)
263                         return &ip6_null_entry;
264         }
265         return rt;
266 }
267
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
270 {
271         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
272         /*
273          * Okay, this does not seem to be appropriate
274          * for now, however, we need to check if it
275          * is really so; aka Router Reachability Probing.
276          *
277          * Router Reachability Probe MUST be rate-limited
278          * to no more than one per minute.
279          */
280         if (!neigh || (neigh->nud_state & NUD_VALID))
281                 return;
282         read_lock_bh(&neigh->lock);
283         if (!(neigh->nud_state & NUD_VALID) &&
284             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285                 struct in6_addr mcaddr;
286                 struct in6_addr *target;
287
288                 neigh->updated = jiffies;
289                 read_unlock_bh(&neigh->lock);
290
291                 target = (struct in6_addr *)&neigh->primary_key;
292                 addrconf_addr_solict_mult(target, &mcaddr);
293                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
294         } else
295                 read_unlock_bh(&neigh->lock);
296 }
297 #else
298 static inline void rt6_probe(struct rt6_info *rt)
299 {
300         return;
301 }
302 #endif
303
304 /*
305  * Default Router Selection (RFC 2461 6.3.6)
306  */
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
308 {
309         struct net_device *dev = rt->rt6i_dev;
310         if (!oif || dev->ifindex == oif)
311                 return 2;
312         if ((dev->flags & IFF_LOOPBACK) &&
313             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
314                 return 1;
315         return 0;
316 }
317
318 static int inline rt6_check_neigh(struct rt6_info *rt)
319 {
320         struct neighbour *neigh = rt->rt6i_nexthop;
321         int m = 0;
322         if (rt->rt6i_flags & RTF_NONEXTHOP ||
323             !(rt->rt6i_flags & RTF_GATEWAY))
324                 m = 1;
325         else if (neigh) {
326                 read_lock_bh(&neigh->lock);
327                 if (neigh->nud_state & NUD_VALID)
328                         m = 2;
329                 read_unlock_bh(&neigh->lock);
330         }
331         return m;
332 }
333
334 static int rt6_score_route(struct rt6_info *rt, int oif,
335                            int strict)
336 {
337         int m, n;
338                 
339         m = rt6_check_dev(rt, oif);
340         if (!m && (strict & RT6_LOOKUP_F_IFACE))
341                 return -1;
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
344 #endif
345         n = rt6_check_neigh(rt);
346         if (n > 1)
347                 m |= 16;
348         else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
349                 return -1;
350         return m;
351 }
352
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
354                                    int strict)
355 {
356         struct rt6_info *match = NULL, *last = NULL;
357         struct rt6_info *rt, *rt0 = *head;
358         u32 metric;
359         int mpri = -1;
360
361         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362                   __FUNCTION__, head, head ? *head : NULL, oif);
363
364         for (rt = rt0, metric = rt0->rt6i_metric;
365              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
366              rt = rt->u.next) {
367                 int m;
368
369                 if (rt6_check_expired(rt))
370                         continue;
371
372                 last = rt;
373
374                 m = rt6_score_route(rt, oif, strict);
375                 if (m < 0)
376                         continue;
377
378                 if (m > mpri) {
379                         rt6_probe(match);
380                         match = rt;
381                         mpri = m;
382                 } else {
383                         rt6_probe(rt);
384                 }
385         }
386
387         if (!match &&
388             (strict & RT6_LOOKUP_F_REACHABLE) &&
389             last && last != rt0) {
390                 /* no entries matched; do round-robin */
391                 static DEFINE_SPINLOCK(lock);
392                 spin_lock(&lock);
393                 *head = rt0->u.next;
394                 rt0->u.next = last->u.next;
395                 last->u.next = rt0;
396                 spin_unlock(&lock);
397         }
398
399         RT6_TRACE("%s() => %p, score=%d\n",
400                   __FUNCTION__, match, mpri);
401
402         return (match ? match : &ip6_null_entry);
403 }
404
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407                   struct in6_addr *gwaddr)
408 {
409         struct route_info *rinfo = (struct route_info *) opt;
410         struct in6_addr prefix_buf, *prefix;
411         unsigned int pref;
412         u32 lifetime;
413         struct rt6_info *rt;
414
415         if (len < sizeof(struct route_info)) {
416                 return -EINVAL;
417         }
418
419         /* Sanity check for prefix_len and length */
420         if (rinfo->length > 3) {
421                 return -EINVAL;
422         } else if (rinfo->prefix_len > 128) {
423                 return -EINVAL;
424         } else if (rinfo->prefix_len > 64) {
425                 if (rinfo->length < 2) {
426                         return -EINVAL;
427                 }
428         } else if (rinfo->prefix_len > 0) {
429                 if (rinfo->length < 1) {
430                         return -EINVAL;
431                 }
432         }
433
434         pref = rinfo->route_pref;
435         if (pref == ICMPV6_ROUTER_PREF_INVALID)
436                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
437
438         lifetime = htonl(rinfo->lifetime);
439         if (lifetime == 0xffffffff) {
440                 /* infinity */
441         } else if (lifetime > 0x7fffffff/HZ) {
442                 /* Avoid arithmetic overflow */
443                 lifetime = 0x7fffffff/HZ - 1;
444         }
445
446         if (rinfo->length == 3)
447                 prefix = (struct in6_addr *)rinfo->prefix;
448         else {
449                 /* this function is safe */
450                 ipv6_addr_prefix(&prefix_buf,
451                                  (struct in6_addr *)rinfo->prefix,
452                                  rinfo->prefix_len);
453                 prefix = &prefix_buf;
454         }
455
456         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
457
458         if (rt && !lifetime) {
459                 ip6_del_rt(rt);
460                 rt = NULL;
461         }
462
463         if (!rt && lifetime)
464                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
465                                         pref);
466         else if (rt)
467                 rt->rt6i_flags = RTF_ROUTEINFO |
468                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
469
470         if (rt) {
471                 if (lifetime == 0xffffffff) {
472                         rt->rt6i_flags &= ~RTF_EXPIRES;
473                 } else {
474                         rt->rt6i_expires = jiffies + HZ * lifetime;
475                         rt->rt6i_flags |= RTF_EXPIRES;
476                 }
477                 dst_release(&rt->u.dst);
478         }
479         return 0;
480 }
481 #endif
482
483 #define BACKTRACK(saddr) \
484 do { \
485         if (rt == &ip6_null_entry) { \
486                 struct fib6_node *pn; \
487                 while (fn) { \
488                         if (fn->fn_flags & RTN_TL_ROOT) \
489                                 goto out; \
490                         pn = fn->parent; \
491                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492                                 fn = fib6_lookup(pn->subtree, NULL, saddr); \
493                         else \
494                                 fn = pn; \
495                         if (fn->fn_flags & RTN_RTINFO) \
496                                 goto restart; \
497                 } \
498         } \
499 } while(0)
500
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502                                              struct flowi *fl, int flags)
503 {
504         struct fib6_node *fn;
505         struct rt6_info *rt;
506
507         read_lock_bh(&table->tb6_lock);
508         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
509 restart:
510         rt = fn->leaf;
511         rt = rt6_device_match(rt, fl->oif, flags);
512         BACKTRACK(&fl->fl6_src);
513 out:
514         dst_hold(&rt->u.dst);
515         read_unlock_bh(&table->tb6_lock);
516
517         rt->u.dst.lastuse = jiffies;
518         rt->u.dst.__use++;
519
520         return rt;
521
522 }
523
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
525                             int oif, int strict)
526 {
527         struct flowi fl = {
528                 .oif = oif,
529                 .nl_u = {
530                         .ip6_u = {
531                                 .daddr = *daddr,
532                         },
533                 },
534         };
535         struct dst_entry *dst;
536         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
537
538         if (saddr) {
539                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
540                 flags |= RT6_LOOKUP_F_HAS_SADDR;
541         }
542
543         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
544         if (dst->error == 0)
545                 return (struct rt6_info *) dst;
546
547         dst_release(dst);
548
549         return NULL;
550 }
551
552 /* ip6_ins_rt is called with FREE table->tb6_lock.
553    It takes new route entry, the addition fails by any reason the
554    route is freed. In any case, if caller does not hold it, it may
555    be destroyed.
556  */
557
558 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
559 {
560         int err;
561         struct fib6_table *table;
562
563         table = rt->rt6i_table;
564         write_lock_bh(&table->tb6_lock);
565         err = fib6_add(&table->tb6_root, rt, info);
566         write_unlock_bh(&table->tb6_lock);
567
568         return err;
569 }
570
571 int ip6_ins_rt(struct rt6_info *rt)
572 {
573         return __ip6_ins_rt(rt, NULL);
574 }
575
576 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
577                                       struct in6_addr *saddr)
578 {
579         struct rt6_info *rt;
580
581         /*
582          *      Clone the route.
583          */
584
585         rt = ip6_rt_copy(ort);
586
587         if (rt) {
588                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
589                         if (rt->rt6i_dst.plen != 128 &&
590                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
591                                 rt->rt6i_flags |= RTF_ANYCAST;
592                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
593                 }
594
595                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
596                 rt->rt6i_dst.plen = 128;
597                 rt->rt6i_flags |= RTF_CACHE;
598                 rt->u.dst.flags |= DST_HOST;
599
600 #ifdef CONFIG_IPV6_SUBTREES
601                 if (rt->rt6i_src.plen && saddr) {
602                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
603                         rt->rt6i_src.plen = 128;
604                 }
605 #endif
606
607                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
608
609         }
610
611         return rt;
612 }
613
614 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
615 {
616         struct rt6_info *rt = ip6_rt_copy(ort);
617         if (rt) {
618                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
619                 rt->rt6i_dst.plen = 128;
620                 rt->rt6i_flags |= RTF_CACHE;
621                 if (rt->rt6i_flags & RTF_REJECT)
622                         rt->u.dst.error = ort->u.dst.error;
623                 rt->u.dst.flags |= DST_HOST;
624                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
625         }
626         return rt;
627 }
628
629 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
630                                             struct flowi *fl, int flags)
631 {
632         struct fib6_node *fn;
633         struct rt6_info *rt, *nrt;
634         int strict = 0;
635         int attempts = 3;
636         int err;
637         int reachable = RT6_LOOKUP_F_REACHABLE;
638
639         strict |= flags & RT6_LOOKUP_F_IFACE;
640
641 relookup:
642         read_lock_bh(&table->tb6_lock);
643
644 restart_2:
645         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
646
647 restart:
648         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
649         BACKTRACK(&fl->fl6_src);
650         if (rt == &ip6_null_entry ||
651             rt->rt6i_flags & RTF_CACHE)
652                 goto out;
653
654         dst_hold(&rt->u.dst);
655         read_unlock_bh(&table->tb6_lock);
656
657         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
658                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
659         else {
660 #if CLONE_OFFLINK_ROUTE
661                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
662 #else
663                 goto out2;
664 #endif
665         }
666
667         dst_release(&rt->u.dst);
668         rt = nrt ? : &ip6_null_entry;
669
670         dst_hold(&rt->u.dst);
671         if (nrt) {
672                 err = ip6_ins_rt(nrt);
673                 if (!err)
674                         goto out2;
675         }
676
677         if (--attempts <= 0)
678                 goto out2;
679
680         /*
681          * Race condition! In the gap, when table->tb6_lock was
682          * released someone could insert this route.  Relookup.
683          */
684         dst_release(&rt->u.dst);
685         goto relookup;
686
687 out:
688         if (reachable) {
689                 reachable = 0;
690                 goto restart_2;
691         }
692         dst_hold(&rt->u.dst);
693         read_unlock_bh(&table->tb6_lock);
694 out2:
695         rt->u.dst.lastuse = jiffies;
696         rt->u.dst.__use++;
697
698         return rt;
699 }
700
701 void ip6_route_input(struct sk_buff *skb)
702 {
703         struct ipv6hdr *iph = skb->nh.ipv6h;
704         int flags = RT6_LOOKUP_F_HAS_SADDR;
705         struct flowi fl = {
706                 .iif = skb->dev->ifindex,
707                 .nl_u = {
708                         .ip6_u = {
709                                 .daddr = iph->daddr,
710                                 .saddr = iph->saddr,
711 #ifdef CONFIG_IPV6_ROUTE_FWMARK
712                                 .fwmark = skb->nfmark,
713 #endif
714                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
715                         },
716                 },
717                 .proto = iph->nexthdr,
718         };
719
720         if (rt6_need_strict(&iph->daddr))
721                 flags |= RT6_LOOKUP_F_IFACE;
722
723         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
724 }
725
726 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
727                                              struct flowi *fl, int flags)
728 {
729         struct fib6_node *fn;
730         struct rt6_info *rt, *nrt;
731         int strict = 0;
732         int attempts = 3;
733         int err;
734         int reachable = RT6_LOOKUP_F_REACHABLE;
735
736         strict |= flags & RT6_LOOKUP_F_IFACE;
737
738 relookup:
739         read_lock_bh(&table->tb6_lock);
740
741 restart_2:
742         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
743
744 restart:
745         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
746         BACKTRACK(&fl->fl6_src);
747         if (rt == &ip6_null_entry ||
748             rt->rt6i_flags & RTF_CACHE)
749                 goto out;
750
751         dst_hold(&rt->u.dst);
752         read_unlock_bh(&table->tb6_lock);
753
754         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
755                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
756         else {
757 #if CLONE_OFFLINK_ROUTE
758                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
759 #else
760                 goto out2;
761 #endif
762         }
763
764         dst_release(&rt->u.dst);
765         rt = nrt ? : &ip6_null_entry;
766
767         dst_hold(&rt->u.dst);
768         if (nrt) {
769                 err = ip6_ins_rt(nrt);
770                 if (!err)
771                         goto out2;
772         }
773
774         if (--attempts <= 0)
775                 goto out2;
776
777         /*
778          * Race condition! In the gap, when table->tb6_lock was
779          * released someone could insert this route.  Relookup.
780          */
781         dst_release(&rt->u.dst);
782         goto relookup;
783
784 out:
785         if (reachable) {
786                 reachable = 0;
787                 goto restart_2;
788         }
789         dst_hold(&rt->u.dst);
790         read_unlock_bh(&table->tb6_lock);
791 out2:
792         rt->u.dst.lastuse = jiffies;
793         rt->u.dst.__use++;
794         return rt;
795 }
796
797 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
798 {
799         int flags = 0;
800
801         if (rt6_need_strict(&fl->fl6_dst))
802                 flags |= RT6_LOOKUP_F_IFACE;
803
804         if (!ipv6_addr_any(&fl->fl6_src))
805                 flags |= RT6_LOOKUP_F_HAS_SADDR;
806
807         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
808 }
809
810
811 /*
812  *      Destination cache support functions
813  */
814
815 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
816 {
817         struct rt6_info *rt;
818
819         rt = (struct rt6_info *) dst;
820
821         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
822                 return dst;
823
824         return NULL;
825 }
826
827 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
828 {
829         struct rt6_info *rt = (struct rt6_info *) dst;
830
831         if (rt) {
832                 if (rt->rt6i_flags & RTF_CACHE)
833                         ip6_del_rt(rt);
834                 else
835                         dst_release(dst);
836         }
837         return NULL;
838 }
839
840 static void ip6_link_failure(struct sk_buff *skb)
841 {
842         struct rt6_info *rt;
843
844         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
845
846         rt = (struct rt6_info *) skb->dst;
847         if (rt) {
848                 if (rt->rt6i_flags&RTF_CACHE) {
849                         dst_set_expires(&rt->u.dst, 0);
850                         rt->rt6i_flags |= RTF_EXPIRES;
851                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
852                         rt->rt6i_node->fn_sernum = -1;
853         }
854 }
855
856 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
857 {
858         struct rt6_info *rt6 = (struct rt6_info*)dst;
859
860         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
861                 rt6->rt6i_flags |= RTF_MODIFIED;
862                 if (mtu < IPV6_MIN_MTU) {
863                         mtu = IPV6_MIN_MTU;
864                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
865                 }
866                 dst->metrics[RTAX_MTU-1] = mtu;
867                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
868         }
869 }
870
871 static int ipv6_get_mtu(struct net_device *dev);
872
873 static inline unsigned int ipv6_advmss(unsigned int mtu)
874 {
875         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
876
877         if (mtu < ip6_rt_min_advmss)
878                 mtu = ip6_rt_min_advmss;
879
880         /*
881          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
882          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
883          * IPV6_MAXPLEN is also valid and means: "any MSS, 
884          * rely only on pmtu discovery"
885          */
886         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
887                 mtu = IPV6_MAXPLEN;
888         return mtu;
889 }
890
891 static struct dst_entry *ndisc_dst_gc_list;
892 static DEFINE_SPINLOCK(ndisc_lock);
893
894 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
895                                   struct neighbour *neigh,
896                                   struct in6_addr *addr,
897                                   int (*output)(struct sk_buff *))
898 {
899         struct rt6_info *rt;
900         struct inet6_dev *idev = in6_dev_get(dev);
901
902         if (unlikely(idev == NULL))
903                 return NULL;
904
905         rt = ip6_dst_alloc();
906         if (unlikely(rt == NULL)) {
907                 in6_dev_put(idev);
908                 goto out;
909         }
910
911         dev_hold(dev);
912         if (neigh)
913                 neigh_hold(neigh);
914         else
915                 neigh = ndisc_get_neigh(dev, addr);
916
917         rt->rt6i_dev      = dev;
918         rt->rt6i_idev     = idev;
919         rt->rt6i_nexthop  = neigh;
920         atomic_set(&rt->u.dst.__refcnt, 1);
921         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
922         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
923         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
924         rt->u.dst.output  = output;
925
926 #if 0   /* there's no chance to use these for ndisc */
927         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
928                                 ? DST_HOST 
929                                 : 0;
930         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
931         rt->rt6i_dst.plen = 128;
932 #endif
933
934         spin_lock_bh(&ndisc_lock);
935         rt->u.dst.next = ndisc_dst_gc_list;
936         ndisc_dst_gc_list = &rt->u.dst;
937         spin_unlock_bh(&ndisc_lock);
938
939         fib6_force_start_gc();
940
941 out:
942         return (struct dst_entry *)rt;
943 }
944
945 int ndisc_dst_gc(int *more)
946 {
947         struct dst_entry *dst, *next, **pprev;
948         int freed;
949
950         next = NULL;
951         freed = 0;
952
953         spin_lock_bh(&ndisc_lock);
954         pprev = &ndisc_dst_gc_list;
955
956         while ((dst = *pprev) != NULL) {
957                 if (!atomic_read(&dst->__refcnt)) {
958                         *pprev = dst->next;
959                         dst_free(dst);
960                         freed++;
961                 } else {
962                         pprev = &dst->next;
963                         (*more)++;
964                 }
965         }
966
967         spin_unlock_bh(&ndisc_lock);
968
969         return freed;
970 }
971
972 static int ip6_dst_gc(void)
973 {
974         static unsigned expire = 30*HZ;
975         static unsigned long last_gc;
976         unsigned long now = jiffies;
977
978         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
979             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
980                 goto out;
981
982         expire++;
983         fib6_run_gc(expire);
984         last_gc = now;
985         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
986                 expire = ip6_rt_gc_timeout>>1;
987
988 out:
989         expire -= expire>>ip6_rt_gc_elasticity;
990         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
991 }
992
993 /* Clean host part of a prefix. Not necessary in radix tree,
994    but results in cleaner routing tables.
995
996    Remove it only when all the things will work!
997  */
998
999 static int ipv6_get_mtu(struct net_device *dev)
1000 {
1001         int mtu = IPV6_MIN_MTU;
1002         struct inet6_dev *idev;
1003
1004         idev = in6_dev_get(dev);
1005         if (idev) {
1006                 mtu = idev->cnf.mtu6;
1007                 in6_dev_put(idev);
1008         }
1009         return mtu;
1010 }
1011
1012 int ipv6_get_hoplimit(struct net_device *dev)
1013 {
1014         int hoplimit = ipv6_devconf.hop_limit;
1015         struct inet6_dev *idev;
1016
1017         idev = in6_dev_get(dev);
1018         if (idev) {
1019                 hoplimit = idev->cnf.hop_limit;
1020                 in6_dev_put(idev);
1021         }
1022         return hoplimit;
1023 }
1024
1025 /*
1026  *
1027  */
1028
1029 int ip6_route_add(struct fib6_config *cfg)
1030 {
1031         int err;
1032         struct rt6_info *rt = NULL;
1033         struct net_device *dev = NULL;
1034         struct inet6_dev *idev = NULL;
1035         struct fib6_table *table;
1036         int addr_type;
1037
1038         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1039                 return -EINVAL;
1040 #ifndef CONFIG_IPV6_SUBTREES
1041         if (cfg->fc_src_len)
1042                 return -EINVAL;
1043 #endif
1044         if (cfg->fc_ifindex) {
1045                 err = -ENODEV;
1046                 dev = dev_get_by_index(cfg->fc_ifindex);
1047                 if (!dev)
1048                         goto out;
1049                 idev = in6_dev_get(dev);
1050                 if (!idev)
1051                         goto out;
1052         }
1053
1054         if (cfg->fc_metric == 0)
1055                 cfg->fc_metric = IP6_RT_PRIO_USER;
1056
1057         table = fib6_new_table(cfg->fc_table);
1058         if (table == NULL) {
1059                 err = -ENOBUFS;
1060                 goto out;
1061         }
1062
1063         rt = ip6_dst_alloc();
1064
1065         if (rt == NULL) {
1066                 err = -ENOMEM;
1067                 goto out;
1068         }
1069
1070         rt->u.dst.obsolete = -1;
1071         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1072
1073         if (cfg->fc_protocol == RTPROT_UNSPEC)
1074                 cfg->fc_protocol = RTPROT_BOOT;
1075         rt->rt6i_protocol = cfg->fc_protocol;
1076
1077         addr_type = ipv6_addr_type(&cfg->fc_dst);
1078
1079         if (addr_type & IPV6_ADDR_MULTICAST)
1080                 rt->u.dst.input = ip6_mc_input;
1081         else
1082                 rt->u.dst.input = ip6_forward;
1083
1084         rt->u.dst.output = ip6_output;
1085
1086         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1087         rt->rt6i_dst.plen = cfg->fc_dst_len;
1088         if (rt->rt6i_dst.plen == 128)
1089                rt->u.dst.flags = DST_HOST;
1090
1091 #ifdef CONFIG_IPV6_SUBTREES
1092         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1093         rt->rt6i_src.plen = cfg->fc_src_len;
1094 #endif
1095
1096         rt->rt6i_metric = cfg->fc_metric;
1097
1098         /* We cannot add true routes via loopback here,
1099            they would result in kernel looping; promote them to reject routes
1100          */
1101         if ((cfg->fc_flags & RTF_REJECT) ||
1102             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1103                 /* hold loopback dev/idev if we haven't done so. */
1104                 if (dev != &loopback_dev) {
1105                         if (dev) {
1106                                 dev_put(dev);
1107                                 in6_dev_put(idev);
1108                         }
1109                         dev = &loopback_dev;
1110                         dev_hold(dev);
1111                         idev = in6_dev_get(dev);
1112                         if (!idev) {
1113                                 err = -ENODEV;
1114                                 goto out;
1115                         }
1116                 }
1117                 rt->u.dst.output = ip6_pkt_discard_out;
1118                 rt->u.dst.input = ip6_pkt_discard;
1119                 rt->u.dst.error = -ENETUNREACH;
1120                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1121                 goto install_route;
1122         }
1123
1124         if (cfg->fc_flags & RTF_GATEWAY) {
1125                 struct in6_addr *gw_addr;
1126                 int gwa_type;
1127
1128                 gw_addr = &cfg->fc_gateway;
1129                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1130                 gwa_type = ipv6_addr_type(gw_addr);
1131
1132                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1133                         struct rt6_info *grt;
1134
1135                         /* IPv6 strictly inhibits using not link-local
1136                            addresses as nexthop address.
1137                            Otherwise, router will not able to send redirects.
1138                            It is very good, but in some (rare!) circumstances
1139                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1140                            some exceptions. --ANK
1141                          */
1142                         err = -EINVAL;
1143                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1144                                 goto out;
1145
1146                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1147
1148                         err = -EHOSTUNREACH;
1149                         if (grt == NULL)
1150                                 goto out;
1151                         if (dev) {
1152                                 if (dev != grt->rt6i_dev) {
1153                                         dst_release(&grt->u.dst);
1154                                         goto out;
1155                                 }
1156                         } else {
1157                                 dev = grt->rt6i_dev;
1158                                 idev = grt->rt6i_idev;
1159                                 dev_hold(dev);
1160                                 in6_dev_hold(grt->rt6i_idev);
1161                         }
1162                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1163                                 err = 0;
1164                         dst_release(&grt->u.dst);
1165
1166                         if (err)
1167                                 goto out;
1168                 }
1169                 err = -EINVAL;
1170                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1171                         goto out;
1172         }
1173
1174         err = -ENODEV;
1175         if (dev == NULL)
1176                 goto out;
1177
1178         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1179                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1180                 if (IS_ERR(rt->rt6i_nexthop)) {
1181                         err = PTR_ERR(rt->rt6i_nexthop);
1182                         rt->rt6i_nexthop = NULL;
1183                         goto out;
1184                 }
1185         }
1186
1187         rt->rt6i_flags = cfg->fc_flags;
1188
1189 install_route:
1190         if (cfg->fc_mx) {
1191                 struct nlattr *nla;
1192                 int remaining;
1193
1194                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1195                         int type = nla->nla_type;
1196
1197                         if (type) {
1198                                 if (type > RTAX_MAX) {
1199                                         err = -EINVAL;
1200                                         goto out;
1201                                 }
1202
1203                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1204                         }
1205                 }
1206         }
1207
1208         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1209                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1210         if (!rt->u.dst.metrics[RTAX_MTU-1])
1211                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1212         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1213                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1214         rt->u.dst.dev = dev;
1215         rt->rt6i_idev = idev;
1216         rt->rt6i_table = table;
1217         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1218
1219 out:
1220         if (dev)
1221                 dev_put(dev);
1222         if (idev)
1223                 in6_dev_put(idev);
1224         if (rt)
1225                 dst_free((struct dst_entry *) rt);
1226         return err;
1227 }
1228
1229 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1230 {
1231         int err;
1232         struct fib6_table *table;
1233
1234         if (rt == &ip6_null_entry)
1235                 return -ENOENT;
1236
1237         table = rt->rt6i_table;
1238         write_lock_bh(&table->tb6_lock);
1239
1240         err = fib6_del(rt, info);
1241         dst_release(&rt->u.dst);
1242
1243         write_unlock_bh(&table->tb6_lock);
1244
1245         return err;
1246 }
1247
1248 int ip6_del_rt(struct rt6_info *rt)
1249 {
1250         return __ip6_del_rt(rt, NULL);
1251 }
1252
1253 static int ip6_route_del(struct fib6_config *cfg)
1254 {
1255         struct fib6_table *table;
1256         struct fib6_node *fn;
1257         struct rt6_info *rt;
1258         int err = -ESRCH;
1259
1260         table = fib6_get_table(cfg->fc_table);
1261         if (table == NULL)
1262                 return err;
1263
1264         read_lock_bh(&table->tb6_lock);
1265
1266         fn = fib6_locate(&table->tb6_root,
1267                          &cfg->fc_dst, cfg->fc_dst_len,
1268                          &cfg->fc_src, cfg->fc_src_len);
1269         
1270         if (fn) {
1271                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1272                         if (cfg->fc_ifindex &&
1273                             (rt->rt6i_dev == NULL ||
1274                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1275                                 continue;
1276                         if (cfg->fc_flags & RTF_GATEWAY &&
1277                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1278                                 continue;
1279                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1280                                 continue;
1281                         dst_hold(&rt->u.dst);
1282                         read_unlock_bh(&table->tb6_lock);
1283
1284                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1285                 }
1286         }
1287         read_unlock_bh(&table->tb6_lock);
1288
1289         return err;
1290 }
1291
1292 /*
1293  *      Handle redirects
1294  */
1295 struct ip6rd_flowi {
1296         struct flowi fl;
1297         struct in6_addr gateway;
1298 };
1299
1300 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1301                                              struct flowi *fl,
1302                                              int flags)
1303 {
1304         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1305         struct rt6_info *rt;
1306         struct fib6_node *fn;
1307
1308         /*
1309          * Get the "current" route for this destination and
1310          * check if the redirect has come from approriate router.
1311          *
1312          * RFC 2461 specifies that redirects should only be
1313          * accepted if they come from the nexthop to the target.
1314          * Due to the way the routes are chosen, this notion
1315          * is a bit fuzzy and one might need to check all possible
1316          * routes.
1317          */
1318
1319         read_lock_bh(&table->tb6_lock);
1320         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1321 restart:
1322         for (rt = fn->leaf; rt; rt = rt->u.next) {
1323                 /*
1324                  * Current route is on-link; redirect is always invalid.
1325                  *
1326                  * Seems, previous statement is not true. It could
1327                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1328                  * But then router serving it might decide, that we should
1329                  * know truth 8)8) --ANK (980726).
1330                  */
1331                 if (rt6_check_expired(rt))
1332                         continue;
1333                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1334                         continue;
1335                 if (fl->oif != rt->rt6i_dev->ifindex)
1336                         continue;
1337                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1338                         continue;
1339                 break;
1340         }
1341
1342         if (!rt)
1343                 rt = &ip6_null_entry;
1344         BACKTRACK(&fl->fl6_src);
1345 out:
1346         dst_hold(&rt->u.dst);
1347
1348         read_unlock_bh(&table->tb6_lock);
1349
1350         return rt;
1351 };
1352
1353 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1354                                            struct in6_addr *src,
1355                                            struct in6_addr *gateway,
1356                                            struct net_device *dev)
1357 {
1358         int flags = RT6_LOOKUP_F_HAS_SADDR;
1359         struct ip6rd_flowi rdfl = {
1360                 .fl = {
1361                         .oif = dev->ifindex,
1362                         .nl_u = {
1363                                 .ip6_u = {
1364                                         .daddr = *dest,
1365                                         .saddr = *src,
1366                                 },
1367                         },
1368                 },
1369                 .gateway = *gateway,
1370         };
1371
1372         if (rt6_need_strict(dest))
1373                 flags |= RT6_LOOKUP_F_IFACE;
1374
1375         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1376 }
1377
1378 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1379                   struct in6_addr *saddr,
1380                   struct neighbour *neigh, u8 *lladdr, int on_link)
1381 {
1382         struct rt6_info *rt, *nrt = NULL;
1383         struct netevent_redirect netevent;
1384
1385         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1386
1387         if (rt == &ip6_null_entry) {
1388                 if (net_ratelimit())
1389                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1390                                "for redirect target\n");
1391                 goto out;
1392         }
1393
1394         /*
1395          *      We have finally decided to accept it.
1396          */
1397
1398         neigh_update(neigh, lladdr, NUD_STALE, 
1399                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1400                      NEIGH_UPDATE_F_OVERRIDE|
1401                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1402                                      NEIGH_UPDATE_F_ISROUTER))
1403                      );
1404
1405         /*
1406          * Redirect received -> path was valid.
1407          * Look, redirects are sent only in response to data packets,
1408          * so that this nexthop apparently is reachable. --ANK
1409          */
1410         dst_confirm(&rt->u.dst);
1411
1412         /* Duplicate redirect: silently ignore. */
1413         if (neigh == rt->u.dst.neighbour)
1414                 goto out;
1415
1416         nrt = ip6_rt_copy(rt);
1417         if (nrt == NULL)
1418                 goto out;
1419
1420         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1421         if (on_link)
1422                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1423
1424         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1425         nrt->rt6i_dst.plen = 128;
1426         nrt->u.dst.flags |= DST_HOST;
1427
1428         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1429         nrt->rt6i_nexthop = neigh_clone(neigh);
1430         /* Reset pmtu, it may be better */
1431         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1432         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1433
1434         if (ip6_ins_rt(nrt))
1435                 goto out;
1436
1437         netevent.old = &rt->u.dst;
1438         netevent.new = &nrt->u.dst;
1439         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1440
1441         if (rt->rt6i_flags&RTF_CACHE) {
1442                 ip6_del_rt(rt);
1443                 return;
1444         }
1445
1446 out:
1447         dst_release(&rt->u.dst);
1448         return;
1449 }
1450
1451 /*
1452  *      Handle ICMP "packet too big" messages
1453  *      i.e. Path MTU discovery
1454  */
1455
1456 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1457                         struct net_device *dev, u32 pmtu)
1458 {
1459         struct rt6_info *rt, *nrt;
1460         int allfrag = 0;
1461
1462         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1463         if (rt == NULL)
1464                 return;
1465
1466         if (pmtu >= dst_mtu(&rt->u.dst))
1467                 goto out;
1468
1469         if (pmtu < IPV6_MIN_MTU) {
1470                 /*
1471                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1472                  * MTU (1280) and a fragment header should always be included
1473                  * after a node receiving Too Big message reporting PMTU is
1474                  * less than the IPv6 Minimum Link MTU.
1475                  */
1476                 pmtu = IPV6_MIN_MTU;
1477                 allfrag = 1;
1478         }
1479
1480         /* New mtu received -> path was valid.
1481            They are sent only in response to data packets,
1482            so that this nexthop apparently is reachable. --ANK
1483          */
1484         dst_confirm(&rt->u.dst);
1485
1486         /* Host route. If it is static, it would be better
1487            not to override it, but add new one, so that
1488            when cache entry will expire old pmtu
1489            would return automatically.
1490          */
1491         if (rt->rt6i_flags & RTF_CACHE) {
1492                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1493                 if (allfrag)
1494                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1495                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1496                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1497                 goto out;
1498         }
1499
1500         /* Network route.
1501            Two cases are possible:
1502            1. It is connected route. Action: COW
1503            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1504          */
1505         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1506                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1507         else
1508                 nrt = rt6_alloc_clone(rt, daddr);
1509
1510         if (nrt) {
1511                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1512                 if (allfrag)
1513                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1514
1515                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1516                  * happened within 5 mins, the recommended timer is 10 mins.
1517                  * Here this route expiration time is set to ip6_rt_mtu_expires
1518                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1519                  * and detecting PMTU increase will be automatically happened.
1520                  */
1521                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1522                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1523
1524                 ip6_ins_rt(nrt);
1525         }
1526 out:
1527         dst_release(&rt->u.dst);
1528 }
1529
1530 /*
1531  *      Misc support functions
1532  */
1533
1534 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1535 {
1536         struct rt6_info *rt = ip6_dst_alloc();
1537
1538         if (rt) {
1539                 rt->u.dst.input = ort->u.dst.input;
1540                 rt->u.dst.output = ort->u.dst.output;
1541
1542                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1543                 rt->u.dst.dev = ort->u.dst.dev;
1544                 if (rt->u.dst.dev)
1545                         dev_hold(rt->u.dst.dev);
1546                 rt->rt6i_idev = ort->rt6i_idev;
1547                 if (rt->rt6i_idev)
1548                         in6_dev_hold(rt->rt6i_idev);
1549                 rt->u.dst.lastuse = jiffies;
1550                 rt->rt6i_expires = 0;
1551
1552                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1553                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1554                 rt->rt6i_metric = 0;
1555
1556                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1557 #ifdef CONFIG_IPV6_SUBTREES
1558                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1559 #endif
1560                 rt->rt6i_table = ort->rt6i_table;
1561         }
1562         return rt;
1563 }
1564
1565 #ifdef CONFIG_IPV6_ROUTE_INFO
1566 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1567                                            struct in6_addr *gwaddr, int ifindex)
1568 {
1569         struct fib6_node *fn;
1570         struct rt6_info *rt = NULL;
1571         struct fib6_table *table;
1572
1573         table = fib6_get_table(RT6_TABLE_INFO);
1574         if (table == NULL)
1575                 return NULL;
1576
1577         write_lock_bh(&table->tb6_lock);
1578         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1579         if (!fn)
1580                 goto out;
1581
1582         for (rt = fn->leaf; rt; rt = rt->u.next) {
1583                 if (rt->rt6i_dev->ifindex != ifindex)
1584                         continue;
1585                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1586                         continue;
1587                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1588                         continue;
1589                 dst_hold(&rt->u.dst);
1590                 break;
1591         }
1592 out:
1593         write_unlock_bh(&table->tb6_lock);
1594         return rt;
1595 }
1596
1597 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1598                                            struct in6_addr *gwaddr, int ifindex,
1599                                            unsigned pref)
1600 {
1601         struct fib6_config cfg = {
1602                 .fc_table       = RT6_TABLE_INFO,
1603                 .fc_metric      = 1024,
1604                 .fc_ifindex     = ifindex,
1605                 .fc_dst_len     = prefixlen,
1606                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1607                                   RTF_UP | RTF_PREF(pref),
1608         };
1609
1610         ipv6_addr_copy(&cfg.fc_dst, prefix);
1611         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1612
1613         /* We should treat it as a default route if prefix length is 0. */
1614         if (!prefixlen)
1615                 cfg.fc_flags |= RTF_DEFAULT;
1616
1617         ip6_route_add(&cfg);
1618
1619         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1620 }
1621 #endif
1622
1623 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1624 {       
1625         struct rt6_info *rt;
1626         struct fib6_table *table;
1627
1628         table = fib6_get_table(RT6_TABLE_DFLT);
1629         if (table == NULL)
1630                 return NULL;
1631
1632         write_lock_bh(&table->tb6_lock);
1633         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1634                 if (dev == rt->rt6i_dev &&
1635                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1636                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1637                         break;
1638         }
1639         if (rt)
1640                 dst_hold(&rt->u.dst);
1641         write_unlock_bh(&table->tb6_lock);
1642         return rt;
1643 }
1644
1645 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1646                                      struct net_device *dev,
1647                                      unsigned int pref)
1648 {
1649         struct fib6_config cfg = {
1650                 .fc_table       = RT6_TABLE_DFLT,
1651                 .fc_metric      = 1024,
1652                 .fc_ifindex     = dev->ifindex,
1653                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1654                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1655         };
1656
1657         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1658
1659         ip6_route_add(&cfg);
1660
1661         return rt6_get_dflt_router(gwaddr, dev);
1662 }
1663
1664 void rt6_purge_dflt_routers(void)
1665 {
1666         struct rt6_info *rt;
1667         struct fib6_table *table;
1668
1669         /* NOTE: Keep consistent with rt6_get_dflt_router */
1670         table = fib6_get_table(RT6_TABLE_DFLT);
1671         if (table == NULL)
1672                 return;
1673
1674 restart:
1675         read_lock_bh(&table->tb6_lock);
1676         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1677                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1678                         dst_hold(&rt->u.dst);
1679                         read_unlock_bh(&table->tb6_lock);
1680                         ip6_del_rt(rt);
1681                         goto restart;
1682                 }
1683         }
1684         read_unlock_bh(&table->tb6_lock);
1685 }
1686
1687 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1688                                  struct fib6_config *cfg)
1689 {
1690         memset(cfg, 0, sizeof(*cfg));
1691
1692         cfg->fc_table = RT6_TABLE_MAIN;
1693         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1694         cfg->fc_metric = rtmsg->rtmsg_metric;
1695         cfg->fc_expires = rtmsg->rtmsg_info;
1696         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1697         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1698         cfg->fc_flags = rtmsg->rtmsg_flags;
1699
1700         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1701         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1702         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1703 }
1704
1705 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1706 {
1707         struct fib6_config cfg;
1708         struct in6_rtmsg rtmsg;
1709         int err;
1710
1711         switch(cmd) {
1712         case SIOCADDRT:         /* Add a route */
1713         case SIOCDELRT:         /* Delete a route */
1714                 if (!capable(CAP_NET_ADMIN))
1715                         return -EPERM;
1716                 err = copy_from_user(&rtmsg, arg,
1717                                      sizeof(struct in6_rtmsg));
1718                 if (err)
1719                         return -EFAULT;
1720
1721                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1722
1723                 rtnl_lock();
1724                 switch (cmd) {
1725                 case SIOCADDRT:
1726                         err = ip6_route_add(&cfg);
1727                         break;
1728                 case SIOCDELRT:
1729                         err = ip6_route_del(&cfg);
1730                         break;
1731                 default:
1732                         err = -EINVAL;
1733                 }
1734                 rtnl_unlock();
1735
1736                 return err;
1737         };
1738
1739         return -EINVAL;
1740 }
1741
1742 /*
1743  *      Drop the packet on the floor
1744  */
1745
1746 static int ip6_pkt_discard(struct sk_buff *skb)
1747 {
1748         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1749         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1750                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1751
1752         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1753         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1754         kfree_skb(skb);
1755         return 0;
1756 }
1757
1758 static int ip6_pkt_discard_out(struct sk_buff *skb)
1759 {
1760         skb->dev = skb->dst->dev;
1761         return ip6_pkt_discard(skb);
1762 }
1763
1764 /*
1765  *      Allocate a dst for local (unicast / anycast) address.
1766  */
1767
1768 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1769                                     const struct in6_addr *addr,
1770                                     int anycast)
1771 {
1772         struct rt6_info *rt = ip6_dst_alloc();
1773
1774         if (rt == NULL)
1775                 return ERR_PTR(-ENOMEM);
1776
1777         dev_hold(&loopback_dev);
1778         in6_dev_hold(idev);
1779
1780         rt->u.dst.flags = DST_HOST;
1781         rt->u.dst.input = ip6_input;
1782         rt->u.dst.output = ip6_output;
1783         rt->rt6i_dev = &loopback_dev;
1784         rt->rt6i_idev = idev;
1785         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1786         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1787         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1788         rt->u.dst.obsolete = -1;
1789
1790         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1791         if (anycast)
1792                 rt->rt6i_flags |= RTF_ANYCAST;
1793         else
1794                 rt->rt6i_flags |= RTF_LOCAL;
1795         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1796         if (rt->rt6i_nexthop == NULL) {
1797                 dst_free((struct dst_entry *) rt);
1798                 return ERR_PTR(-ENOMEM);
1799         }
1800
1801         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1802         rt->rt6i_dst.plen = 128;
1803         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1804
1805         atomic_set(&rt->u.dst.__refcnt, 1);
1806
1807         return rt;
1808 }
1809
1810 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1811 {
1812         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1813             rt != &ip6_null_entry) {
1814                 RT6_TRACE("deleted by ifdown %p\n", rt);
1815                 return -1;
1816         }
1817         return 0;
1818 }
1819
1820 void rt6_ifdown(struct net_device *dev)
1821 {
1822         fib6_clean_all(fib6_ifdown, 0, dev);
1823 }
1824
1825 struct rt6_mtu_change_arg
1826 {
1827         struct net_device *dev;
1828         unsigned mtu;
1829 };
1830
1831 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1832 {
1833         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1834         struct inet6_dev *idev;
1835
1836         /* In IPv6 pmtu discovery is not optional,
1837            so that RTAX_MTU lock cannot disable it.
1838            We still use this lock to block changes
1839            caused by addrconf/ndisc.
1840         */
1841
1842         idev = __in6_dev_get(arg->dev);
1843         if (idev == NULL)
1844                 return 0;
1845
1846         /* For administrative MTU increase, there is no way to discover
1847            IPv6 PMTU increase, so PMTU increase should be updated here.
1848            Since RFC 1981 doesn't include administrative MTU increase
1849            update PMTU increase is a MUST. (i.e. jumbo frame)
1850          */
1851         /*
1852            If new MTU is less than route PMTU, this new MTU will be the
1853            lowest MTU in the path, update the route PMTU to reflect PMTU
1854            decreases; if new MTU is greater than route PMTU, and the
1855            old MTU is the lowest MTU in the path, update the route PMTU
1856            to reflect the increase. In this case if the other nodes' MTU
1857            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1858            PMTU discouvery.
1859          */
1860         if (rt->rt6i_dev == arg->dev &&
1861             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1862             (dst_mtu(&rt->u.dst) > arg->mtu ||
1863              (dst_mtu(&rt->u.dst) < arg->mtu &&
1864               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1865                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1866         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1867         return 0;
1868 }
1869
1870 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1871 {
1872         struct rt6_mtu_change_arg arg = {
1873                 .dev = dev,
1874                 .mtu = mtu,
1875         };
1876
1877         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1878 }
1879
1880 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1881         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1882         [RTA_OIF]               = { .type = NLA_U32 },
1883         [RTA_IIF]               = { .type = NLA_U32 },
1884         [RTA_PRIORITY]          = { .type = NLA_U32 },
1885         [RTA_METRICS]           = { .type = NLA_NESTED },
1886 };
1887
1888 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1889                               struct fib6_config *cfg)
1890 {
1891         struct rtmsg *rtm;
1892         struct nlattr *tb[RTA_MAX+1];
1893         int err;
1894
1895         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1896         if (err < 0)
1897                 goto errout;
1898
1899         err = -EINVAL;
1900         rtm = nlmsg_data(nlh);
1901         memset(cfg, 0, sizeof(*cfg));
1902
1903         cfg->fc_table = rtm->rtm_table;
1904         cfg->fc_dst_len = rtm->rtm_dst_len;
1905         cfg->fc_src_len = rtm->rtm_src_len;
1906         cfg->fc_flags = RTF_UP;
1907         cfg->fc_protocol = rtm->rtm_protocol;
1908
1909         if (rtm->rtm_type == RTN_UNREACHABLE)
1910                 cfg->fc_flags |= RTF_REJECT;
1911
1912         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1913         cfg->fc_nlinfo.nlh = nlh;
1914
1915         if (tb[RTA_GATEWAY]) {
1916                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1917                 cfg->fc_flags |= RTF_GATEWAY;
1918         }
1919
1920         if (tb[RTA_DST]) {
1921                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1922
1923                 if (nla_len(tb[RTA_DST]) < plen)
1924                         goto errout;
1925
1926                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1927         }
1928
1929         if (tb[RTA_SRC]) {
1930                 int plen = (rtm->rtm_src_len + 7) >> 3;
1931
1932                 if (nla_len(tb[RTA_SRC]) < plen)
1933                         goto errout;
1934
1935                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1936         }
1937
1938         if (tb[RTA_OIF])
1939                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1940
1941         if (tb[RTA_PRIORITY])
1942                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1943
1944         if (tb[RTA_METRICS]) {
1945                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1946                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1947         }
1948
1949         if (tb[RTA_TABLE])
1950                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1951
1952         err = 0;
1953 errout:
1954         return err;
1955 }
1956
1957 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1958 {
1959         struct fib6_config cfg;
1960         int err;
1961
1962         err = rtm_to_fib6_config(skb, nlh, &cfg);
1963         if (err < 0)
1964                 return err;
1965
1966         return ip6_route_del(&cfg);
1967 }
1968
1969 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1970 {
1971         struct fib6_config cfg;
1972         int err;
1973
1974         err = rtm_to_fib6_config(skb, nlh, &cfg);
1975         if (err < 0)
1976                 return err;
1977
1978         return ip6_route_add(&cfg);
1979 }
1980
1981 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1982                          struct in6_addr *dst, struct in6_addr *src,
1983                          int iif, int type, u32 pid, u32 seq,
1984                          int prefix, unsigned int flags)
1985 {
1986         struct rtmsg *rtm;
1987         struct nlmsghdr *nlh;
1988         struct rta_cacheinfo ci;
1989         u32 table;
1990
1991         if (prefix) {   /* user wants prefix routes only */
1992                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1993                         /* success since this is not a prefix route */
1994                         return 1;
1995                 }
1996         }
1997
1998         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1999         if (nlh == NULL)
2000                 return -ENOBUFS;
2001
2002         rtm = nlmsg_data(nlh);
2003         rtm->rtm_family = AF_INET6;
2004         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2005         rtm->rtm_src_len = rt->rt6i_src.plen;
2006         rtm->rtm_tos = 0;
2007         if (rt->rt6i_table)
2008                 table = rt->rt6i_table->tb6_id;
2009         else
2010                 table = RT6_TABLE_UNSPEC;
2011         rtm->rtm_table = table;
2012         NLA_PUT_U32(skb, RTA_TABLE, table);
2013         if (rt->rt6i_flags&RTF_REJECT)
2014                 rtm->rtm_type = RTN_UNREACHABLE;
2015         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2016                 rtm->rtm_type = RTN_LOCAL;
2017         else
2018                 rtm->rtm_type = RTN_UNICAST;
2019         rtm->rtm_flags = 0;
2020         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2021         rtm->rtm_protocol = rt->rt6i_protocol;
2022         if (rt->rt6i_flags&RTF_DYNAMIC)
2023                 rtm->rtm_protocol = RTPROT_REDIRECT;
2024         else if (rt->rt6i_flags & RTF_ADDRCONF)
2025                 rtm->rtm_protocol = RTPROT_KERNEL;
2026         else if (rt->rt6i_flags&RTF_DEFAULT)
2027                 rtm->rtm_protocol = RTPROT_RA;
2028
2029         if (rt->rt6i_flags&RTF_CACHE)
2030                 rtm->rtm_flags |= RTM_F_CLONED;
2031
2032         if (dst) {
2033                 NLA_PUT(skb, RTA_DST, 16, dst);
2034                 rtm->rtm_dst_len = 128;
2035         } else if (rtm->rtm_dst_len)
2036                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2037 #ifdef CONFIG_IPV6_SUBTREES
2038         if (src) {
2039                 NLA_PUT(skb, RTA_SRC, 16, src);
2040                 rtm->rtm_src_len = 128;
2041         } else if (rtm->rtm_src_len)
2042                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2043 #endif
2044         if (iif)
2045                 NLA_PUT_U32(skb, RTA_IIF, iif);
2046         else if (dst) {
2047                 struct in6_addr saddr_buf;
2048                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2049                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2050         }
2051
2052         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2053                 goto nla_put_failure;
2054
2055         if (rt->u.dst.neighbour)
2056                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2057
2058         if (rt->u.dst.dev)
2059                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2060
2061         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2062         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2063         if (rt->rt6i_expires)
2064                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2065         else
2066                 ci.rta_expires = 0;
2067         ci.rta_used = rt->u.dst.__use;
2068         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2069         ci.rta_error = rt->u.dst.error;
2070         ci.rta_id = 0;
2071         ci.rta_ts = 0;
2072         ci.rta_tsage = 0;
2073         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2074
2075         return nlmsg_end(skb, nlh);
2076
2077 nla_put_failure:
2078         return nlmsg_cancel(skb, nlh);
2079 }
2080
2081 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2082 {
2083         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2084         int prefix;
2085
2086         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2087                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2088                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2089         } else
2090                 prefix = 0;
2091
2092         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2093                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2094                      prefix, NLM_F_MULTI);
2095 }
2096
2097 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2098 {
2099         struct nlattr *tb[RTA_MAX+1];
2100         struct rt6_info *rt;
2101         struct sk_buff *skb;
2102         struct rtmsg *rtm;
2103         struct flowi fl;
2104         int err, iif = 0;
2105
2106         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2107         if (err < 0)
2108                 goto errout;
2109
2110         err = -EINVAL;
2111         memset(&fl, 0, sizeof(fl));
2112
2113         if (tb[RTA_SRC]) {
2114                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2115                         goto errout;
2116
2117                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2118         }
2119
2120         if (tb[RTA_DST]) {
2121                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2122                         goto errout;
2123
2124                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2125         }
2126
2127         if (tb[RTA_IIF])
2128                 iif = nla_get_u32(tb[RTA_IIF]);
2129
2130         if (tb[RTA_OIF])
2131                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2132
2133         if (iif) {
2134                 struct net_device *dev;
2135                 dev = __dev_get_by_index(iif);
2136                 if (!dev) {
2137                         err = -ENODEV;
2138                         goto errout;
2139                 }
2140         }
2141
2142         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2143         if (skb == NULL) {
2144                 err = -ENOBUFS;
2145                 goto errout;
2146         }
2147
2148         /* Reserve room for dummy headers, this skb can pass
2149            through good chunk of routing engine.
2150          */
2151         skb->mac.raw = skb->data;
2152         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2153
2154         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2155         skb->dst = &rt->u.dst;
2156
2157         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2158                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2159                             nlh->nlmsg_seq, 0, 0);
2160         if (err < 0) {
2161                 kfree_skb(skb);
2162                 goto errout;
2163         }
2164
2165         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2166 errout:
2167         return err;
2168 }
2169
2170 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2171 {
2172         struct sk_buff *skb;
2173         u32 pid = 0, seq = 0;
2174         struct nlmsghdr *nlh = NULL;
2175         int payload = sizeof(struct rtmsg) + 256;
2176         int err = -ENOBUFS;
2177
2178         if (info) {
2179                 pid = info->pid;
2180                 nlh = info->nlh;
2181                 if (nlh)
2182                         seq = nlh->nlmsg_seq;
2183         }
2184
2185         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2186         if (skb == NULL)
2187                 goto errout;
2188
2189         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2190         if (err < 0) {
2191                 kfree_skb(skb);
2192                 goto errout;
2193         }
2194
2195         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2196 errout:
2197         if (err < 0)
2198                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2199 }
2200
2201 /*
2202  *      /proc
2203  */
2204
2205 #ifdef CONFIG_PROC_FS
2206
2207 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2208
2209 struct rt6_proc_arg
2210 {
2211         char *buffer;
2212         int offset;
2213         int length;
2214         int skip;
2215         int len;
2216 };
2217
2218 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2219 {
2220         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2221         int i;
2222
2223         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2224                 arg->skip++;
2225                 return 0;
2226         }
2227
2228         if (arg->len >= arg->length)
2229                 return 0;
2230
2231         for (i=0; i<16; i++) {
2232                 sprintf(arg->buffer + arg->len, "%02x",
2233                         rt->rt6i_dst.addr.s6_addr[i]);
2234                 arg->len += 2;
2235         }
2236         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2237                             rt->rt6i_dst.plen);
2238
2239 #ifdef CONFIG_IPV6_SUBTREES
2240         for (i=0; i<16; i++) {
2241                 sprintf(arg->buffer + arg->len, "%02x",
2242                         rt->rt6i_src.addr.s6_addr[i]);
2243                 arg->len += 2;
2244         }
2245         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2246                             rt->rt6i_src.plen);
2247 #else
2248         sprintf(arg->buffer + arg->len,
2249                 "00000000000000000000000000000000 00 ");
2250         arg->len += 36;
2251 #endif
2252
2253         if (rt->rt6i_nexthop) {
2254                 for (i=0; i<16; i++) {
2255                         sprintf(arg->buffer + arg->len, "%02x",
2256                                 rt->rt6i_nexthop->primary_key[i]);
2257                         arg->len += 2;
2258                 }
2259         } else {
2260                 sprintf(arg->buffer + arg->len,
2261                         "00000000000000000000000000000000");
2262                 arg->len += 32;
2263         }
2264         arg->len += sprintf(arg->buffer + arg->len,
2265                             " %08x %08x %08x %08x %8s\n",
2266                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2267                             rt->u.dst.__use, rt->rt6i_flags, 
2268                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2269         return 0;
2270 }
2271
2272 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2273 {
2274         struct rt6_proc_arg arg = {
2275                 .buffer = buffer,
2276                 .offset = offset,
2277                 .length = length,
2278         };
2279
2280         fib6_clean_all(rt6_info_route, 0, &arg);
2281
2282         *start = buffer;
2283         if (offset)
2284                 *start += offset % RT6_INFO_LEN;
2285
2286         arg.len -= offset % RT6_INFO_LEN;
2287
2288         if (arg.len > length)
2289                 arg.len = length;
2290         if (arg.len < 0)
2291                 arg.len = 0;
2292
2293         return arg.len;
2294 }
2295
2296 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2297 {
2298         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2299                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2300                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2301                       rt6_stats.fib_rt_cache,
2302                       atomic_read(&ip6_dst_ops.entries),
2303                       rt6_stats.fib_discarded_routes);
2304
2305         return 0;
2306 }
2307
2308 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2309 {
2310         return single_open(file, rt6_stats_seq_show, NULL);
2311 }
2312
2313 static struct file_operations rt6_stats_seq_fops = {
2314         .owner   = THIS_MODULE,
2315         .open    = rt6_stats_seq_open,
2316         .read    = seq_read,
2317         .llseek  = seq_lseek,
2318         .release = single_release,
2319 };
2320 #endif  /* CONFIG_PROC_FS */
2321
2322 #ifdef CONFIG_SYSCTL
2323
2324 static int flush_delay;
2325
2326 static
2327 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2328                               void __user *buffer, size_t *lenp, loff_t *ppos)
2329 {
2330         if (write) {
2331                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2332                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2333                 return 0;
2334         } else
2335                 return -EINVAL;
2336 }
2337
2338 ctl_table ipv6_route_table[] = {
2339         {
2340                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2341                 .procname       =       "flush",
2342                 .data           =       &flush_delay,
2343                 .maxlen         =       sizeof(int),
2344                 .mode           =       0200,
2345                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2346         },
2347         {
2348                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2349                 .procname       =       "gc_thresh",
2350                 .data           =       &ip6_dst_ops.gc_thresh,
2351                 .maxlen         =       sizeof(int),
2352                 .mode           =       0644,
2353                 .proc_handler   =       &proc_dointvec,
2354         },
2355         {
2356                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2357                 .procname       =       "max_size",
2358                 .data           =       &ip6_rt_max_size,
2359                 .maxlen         =       sizeof(int),
2360                 .mode           =       0644,
2361                 .proc_handler   =       &proc_dointvec,
2362         },
2363         {
2364                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2365                 .procname       =       "gc_min_interval",
2366                 .data           =       &ip6_rt_gc_min_interval,
2367                 .maxlen         =       sizeof(int),
2368                 .mode           =       0644,
2369                 .proc_handler   =       &proc_dointvec_jiffies,
2370                 .strategy       =       &sysctl_jiffies,
2371         },
2372         {
2373                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2374                 .procname       =       "gc_timeout",
2375                 .data           =       &ip6_rt_gc_timeout,
2376                 .maxlen         =       sizeof(int),
2377                 .mode           =       0644,
2378                 .proc_handler   =       &proc_dointvec_jiffies,
2379                 .strategy       =       &sysctl_jiffies,
2380         },
2381         {
2382                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2383                 .procname       =       "gc_interval",
2384                 .data           =       &ip6_rt_gc_interval,
2385                 .maxlen         =       sizeof(int),
2386                 .mode           =       0644,
2387                 .proc_handler   =       &proc_dointvec_jiffies,
2388                 .strategy       =       &sysctl_jiffies,
2389         },
2390         {
2391                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2392                 .procname       =       "gc_elasticity",
2393                 .data           =       &ip6_rt_gc_elasticity,
2394                 .maxlen         =       sizeof(int),
2395                 .mode           =       0644,
2396                 .proc_handler   =       &proc_dointvec_jiffies,
2397                 .strategy       =       &sysctl_jiffies,
2398         },
2399         {
2400                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2401                 .procname       =       "mtu_expires",
2402                 .data           =       &ip6_rt_mtu_expires,
2403                 .maxlen         =       sizeof(int),
2404                 .mode           =       0644,
2405                 .proc_handler   =       &proc_dointvec_jiffies,
2406                 .strategy       =       &sysctl_jiffies,
2407         },
2408         {
2409                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2410                 .procname       =       "min_adv_mss",
2411                 .data           =       &ip6_rt_min_advmss,
2412                 .maxlen         =       sizeof(int),
2413                 .mode           =       0644,
2414                 .proc_handler   =       &proc_dointvec_jiffies,
2415                 .strategy       =       &sysctl_jiffies,
2416         },
2417         {
2418                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2419                 .procname       =       "gc_min_interval_ms",
2420                 .data           =       &ip6_rt_gc_min_interval,
2421                 .maxlen         =       sizeof(int),
2422                 .mode           =       0644,
2423                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2424                 .strategy       =       &sysctl_ms_jiffies,
2425         },
2426         { .ctl_name = 0 }
2427 };
2428
2429 #endif
2430
2431 void __init ip6_route_init(void)
2432 {
2433         struct proc_dir_entry *p;
2434
2435         ip6_dst_ops.kmem_cachep =
2436                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2437                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2438         fib6_init();
2439 #ifdef  CONFIG_PROC_FS
2440         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2441         if (p)
2442                 p->owner = THIS_MODULE;
2443
2444         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2445 #endif
2446 #ifdef CONFIG_XFRM
2447         xfrm6_init();
2448 #endif
2449 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2450         fib6_rules_init();
2451 #endif
2452 }
2453
2454 void ip6_route_cleanup(void)
2455 {
2456 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2457         fib6_rules_cleanup();
2458 #endif
2459 #ifdef CONFIG_PROC_FS
2460         proc_net_remove("ipv6_route");
2461         proc_net_remove("rt6_stats");
2462 #endif
2463 #ifdef CONFIG_XFRM
2464         xfrm6_fini();
2465 #endif
2466         rt6_ifdown(NULL);
2467         fib6_gc_cleanup();
2468         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2469 }