2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
 
   3  *              operating system.  INET is implemented using the  BSD Socket
 
   4  *              interface as the means of communication with the user level.
 
   6  *              ROUTE - implementation of the IP router.
 
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
 
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 
  17  *              Alan Cox        :       Verify area fixes.
 
  18  *              Alan Cox        :       cli() protects routing changes
 
  19  *              Rui Oliveira    :       ICMP routing table updates
 
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
 
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
 
  22  *              Alan Cox        :       Added BSD route gw semantics
 
  23  *              Alan Cox        :       Super /proc >4K 
 
  24  *              Alan Cox        :       MTU in route table
 
  25  *              Alan Cox        :       MSS actually. Also added the window
 
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
 
  28  *              Alan Cox        :       Routing cache support.
 
  29  *              Alan Cox        :       Removed compatibility cruft.
 
  30  *              Alan Cox        :       RTF_REJECT support.
 
  31  *              Alan Cox        :       TCP irtt support.
 
  32  *              Jonathan Naylor :       Added Metric support.
 
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
 
  34  *      Miquel van Smoorenburg  :       Metrics.
 
  35  *              Alan Cox        :       Use __u32 properly
 
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
 
  37  *                                      our system is still very different.
 
  38  *              Alan Cox        :       Faster /proc handling
 
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 
  40  *                                      routing caches and better behaviour.
 
  42  *              Olaf Erb        :       irtt wasn't being copied right.
 
  43  *              Bjorn Ekwall    :       Kerneld route support.
 
  44  *              Alan Cox        :       Multicast fixed (I hope)
 
  45  *              Pavel Krauz     :       Limited broadcast fixed
 
  46  *              Mike McLagan    :       Routing by source
 
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
 
  48  *                                      route.c and rewritten from scratch.
 
  49  *              Andi Kleen      :       Load-limit warning messages.
 
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 
  54  *              Marc Boucher    :       routing by fwmark
 
  55  *      Robert Olsson           :       Added rt_cache statistics
 
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
 
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 
  59  *              This program is free software; you can redistribute it and/or
 
  60  *              modify it under the terms of the GNU General Public License
 
  61  *              as published by the Free Software Foundation; either version
 
  62  *              2 of the License, or (at your option) any later version.
 
  65 #include <linux/config.h>
 
  66 #include <linux/module.h>
 
  67 #include <asm/uaccess.h>
 
  68 #include <asm/system.h>
 
  69 #include <linux/bitops.h>
 
  70 #include <linux/types.h>
 
  71 #include <linux/kernel.h>
 
  72 #include <linux/sched.h>
 
  74 #include <linux/bootmem.h>
 
  75 #include <linux/string.h>
 
  76 #include <linux/socket.h>
 
  77 #include <linux/sockios.h>
 
  78 #include <linux/errno.h>
 
  80 #include <linux/inet.h>
 
  81 #include <linux/netdevice.h>
 
  82 #include <linux/proc_fs.h>
 
  83 #include <linux/init.h>
 
  84 #include <linux/skbuff.h>
 
  85 #include <linux/rtnetlink.h>
 
  86 #include <linux/inetdevice.h>
 
  87 #include <linux/igmp.h>
 
  88 #include <linux/pkt_sched.h>
 
  89 #include <linux/mroute.h>
 
  90 #include <linux/netfilter_ipv4.h>
 
  91 #include <linux/random.h>
 
  92 #include <linux/jhash.h>
 
  93 #include <linux/rcupdate.h>
 
  94 #include <linux/times.h>
 
  95 #include <net/protocol.h>
 
  97 #include <net/route.h>
 
  98 #include <net/inetpeer.h>
 
 100 #include <net/ip_fib.h>
 
 103 #include <net/icmp.h>
 
 104 #include <net/xfrm.h>
 
 105 #include <net/ip_mp_alg.h>
 
 107 #include <linux/sysctl.h>
 
 110 #define RT_FL_TOS(oldflp) \
 
 111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 
 113 #define IP_MAX_MTU      0xFFF0
 
 115 #define RT_GC_TIMEOUT (300*HZ)
 
 117 static int ip_rt_min_delay              = 2 * HZ;
 
 118 static int ip_rt_max_delay              = 10 * HZ;
 
 119 static int ip_rt_max_size;
 
 120 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 
 121 static int ip_rt_gc_interval            = 60 * HZ;
 
 122 static int ip_rt_gc_min_interval        = HZ / 2;
 
 123 static int ip_rt_redirect_number        = 9;
 
 124 static int ip_rt_redirect_load          = HZ / 50;
 
 125 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 
 126 static int ip_rt_error_cost             = HZ;
 
 127 static int ip_rt_error_burst            = 5 * HZ;
 
 128 static int ip_rt_gc_elasticity          = 8;
 
 129 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 
 130 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 
 131 static int ip_rt_min_advmss             = 256;
 
 132 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 
 133 static unsigned long rt_deadline;
 
 135 #define RTprint(a...)   printk(KERN_DEBUG a)
 
 137 static struct timer_list rt_flush_timer;
 
 138 static struct timer_list rt_periodic_timer;
 
 139 static struct timer_list rt_secret_timer;
 
 142  *      Interface to generic destination cache.
 
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 
 147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 
 148                                          struct net_device *dev, int how);
 
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 
 152 static int rt_garbage_collect(void);
 
 155 static struct dst_ops ipv4_dst_ops = {
 
 157         .protocol =             __constant_htons(ETH_P_IP),
 
 158         .gc =                   rt_garbage_collect,
 
 159         .check =                ipv4_dst_check,
 
 160         .destroy =              ipv4_dst_destroy,
 
 161         .ifdown =               ipv4_dst_ifdown,
 
 162         .negative_advice =      ipv4_negative_advice,
 
 163         .link_failure =         ipv4_link_failure,
 
 164         .update_pmtu =          ip_rt_update_pmtu,
 
 165         .entry_size =           sizeof(struct rtable),
 
 168 #define ECN_OR_COST(class)      TC_PRIO_##class
 
 170 __u8 ip_tos2prio[16] = {
 
 174         ECN_OR_COST(BESTEFFORT),
 
 180         ECN_OR_COST(INTERACTIVE),
 
 182         ECN_OR_COST(INTERACTIVE),
 
 183         TC_PRIO_INTERACTIVE_BULK,
 
 184         ECN_OR_COST(INTERACTIVE_BULK),
 
 185         TC_PRIO_INTERACTIVE_BULK,
 
 186         ECN_OR_COST(INTERACTIVE_BULK)
 
 194 /* The locking scheme is rather straight forward:
 
 196  * 1) Read-Copy Update protects the buckets of the central route hash.
 
 197  * 2) Only writers remove entries, and they hold the lock
 
 198  *    as they look at rtable reference counts.
 
 199  * 3) Only readers acquire references to rtable entries,
 
 200  *    they do so with atomic increments and with the
 
 204 struct rt_hash_bucket {
 
 205         struct rtable   *chain;
 
 207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 
 210  * The size of this table is a power of two and depends on the number of CPUS.
 
 213 #define RT_HASH_LOCK_SZ 4096
 
 215 #define RT_HASH_LOCK_SZ 2048
 
 217 #define RT_HASH_LOCK_SZ 1024
 
 219 #define RT_HASH_LOCK_SZ 512
 
 221 #define RT_HASH_LOCK_SZ 256
 
 224 static spinlock_t       *rt_hash_locks;
 
 225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 
 226 # define rt_hash_lock_init()    { \
 
 228                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 
 229                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 
 230                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 
 231                         spin_lock_init(&rt_hash_locks[i]); \
 
 234 # define rt_hash_lock_addr(slot) NULL
 
 235 # define rt_hash_lock_init()
 
 238 static struct rt_hash_bucket    *rt_hash_table;
 
 239 static unsigned                 rt_hash_mask;
 
 240 static int                      rt_hash_log;
 
 241 static unsigned int             rt_hash_rnd;
 
 243 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 
 244 #define RT_CACHE_STAT_INC(field) \
 
 245         (per_cpu(rt_cache_stat, raw_smp_processor_id()).field++)
 
 247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 
 248                                 struct rtable **res);
 
 250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 
 252         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 
 256 #ifdef CONFIG_PROC_FS
 
 257 struct rt_cache_iter_state {
 
 261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 
 263         struct rtable *r = NULL;
 
 264         struct rt_cache_iter_state *st = seq->private;
 
 266         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 
 268                 r = rt_hash_table[st->bucket].chain;
 
 271                 rcu_read_unlock_bh();
 
 276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 
 278         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 
 282                 rcu_read_unlock_bh();
 
 283                 if (--st->bucket < 0)
 
 286                 r = rt_hash_table[st->bucket].chain;
 
 291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 
 293         struct rtable *r = rt_cache_get_first(seq);
 
 296                 while (pos && (r = rt_cache_get_next(seq, r)))
 
 298         return pos ? NULL : r;
 
 301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 
 303         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 
 306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 308         struct rtable *r = NULL;
 
 310         if (v == SEQ_START_TOKEN)
 
 311                 r = rt_cache_get_first(seq);
 
 313                 r = rt_cache_get_next(seq, v);
 
 318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 
 320         if (v && v != SEQ_START_TOKEN)
 
 321                 rcu_read_unlock_bh();
 
 324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 
 326         if (v == SEQ_START_TOKEN)
 
 327                 seq_printf(seq, "%-127s\n",
 
 328                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 
 329                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 
 332                 struct rtable *r = v;
 
 335                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 
 336                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 
 337                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 
 338                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 
 339                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 
 340                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 
 341                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 
 342                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 
 343                         dst_metric(&r->u.dst, RTAX_WINDOW),
 
 344                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 
 345                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 
 347                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 
 348                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 
 351                 seq_printf(seq, "%-127s\n", temp);
 
 356 static struct seq_operations rt_cache_seq_ops = {
 
 357         .start  = rt_cache_seq_start,
 
 358         .next   = rt_cache_seq_next,
 
 359         .stop   = rt_cache_seq_stop,
 
 360         .show   = rt_cache_seq_show,
 
 363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 
 365         struct seq_file *seq;
 
 367         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 
 371         rc = seq_open(file, &rt_cache_seq_ops);
 
 374         seq          = file->private_data;
 
 376         memset(s, 0, sizeof(*s));
 
 384 static struct file_operations rt_cache_seq_fops = {
 
 385         .owner   = THIS_MODULE,
 
 386         .open    = rt_cache_seq_open,
 
 389         .release = seq_release_private,
 
 393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 
 398                 return SEQ_START_TOKEN;
 
 400         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 
 401                 if (!cpu_possible(cpu))
 
 404                 return &per_cpu(rt_cache_stat, cpu);
 
 409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 413         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 
 414                 if (!cpu_possible(cpu))
 
 417                 return &per_cpu(rt_cache_stat, cpu);
 
 423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 
 428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 
 430         struct rt_cache_stat *st = v;
 
 432         if (v == SEQ_START_TOKEN) {
 
 433                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 
 437         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 
 438                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 
 439                    atomic_read(&ipv4_dst_ops.entries),
 
 462 static struct seq_operations rt_cpu_seq_ops = {
 
 463         .start  = rt_cpu_seq_start,
 
 464         .next   = rt_cpu_seq_next,
 
 465         .stop   = rt_cpu_seq_stop,
 
 466         .show   = rt_cpu_seq_show,
 
 470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 
 472         return seq_open(file, &rt_cpu_seq_ops);
 
 475 static struct file_operations rt_cpu_seq_fops = {
 
 476         .owner   = THIS_MODULE,
 
 477         .open    = rt_cpu_seq_open,
 
 480         .release = seq_release,
 
 483 #endif /* CONFIG_PROC_FS */
 
 485 static __inline__ void rt_free(struct rtable *rt)
 
 487         multipath_remove(rt);
 
 488         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 
 491 static __inline__ void rt_drop(struct rtable *rt)
 
 493         multipath_remove(rt);
 
 495         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 
 498 static __inline__ int rt_fast_clean(struct rtable *rth)
 
 500         /* Kill broadcast/multicast entries very aggresively, if they
 
 501            collide in hash table with more useful entries */
 
 502         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 
 503                 rth->fl.iif && rth->u.rt_next;
 
 506 static __inline__ int rt_valuable(struct rtable *rth)
 
 508         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 
 512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 
 517         if (atomic_read(&rth->u.dst.__refcnt))
 
 521         if (rth->u.dst.expires &&
 
 522             time_after_eq(jiffies, rth->u.dst.expires))
 
 525         age = jiffies - rth->u.dst.lastuse;
 
 527         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 
 528             (age <= tmo2 && rt_valuable(rth)))
 
 534 /* Bits of score are:
 
 536  * 30: not quite useless
 
 537  * 29..0: usage counter
 
 539 static inline u32 rt_score(struct rtable *rt)
 
 541         u32 score = jiffies - rt->u.dst.lastuse;
 
 543         score = ~score & ~(3<<30);
 
 549             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 
 555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 
 557         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 
 558                fl1->oif     == fl2->oif &&
 
 559                fl1->iif     == fl2->iif;
 
 562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
 563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 
 564                                                 struct rtable *expentry,
 
 567         int passedexpired = 0;
 
 568         struct rtable **nextstep = NULL;
 
 569         struct rtable **rthp = chain_head;
 
 575         while ((rth = *rthp) != NULL) {
 
 579                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 
 580                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 
 581                         if (*rthp == expentry) {
 
 582                                 *rthp = rth->u.rt_next;
 
 585                                 *rthp = rth->u.rt_next;
 
 591                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 
 592                             passedexpired && !nextstep)
 
 593                                 nextstep = &rth->u.rt_next;
 
 595                         rthp = &rth->u.rt_next;
 
 605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 
 608 /* This runs via a timer and thus is always in BH context. */
 
 609 static void rt_check_expire(unsigned long dummy)
 
 611         static unsigned int rover;
 
 612         unsigned int i = rover, goal;
 
 613         struct rtable *rth, **rthp;
 
 614         unsigned long now = jiffies;
 
 617         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 
 618         if (ip_rt_gc_timeout > 1)
 
 619                 do_div(mult, ip_rt_gc_timeout);
 
 620         goal = (unsigned int)mult;
 
 621         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 
 622         for (; goal > 0; goal--) {
 
 623                 unsigned long tmo = ip_rt_gc_timeout;
 
 625                 i = (i + 1) & rt_hash_mask;
 
 626                 rthp = &rt_hash_table[i].chain;
 
 630                 spin_lock(rt_hash_lock_addr(i));
 
 631                 while ((rth = *rthp) != NULL) {
 
 632                         if (rth->u.dst.expires) {
 
 633                                 /* Entry is expired even if it is in use */
 
 634                                 if (time_before_eq(now, rth->u.dst.expires)) {
 
 636                                         rthp = &rth->u.rt_next;
 
 639                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 
 641                                 rthp = &rth->u.rt_next;
 
 645                         /* Cleanup aged off entries. */
 
 646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
 647                         /* remove all related balanced entries if necessary */
 
 648                         if (rth->u.dst.flags & DST_BALANCED) {
 
 649                                 rthp = rt_remove_balanced_route(
 
 650                                         &rt_hash_table[i].chain,
 
 655                                 *rthp = rth->u.rt_next;
 
 658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 
 659                         *rthp = rth->u.rt_next;
 
 661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 
 663                 spin_unlock(rt_hash_lock_addr(i));
 
 665                 /* Fallback loop breaker. */
 
 666                 if (time_after(jiffies, now))
 
 670         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 
 673 /* This can run from both BH and non-BH contexts, the latter
 
 674  * in the case of a forced flush event.
 
 676 static void rt_run_flush(unsigned long dummy)
 
 679         struct rtable *rth, *next;
 
 683         get_random_bytes(&rt_hash_rnd, 4);
 
 685         for (i = rt_hash_mask; i >= 0; i--) {
 
 686                 spin_lock_bh(rt_hash_lock_addr(i));
 
 687                 rth = rt_hash_table[i].chain;
 
 689                         rt_hash_table[i].chain = NULL;
 
 690                 spin_unlock_bh(rt_hash_lock_addr(i));
 
 692                 for (; rth; rth = next) {
 
 693                         next = rth->u.rt_next;
 
 699 static DEFINE_SPINLOCK(rt_flush_lock);
 
 701 void rt_cache_flush(int delay)
 
 703         unsigned long now = jiffies;
 
 704         int user_mode = !in_softirq();
 
 707                 delay = ip_rt_min_delay;
 
 709         /* flush existing multipath state*/
 
 712         spin_lock_bh(&rt_flush_lock);
 
 714         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 
 715                 long tmo = (long)(rt_deadline - now);
 
 717                 /* If flush timer is already running
 
 718                    and flush request is not immediate (delay > 0):
 
 720                    if deadline is not achieved, prolongate timer to "delay",
 
 721                    otherwise fire it at deadline time.
 
 724                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 
 732                 spin_unlock_bh(&rt_flush_lock);
 
 737         if (rt_deadline == 0)
 
 738                 rt_deadline = now + ip_rt_max_delay;
 
 740         mod_timer(&rt_flush_timer, now+delay);
 
 741         spin_unlock_bh(&rt_flush_lock);
 
 744 static void rt_secret_rebuild(unsigned long dummy)
 
 746         unsigned long now = jiffies;
 
 749         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 
 753    Short description of GC goals.
 
 755    We want to build algorithm, which will keep routing cache
 
 756    at some equilibrium point, when number of aged off entries
 
 757    is kept approximately equal to newly generated ones.
 
 759    Current expiration strength is variable "expire".
 
 760    We try to adjust it dynamically, so that if networking
 
 761    is idle expires is large enough to keep enough of warm entries,
 
 762    and when load increases it reduces to limit cache size.
 
 765 static int rt_garbage_collect(void)
 
 767         static unsigned long expire = RT_GC_TIMEOUT;
 
 768         static unsigned long last_gc;
 
 770         static int equilibrium;
 
 771         struct rtable *rth, **rthp;
 
 772         unsigned long now = jiffies;
 
 776          * Garbage collection is pretty expensive,
 
 777          * do not make it too frequently.
 
 780         RT_CACHE_STAT_INC(gc_total);
 
 782         if (now - last_gc < ip_rt_gc_min_interval &&
 
 783             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 
 784                 RT_CACHE_STAT_INC(gc_ignored);
 
 788         /* Calculate number of entries, which we want to expire now. */
 
 789         goal = atomic_read(&ipv4_dst_ops.entries) -
 
 790                 (ip_rt_gc_elasticity << rt_hash_log);
 
 792                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 
 793                         equilibrium = ipv4_dst_ops.gc_thresh;
 
 794                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 
 796                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 
 797                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 
 800                 /* We are in dangerous area. Try to reduce cache really
 
 803                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 
 804                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 
 807         if (now - last_gc >= ip_rt_gc_min_interval)
 
 818                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 
 819                         unsigned long tmo = expire;
 
 821                         k = (k + 1) & rt_hash_mask;
 
 822                         rthp = &rt_hash_table[k].chain;
 
 823                         spin_lock_bh(rt_hash_lock_addr(k));
 
 824                         while ((rth = *rthp) != NULL) {
 
 825                                 if (!rt_may_expire(rth, tmo, expire)) {
 
 827                                         rthp = &rth->u.rt_next;
 
 830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
 831                                 /* remove all related balanced entries
 
 834                                 if (rth->u.dst.flags & DST_BALANCED) {
 
 837                                         rthp = rt_remove_balanced_route(
 
 838                                                 &rt_hash_table[k].chain,
 
 845                                         *rthp = rth->u.rt_next;
 
 849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 
 850                                 *rthp = rth->u.rt_next;
 
 853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 
 855                         spin_unlock_bh(rt_hash_lock_addr(k));
 
 864                 /* Goal is not achieved. We stop process if:
 
 866                    - if expire reduced to zero. Otherwise, expire is halfed.
 
 867                    - if table is not full.
 
 868                    - if we are called from interrupt.
 
 869                    - jiffies check is just fallback/debug loop breaker.
 
 870                      We will not spin here for long time in any case.
 
 873                 RT_CACHE_STAT_INC(gc_goal_miss);
 
 879 #if RT_CACHE_DEBUG >= 2
 
 880                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 
 881                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 
 884                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 
 886         } while (!in_softirq() && time_before_eq(jiffies, now));
 
 888         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 
 891                 printk(KERN_WARNING "dst cache overflow\n");
 
 892         RT_CACHE_STAT_INC(gc_dst_overflow);
 
 896         expire += ip_rt_gc_min_interval;
 
 897         if (expire > ip_rt_gc_timeout ||
 
 898             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 
 899                 expire = ip_rt_gc_timeout;
 
 900 #if RT_CACHE_DEBUG >= 2
 
 901         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 
 902                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 
 907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 
 909         struct rtable   *rth, **rthp;
 
 911         struct rtable *cand, **candp;
 
 914         int attempts = !in_softirq();
 
 923         rthp = &rt_hash_table[hash].chain;
 
 925         spin_lock_bh(rt_hash_lock_addr(hash));
 
 926         while ((rth = *rthp) != NULL) {
 
 927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
 928                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 
 929                     compare_keys(&rth->fl, &rt->fl)) {
 
 931                 if (compare_keys(&rth->fl, &rt->fl)) {
 
 934                         *rthp = rth->u.rt_next;
 
 936                          * Since lookup is lockfree, the deletion
 
 937                          * must be visible to another weakly ordered CPU before
 
 938                          * the insertion at the start of the hash chain.
 
 940                         rcu_assign_pointer(rth->u.rt_next,
 
 941                                            rt_hash_table[hash].chain);
 
 943                          * Since lookup is lockfree, the update writes
 
 944                          * must be ordered for consistency on SMP.
 
 946                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 
 949                         dst_hold(&rth->u.dst);
 
 950                         rth->u.dst.lastuse = now;
 
 951                         spin_unlock_bh(rt_hash_lock_addr(hash));
 
 958                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 
 959                         u32 score = rt_score(rth);
 
 961                         if (score <= min_score) {
 
 970                 rthp = &rth->u.rt_next;
 
 974                 /* ip_rt_gc_elasticity used to be average length of chain
 
 975                  * length, when exceeded gc becomes really aggressive.
 
 977                  * The second limit is less certain. At the moment it allows
 
 978                  * only 2 entries per bucket. We will see.
 
 980                 if (chain_length > ip_rt_gc_elasticity) {
 
 981                         *candp = cand->u.rt_next;
 
 986         /* Try to bind route to arp only if it is output
 
 987            route or unicast forwarding path.
 
 989         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 
 990                 int err = arp_bind_neighbour(&rt->u.dst);
 
 992                         spin_unlock_bh(rt_hash_lock_addr(hash));
 
 994                         if (err != -ENOBUFS) {
 
 999                         /* Neighbour tables are full and nothing
 
1000                            can be released. Try to shrink route cache,
 
1001                            it is most likely it holds some neighbour records.
 
1003                         if (attempts-- > 0) {
 
1004                                 int saved_elasticity = ip_rt_gc_elasticity;
 
1005                                 int saved_int = ip_rt_gc_min_interval;
 
1006                                 ip_rt_gc_elasticity     = 1;
 
1007                                 ip_rt_gc_min_interval   = 0;
 
1008                                 rt_garbage_collect();
 
1009                                 ip_rt_gc_min_interval   = saved_int;
 
1010                                 ip_rt_gc_elasticity     = saved_elasticity;
 
1014                         if (net_ratelimit())
 
1015                                 printk(KERN_WARNING "Neighbour table overflow.\n");
 
1021         rt->u.rt_next = rt_hash_table[hash].chain;
 
1022 #if RT_CACHE_DEBUG >= 2
 
1023         if (rt->u.rt_next) {
 
1025                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 
1026                        NIPQUAD(rt->rt_dst));
 
1027                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
 
1028                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 
1032         rt_hash_table[hash].chain = rt;
 
1033         spin_unlock_bh(rt_hash_lock_addr(hash));
 
1038 void rt_bind_peer(struct rtable *rt, int create)
 
1040         static DEFINE_SPINLOCK(rt_peer_lock);
 
1041         struct inet_peer *peer;
 
1043         peer = inet_getpeer(rt->rt_dst, create);
 
1045         spin_lock_bh(&rt_peer_lock);
 
1046         if (rt->peer == NULL) {
 
1050         spin_unlock_bh(&rt_peer_lock);
 
1056  * Peer allocation may fail only in serious out-of-memory conditions.  However
 
1057  * we still can generate some output.
 
1058  * Random ID selection looks a bit dangerous because we have no chances to
 
1059  * select ID being unique in a reasonable period of time.
 
1060  * But broken packet identifier may be better than no packet at all.
 
1062 static void ip_select_fb_ident(struct iphdr *iph)
 
1064         static DEFINE_SPINLOCK(ip_fb_id_lock);
 
1065         static u32 ip_fallback_id;
 
1068         spin_lock_bh(&ip_fb_id_lock);
 
1069         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
 
1070         iph->id = htons(salt & 0xFFFF);
 
1071         ip_fallback_id = salt;
 
1072         spin_unlock_bh(&ip_fb_id_lock);
 
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 
1077         struct rtable *rt = (struct rtable *) dst;
 
1080                 if (rt->peer == NULL)
 
1081                         rt_bind_peer(rt, 1);
 
1083                 /* If peer is attached to destination, it is never detached,
 
1084                    so that we need not to grab a lock to dereference it.
 
1087                         iph->id = htons(inet_getid(rt->peer, more));
 
1091                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
 
1092                        __builtin_return_address(0));
 
1094         ip_select_fb_ident(iph);
 
1097 static void rt_del(unsigned hash, struct rtable *rt)
 
1099         struct rtable **rthp;
 
1101         spin_lock_bh(rt_hash_lock_addr(hash));
 
1103         for (rthp = &rt_hash_table[hash].chain; *rthp;
 
1104              rthp = &(*rthp)->u.rt_next)
 
1106                         *rthp = rt->u.rt_next;
 
1110         spin_unlock_bh(rt_hash_lock_addr(hash));
 
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 
1114                     u32 saddr, u8 tos, struct net_device *dev)
 
1117         struct in_device *in_dev = in_dev_get(dev);
 
1118         struct rtable *rth, **rthp;
 
1119         u32  skeys[2] = { saddr, 0 };
 
1120         int  ikeys[2] = { dev->ifindex, 0 };
 
1122         tos &= IPTOS_RT_MASK;
 
1127         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 
1128             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 
1129                 goto reject_redirect;
 
1131         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 
1132                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 
1133                         goto reject_redirect;
 
1134                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 
1135                         goto reject_redirect;
 
1137                 if (inet_addr_type(new_gw) != RTN_UNICAST)
 
1138                         goto reject_redirect;
 
1141         for (i = 0; i < 2; i++) {
 
1142                 for (k = 0; k < 2; k++) {
 
1143                         unsigned hash = rt_hash_code(daddr,
 
1144                                                      skeys[i] ^ (ikeys[k] << 5),
 
1147                         rthp=&rt_hash_table[hash].chain;
 
1150                         while ((rth = rcu_dereference(*rthp)) != NULL) {
 
1153                                 if (rth->fl.fl4_dst != daddr ||
 
1154                                     rth->fl.fl4_src != skeys[i] ||
 
1155                                     rth->fl.fl4_tos != tos ||
 
1156                                     rth->fl.oif != ikeys[k] ||
 
1158                                         rthp = &rth->u.rt_next;
 
1162                                 if (rth->rt_dst != daddr ||
 
1163                                     rth->rt_src != saddr ||
 
1165                                     rth->rt_gateway != old_gw ||
 
1166                                     rth->u.dst.dev != dev)
 
1169                                 dst_hold(&rth->u.dst);
 
1172                                 rt = dst_alloc(&ipv4_dst_ops);
 
1179                                 /* Copy all the information. */
 
1181                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
 
1182                                 rt->u.dst.__use         = 1;
 
1183                                 atomic_set(&rt->u.dst.__refcnt, 1);
 
1184                                 rt->u.dst.child         = NULL;
 
1186                                         dev_hold(rt->u.dst.dev);
 
1188                                         in_dev_hold(rt->idev);
 
1189                                 rt->u.dst.obsolete      = 0;
 
1190                                 rt->u.dst.lastuse       = jiffies;
 
1191                                 rt->u.dst.path          = &rt->u.dst;
 
1192                                 rt->u.dst.neighbour     = NULL;
 
1193                                 rt->u.dst.hh            = NULL;
 
1194                                 rt->u.dst.xfrm          = NULL;
 
1196                                 rt->rt_flags            |= RTCF_REDIRECTED;
 
1198                                 /* Gateway is different ... */
 
1199                                 rt->rt_gateway          = new_gw;
 
1201                                 /* Redirect received -> path was valid */
 
1202                                 dst_confirm(&rth->u.dst);
 
1205                                         atomic_inc(&rt->peer->refcnt);
 
1207                                 if (arp_bind_neighbour(&rt->u.dst) ||
 
1208                                     !(rt->u.dst.neighbour->nud_state &
 
1210                                         if (rt->u.dst.neighbour)
 
1211                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
 
1218                                 if (!rt_intern_hash(hash, rt, &rt))
 
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
 
1232         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 
1233                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
 
1234                         "%u.%u.%u.%u ignored.\n"
 
1235                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
 
1237                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
 
1238                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
 
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 
1245         struct rtable *rt = (struct rtable*)dst;
 
1246         struct dst_entry *ret = dst;
 
1249                 if (dst->obsolete) {
 
1252                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 
1253                            rt->u.dst.expires) {
 
1254                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
 
1258 #if RT_CACHE_DEBUG >= 1
 
1259                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
 
1260                                           "%u.%u.%u.%u/%02x dropped\n",
 
1261                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
 
1272  *      1. The first ip_rt_redirect_number redirects are sent
 
1273  *         with exponential backoff, then we stop sending them at all,
 
1274  *         assuming that the host ignores our redirects.
 
1275  *      2. If we did not see packets requiring redirects
 
1276  *         during ip_rt_redirect_silence, we assume that the host
 
1277  *         forgot redirected route and start to send redirects again.
 
1279  * This algorithm is much cheaper and more intelligent than dumb load limiting
 
1282  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 
1283  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 
1286 void ip_rt_send_redirect(struct sk_buff *skb)
 
1288         struct rtable *rt = (struct rtable*)skb->dst;
 
1289         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
 
1294         if (!IN_DEV_TX_REDIRECTS(in_dev))
 
1297         /* No redirected packets during ip_rt_redirect_silence;
 
1298          * reset the algorithm.
 
1300         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
 
1301                 rt->u.dst.rate_tokens = 0;
 
1303         /* Too many ignored redirects; do not send anything
 
1304          * set u.dst.rate_last to the last seen redirected packet.
 
1306         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
 
1307                 rt->u.dst.rate_last = jiffies;
 
1311         /* Check for load limit; set rate_last to the latest sent
 
1314         if (time_after(jiffies,
 
1315                        (rt->u.dst.rate_last +
 
1316                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
 
1317                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 
1318                 rt->u.dst.rate_last = jiffies;
 
1319                 ++rt->u.dst.rate_tokens;
 
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
 
1321                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
 
1322                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
 
1324                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
 
1325                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
 
1326                                 NIPQUAD(rt->rt_src), rt->rt_iif,
 
1327                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
 
1334 static int ip_error(struct sk_buff *skb)
 
1336         struct rtable *rt = (struct rtable*)skb->dst;
 
1340         switch (rt->u.dst.error) {
 
1345                         code = ICMP_HOST_UNREACH;
 
1348                         code = ICMP_NET_UNREACH;
 
1351                         code = ICMP_PKT_FILTERED;
 
1356         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
 
1357         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
 
1358                 rt->u.dst.rate_tokens = ip_rt_error_burst;
 
1359         rt->u.dst.rate_last = now;
 
1360         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
 
1361                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
 
1362                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 
1365 out:    kfree_skb(skb);
 
1370  *      The last two values are not from the RFC but
 
1371  *      are needed for AMPRnet AX.25 paths.
 
1374 static const unsigned short mtu_plateau[] =
 
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
 
1381         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
 
1382                 if (old_mtu > mtu_plateau[i])
 
1383                         return mtu_plateau[i];
 
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 
1390         unsigned short old_mtu = ntohs(iph->tot_len);
 
1392         u32  skeys[2] = { iph->saddr, 0, };
 
1393         u32  daddr = iph->daddr;
 
1394         u8   tos = iph->tos & IPTOS_RT_MASK;
 
1395         unsigned short est_mtu = 0;
 
1397         if (ipv4_config.no_pmtu_disc)
 
1400         for (i = 0; i < 2; i++) {
 
1401                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 
1404                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 
1405                      rth = rcu_dereference(rth->u.rt_next)) {
 
1406                         if (rth->fl.fl4_dst == daddr &&
 
1407                             rth->fl.fl4_src == skeys[i] &&
 
1408                             rth->rt_dst  == daddr &&
 
1409                             rth->rt_src  == iph->saddr &&
 
1410                             rth->fl.fl4_tos == tos &&
 
1412                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
 
1413                                 unsigned short mtu = new_mtu;
 
1415                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
 
1417                                         /* BSD 4.2 compatibility hack :-( */
 
1419                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
 
1420                                             old_mtu >= 68 + (iph->ihl << 2))
 
1421                                                 old_mtu -= iph->ihl << 2;
 
1423                                         mtu = guess_mtu(old_mtu);
 
1425                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
 
1426                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
 
1427                                                 dst_confirm(&rth->u.dst);
 
1428                                                 if (mtu < ip_rt_min_pmtu) {
 
1429                                                         mtu = ip_rt_min_pmtu;
 
1430                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
 
1433                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
 
1434                                                 dst_set_expires(&rth->u.dst,
 
1443         return est_mtu ? : new_mtu;
 
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 
1448         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
 
1449             !(dst_metric_locked(dst, RTAX_MTU))) {
 
1450                 if (mtu < ip_rt_min_pmtu) {
 
1451                         mtu = ip_rt_min_pmtu;
 
1452                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
 
1454                 dst->metrics[RTAX_MTU-1] = mtu;
 
1455                 dst_set_expires(dst, ip_rt_mtu_expires);
 
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
 
1466         struct rtable *rt = (struct rtable *) dst;
 
1467         struct inet_peer *peer = rt->peer;
 
1468         struct in_device *idev = rt->idev;
 
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 
1484         struct rtable *rt = (struct rtable *) dst;
 
1485         struct in_device *idev = rt->idev;
 
1486         if (dev != &loopback_dev && idev && idev->dev == dev) {
 
1487                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
 
1488                 if (loopback_idev) {
 
1489                         rt->idev = loopback_idev;
 
1495 static void ipv4_link_failure(struct sk_buff *skb)
 
1499         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 
1501         rt = (struct rtable *) skb->dst;
 
1503                 dst_set_expires(&rt->u.dst, 0);
 
1506 static int ip_rt_bug(struct sk_buff *skb)
 
1508         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
 
1509                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
 
1510                 skb->dev ? skb->dev->name : "?");
 
1516    We do not cache source address of outgoing interface,
 
1517    because it is used only by IP RR, TS and SRR options,
 
1518    so that it out of fast path.
 
1520    BTW remember: "addr" is allowed to be not aligned
 
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
 
1527         struct fib_result res;
 
1529         if (rt->fl.iif == 0)
 
1531         else if (fib_lookup(&rt->fl, &res) == 0) {
 
1532                 src = FIB_RES_PREFSRC(res);
 
1535                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
 
1537         memcpy(addr, &src, 4);
 
1540 #ifdef CONFIG_NET_CLS_ROUTE
 
1541 static void set_class_tag(struct rtable *rt, u32 tag)
 
1543         if (!(rt->u.dst.tclassid & 0xFFFF))
 
1544                 rt->u.dst.tclassid |= tag & 0xFFFF;
 
1545         if (!(rt->u.dst.tclassid & 0xFFFF0000))
 
1546                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
 
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 
1552         struct fib_info *fi = res->fi;
 
1555                 if (FIB_RES_GW(*res) &&
 
1556                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 
1557                         rt->rt_gateway = FIB_RES_GW(*res);
 
1558                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
 
1559                        sizeof(rt->u.dst.metrics));
 
1560                 if (fi->fib_mtu == 0) {
 
1561                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
 
1562                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
 
1563                             rt->rt_gateway != rt->rt_dst &&
 
1564                             rt->u.dst.dev->mtu > 576)
 
1565                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
 
1567 #ifdef CONFIG_NET_CLS_ROUTE
 
1568                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
 
1571                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
 
1573         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
 
1574                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
 
1575         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
 
1576                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
 
1577         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
 
1578                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
 
1580         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
 
1581                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
 
1583 #ifdef CONFIG_NET_CLS_ROUTE
 
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
 
1585         set_class_tag(rt, fib_rules_tclass(res));
 
1587         set_class_tag(rt, itag);
 
1589         rt->rt_type = res->type;
 
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
 
1593                                 u8 tos, struct net_device *dev, int our)
 
1598         struct in_device *in_dev = in_dev_get(dev);
 
1601         /* Primary sanity checks. */
 
1606         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
 
1607             skb->protocol != htons(ETH_P_IP))
 
1610         if (ZERONET(saddr)) {
 
1611                 if (!LOCAL_MCAST(daddr))
 
1613                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
 
1614         } else if (fib_validate_source(saddr, 0, tos, 0,
 
1615                                         dev, &spec_dst, &itag) < 0)
 
1618         rth = dst_alloc(&ipv4_dst_ops);
 
1622         rth->u.dst.output= ip_rt_bug;
 
1624         atomic_set(&rth->u.dst.__refcnt, 1);
 
1625         rth->u.dst.flags= DST_HOST;
 
1626         if (in_dev->cnf.no_policy)
 
1627                 rth->u.dst.flags |= DST_NOPOLICY;
 
1628         rth->fl.fl4_dst = daddr;
 
1629         rth->rt_dst     = daddr;
 
1630         rth->fl.fl4_tos = tos;
 
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
 
1632         rth->fl.fl4_fwmark= skb->nfmark;
 
1634         rth->fl.fl4_src = saddr;
 
1635         rth->rt_src     = saddr;
 
1636 #ifdef CONFIG_NET_CLS_ROUTE
 
1637         rth->u.dst.tclassid = itag;
 
1640         rth->fl.iif     = dev->ifindex;
 
1641         rth->u.dst.dev  = &loopback_dev;
 
1642         dev_hold(rth->u.dst.dev);
 
1643         rth->idev       = in_dev_get(rth->u.dst.dev);
 
1645         rth->rt_gateway = daddr;
 
1646         rth->rt_spec_dst= spec_dst;
 
1647         rth->rt_type    = RTN_MULTICAST;
 
1648         rth->rt_flags   = RTCF_MULTICAST;
 
1650                 rth->u.dst.input= ip_local_deliver;
 
1651                 rth->rt_flags |= RTCF_LOCAL;
 
1654 #ifdef CONFIG_IP_MROUTE
 
1655         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
 
1656                 rth->u.dst.input = ip_mr_input;
 
1658         RT_CACHE_STAT_INC(in_slow_mc);
 
1661         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
 
1662         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
 
1674 static void ip_handle_martian_source(struct net_device *dev,
 
1675                                      struct in_device *in_dev,
 
1676                                      struct sk_buff *skb,
 
1680         RT_CACHE_STAT_INC(in_martian_src);
 
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
 
1682         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
 
1684                  *      RFC1812 recommendation, if source is martian,
 
1685                  *      the only hint is MAC header.
 
1687                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
 
1688                         "%u.%u.%u.%u, on dev %s\n",
 
1689                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
 
1690                 if (dev->hard_header_len && skb->mac.raw) {
 
1692                         unsigned char *p = skb->mac.raw;
 
1693                         printk(KERN_WARNING "ll header: ");
 
1694                         for (i = 0; i < dev->hard_header_len; i++, p++) {
 
1696                                 if (i < (dev->hard_header_len - 1))
 
1705 static inline int __mkroute_input(struct sk_buff *skb, 
 
1706                                   struct fib_result* res, 
 
1707                                   struct in_device *in_dev, 
 
1708                                   u32 daddr, u32 saddr, u32 tos, 
 
1709                                   struct rtable **result) 
 
1714         struct in_device *out_dev;
 
1718         /* get a working reference to the output device */
 
1719         out_dev = in_dev_get(FIB_RES_DEV(*res));
 
1720         if (out_dev == NULL) {
 
1721                 if (net_ratelimit())
 
1722                         printk(KERN_CRIT "Bug in ip_route_input" \
 
1723                                "_slow(). Please, report\n");
 
1728         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
 
1729                                   in_dev->dev, &spec_dst, &itag);
 
1731                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
 
1739                 flags |= RTCF_DIRECTSRC;
 
1741         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
 
1742             (IN_DEV_SHARED_MEDIA(out_dev) ||
 
1743              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
 
1744                 flags |= RTCF_DOREDIRECT;
 
1746         if (skb->protocol != htons(ETH_P_IP)) {
 
1747                 /* Not IP (i.e. ARP). Do not create route, if it is
 
1748                  * invalid for proxy arp. DNAT routes are always valid.
 
1750                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
 
1757         rth = dst_alloc(&ipv4_dst_ops);
 
1763         atomic_set(&rth->u.dst.__refcnt, 1);
 
1764         rth->u.dst.flags= DST_HOST;
 
1765 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
1766         if (res->fi->fib_nhs > 1)
 
1767                 rth->u.dst.flags |= DST_BALANCED;
 
1769         if (in_dev->cnf.no_policy)
 
1770                 rth->u.dst.flags |= DST_NOPOLICY;
 
1771         if (in_dev->cnf.no_xfrm)
 
1772                 rth->u.dst.flags |= DST_NOXFRM;
 
1773         rth->fl.fl4_dst = daddr;
 
1774         rth->rt_dst     = daddr;
 
1775         rth->fl.fl4_tos = tos;
 
1776 #ifdef CONFIG_IP_ROUTE_FWMARK
 
1777         rth->fl.fl4_fwmark= skb->nfmark;
 
1779         rth->fl.fl4_src = saddr;
 
1780         rth->rt_src     = saddr;
 
1781         rth->rt_gateway = daddr;
 
1783                 rth->fl.iif     = in_dev->dev->ifindex;
 
1784         rth->u.dst.dev  = (out_dev)->dev;
 
1785         dev_hold(rth->u.dst.dev);
 
1786         rth->idev       = in_dev_get(rth->u.dst.dev);
 
1788         rth->rt_spec_dst= spec_dst;
 
1790         rth->u.dst.input = ip_forward;
 
1791         rth->u.dst.output = ip_output;
 
1793         rt_set_nexthop(rth, res, itag);
 
1795         rth->rt_flags = flags;
 
1800         /* release the working reference to the output device */
 
1801         in_dev_put(out_dev);
 
1805 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
 
1806                                        struct fib_result* res, 
 
1807                                        const struct flowi *fl,
 
1808                                        struct in_device *in_dev,
 
1809                                        u32 daddr, u32 saddr, u32 tos)
 
1811         struct rtable* rth = NULL;
 
1815 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
1816         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
 
1817                 fib_select_multipath(fl, res);
 
1820         /* create a routing cache entry */
 
1821         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
 
1825         /* put it into the cache */
 
1826         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
 
1827         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
 
1830 static inline int ip_mkroute_input(struct sk_buff *skb, 
 
1831                                    struct fib_result* res, 
 
1832                                    const struct flowi *fl,
 
1833                                    struct in_device *in_dev,
 
1834                                    u32 daddr, u32 saddr, u32 tos)
 
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
1837         struct rtable* rth = NULL, *rtres;
 
1838         unsigned char hop, hopcount;
 
1843                 hopcount = res->fi->fib_nhs;
 
1847         /* distinguish between multipath and singlepath */
 
1849                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
 
1852         /* add all alternatives to the routing cache */
 
1853         for (hop = 0; hop < hopcount; hop++) {
 
1856                 /* put reference to previous result */
 
1860                 /* create a routing cache entry */
 
1861                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
 
1866                 /* put it into the cache */
 
1867                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
 
1868                 err = rt_intern_hash(hash, rth, &rtres);
 
1872                 /* forward hop information to multipath impl. */
 
1873                 multipath_set_nhinfo(rth,
 
1874                                      FIB_RES_NETWORK(*res),
 
1875                                      FIB_RES_NETMASK(*res),
 
1879         skb->dst = &rtres->u.dst;
 
1881 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
 
1882         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
 
1883 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
 
1888  *      NOTE. We drop all the packets that has local source
 
1889  *      addresses, because every properly looped back packet
 
1890  *      must have correct destination already attached by output routine.
 
1892  *      Such approach solves two big problems:
 
1893  *      1. Not simplex devices are handled properly.
 
1894  *      2. IP spoofing attempts are filtered with 100% of guarantee.
 
1897 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 
1898                                u8 tos, struct net_device *dev)
 
1900         struct fib_result res;
 
1901         struct in_device *in_dev = in_dev_get(dev);
 
1902         struct flowi fl = { .nl_u = { .ip4_u =
 
1906                                         .scope = RT_SCOPE_UNIVERSE,
 
1907 #ifdef CONFIG_IP_ROUTE_FWMARK
 
1908                                         .fwmark = skb->nfmark
 
1911                             .iif = dev->ifindex };
 
1914         struct rtable * rth;
 
1920         /* IP on this device is disabled. */
 
1925         /* Check for the most weird martians, which can be not detected
 
1929         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
 
1930                 goto martian_source;
 
1932         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
 
1935         /* Accept zero addresses only to limited broadcast;
 
1936          * I even do not know to fix it or not. Waiting for complains :-)
 
1939                 goto martian_source;
 
1941         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
 
1942                 goto martian_destination;
 
1945          *      Now we are ready to route packet.
 
1947         if ((err = fib_lookup(&fl, &res)) != 0) {
 
1948                 if (!IN_DEV_FORWARD(in_dev))
 
1954         RT_CACHE_STAT_INC(in_slow_tot);
 
1956         if (res.type == RTN_BROADCAST)
 
1959         if (res.type == RTN_LOCAL) {
 
1961                 result = fib_validate_source(saddr, daddr, tos,
 
1962                                              loopback_dev.ifindex,
 
1963                                              dev, &spec_dst, &itag);
 
1965                         goto martian_source;
 
1967                         flags |= RTCF_DIRECTSRC;
 
1972         if (!IN_DEV_FORWARD(in_dev))
 
1974         if (res.type != RTN_UNICAST)
 
1975                 goto martian_destination;
 
1977         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
 
1978         if (err == -ENOBUFS)
 
1990         if (skb->protocol != htons(ETH_P_IP))
 
1994                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
 
1996                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
 
1999                         goto martian_source;
 
2001                         flags |= RTCF_DIRECTSRC;
 
2003         flags |= RTCF_BROADCAST;
 
2004         res.type = RTN_BROADCAST;
 
2005         RT_CACHE_STAT_INC(in_brd);
 
2008         rth = dst_alloc(&ipv4_dst_ops);
 
2012         rth->u.dst.output= ip_rt_bug;
 
2014         atomic_set(&rth->u.dst.__refcnt, 1);
 
2015         rth->u.dst.flags= DST_HOST;
 
2016         if (in_dev->cnf.no_policy)
 
2017                 rth->u.dst.flags |= DST_NOPOLICY;
 
2018         rth->fl.fl4_dst = daddr;
 
2019         rth->rt_dst     = daddr;
 
2020         rth->fl.fl4_tos = tos;
 
2021 #ifdef CONFIG_IP_ROUTE_FWMARK
 
2022         rth->fl.fl4_fwmark= skb->nfmark;
 
2024         rth->fl.fl4_src = saddr;
 
2025         rth->rt_src     = saddr;
 
2026 #ifdef CONFIG_NET_CLS_ROUTE
 
2027         rth->u.dst.tclassid = itag;
 
2030         rth->fl.iif     = dev->ifindex;
 
2031         rth->u.dst.dev  = &loopback_dev;
 
2032         dev_hold(rth->u.dst.dev);
 
2033         rth->idev       = in_dev_get(rth->u.dst.dev);
 
2034         rth->rt_gateway = daddr;
 
2035         rth->rt_spec_dst= spec_dst;
 
2036         rth->u.dst.input= ip_local_deliver;
 
2037         rth->rt_flags   = flags|RTCF_LOCAL;
 
2038         if (res.type == RTN_UNREACHABLE) {
 
2039                 rth->u.dst.input= ip_error;
 
2040                 rth->u.dst.error= -err;
 
2041                 rth->rt_flags   &= ~RTCF_LOCAL;
 
2043         rth->rt_type    = res.type;
 
2044         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
 
2045         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
 
2049         RT_CACHE_STAT_INC(in_no_route);
 
2050         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 
2051         res.type = RTN_UNREACHABLE;
 
2055          *      Do not cache martian addresses: they should be logged (RFC1812)
 
2057 martian_destination:
 
2058         RT_CACHE_STAT_INC(in_martian_dst);
 
2059 #ifdef CONFIG_IP_ROUTE_VERBOSE
 
2060         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 
2061                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
 
2062                         "%u.%u.%u.%u, dev %s\n",
 
2063                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
 
2067         err = -EHOSTUNREACH;
 
2079         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
 
2083 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
 
2084                    u8 tos, struct net_device *dev)
 
2086         struct rtable * rth;
 
2088         int iif = dev->ifindex;
 
2090         tos &= IPTOS_RT_MASK;
 
2091         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
 
2094         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 
2095              rth = rcu_dereference(rth->u.rt_next)) {
 
2096                 if (rth->fl.fl4_dst == daddr &&
 
2097                     rth->fl.fl4_src == saddr &&
 
2098                     rth->fl.iif == iif &&
 
2100 #ifdef CONFIG_IP_ROUTE_FWMARK
 
2101                     rth->fl.fl4_fwmark == skb->nfmark &&
 
2103                     rth->fl.fl4_tos == tos) {
 
2104                         rth->u.dst.lastuse = jiffies;
 
2105                         dst_hold(&rth->u.dst);
 
2107                         RT_CACHE_STAT_INC(in_hit);
 
2109                         skb->dst = (struct dst_entry*)rth;
 
2112                 RT_CACHE_STAT_INC(in_hlist_search);
 
2116         /* Multicast recognition logic is moved from route cache to here.
 
2117            The problem was that too many Ethernet cards have broken/missing
 
2118            hardware multicast filters :-( As result the host on multicasting
 
2119            network acquires a lot of useless route cache entries, sort of
 
2120            SDR messages from all the world. Now we try to get rid of them.
 
2121            Really, provided software IP multicast filter is organized
 
2122            reasonably (at least, hashed), it does not result in a slowdown
 
2123            comparing with route cache reject entries.
 
2124            Note, that multicast routers are not affected, because
 
2125            route cache entry is created eventually.
 
2127         if (MULTICAST(daddr)) {
 
2128                 struct in_device *in_dev;
 
2131                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
 
2132                         int our = ip_check_mc(in_dev, daddr, saddr,
 
2133                                 skb->nh.iph->protocol);
 
2135 #ifdef CONFIG_IP_MROUTE
 
2136                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
 
2140                                 return ip_route_input_mc(skb, daddr, saddr,
 
2147         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
 
2150 static inline int __mkroute_output(struct rtable **result,
 
2151                                    struct fib_result* res, 
 
2152                                    const struct flowi *fl,
 
2153                                    const struct flowi *oldflp, 
 
2154                                    struct net_device *dev_out, 
 
2158         struct in_device *in_dev;
 
2159         u32 tos = RT_FL_TOS(oldflp);
 
2162         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
 
2165         if (fl->fl4_dst == 0xFFFFFFFF)
 
2166                 res->type = RTN_BROADCAST;
 
2167         else if (MULTICAST(fl->fl4_dst))
 
2168                 res->type = RTN_MULTICAST;
 
2169         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
 
2172         if (dev_out->flags & IFF_LOOPBACK)
 
2173                 flags |= RTCF_LOCAL;
 
2175         /* get work reference to inet device */
 
2176         in_dev = in_dev_get(dev_out);
 
2180         if (res->type == RTN_BROADCAST) {
 
2181                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
 
2183                         fib_info_put(res->fi);
 
2186         } else if (res->type == RTN_MULTICAST) {
 
2187                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
 
2188                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
 
2190                         flags &= ~RTCF_LOCAL;
 
2191                 /* If multicast route do not exist use
 
2192                    default one, but do not gateway in this case.
 
2195                 if (res->fi && res->prefixlen < 4) {
 
2196                         fib_info_put(res->fi);
 
2202         rth = dst_alloc(&ipv4_dst_ops);
 
2208         atomic_set(&rth->u.dst.__refcnt, 1);
 
2209         rth->u.dst.flags= DST_HOST;
 
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
2212                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
 
2213                 if (res->fi->fib_nhs > 1)
 
2214                         rth->u.dst.flags |= DST_BALANCED;
 
2217         if (in_dev->cnf.no_xfrm)
 
2218                 rth->u.dst.flags |= DST_NOXFRM;
 
2219         if (in_dev->cnf.no_policy)
 
2220                 rth->u.dst.flags |= DST_NOPOLICY;
 
2222         rth->fl.fl4_dst = oldflp->fl4_dst;
 
2223         rth->fl.fl4_tos = tos;
 
2224         rth->fl.fl4_src = oldflp->fl4_src;
 
2225         rth->fl.oif     = oldflp->oif;
 
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
 
2227         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
 
2229         rth->rt_dst     = fl->fl4_dst;
 
2230         rth->rt_src     = fl->fl4_src;
 
2231         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
 
2232         /* get references to the devices that are to be hold by the routing 
 
2234         rth->u.dst.dev  = dev_out;
 
2236         rth->idev       = in_dev_get(dev_out);
 
2237         rth->rt_gateway = fl->fl4_dst;
 
2238         rth->rt_spec_dst= fl->fl4_src;
 
2240         rth->u.dst.output=ip_output;
 
2242         RT_CACHE_STAT_INC(out_slow_tot);
 
2244         if (flags & RTCF_LOCAL) {
 
2245                 rth->u.dst.input = ip_local_deliver;
 
2246                 rth->rt_spec_dst = fl->fl4_dst;
 
2248         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
 
2249                 rth->rt_spec_dst = fl->fl4_src;
 
2250                 if (flags & RTCF_LOCAL && 
 
2251                     !(dev_out->flags & IFF_LOOPBACK)) {
 
2252                         rth->u.dst.output = ip_mc_output;
 
2253                         RT_CACHE_STAT_INC(out_slow_mc);
 
2255 #ifdef CONFIG_IP_MROUTE
 
2256                 if (res->type == RTN_MULTICAST) {
 
2257                         if (IN_DEV_MFORWARD(in_dev) &&
 
2258                             !LOCAL_MCAST(oldflp->fl4_dst)) {
 
2259                                 rth->u.dst.input = ip_mr_input;
 
2260                                 rth->u.dst.output = ip_mc_output;
 
2266         rt_set_nexthop(rth, res, 0);
 
2268         rth->rt_flags = flags;
 
2272         /* release work reference to inet device */
 
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
 
2279                                         struct fib_result* res,
 
2280                                         const struct flowi *fl,
 
2281                                         const struct flowi *oldflp,
 
2282                                         struct net_device *dev_out,
 
2285         struct rtable *rth = NULL;
 
2286         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
 
2289                 u32 tos = RT_FL_TOS(oldflp);
 
2291                 hash = rt_hash_code(oldflp->fl4_dst, 
 
2292                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
 
2293                 err = rt_intern_hash(hash, rth, rp);
 
2299 static inline int ip_mkroute_output(struct rtable** rp,
 
2300                                     struct fib_result* res,
 
2301                                     const struct flowi *fl,
 
2302                                     const struct flowi *oldflp,
 
2303                                     struct net_device *dev_out,
 
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
2307         u32 tos = RT_FL_TOS(oldflp);
 
2311         struct rtable *rth = NULL;
 
2313         if (res->fi && res->fi->fib_nhs > 1) {
 
2314                 unsigned char hopcount = res->fi->fib_nhs;
 
2316                 for (hop = 0; hop < hopcount; hop++) {
 
2317                         struct net_device *dev2nexthop;
 
2321                         /* hold a work reference to the output device */
 
2322                         dev2nexthop = FIB_RES_DEV(*res);
 
2323                         dev_hold(dev2nexthop);
 
2325                         /* put reference to previous result */
 
2329                         err = __mkroute_output(&rth, res, fl, oldflp,
 
2330                                                dev2nexthop, flags);
 
2335                         hash = rt_hash_code(oldflp->fl4_dst, 
 
2337                                             (oldflp->oif << 5), tos);
 
2338                         err = rt_intern_hash(hash, rth, rp);
 
2340                         /* forward hop information to multipath impl. */
 
2341                         multipath_set_nhinfo(rth,
 
2342                                              FIB_RES_NETWORK(*res),
 
2343                                              FIB_RES_NETMASK(*res),
 
2347                         /* release work reference to output device */
 
2348                         dev_put(dev2nexthop);
 
2355                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
 
2358 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 
2359         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
 
2364  * Major route resolver routine.
 
2367 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
 
2369         u32 tos = RT_FL_TOS(oldflp);
 
2370         struct flowi fl = { .nl_u = { .ip4_u =
 
2371                                       { .daddr = oldflp->fl4_dst,
 
2372                                         .saddr = oldflp->fl4_src,
 
2373                                         .tos = tos & IPTOS_RT_MASK,
 
2374                                         .scope = ((tos & RTO_ONLINK) ?
 
2377 #ifdef CONFIG_IP_ROUTE_FWMARK
 
2378                                         .fwmark = oldflp->fl4_fwmark
 
2381                             .iif = loopback_dev.ifindex,
 
2382                             .oif = oldflp->oif };
 
2383         struct fib_result res;
 
2385         struct net_device *dev_out = NULL;
 
2391 #ifdef CONFIG_IP_MULTIPLE_TABLES
 
2395         if (oldflp->fl4_src) {
 
2397                 if (MULTICAST(oldflp->fl4_src) ||
 
2398                     BADCLASS(oldflp->fl4_src) ||
 
2399                     ZERONET(oldflp->fl4_src))
 
2402                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
 
2403                 dev_out = ip_dev_find(oldflp->fl4_src);
 
2404                 if (dev_out == NULL)
 
2407                 /* I removed check for oif == dev_out->oif here.
 
2408                    It was wrong for two reasons:
 
2409                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
 
2410                       assigned to multiple interfaces.
 
2411                    2. Moreover, we are allowed to send packets with saddr
 
2412                       of another iface. --ANK
 
2415                 if (oldflp->oif == 0
 
2416                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
 
2417                         /* Special hack: user can direct multicasts
 
2418                            and limited broadcast via necessary interface
 
2419                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
 
2420                            This hack is not just for fun, it allows
 
2421                            vic,vat and friends to work.
 
2422                            They bind socket to loopback, set ttl to zero
 
2423                            and expect that it will work.
 
2424                            From the viewpoint of routing cache they are broken,
 
2425                            because we are not allowed to build multicast path
 
2426                            with loopback source addr (look, routing cache
 
2427                            cannot know, that ttl is zero, so that packet
 
2428                            will not leave this host and route is valid).
 
2429                            Luckily, this hack is good workaround.
 
2432                         fl.oif = dev_out->ifindex;
 
2442                 dev_out = dev_get_by_index(oldflp->oif);
 
2444                 if (dev_out == NULL)
 
2447                 /* RACE: Check return value of inet_select_addr instead. */
 
2448                 if (__in_dev_get_rtnl(dev_out) == NULL) {
 
2450                         goto out;       /* Wrong error code */
 
2453                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
 
2455                                 fl.fl4_src = inet_select_addr(dev_out, 0,
 
2460                         if (MULTICAST(oldflp->fl4_dst))
 
2461                                 fl.fl4_src = inet_select_addr(dev_out, 0,
 
2463                         else if (!oldflp->fl4_dst)
 
2464                                 fl.fl4_src = inet_select_addr(dev_out, 0,
 
2470                 fl.fl4_dst = fl.fl4_src;
 
2472                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
 
2475                 dev_out = &loopback_dev;
 
2477                 fl.oif = loopback_dev.ifindex;
 
2478                 res.type = RTN_LOCAL;
 
2479                 flags |= RTCF_LOCAL;
 
2483         if (fib_lookup(&fl, &res)) {
 
2486                         /* Apparently, routing tables are wrong. Assume,
 
2487                            that the destination is on link.
 
2490                            Because we are allowed to send to iface
 
2491                            even if it has NO routes and NO assigned
 
2492                            addresses. When oif is specified, routing
 
2493                            tables are looked up with only one purpose:
 
2494                            to catch if destination is gatewayed, rather than
 
2495                            direct. Moreover, if MSG_DONTROUTE is set,
 
2496                            we send packet, ignoring both routing tables
 
2497                            and ifaddr state. --ANK
 
2500                            We could make it even if oif is unknown,
 
2501                            likely IPv6, but we do not.
 
2504                         if (fl.fl4_src == 0)
 
2505                                 fl.fl4_src = inet_select_addr(dev_out, 0,
 
2507                         res.type = RTN_UNICAST;
 
2517         if (res.type == RTN_LOCAL) {
 
2519                         fl.fl4_src = fl.fl4_dst;
 
2522                 dev_out = &loopback_dev;
 
2524                 fl.oif = dev_out->ifindex;
 
2526                         fib_info_put(res.fi);
 
2528                 flags |= RTCF_LOCAL;
 
2532 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
2533         if (res.fi->fib_nhs > 1 && fl.oif == 0)
 
2534                 fib_select_multipath(&fl, &res);
 
2537         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
 
2538                 fib_select_default(&fl, &res);
 
2541                 fl.fl4_src = FIB_RES_PREFSRC(res);
 
2545         dev_out = FIB_RES_DEV(res);
 
2547         fl.oif = dev_out->ifindex;
 
2551         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
 
2561 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
 
2566         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
 
2569         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 
2570                 rth = rcu_dereference(rth->u.rt_next)) {
 
2571                 if (rth->fl.fl4_dst == flp->fl4_dst &&
 
2572                     rth->fl.fl4_src == flp->fl4_src &&
 
2574                     rth->fl.oif == flp->oif &&
 
2575 #ifdef CONFIG_IP_ROUTE_FWMARK
 
2576                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
 
2578                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
 
2579                             (IPTOS_RT_MASK | RTO_ONLINK))) {
 
2581                         /* check for multipath routes and choose one if
 
2584                         if (multipath_select_route(flp, rth, rp)) {
 
2585                                 dst_hold(&(*rp)->u.dst);
 
2586                                 RT_CACHE_STAT_INC(out_hit);
 
2587                                 rcu_read_unlock_bh();
 
2591                         rth->u.dst.lastuse = jiffies;
 
2592                         dst_hold(&rth->u.dst);
 
2594                         RT_CACHE_STAT_INC(out_hit);
 
2595                         rcu_read_unlock_bh();
 
2599                 RT_CACHE_STAT_INC(out_hlist_search);
 
2601         rcu_read_unlock_bh();
 
2603         return ip_route_output_slow(rp, flp);
 
2606 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
2608 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
 
2612         if ((err = __ip_route_output_key(rp, flp)) != 0)
 
2617                         flp->fl4_src = (*rp)->rt_src;
 
2619                         flp->fl4_dst = (*rp)->rt_dst;
 
2620                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
 
2626 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
2628 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
 
2630         return ip_route_output_flow(rp, flp, NULL, 0);
 
2633 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 
2634                         int nowait, unsigned int flags)
 
2636         struct rtable *rt = (struct rtable*)skb->dst;
 
2638         struct nlmsghdr  *nlh;
 
2639         unsigned char    *b = skb->tail;
 
2640         struct rta_cacheinfo ci;
 
2641 #ifdef CONFIG_IP_MROUTE
 
2642         struct rtattr *eptr;
 
2644         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
 
2645         r = NLMSG_DATA(nlh);
 
2646         r->rtm_family    = AF_INET;
 
2647         r->rtm_dst_len  = 32;
 
2649         r->rtm_tos      = rt->fl.fl4_tos;
 
2650         r->rtm_table    = RT_TABLE_MAIN;
 
2651         r->rtm_type     = rt->rt_type;
 
2652         r->rtm_scope    = RT_SCOPE_UNIVERSE;
 
2653         r->rtm_protocol = RTPROT_UNSPEC;
 
2654         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
 
2655         if (rt->rt_flags & RTCF_NOTIFY)
 
2656                 r->rtm_flags |= RTM_F_NOTIFY;
 
2657         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
 
2658         if (rt->fl.fl4_src) {
 
2659                 r->rtm_src_len = 32;
 
2660                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
 
2663                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
 
2664 #ifdef CONFIG_NET_CLS_ROUTE
 
2665         if (rt->u.dst.tclassid)
 
2666                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
 
2668 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 
2669         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
 
2670                 __u32 alg = rt->rt_multipath_alg;
 
2672                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
 
2676                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
 
2677         else if (rt->rt_src != rt->fl.fl4_src)
 
2678                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
 
2679         if (rt->rt_dst != rt->rt_gateway)
 
2680                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
 
2681         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
 
2682                 goto rtattr_failure;
 
2683         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
 
2684         ci.rta_used     = rt->u.dst.__use;
 
2685         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
 
2686         if (rt->u.dst.expires)
 
2687                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
 
2690         ci.rta_error    = rt->u.dst.error;
 
2691         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
 
2693                 ci.rta_id = rt->peer->ip_id_count;
 
2694                 if (rt->peer->tcp_ts_stamp) {
 
2695                         ci.rta_ts = rt->peer->tcp_ts;
 
2696                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
 
2699 #ifdef CONFIG_IP_MROUTE
 
2700         eptr = (struct rtattr*)skb->tail;
 
2702         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
 
2704 #ifdef CONFIG_IP_MROUTE
 
2705                 u32 dst = rt->rt_dst;
 
2707                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
 
2708                     ipv4_devconf.mc_forwarding) {
 
2709                         int err = ipmr_get_route(skb, r, nowait);
 
2716                                         if (err == -EMSGSIZE)
 
2718                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
 
2723                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
 
2726         nlh->nlmsg_len = skb->tail - b;
 
2731         skb_trim(skb, b - skb->data);
 
2735 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 
2737         struct rtattr **rta = arg;
 
2738         struct rtmsg *rtm = NLMSG_DATA(nlh);
 
2739         struct rtable *rt = NULL;
 
2744         struct sk_buff *skb;
 
2746         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 
2750         /* Reserve room for dummy headers, this skb can pass
 
2751            through good chunk of routing engine.
 
2753         skb->mac.raw = skb->data;
 
2754         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
 
2756         if (rta[RTA_SRC - 1])
 
2757                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
 
2758         if (rta[RTA_DST - 1])
 
2759                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
 
2760         if (rta[RTA_IIF - 1])
 
2761                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
 
2764                 struct net_device *dev = __dev_get_by_index(iif);
 
2768                 skb->protocol   = htons(ETH_P_IP);
 
2771                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
 
2773                 rt = (struct rtable*)skb->dst;
 
2774                 if (!err && rt->u.dst.error)
 
2775                         err = -rt->u.dst.error;
 
2777                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
 
2779                                                          .tos = rtm->rtm_tos } } };
 
2781                 if (rta[RTA_OIF - 1])
 
2782                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
 
2784                 err = ip_route_output_key(&rt, &fl);
 
2789         skb->dst = &rt->u.dst;
 
2790         if (rtm->rtm_flags & RTM_F_NOTIFY)
 
2791                 rt->rt_flags |= RTCF_NOTIFY;
 
2793         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
 
2795         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 
2796                                 RTM_NEWROUTE, 0, 0);
 
2804         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
 
2814 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 
2821         s_idx = idx = cb->args[1];
 
2822         for (h = 0; h <= rt_hash_mask; h++) {
 
2823                 if (h < s_h) continue;
 
2827                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
 
2828                      rt = rcu_dereference(rt->u.rt_next), idx++) {
 
2831                         skb->dst = dst_clone(&rt->u.dst);
 
2832                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
 
2833                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
 
2834                                          1, NLM_F_MULTI) <= 0) {
 
2835                                 dst_release(xchg(&skb->dst, NULL));
 
2836                                 rcu_read_unlock_bh();
 
2839                         dst_release(xchg(&skb->dst, NULL));
 
2841                 rcu_read_unlock_bh();
 
2850 void ip_rt_multicast_event(struct in_device *in_dev)
 
2855 #ifdef CONFIG_SYSCTL
 
2856 static int flush_delay;
 
2858 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
 
2859                                         struct file *filp, void __user *buffer,
 
2860                                         size_t *lenp, loff_t *ppos)
 
2863                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
 
2864                 rt_cache_flush(flush_delay);
 
2871 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
 
2874                                                 void __user *oldval,
 
2875                                                 size_t __user *oldlenp,
 
2876                                                 void __user *newval,
 
2881         if (newlen != sizeof(int))
 
2883         if (get_user(delay, (int __user *)newval))
 
2885         rt_cache_flush(delay); 
 
2889 ctl_table ipv4_route_table[] = {
 
2891                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
 
2892                 .procname       = "flush",
 
2893                 .data           = &flush_delay,
 
2894                 .maxlen         = sizeof(int),
 
2896                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
 
2897                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
 
2900                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
 
2901                 .procname       = "min_delay",
 
2902                 .data           = &ip_rt_min_delay,
 
2903                 .maxlen         = sizeof(int),
 
2905                 .proc_handler   = &proc_dointvec_jiffies,
 
2906                 .strategy       = &sysctl_jiffies,
 
2909                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
 
2910                 .procname       = "max_delay",
 
2911                 .data           = &ip_rt_max_delay,
 
2912                 .maxlen         = sizeof(int),
 
2914                 .proc_handler   = &proc_dointvec_jiffies,
 
2915                 .strategy       = &sysctl_jiffies,
 
2918                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
 
2919                 .procname       = "gc_thresh",
 
2920                 .data           = &ipv4_dst_ops.gc_thresh,
 
2921                 .maxlen         = sizeof(int),
 
2923                 .proc_handler   = &proc_dointvec,
 
2926                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
 
2927                 .procname       = "max_size",
 
2928                 .data           = &ip_rt_max_size,
 
2929                 .maxlen         = sizeof(int),
 
2931                 .proc_handler   = &proc_dointvec,
 
2934                 /*  Deprecated. Use gc_min_interval_ms */
 
2936                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
 
2937                 .procname       = "gc_min_interval",
 
2938                 .data           = &ip_rt_gc_min_interval,
 
2939                 .maxlen         = sizeof(int),
 
2941                 .proc_handler   = &proc_dointvec_jiffies,
 
2942                 .strategy       = &sysctl_jiffies,
 
2945                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
 
2946                 .procname       = "gc_min_interval_ms",
 
2947                 .data           = &ip_rt_gc_min_interval,
 
2948                 .maxlen         = sizeof(int),
 
2950                 .proc_handler   = &proc_dointvec_ms_jiffies,
 
2951                 .strategy       = &sysctl_ms_jiffies,
 
2954                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
 
2955                 .procname       = "gc_timeout",
 
2956                 .data           = &ip_rt_gc_timeout,
 
2957                 .maxlen         = sizeof(int),
 
2959                 .proc_handler   = &proc_dointvec_jiffies,
 
2960                 .strategy       = &sysctl_jiffies,
 
2963                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
 
2964                 .procname       = "gc_interval",
 
2965                 .data           = &ip_rt_gc_interval,
 
2966                 .maxlen         = sizeof(int),
 
2968                 .proc_handler   = &proc_dointvec_jiffies,
 
2969                 .strategy       = &sysctl_jiffies,
 
2972                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
 
2973                 .procname       = "redirect_load",
 
2974                 .data           = &ip_rt_redirect_load,
 
2975                 .maxlen         = sizeof(int),
 
2977                 .proc_handler   = &proc_dointvec,
 
2980                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
 
2981                 .procname       = "redirect_number",
 
2982                 .data           = &ip_rt_redirect_number,
 
2983                 .maxlen         = sizeof(int),
 
2985                 .proc_handler   = &proc_dointvec,
 
2988                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
 
2989                 .procname       = "redirect_silence",
 
2990                 .data           = &ip_rt_redirect_silence,
 
2991                 .maxlen         = sizeof(int),
 
2993                 .proc_handler   = &proc_dointvec,
 
2996                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
 
2997                 .procname       = "error_cost",
 
2998                 .data           = &ip_rt_error_cost,
 
2999                 .maxlen         = sizeof(int),
 
3001                 .proc_handler   = &proc_dointvec,
 
3004                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
 
3005                 .procname       = "error_burst",
 
3006                 .data           = &ip_rt_error_burst,
 
3007                 .maxlen         = sizeof(int),
 
3009                 .proc_handler   = &proc_dointvec,
 
3012                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
 
3013                 .procname       = "gc_elasticity",
 
3014                 .data           = &ip_rt_gc_elasticity,
 
3015                 .maxlen         = sizeof(int),
 
3017                 .proc_handler   = &proc_dointvec,
 
3020                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
 
3021                 .procname       = "mtu_expires",
 
3022                 .data           = &ip_rt_mtu_expires,
 
3023                 .maxlen         = sizeof(int),
 
3025                 .proc_handler   = &proc_dointvec_jiffies,
 
3026                 .strategy       = &sysctl_jiffies,
 
3029                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
 
3030                 .procname       = "min_pmtu",
 
3031                 .data           = &ip_rt_min_pmtu,
 
3032                 .maxlen         = sizeof(int),
 
3034                 .proc_handler   = &proc_dointvec,
 
3037                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
 
3038                 .procname       = "min_adv_mss",
 
3039                 .data           = &ip_rt_min_advmss,
 
3040                 .maxlen         = sizeof(int),
 
3042                 .proc_handler   = &proc_dointvec,
 
3045                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
 
3046                 .procname       = "secret_interval",
 
3047                 .data           = &ip_rt_secret_interval,
 
3048                 .maxlen         = sizeof(int),
 
3050                 .proc_handler   = &proc_dointvec_jiffies,
 
3051                 .strategy       = &sysctl_jiffies,
 
3057 #ifdef CONFIG_NET_CLS_ROUTE
 
3058 struct ip_rt_acct *ip_rt_acct;
 
3060 /* This code sucks.  But you should have seen it before! --RR */
 
3062 /* IP route accounting ptr for this logical cpu number. */
 
3063 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
 
3065 #ifdef CONFIG_PROC_FS
 
3066 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 
3067                            int length, int *eof, void *data)
 
3071         if ((offset & 3) || (length & 3))
 
3074         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 
3079         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 
3080                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 
3084         offset /= sizeof(u32);
 
3087                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
 
3088                 u32 *dst = (u32 *) buffer;
 
3090                 /* Copy first cpu. */
 
3092                 memcpy(dst, src, length);
 
3094                 /* Add the other cpus in, one int at a time */
 
3098                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
 
3100                         for (j = 0; j < length/4; j++)
 
3106 #endif /* CONFIG_PROC_FS */
 
3107 #endif /* CONFIG_NET_CLS_ROUTE */
 
3109 static __initdata unsigned long rhash_entries;
 
3110 static int __init set_rhash_entries(char *str)
 
3114         rhash_entries = simple_strtoul(str, &str, 0);
 
3117 __setup("rhash_entries=", set_rhash_entries);
 
3119 int __init ip_rt_init(void)
 
3123         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 
3124                              (jiffies ^ (jiffies >> 7)));
 
3126 #ifdef CONFIG_NET_CLS_ROUTE
 
3130              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
 
3132         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
 
3134                 panic("IP: failed to allocate ip_rt_acct\n");
 
3135         memset(ip_rt_acct, 0, PAGE_SIZE << order);
 
3139         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
 
3140                                                      sizeof(struct rtable),
 
3141                                                      0, SLAB_HWCACHE_ALIGN,
 
3144         if (!ipv4_dst_ops.kmem_cachep)
 
3145                 panic("IP: failed to allocate ip_dst_cache\n");
 
3147         rt_hash_table = (struct rt_hash_bucket *)
 
3148                 alloc_large_system_hash("IP route cache",
 
3149                                         sizeof(struct rt_hash_bucket),
 
3151                                         (num_physpages >= 128 * 1024) ?
 
3157         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
 
3158         rt_hash_lock_init();
 
3160         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 
3161         ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
3166         init_timer(&rt_flush_timer);
 
3167         rt_flush_timer.function = rt_run_flush;
 
3168         init_timer(&rt_periodic_timer);
 
3169         rt_periodic_timer.function = rt_check_expire;
 
3170         init_timer(&rt_secret_timer);
 
3171         rt_secret_timer.function = rt_secret_rebuild;
 
3173         /* All the timers, started at system startup tend
 
3174            to synchronize. Perturb it a bit.
 
3176         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
 
3178         add_timer(&rt_periodic_timer);
 
3180         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
 
3181                 ip_rt_secret_interval;
 
3182         add_timer(&rt_secret_timer);
 
3184 #ifdef CONFIG_PROC_FS
 
3186         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
 
3187         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
 
3188             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
 
3192         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
 
3194 #ifdef CONFIG_NET_CLS_ROUTE
 
3195         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
 
3205 EXPORT_SYMBOL(__ip_select_ident);
 
3206 EXPORT_SYMBOL(ip_route_input);
 
3207 EXPORT_SYMBOL(ip_route_output_key);