git.oblomov.eu Git - linux-2.6/blob - net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/config.h>
  66 #include <linux/module.h>
  67 #include <asm/uaccess.h>
  68 #include <asm/system.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/sched.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/rtnetlink.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/ip_mp_alg.h>
 106 #ifdef CONFIG_SYSCTL
 107 #include <linux/sysctl.h>
 108 #endif
 109
 110 #define RT_FL_TOS(oldflp) \
 111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 112
 113 #define IP_MAX_MTU      0xFFF0
 114
 115 #define RT_GC_TIMEOUT (300*HZ)
 116
 117 static int ip_rt_min_delay              = 2 * HZ;
 118 static int ip_rt_max_delay              = 10 * HZ;
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval            = 60 * HZ;
 122 static int ip_rt_gc_min_interval        = HZ / 2;
 123 static int ip_rt_redirect_number        = 9;
 124 static int ip_rt_redirect_load          = HZ / 50;
 125 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost             = HZ;
 127 static int ip_rt_error_burst            = 5 * HZ;
 128 static int ip_rt_gc_elasticity          = 8;
 129 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 131 static int ip_rt_min_advmss             = 256;
 132 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 133 static unsigned long rt_deadline;
 134
 135 #define RTprint(a...)   printk(KERN_DEBUG a)
 136
 137 static struct timer_list rt_flush_timer;
 138 static struct timer_list rt_periodic_timer;
 139 static struct timer_list rt_secret_timer;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 148                                          struct net_device *dev, int how);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(void);
 153
 154
 155 static struct dst_ops ipv4_dst_ops = {
 156         .family =               AF_INET,
 157         .protocol =             __constant_htons(ETH_P_IP),
 158         .gc =                   rt_garbage_collect,
 159         .check =                ipv4_dst_check,
 160         .destroy =              ipv4_dst_destroy,
 161         .ifdown =               ipv4_dst_ifdown,
 162         .negative_advice =      ipv4_negative_advice,
 163         .link_failure =         ipv4_link_failure,
 164         .update_pmtu =          ip_rt_update_pmtu,
 165         .entry_size =           sizeof(struct rtable),
 166 };
 167
 168 #define ECN_OR_COST(class)      TC_PRIO_##class
 169
 170 __u8 ip_tos2prio[16] = {
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(FILLER),
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(BESTEFFORT),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK)
 187 };
 188
 189
 190 /*
 191  * Route cache.
 192  */
 193
 194 /* The locking scheme is rather straight forward:
 195  *
 196  * 1) Read-Copy Update protects the buckets of the central route hash.
 197  * 2) Only writers remove entries, and they hold the lock
 198  *    as they look at rtable reference counts.
 199  * 3) Only readers acquire references to rtable entries,
 200  *    they do so with atomic increments and with the
 201  *    lock held.
 202  */
 203
 204 struct rt_hash_bucket {
 205         struct rtable   *chain;
 206 };
 207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  */
 212 #if NR_CPUS >= 32
 213 #define RT_HASH_LOCK_SZ 4096
 214 #elif NR_CPUS >= 16
 215 #define RT_HASH_LOCK_SZ 2048
 216 #elif NR_CPUS >= 8
 217 #define RT_HASH_LOCK_SZ 1024
 218 #elif NR_CPUS >= 4
 219 #define RT_HASH_LOCK_SZ 512
 220 #else
 221 #define RT_HASH_LOCK_SZ 256
 222 #endif
 223
 224 static spinlock_t       *rt_hash_locks;
 225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 226 # define rt_hash_lock_init()    { \
 227                 int i; \
 228                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 229                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 230                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 231                         spin_lock_init(&rt_hash_locks[i]); \
 232                 }
 233 #else
 234 # define rt_hash_lock_addr(slot) NULL
 235 # define rt_hash_lock_init()
 236 #endif
 237
 238 static struct rt_hash_bucket    *rt_hash_table;
 239 static unsigned                 rt_hash_mask;
 240 static int                      rt_hash_log;
 241 static unsigned int             rt_hash_rnd;
 242
 243 static struct rt_cache_stat *rt_cache_stat;
 244 #define RT_CACHE_STAT_INC(field)                                          \
 245                 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
 246
 247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 248                                 struct rtable **res);
 249
 250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 251 {
 252         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 253                 & rt_hash_mask);
 254 }
 255
 256 #ifdef CONFIG_PROC_FS
 257 struct rt_cache_iter_state {
 258         int bucket;
 259 };
 260
 261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 262 {
 263         struct rtable *r = NULL;
 264         struct rt_cache_iter_state *st = seq->private;
 265
 266         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 267                 rcu_read_lock_bh();
 268                 r = rt_hash_table[st->bucket].chain;
 269                 if (r)
 270                         break;
 271                 rcu_read_unlock_bh();
 272         }
 273         return r;
 274 }
 275
 276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 277 {
 278         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 279
 280         r = r->u.rt_next;
 281         while (!r) {
 282                 rcu_read_unlock_bh();
 283                 if (--st->bucket < 0)
 284                         break;
 285                 rcu_read_lock_bh();
 286                 r = rt_hash_table[st->bucket].chain;
 287         }
 288         return r;
 289 }
 290
 291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 292 {
 293         struct rtable *r = rt_cache_get_first(seq);
 294
 295         if (r)
 296                 while (pos && (r = rt_cache_get_next(seq, r)))
 297                         --pos;
 298         return pos ? NULL : r;
 299 }
 300
 301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 302 {
 303         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 304 }
 305
 306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 307 {
 308         struct rtable *r = NULL;
 309
 310         if (v == SEQ_START_TOKEN)
 311                 r = rt_cache_get_first(seq);
 312         else
 313                 r = rt_cache_get_next(seq, v);
 314         ++*pos;
 315         return r;
 316 }
 317
 318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 319 {
 320         if (v && v != SEQ_START_TOKEN)
 321                 rcu_read_unlock_bh();
 322 }
 323
 324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 325 {
 326         if (v == SEQ_START_TOKEN)
 327                 seq_printf(seq, "%-127s\n",
 328                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 329                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 330                            "HHUptod\tSpecDst");
 331         else {
 332                 struct rtable *r = v;
 333                 char temp[256];
 334
 335                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 336                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 337                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 338                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 339                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 340                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 341                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 342                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 343                         dst_metric(&r->u.dst, RTAX_WINDOW),
 344                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 345                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 346                         r->fl.fl4_tos,
 347                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 348                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 349                                        dev_queue_xmit) : 0,
 350                         r->rt_spec_dst);
 351                 seq_printf(seq, "%-127s\n", temp);
 352         }
 353         return 0;
 354 }
 355
 356 static struct seq_operations rt_cache_seq_ops = {
 357         .start  = rt_cache_seq_start,
 358         .next   = rt_cache_seq_next,
 359         .stop   = rt_cache_seq_stop,
 360         .show   = rt_cache_seq_show,
 361 };
 362
 363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 364 {
 365         struct seq_file *seq;
 366         int rc = -ENOMEM;
 367         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 368
 369         if (!s)
 370                 goto out;
 371         rc = seq_open(file, &rt_cache_seq_ops);
 372         if (rc)
 373                 goto out_kfree;
 374         seq          = file->private_data;
 375         seq->private = s;
 376         memset(s, 0, sizeof(*s));
 377 out:
 378         return rc;
 379 out_kfree:
 380         kfree(s);
 381         goto out;
 382 }
 383
 384 static struct file_operations rt_cache_seq_fops = {
 385         .owner   = THIS_MODULE,
 386         .open    = rt_cache_seq_open,
 387         .read    = seq_read,
 388         .llseek  = seq_lseek,
 389         .release = seq_release_private,
 390 };
 391
 392
 393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 394 {
 395         int cpu;
 396
 397         if (*pos == 0)
 398                 return SEQ_START_TOKEN;
 399
 400         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 401                 if (!cpu_possible(cpu))
 402                         continue;
 403                 *pos = cpu+1;
 404                 return per_cpu_ptr(rt_cache_stat, cpu);
 405         }
 406         return NULL;
 407 }
 408
 409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 410 {
 411         int cpu;
 412
 413         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 414                 if (!cpu_possible(cpu))
 415                         continue;
 416                 *pos = cpu+1;
 417                 return per_cpu_ptr(rt_cache_stat, cpu);
 418         }
 419         return NULL;
 420
 421 }
 422
 423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 424 {
 425
 426 }
 427
 428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 429 {
 430         struct rt_cache_stat *st = v;
 431
 432         if (v == SEQ_START_TOKEN) {
 433                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 434                 return 0;
 435         }
 436
 437         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 438                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 439                    atomic_read(&ipv4_dst_ops.entries),
 440                    st->in_hit,
 441                    st->in_slow_tot,
 442                    st->in_slow_mc,
 443                    st->in_no_route,
 444                    st->in_brd,
 445                    st->in_martian_dst,
 446                    st->in_martian_src,
 447
 448                    st->out_hit,
 449                    st->out_slow_tot,
 450                    st->out_slow_mc,
 451
 452                    st->gc_total,
 453                    st->gc_ignored,
 454                    st->gc_goal_miss,
 455                    st->gc_dst_overflow,
 456                    st->in_hlist_search,
 457                    st->out_hlist_search
 458                 );
 459         return 0;
 460 }
 461
 462 static struct seq_operations rt_cpu_seq_ops = {
 463         .start  = rt_cpu_seq_start,
 464         .next   = rt_cpu_seq_next,
 465         .stop   = rt_cpu_seq_stop,
 466         .show   = rt_cpu_seq_show,
 467 };
 468
 469
 470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 471 {
 472         return seq_open(file, &rt_cpu_seq_ops);
 473 }
 474
 475 static struct file_operations rt_cpu_seq_fops = {
 476         .owner   = THIS_MODULE,
 477         .open    = rt_cpu_seq_open,
 478         .read    = seq_read,
 479         .llseek  = seq_lseek,
 480         .release = seq_release,
 481 };
 482
 483 #endif /* CONFIG_PROC_FS */
 484
 485 static __inline__ void rt_free(struct rtable *rt)
 486 {
 487         multipath_remove(rt);
 488         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 489 }
 490
 491 static __inline__ void rt_drop(struct rtable *rt)
 492 {
 493         multipath_remove(rt);
 494         ip_rt_put(rt);
 495         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 496 }
 497
 498 static __inline__ int rt_fast_clean(struct rtable *rth)
 499 {
 500         /* Kill broadcast/multicast entries very aggresively, if they
 501            collide in hash table with more useful entries */
 502         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 503                 rth->fl.iif && rth->u.rt_next;
 504 }
 505
 506 static __inline__ int rt_valuable(struct rtable *rth)
 507 {
 508         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 509                 rth->u.dst.expires;
 510 }
 511
 512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 513 {
 514         unsigned long age;
 515         int ret = 0;
 516
 517         if (atomic_read(&rth->u.dst.__refcnt))
 518                 goto out;
 519
 520         ret = 1;
 521         if (rth->u.dst.expires &&
 522             time_after_eq(jiffies, rth->u.dst.expires))
 523                 goto out;
 524
 525         age = jiffies - rth->u.dst.lastuse;
 526         ret = 0;
 527         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 528             (age <= tmo2 && rt_valuable(rth)))
 529                 goto out;
 530         ret = 1;
 531 out:    return ret;
 532 }
 533
 534 /* Bits of score are:
 535  * 31: very valuable
 536  * 30: not quite useless
 537  * 29..0: usage counter
 538  */
 539 static inline u32 rt_score(struct rtable *rt)
 540 {
 541         u32 score = jiffies - rt->u.dst.lastuse;
 542
 543         score = ~score & ~(3<<30);
 544
 545         if (rt_valuable(rt))
 546                 score |= (1<<31);
 547
 548         if (!rt->fl.iif ||
 549             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 550                 score |= (1<<30);
 551
 552         return score;
 553 }
 554
 555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 556 {
 557         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 558                fl1->oif     == fl2->oif &&
 559                fl1->iif     == fl2->iif;
 560 }
 561
 562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 564                                                 struct rtable *expentry,
 565                                                 int *removed_count)
 566 {
 567         int passedexpired = 0;
 568         struct rtable **nextstep = NULL;
 569         struct rtable **rthp = chain_head;
 570         struct rtable *rth;
 571
 572         if (removed_count)
 573                 *removed_count = 0;
 574
 575         while ((rth = *rthp) != NULL) {
 576                 if (rth == expentry)
 577                         passedexpired = 1;
 578
 579                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 580                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 581                         if (*rthp == expentry) {
 582                                 *rthp = rth->u.rt_next;
 583                                 continue;
 584                         } else {
 585                                 *rthp = rth->u.rt_next;
 586                                 rt_free(rth);
 587                                 if (removed_count)
 588                                         ++(*removed_count);
 589                         }
 590                 } else {
 591                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 592                             passedexpired && !nextstep)
 593                                 nextstep = &rth->u.rt_next;
 594
 595                         rthp = &rth->u.rt_next;
 596                 }
 597         }
 598
 599         rt_free(expentry);
 600         if (removed_count)
 601                 ++(*removed_count);
 602
 603         return nextstep;
 604 }
 605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 606
 607
 608 /* This runs via a timer and thus is always in BH context. */
 609 static void rt_check_expire(unsigned long dummy)
 610 {
 611         static unsigned int rover;
 612         unsigned int i = rover, goal;
 613         struct rtable *rth, **rthp;
 614         unsigned long now = jiffies;
 615         u64 mult;
 616
 617         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 618         if (ip_rt_gc_timeout > 1)
 619                 do_div(mult, ip_rt_gc_timeout);
 620         goal = (unsigned int)mult;
 621         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 622         for (; goal > 0; goal--) {
 623                 unsigned long tmo = ip_rt_gc_timeout;
 624
 625                 i = (i + 1) & rt_hash_mask;
 626                 rthp = &rt_hash_table[i].chain;
 627
 628                 if (*rthp == 0)
 629                         continue;
 630                 spin_lock(rt_hash_lock_addr(i));
 631                 while ((rth = *rthp) != NULL) {
 632                         if (rth->u.dst.expires) {
 633                                 /* Entry is expired even if it is in use */
 634                                 if (time_before_eq(now, rth->u.dst.expires)) {
 635                                         tmo >>= 1;
 636                                         rthp = &rth->u.rt_next;
 637                                         continue;
 638                                 }
 639                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 640                                 tmo >>= 1;
 641                                 rthp = &rth->u.rt_next;
 642                                 continue;
 643                         }
 644
 645                         /* Cleanup aged off entries. */
 646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 647                         /* remove all related balanced entries if necessary */
 648                         if (rth->u.dst.flags & DST_BALANCED) {
 649                                 rthp = rt_remove_balanced_route(
 650                                         &rt_hash_table[i].chain,
 651                                         rth, NULL);
 652                                 if (!rthp)
 653                                         break;
 654                         } else {
 655                                 *rthp = rth->u.rt_next;
 656                                 rt_free(rth);
 657                         }
 658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 659                         *rthp = rth->u.rt_next;
 660                         rt_free(rth);
 661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 662                 }
 663                 spin_unlock(rt_hash_lock_addr(i));
 664
 665                 /* Fallback loop breaker. */
 666                 if (time_after(jiffies, now))
 667                         break;
 668         }
 669         rover = i;
 670         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 671 }
 672
 673 /* This can run from both BH and non-BH contexts, the latter
 674  * in the case of a forced flush event.
 675  */
 676 static void rt_run_flush(unsigned long dummy)
 677 {
 678         int i;
 679         struct rtable *rth, *next;
 680
 681         rt_deadline = 0;
 682
 683         get_random_bytes(&rt_hash_rnd, 4);
 684
 685         for (i = rt_hash_mask; i >= 0; i--) {
 686                 spin_lock_bh(rt_hash_lock_addr(i));
 687                 rth = rt_hash_table[i].chain;
 688                 if (rth)
 689                         rt_hash_table[i].chain = NULL;
 690                 spin_unlock_bh(rt_hash_lock_addr(i));
 691
 692                 for (; rth; rth = next) {
 693                         next = rth->u.rt_next;
 694                         rt_free(rth);
 695                 }
 696         }
 697 }
 698
 699 static DEFINE_SPINLOCK(rt_flush_lock);
 700
 701 void rt_cache_flush(int delay)
 702 {
 703         unsigned long now = jiffies;
 704         int user_mode = !in_softirq();
 705
 706         if (delay < 0)
 707                 delay = ip_rt_min_delay;
 708
 709         /* flush existing multipath state*/
 710         multipath_flush();
 711
 712         spin_lock_bh(&rt_flush_lock);
 713
 714         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 715                 long tmo = (long)(rt_deadline - now);
 716
 717                 /* If flush timer is already running
 718                    and flush request is not immediate (delay > 0):
 719
 720                    if deadline is not achieved, prolongate timer to "delay",
 721                    otherwise fire it at deadline time.
 722                  */
 723
 724                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 725                         tmo = 0;
 726
 727                 if (delay > tmo)
 728                         delay = tmo;
 729         }
 730
 731         if (delay <= 0) {
 732                 spin_unlock_bh(&rt_flush_lock);
 733                 rt_run_flush(0);
 734                 return;
 735         }
 736
 737         if (rt_deadline == 0)
 738                 rt_deadline = now + ip_rt_max_delay;
 739
 740         mod_timer(&rt_flush_timer, now+delay);
 741         spin_unlock_bh(&rt_flush_lock);
 742 }
 743
 744 static void rt_secret_rebuild(unsigned long dummy)
 745 {
 746         unsigned long now = jiffies;
 747
 748         rt_cache_flush(0);
 749         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 750 }
 751
 752 /*
 753    Short description of GC goals.
 754
 755    We want to build algorithm, which will keep routing cache
 756    at some equilibrium point, when number of aged off entries
 757    is kept approximately equal to newly generated ones.
 758
 759    Current expiration strength is variable "expire".
 760    We try to adjust it dynamically, so that if networking
 761    is idle expires is large enough to keep enough of warm entries,
 762    and when load increases it reduces to limit cache size.
 763  */
 764
 765 static int rt_garbage_collect(void)
 766 {
 767         static unsigned long expire = RT_GC_TIMEOUT;
 768         static unsigned long last_gc;
 769         static int rover;
 770         static int equilibrium;
 771         struct rtable *rth, **rthp;
 772         unsigned long now = jiffies;
 773         int goal;
 774
 775         /*
 776          * Garbage collection is pretty expensive,
 777          * do not make it too frequently.
 778          */
 779
 780         RT_CACHE_STAT_INC(gc_total);
 781
 782         if (now - last_gc < ip_rt_gc_min_interval &&
 783             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 784                 RT_CACHE_STAT_INC(gc_ignored);
 785                 goto out;
 786         }
 787
 788         /* Calculate number of entries, which we want to expire now. */
 789         goal = atomic_read(&ipv4_dst_ops.entries) -
 790                 (ip_rt_gc_elasticity << rt_hash_log);
 791         if (goal <= 0) {
 792                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 793                         equilibrium = ipv4_dst_ops.gc_thresh;
 794                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 795                 if (goal > 0) {
 796                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 797                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 798                 }
 799         } else {
 800                 /* We are in dangerous area. Try to reduce cache really
 801                  * aggressively.
 802                  */
 803                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 804                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 805         }
 806
 807         if (now - last_gc >= ip_rt_gc_min_interval)
 808                 last_gc = now;
 809
 810         if (goal <= 0) {
 811                 equilibrium += goal;
 812                 goto work_done;
 813         }
 814
 815         do {
 816                 int i, k;
 817
 818                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 819                         unsigned long tmo = expire;
 820
 821                         k = (k + 1) & rt_hash_mask;
 822                         rthp = &rt_hash_table[k].chain;
 823                         spin_lock_bh(rt_hash_lock_addr(k));
 824                         while ((rth = *rthp) != NULL) {
 825                                 if (!rt_may_expire(rth, tmo, expire)) {
 826                                         tmo >>= 1;
 827                                         rthp = &rth->u.rt_next;
 828                                         continue;
 829                                 }
 830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 831                                 /* remove all related balanced entries
 832                                  * if necessary
 833                                  */
 834                                 if (rth->u.dst.flags & DST_BALANCED) {
 835                                         int r;
 836
 837                                         rthp = rt_remove_balanced_route(
 838                                                 &rt_hash_table[i].chain,
 839                                                 rth,
 840                                                 &r);
 841                                         goal -= r;
 842                                         if (!rthp)
 843                                                 break;
 844                                 } else {
 845                                         *rthp = rth->u.rt_next;
 846                                         rt_free(rth);
 847                                         goal--;
 848                                 }
 849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 850                                 *rthp = rth->u.rt_next;
 851                                 rt_free(rth);
 852                                 goal--;
 853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 854                         }
 855                         spin_unlock_bh(rt_hash_lock_addr(k));
 856                         if (goal <= 0)
 857                                 break;
 858                 }
 859                 rover = k;
 860
 861                 if (goal <= 0)
 862                         goto work_done;
 863
 864                 /* Goal is not achieved. We stop process if:
 865
 866                    - if expire reduced to zero. Otherwise, expire is halfed.
 867                    - if table is not full.
 868                    - if we are called from interrupt.
 869                    - jiffies check is just fallback/debug loop breaker.
 870                      We will not spin here for long time in any case.
 871                  */
 872
 873                 RT_CACHE_STAT_INC(gc_goal_miss);
 874
 875                 if (expire == 0)
 876                         break;
 877
 878                 expire >>= 1;
 879 #if RT_CACHE_DEBUG >= 2
 880                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 881                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 882 #endif
 883
 884                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 885                         goto out;
 886         } while (!in_softirq() && time_before_eq(jiffies, now));
 887
 888         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 889                 goto out;
 890         if (net_ratelimit())
 891                 printk(KERN_WARNING "dst cache overflow\n");
 892         RT_CACHE_STAT_INC(gc_dst_overflow);
 893         return 1;
 894
 895 work_done:
 896         expire += ip_rt_gc_min_interval;
 897         if (expire > ip_rt_gc_timeout ||
 898             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 899                 expire = ip_rt_gc_timeout;
 900 #if RT_CACHE_DEBUG >= 2
 901         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 902                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 903 #endif
 904 out:    return 0;
 905 }
 906
 907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 908 {
 909         struct rtable   *rth, **rthp;
 910         unsigned long   now;
 911         struct rtable *cand, **candp;
 912         u32             min_score;
 913         int             chain_length;
 914         int attempts = !in_softirq();
 915
 916 restart:
 917         chain_length = 0;
 918         min_score = ~(u32)0;
 919         cand = NULL;
 920         candp = NULL;
 921         now = jiffies;
 922
 923         rthp = &rt_hash_table[hash].chain;
 924
 925         spin_lock_bh(rt_hash_lock_addr(hash));
 926         while ((rth = *rthp) != NULL) {
 927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 928                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 929                     compare_keys(&rth->fl, &rt->fl)) {
 930 #else
 931                 if (compare_keys(&rth->fl, &rt->fl)) {
 932 #endif
 933                         /* Put it first */
 934                         *rthp = rth->u.rt_next;
 935                         /*
 936                          * Since lookup is lockfree, the deletion
 937                          * must be visible to another weakly ordered CPU before
 938                          * the insertion at the start of the hash chain.
 939                          */
 940                         rcu_assign_pointer(rth->u.rt_next,
 941                                            rt_hash_table[hash].chain);
 942                         /*
 943                          * Since lookup is lockfree, the update writes
 944                          * must be ordered for consistency on SMP.
 945                          */
 946                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 947
 948                         rth->u.dst.__use++;
 949                         dst_hold(&rth->u.dst);
 950                         rth->u.dst.lastuse = now;
 951                         spin_unlock_bh(rt_hash_lock_addr(hash));
 952
 953                         rt_drop(rt);
 954                         *rp = rth;
 955                         return 0;
 956                 }
 957
 958                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 959                         u32 score = rt_score(rth);
 960
 961                         if (score <= min_score) {
 962                                 cand = rth;
 963                                 candp = rthp;
 964                                 min_score = score;
 965                         }
 966                 }
 967
 968                 chain_length++;
 969
 970                 rthp = &rth->u.rt_next;
 971         }
 972
 973         if (cand) {
 974                 /* ip_rt_gc_elasticity used to be average length of chain
 975                  * length, when exceeded gc becomes really aggressive.
 976                  *
 977                  * The second limit is less certain. At the moment it allows
 978                  * only 2 entries per bucket. We will see.
 979                  */
 980                 if (chain_length > ip_rt_gc_elasticity) {
 981                         *candp = cand->u.rt_next;
 982                         rt_free(cand);
 983                 }
 984         }
 985
 986         /* Try to bind route to arp only if it is output
 987            route or unicast forwarding path.
 988          */
 989         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 990                 int err = arp_bind_neighbour(&rt->u.dst);
 991                 if (err) {
 992                         spin_unlock_bh(rt_hash_lock_addr(hash));
 993
 994                         if (err != -ENOBUFS) {
 995                                 rt_drop(rt);
 996                                 return err;
 997                         }
 998
 999                         /* Neighbour tables are full and nothing
1000                            can be released. Try to shrink route cache,
1001                            it is most likely it holds some neighbour records.
1002                          */
1003                         if (attempts-- > 0) {
1004                                 int saved_elasticity = ip_rt_gc_elasticity;
1005                                 int saved_int = ip_rt_gc_min_interval;
1006                                 ip_rt_gc_elasticity     = 1;
1007                                 ip_rt_gc_min_interval   = 0;
1008                                 rt_garbage_collect();
1009                                 ip_rt_gc_min_interval   = saved_int;
1010                                 ip_rt_gc_elasticity     = saved_elasticity;
1011                                 goto restart;
1012                         }
1013
1014                         if (net_ratelimit())
1015                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1016                         rt_drop(rt);
1017                         return -ENOBUFS;
1018                 }
1019         }
1020
1021         rt->u.rt_next = rt_hash_table[hash].chain;
1022 #if RT_CACHE_DEBUG >= 2
1023         if (rt->u.rt_next) {
1024                 struct rtable *trt;
1025                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026                        NIPQUAD(rt->rt_dst));
1027                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1029                 printk("\n");
1030         }
1031 #endif
1032         rt_hash_table[hash].chain = rt;
1033         spin_unlock_bh(rt_hash_lock_addr(hash));
1034         *rp = rt;
1035         return 0;
1036 }
1037
1038 void rt_bind_peer(struct rtable *rt, int create)
1039 {
1040         static DEFINE_SPINLOCK(rt_peer_lock);
1041         struct inet_peer *peer;
1042
1043         peer = inet_getpeer(rt->rt_dst, create);
1044
1045         spin_lock_bh(&rt_peer_lock);
1046         if (rt->peer == NULL) {
1047                 rt->peer = peer;
1048                 peer = NULL;
1049         }
1050         spin_unlock_bh(&rt_peer_lock);
1051         if (peer)
1052                 inet_putpeer(peer);
1053 }
1054
1055 /*
1056  * Peer allocation may fail only in serious out-of-memory conditions.  However
1057  * we still can generate some output.
1058  * Random ID selection looks a bit dangerous because we have no chances to
1059  * select ID being unique in a reasonable period of time.
1060  * But broken packet identifier may be better than no packet at all.
1061  */
1062 static void ip_select_fb_ident(struct iphdr *iph)
1063 {
1064         static DEFINE_SPINLOCK(ip_fb_id_lock);
1065         static u32 ip_fallback_id;
1066         u32 salt;
1067
1068         spin_lock_bh(&ip_fb_id_lock);
1069         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070         iph->id = htons(salt & 0xFFFF);
1071         ip_fallback_id = salt;
1072         spin_unlock_bh(&ip_fb_id_lock);
1073 }
1074
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076 {
1077         struct rtable *rt = (struct rtable *) dst;
1078
1079         if (rt) {
1080                 if (rt->peer == NULL)
1081                         rt_bind_peer(rt, 1);
1082
1083                 /* If peer is attached to destination, it is never detached,
1084                    so that we need not to grab a lock to dereference it.
1085                  */
1086                 if (rt->peer) {
1087                         iph->id = htons(inet_getid(rt->peer, more));
1088                         return;
1089                 }
1090         } else
1091                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1092                        __builtin_return_address(0));
1093
1094         ip_select_fb_ident(iph);
1095 }
1096
1097 static void rt_del(unsigned hash, struct rtable *rt)
1098 {
1099         struct rtable **rthp;
1100
1101         spin_lock_bh(rt_hash_lock_addr(hash));
1102         ip_rt_put(rt);
1103         for (rthp = &rt_hash_table[hash].chain; *rthp;
1104              rthp = &(*rthp)->u.rt_next)
1105                 if (*rthp == rt) {
1106                         *rthp = rt->u.rt_next;
1107                         rt_free(rt);
1108                         break;
1109                 }
1110         spin_unlock_bh(rt_hash_lock_addr(hash));
1111 }
1112
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114                     u32 saddr, u8 tos, struct net_device *dev)
1115 {
1116         int i, k;
1117         struct in_device *in_dev = in_dev_get(dev);
1118         struct rtable *rth, **rthp;
1119         u32  skeys[2] = { saddr, 0 };
1120         int  ikeys[2] = { dev->ifindex, 0 };
1121
1122         tos &= IPTOS_RT_MASK;
1123
1124         if (!in_dev)
1125                 return;
1126
1127         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129                 goto reject_redirect;
1130
1131         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133                         goto reject_redirect;
1134                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135                         goto reject_redirect;
1136         } else {
1137                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1138                         goto reject_redirect;
1139         }
1140
1141         for (i = 0; i < 2; i++) {
1142                 for (k = 0; k < 2; k++) {
1143                         unsigned hash = rt_hash_code(daddr,
1144                                                      skeys[i] ^ (ikeys[k] << 5),
1145                                                      tos);
1146
1147                         rthp=&rt_hash_table[hash].chain;
1148
1149                         rcu_read_lock();
1150                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1151                                 struct rtable *rt;
1152
1153                                 if (rth->fl.fl4_dst != daddr ||
1154                                     rth->fl.fl4_src != skeys[i] ||
1155                                     rth->fl.fl4_tos != tos ||
1156                                     rth->fl.oif != ikeys[k] ||
1157                                     rth->fl.iif != 0) {
1158                                         rthp = &rth->u.rt_next;
1159                                         continue;
1160                                 }
1161
1162                                 if (rth->rt_dst != daddr ||
1163                                     rth->rt_src != saddr ||
1164                                     rth->u.dst.error ||
1165                                     rth->rt_gateway != old_gw ||
1166                                     rth->u.dst.dev != dev)
1167                                         break;
1168
1169                                 dst_hold(&rth->u.dst);
1170                                 rcu_read_unlock();
1171
1172                                 rt = dst_alloc(&ipv4_dst_ops);
1173                                 if (rt == NULL) {
1174                                         ip_rt_put(rth);
1175                                         in_dev_put(in_dev);
1176                                         return;
1177                                 }
1178
1179                                 /* Copy all the information. */
1180                                 *rt = *rth;
1181                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182                                 rt->u.dst.__use         = 1;
1183                                 atomic_set(&rt->u.dst.__refcnt, 1);
1184                                 rt->u.dst.child         = NULL;
1185                                 if (rt->u.dst.dev)
1186                                         dev_hold(rt->u.dst.dev);
1187                                 if (rt->idev)
1188                                         in_dev_hold(rt->idev);
1189                                 rt->u.dst.obsolete      = 0;
1190                                 rt->u.dst.lastuse       = jiffies;
1191                                 rt->u.dst.path          = &rt->u.dst;
1192                                 rt->u.dst.neighbour     = NULL;
1193                                 rt->u.dst.hh            = NULL;
1194                                 rt->u.dst.xfrm          = NULL;
1195
1196                                 rt->rt_flags            |= RTCF_REDIRECTED;
1197
1198                                 /* Gateway is different ... */
1199                                 rt->rt_gateway          = new_gw;
1200
1201                                 /* Redirect received -> path was valid */
1202                                 dst_confirm(&rth->u.dst);
1203
1204                                 if (rt->peer)
1205                                         atomic_inc(&rt->peer->refcnt);
1206
1207                                 if (arp_bind_neighbour(&rt->u.dst) ||
1208                                     !(rt->u.dst.neighbour->nud_state &
1209                                             NUD_VALID)) {
1210                                         if (rt->u.dst.neighbour)
1211                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1212                                         ip_rt_put(rth);
1213                                         rt_drop(rt);
1214                                         goto do_next;
1215                                 }
1216
1217                                 rt_del(hash, rth);
1218                                 if (!rt_intern_hash(hash, rt, &rt))
1219                                         ip_rt_put(rt);
1220                                 goto do_next;
1221                         }
1222                         rcu_read_unlock();
1223                 do_next:
1224                         ;
1225                 }
1226         }
1227         in_dev_put(in_dev);
1228         return;
1229
1230 reject_redirect:
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
1232         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234                         "%u.%u.%u.%u ignored.\n"
1235                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236                         "tos %02x\n",
1237                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1239 #endif
1240         in_dev_put(in_dev);
1241 }
1242
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244 {
1245         struct rtable *rt = (struct rtable*)dst;
1246         struct dst_entry *ret = dst;
1247
1248         if (rt) {
1249                 if (dst->obsolete) {
1250                         ip_rt_put(rt);
1251                         ret = NULL;
1252                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253                            rt->u.dst.expires) {
1254                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1255                                                      rt->fl.fl4_src ^
1256                                                         (rt->fl.oif << 5),
1257                                                      rt->fl.fl4_tos);
1258 #if RT_CACHE_DEBUG >= 1
1259                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260                                           "%u.%u.%u.%u/%02x dropped\n",
1261                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262 #endif
1263                         rt_del(hash, rt);
1264                         ret = NULL;
1265                 }
1266         }
1267         return ret;
1268 }
1269
1270 /*
1271  * Algorithm:
1272  *      1. The first ip_rt_redirect_number redirects are sent
1273  *         with exponential backoff, then we stop sending them at all,
1274  *         assuming that the host ignores our redirects.
1275  *      2. If we did not see packets requiring redirects
1276  *         during ip_rt_redirect_silence, we assume that the host
1277  *         forgot redirected route and start to send redirects again.
1278  *
1279  * This algorithm is much cheaper and more intelligent than dumb load limiting
1280  * in icmp.c.
1281  *
1282  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284  */
1285
1286 void ip_rt_send_redirect(struct sk_buff *skb)
1287 {
1288         struct rtable *rt = (struct rtable*)skb->dst;
1289         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1290
1291         if (!in_dev)
1292                 return;
1293
1294         if (!IN_DEV_TX_REDIRECTS(in_dev))
1295                 goto out;
1296
1297         /* No redirected packets during ip_rt_redirect_silence;
1298          * reset the algorithm.
1299          */
1300         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301                 rt->u.dst.rate_tokens = 0;
1302
1303         /* Too many ignored redirects; do not send anything
1304          * set u.dst.rate_last to the last seen redirected packet.
1305          */
1306         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307                 rt->u.dst.rate_last = jiffies;
1308                 goto out;
1309         }
1310
1311         /* Check for load limit; set rate_last to the latest sent
1312          * redirect.
1313          */
1314         if (time_after(jiffies,
1315                        (rt->u.dst.rate_last +
1316                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318                 rt->u.dst.rate_last = jiffies;
1319                 ++rt->u.dst.rate_tokens;
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323                     net_ratelimit())
1324                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1327                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1328 #endif
1329         }
1330 out:
1331         in_dev_put(in_dev);
1332 }
1333
1334 static int ip_error(struct sk_buff *skb)
1335 {
1336         struct rtable *rt = (struct rtable*)skb->dst;
1337         unsigned long now;
1338         int code;
1339
1340         switch (rt->u.dst.error) {
1341                 case EINVAL:
1342                 default:
1343                         goto out;
1344                 case EHOSTUNREACH:
1345                         code = ICMP_HOST_UNREACH;
1346                         break;
1347                 case ENETUNREACH:
1348                         code = ICMP_NET_UNREACH;
1349                         break;
1350                 case EACCES:
1351                         code = ICMP_PKT_FILTERED;
1352                         break;
1353         }
1354
1355         now = jiffies;
1356         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1359         rt->u.dst.rate_last = now;
1360         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363         }
1364
1365 out:    kfree_skb(skb);
1366         return 0;
1367 }
1368
1369 /*
1370  *      The last two values are not from the RFC but
1371  *      are needed for AMPRnet AX.25 paths.
1372  */
1373
1374 static unsigned short mtu_plateau[] =
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1378 {
1379         int i;
1380
1381         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382                 if (old_mtu > mtu_plateau[i])
1383                         return mtu_plateau[i];
1384         return 68;
1385 }
1386
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388 {
1389         int i;
1390         unsigned short old_mtu = ntohs(iph->tot_len);
1391         struct rtable *rth;
1392         u32  skeys[2] = { iph->saddr, 0, };
1393         u32  daddr = iph->daddr;
1394         u8   tos = iph->tos & IPTOS_RT_MASK;
1395         unsigned short est_mtu = 0;
1396
1397         if (ipv4_config.no_pmtu_disc)
1398                 return 0;
1399
1400         for (i = 0; i < 2; i++) {
1401                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402
1403                 rcu_read_lock();
1404                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405                      rth = rcu_dereference(rth->u.rt_next)) {
1406                         if (rth->fl.fl4_dst == daddr &&
1407                             rth->fl.fl4_src == skeys[i] &&
1408                             rth->rt_dst  == daddr &&
1409                             rth->rt_src  == iph->saddr &&
1410                             rth->fl.fl4_tos == tos &&
1411                             rth->fl.iif == 0 &&
1412                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413                                 unsigned short mtu = new_mtu;
1414
1415                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1416
1417                                         /* BSD 4.2 compatibility hack :-( */
1418                                         if (mtu == 0 &&
1419                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420                                             old_mtu >= 68 + (iph->ihl << 2))
1421                                                 old_mtu -= iph->ihl << 2;
1422
1423                                         mtu = guess_mtu(old_mtu);
1424                                 }
1425                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1427                                                 dst_confirm(&rth->u.dst);
1428                                                 if (mtu < ip_rt_min_pmtu) {
1429                                                         mtu = ip_rt_min_pmtu;
1430                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1431                                                                 (1 << RTAX_MTU);
1432                                                 }
1433                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434                                                 dst_set_expires(&rth->u.dst,
1435                                                         ip_rt_mtu_expires);
1436                                         }
1437                                         est_mtu = mtu;
1438                                 }
1439                         }
1440                 }
1441                 rcu_read_unlock();
1442         }
1443         return est_mtu ? : new_mtu;
1444 }
1445
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 {
1448         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449             !(dst_metric_locked(dst, RTAX_MTU))) {
1450                 if (mtu < ip_rt_min_pmtu) {
1451                         mtu = ip_rt_min_pmtu;
1452                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453                 }
1454                 dst->metrics[RTAX_MTU-1] = mtu;
1455                 dst_set_expires(dst, ip_rt_mtu_expires);
1456         }
1457 }
1458
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460 {
1461         return NULL;
1462 }
1463
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 {
1466         struct rtable *rt = (struct rtable *) dst;
1467         struct inet_peer *peer = rt->peer;
1468         struct in_device *idev = rt->idev;
1469
1470         if (peer) {
1471                 rt->peer = NULL;
1472                 inet_putpeer(peer);
1473         }
1474
1475         if (idev) {
1476                 rt->idev = NULL;
1477                 in_dev_put(idev);
1478         }
1479 }
1480
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482                             int how)
1483 {
1484         struct rtable *rt = (struct rtable *) dst;
1485         struct in_device *idev = rt->idev;
1486         if (dev != &loopback_dev && idev && idev->dev == dev) {
1487                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488                 if (loopback_idev) {
1489                         rt->idev = loopback_idev;
1490                         in_dev_put(idev);
1491                 }
1492         }
1493 }
1494
1495 static void ipv4_link_failure(struct sk_buff *skb)
1496 {
1497         struct rtable *rt;
1498
1499         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500
1501         rt = (struct rtable *) skb->dst;
1502         if (rt)
1503                 dst_set_expires(&rt->u.dst, 0);
1504 }
1505
1506 static int ip_rt_bug(struct sk_buff *skb)
1507 {
1508         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510                 skb->dev ? skb->dev->name : "?");
1511         kfree_skb(skb);
1512         return 0;
1513 }
1514
1515 /*
1516    We do not cache source address of outgoing interface,
1517    because it is used only by IP RR, TS and SRR options,
1518    so that it out of fast path.
1519
1520    BTW remember: "addr" is allowed to be not aligned
1521    in IP options!
1522  */
1523
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 {
1526         u32 src;
1527         struct fib_result res;
1528
1529         if (rt->fl.iif == 0)
1530                 src = rt->rt_src;
1531         else if (fib_lookup(&rt->fl, &res) == 0) {
1532                 src = FIB_RES_PREFSRC(res);
1533                 fib_res_put(&res);
1534         } else
1535                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536                                         RT_SCOPE_UNIVERSE);
1537         memcpy(addr, &src, 4);
1538 }
1539
1540 #ifdef CONFIG_NET_CLS_ROUTE
1541 static void set_class_tag(struct rtable *rt, u32 tag)
1542 {
1543         if (!(rt->u.dst.tclassid & 0xFFFF))
1544                 rt->u.dst.tclassid |= tag & 0xFFFF;
1545         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547 }
1548 #endif
1549
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 {
1552         struct fib_info *fi = res->fi;
1553
1554         if (fi) {
1555                 if (FIB_RES_GW(*res) &&
1556                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557                         rt->rt_gateway = FIB_RES_GW(*res);
1558                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559                        sizeof(rt->u.dst.metrics));
1560                 if (fi->fib_mtu == 0) {
1561                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563                             rt->rt_gateway != rt->rt_dst &&
1564                             rt->u.dst.dev->mtu > 576)
1565                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566                 }
1567 #ifdef CONFIG_NET_CLS_ROUTE
1568                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569 #endif
1570         } else
1571                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572
1573         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579                                        ip_rt_min_advmss);
1580         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582
1583 #ifdef CONFIG_NET_CLS_ROUTE
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
1585         set_class_tag(rt, fib_rules_tclass(res));
1586 #endif
1587         set_class_tag(rt, itag);
1588 #endif
1589         rt->rt_type = res->type;
1590 }
1591
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593                                 u8 tos, struct net_device *dev, int our)
1594 {
1595         unsigned hash;
1596         struct rtable *rth;
1597         u32 spec_dst;
1598         struct in_device *in_dev = in_dev_get(dev);
1599         u32 itag = 0;
1600
1601         /* Primary sanity checks. */
1602
1603         if (in_dev == NULL)
1604                 return -EINVAL;
1605
1606         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607             skb->protocol != htons(ETH_P_IP))
1608                 goto e_inval;
1609
1610         if (ZERONET(saddr)) {
1611                 if (!LOCAL_MCAST(daddr))
1612                         goto e_inval;
1613                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614         } else if (fib_validate_source(saddr, 0, tos, 0,
1615                                         dev, &spec_dst, &itag) < 0)
1616                 goto e_inval;
1617
1618         rth = dst_alloc(&ipv4_dst_ops);
1619         if (!rth)
1620                 goto e_nobufs;
1621
1622         rth->u.dst.output= ip_rt_bug;
1623
1624         atomic_set(&rth->u.dst.__refcnt, 1);
1625         rth->u.dst.flags= DST_HOST;
1626         if (in_dev->cnf.no_policy)
1627                 rth->u.dst.flags |= DST_NOPOLICY;
1628         rth->fl.fl4_dst = daddr;
1629         rth->rt_dst     = daddr;
1630         rth->fl.fl4_tos = tos;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632         rth->fl.fl4_fwmark= skb->nfmark;
1633 #endif
1634         rth->fl.fl4_src = saddr;
1635         rth->rt_src     = saddr;
1636 #ifdef CONFIG_NET_CLS_ROUTE
1637         rth->u.dst.tclassid = itag;
1638 #endif
1639         rth->rt_iif     =
1640         rth->fl.iif     = dev->ifindex;
1641         rth->u.dst.dev  = &loopback_dev;
1642         dev_hold(rth->u.dst.dev);
1643         rth->idev       = in_dev_get(rth->u.dst.dev);
1644         rth->fl.oif     = 0;
1645         rth->rt_gateway = daddr;
1646         rth->rt_spec_dst= spec_dst;
1647         rth->rt_type    = RTN_MULTICAST;
1648         rth->rt_flags   = RTCF_MULTICAST;
1649         if (our) {
1650                 rth->u.dst.input= ip_local_deliver;
1651                 rth->rt_flags |= RTCF_LOCAL;
1652         }
1653
1654 #ifdef CONFIG_IP_MROUTE
1655         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656                 rth->u.dst.input = ip_mr_input;
1657 #endif
1658         RT_CACHE_STAT_INC(in_slow_mc);
1659
1660         in_dev_put(in_dev);
1661         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663
1664 e_nobufs:
1665         in_dev_put(in_dev);
1666         return -ENOBUFS;
1667
1668 e_inval:
1669         in_dev_put(in_dev);
1670         return -EINVAL;
1671 }
1672
1673
1674 static void ip_handle_martian_source(struct net_device *dev,
1675                                      struct in_device *in_dev,
1676                                      struct sk_buff *skb,
1677                                      u32 daddr,
1678                                      u32 saddr)
1679 {
1680         RT_CACHE_STAT_INC(in_martian_src);
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
1682         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683                 /*
1684                  *      RFC1812 recommendation, if source is martian,
1685                  *      the only hint is MAC header.
1686                  */
1687                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688                         "%u.%u.%u.%u, on dev %s\n",
1689                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690                 if (dev->hard_header_len && skb->mac.raw) {
1691                         int i;
1692                         unsigned char *p = skb->mac.raw;
1693                         printk(KERN_WARNING "ll header: ");
1694                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1695                                 printk("%02x", *p);
1696                                 if (i < (dev->hard_header_len - 1))
1697                                         printk(":");
1698                         }
1699                         printk("\n");
1700                 }
1701         }
1702 #endif
1703 }
1704
1705 static inline int __mkroute_input(struct sk_buff *skb,
1706                                   struct fib_result* res,
1707                                   struct in_device *in_dev,
1708                                   u32 daddr, u32 saddr, u32 tos,
1709                                   struct rtable **result)
1710 {
1711
1712         struct rtable *rth;
1713         int err;
1714         struct in_device *out_dev;
1715         unsigned flags = 0;
1716         u32 spec_dst, itag;
1717
1718         /* get a working reference to the output device */
1719         out_dev = in_dev_get(FIB_RES_DEV(*res));
1720         if (out_dev == NULL) {
1721                 if (net_ratelimit())
1722                         printk(KERN_CRIT "Bug in ip_route_input" \
1723                                "_slow(). Please, report\n");
1724                 return -EINVAL;
1725         }
1726
1727
1728         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1729                                   in_dev->dev, &spec_dst, &itag);
1730         if (err < 0) {
1731                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1732                                          saddr);
1733
1734                 err = -EINVAL;
1735                 goto cleanup;
1736         }
1737
1738         if (err)
1739                 flags |= RTCF_DIRECTSRC;
1740
1741         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742             (IN_DEV_SHARED_MEDIA(out_dev) ||
1743              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744                 flags |= RTCF_DOREDIRECT;
1745
1746         if (skb->protocol != htons(ETH_P_IP)) {
1747                 /* Not IP (i.e. ARP). Do not create route, if it is
1748                  * invalid for proxy arp. DNAT routes are always valid.
1749                  */
1750                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1751                         err = -EINVAL;
1752                         goto cleanup;
1753                 }
1754         }
1755
1756
1757         rth = dst_alloc(&ipv4_dst_ops);
1758         if (!rth) {
1759                 err = -ENOBUFS;
1760                 goto cleanup;
1761         }
1762
1763         rth->u.dst.flags= DST_HOST;
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765         if (res->fi->fib_nhs > 1)
1766                 rth->u.dst.flags |= DST_BALANCED;
1767 #endif
1768         if (in_dev->cnf.no_policy)
1769                 rth->u.dst.flags |= DST_NOPOLICY;
1770         if (in_dev->cnf.no_xfrm)
1771                 rth->u.dst.flags |= DST_NOXFRM;
1772         rth->fl.fl4_dst = daddr;
1773         rth->rt_dst     = daddr;
1774         rth->fl.fl4_tos = tos;
1775 #ifdef CONFIG_IP_ROUTE_FWMARK
1776         rth->fl.fl4_fwmark= skb->nfmark;
1777 #endif
1778         rth->fl.fl4_src = saddr;
1779         rth->rt_src     = saddr;
1780         rth->rt_gateway = daddr;
1781         rth->rt_iif     =
1782                 rth->fl.iif     = in_dev->dev->ifindex;
1783         rth->u.dst.dev  = (out_dev)->dev;
1784         dev_hold(rth->u.dst.dev);
1785         rth->idev       = in_dev_get(rth->u.dst.dev);
1786         rth->fl.oif     = 0;
1787         rth->rt_spec_dst= spec_dst;
1788
1789         rth->u.dst.input = ip_forward;
1790         rth->u.dst.output = ip_output;
1791
1792         rt_set_nexthop(rth, res, itag);
1793
1794         rth->rt_flags = flags;
1795
1796         *result = rth;
1797         err = 0;
1798  cleanup:
1799         /* release the working reference to the output device */
1800         in_dev_put(out_dev);
1801         return err;
1802 }
1803
1804 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805                                        struct fib_result* res,
1806                                        const struct flowi *fl,
1807                                        struct in_device *in_dev,
1808                                        u32 daddr, u32 saddr, u32 tos)
1809 {
1810         struct rtable* rth = NULL;
1811         int err;
1812         unsigned hash;
1813
1814 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1815         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816                 fib_select_multipath(fl, res);
1817 #endif
1818
1819         /* create a routing cache entry */
1820         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1821         if (err)
1822                 return err;
1823         atomic_set(&rth->u.dst.__refcnt, 1);
1824
1825         /* put it into the cache */
1826         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1828 }
1829
1830 static inline int ip_mkroute_input(struct sk_buff *skb,
1831                                    struct fib_result* res,
1832                                    const struct flowi *fl,
1833                                    struct in_device *in_dev,
1834                                    u32 daddr, u32 saddr, u32 tos)
1835 {
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1837         struct rtable* rth = NULL;
1838         unsigned char hop, hopcount, lasthop;
1839         int err = -EINVAL;
1840         unsigned int hash;
1841
1842         if (res->fi)
1843                 hopcount = res->fi->fib_nhs;
1844         else
1845                 hopcount = 1;
1846
1847         lasthop = hopcount - 1;
1848
1849         /* distinguish between multipath and singlepath */
1850         if (hopcount < 2)
1851                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1852                                             saddr, tos);
1853
1854         /* add all alternatives to the routing cache */
1855         for (hop = 0; hop < hopcount; hop++) {
1856                 res->nh_sel = hop;
1857
1858                 /* create a routing cache entry */
1859                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1860                                       &rth);
1861                 if (err)
1862                         return err;
1863
1864                 /* put it into the cache */
1865                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1866                 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1867                 if (err)
1868                         return err;
1869
1870                 /* forward hop information to multipath impl. */
1871                 multipath_set_nhinfo(rth,
1872                                      FIB_RES_NETWORK(*res),
1873                                      FIB_RES_NETMASK(*res),
1874                                      res->prefixlen,
1875                                      &FIB_RES_NH(*res));
1876
1877                 /* only for the last hop the reference count is handled
1878                  * outside
1879                  */
1880                 if (hop == lasthop)
1881                         atomic_set(&(skb->dst->__refcnt), 1);
1882         }
1883         return err;
1884 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1885         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1886 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1887 }
1888
1889
1890 /*
1891  *      NOTE. We drop all the packets that has local source
1892  *      addresses, because every properly looped back packet
1893  *      must have correct destination already attached by output routine.
1894  *
1895  *      Such approach solves two big problems:
1896  *      1. Not simplex devices are handled properly.
1897  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1898  */
1899
1900 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1901                                u8 tos, struct net_device *dev)
1902 {
1903         struct fib_result res;
1904         struct in_device *in_dev = in_dev_get(dev);
1905         struct flowi fl = { .nl_u = { .ip4_u =
1906                                       { .daddr = daddr,
1907                                         .saddr = saddr,
1908                                         .tos = tos,
1909                                         .scope = RT_SCOPE_UNIVERSE,
1910 #ifdef CONFIG_IP_ROUTE_FWMARK
1911                                         .fwmark = skb->nfmark
1912 #endif
1913                                       } },
1914                             .iif = dev->ifindex };
1915         unsigned        flags = 0;
1916         u32             itag = 0;
1917         struct rtable * rth;
1918         unsigned        hash;
1919         u32             spec_dst;
1920         int             err = -EINVAL;
1921         int             free_res = 0;
1922
1923         /* IP on this device is disabled. */
1924
1925         if (!in_dev)
1926                 goto out;
1927
1928         /* Check for the most weird martians, which can be not detected
1929            by fib_lookup.
1930          */
1931
1932         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1933                 goto martian_source;
1934
1935         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1936                 goto brd_input;
1937
1938         /* Accept zero addresses only to limited broadcast;
1939          * I even do not know to fix it or not. Waiting for complains :-)
1940          */
1941         if (ZERONET(saddr))
1942                 goto martian_source;
1943
1944         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1945                 goto martian_destination;
1946
1947         /*
1948          *      Now we are ready to route packet.
1949          */
1950         if ((err = fib_lookup(&fl, &res)) != 0) {
1951                 if (!IN_DEV_FORWARD(in_dev))
1952                         goto e_hostunreach;
1953                 goto no_route;
1954         }
1955         free_res = 1;
1956
1957         RT_CACHE_STAT_INC(in_slow_tot);
1958
1959         if (res.type == RTN_BROADCAST)
1960                 goto brd_input;
1961
1962         if (res.type == RTN_LOCAL) {
1963                 int result;
1964                 result = fib_validate_source(saddr, daddr, tos,
1965                                              loopback_dev.ifindex,
1966                                              dev, &spec_dst, &itag);
1967                 if (result < 0)
1968                         goto martian_source;
1969                 if (result)
1970                         flags |= RTCF_DIRECTSRC;
1971                 spec_dst = daddr;
1972                 goto local_input;
1973         }
1974
1975         if (!IN_DEV_FORWARD(in_dev))
1976                 goto e_hostunreach;
1977         if (res.type != RTN_UNICAST)
1978                 goto martian_destination;
1979
1980         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1981         if (err == -ENOBUFS)
1982                 goto e_nobufs;
1983         if (err == -EINVAL)
1984                 goto e_inval;
1985
1986 done:
1987         in_dev_put(in_dev);
1988         if (free_res)
1989                 fib_res_put(&res);
1990 out:    return err;
1991
1992 brd_input:
1993         if (skb->protocol != htons(ETH_P_IP))
1994                 goto e_inval;
1995
1996         if (ZERONET(saddr))
1997                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1998         else {
1999                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2000                                           &itag);
2001                 if (err < 0)
2002                         goto martian_source;
2003                 if (err)
2004                         flags |= RTCF_DIRECTSRC;
2005         }
2006         flags |= RTCF_BROADCAST;
2007         res.type = RTN_BROADCAST;
2008         RT_CACHE_STAT_INC(in_brd);
2009
2010 local_input:
2011         rth = dst_alloc(&ipv4_dst_ops);
2012         if (!rth)
2013                 goto e_nobufs;
2014
2015         rth->u.dst.output= ip_rt_bug;
2016
2017         atomic_set(&rth->u.dst.__refcnt, 1);
2018         rth->u.dst.flags= DST_HOST;
2019         if (in_dev->cnf.no_policy)
2020                 rth->u.dst.flags |= DST_NOPOLICY;
2021         rth->fl.fl4_dst = daddr;
2022         rth->rt_dst     = daddr;
2023         rth->fl.fl4_tos = tos;
2024 #ifdef CONFIG_IP_ROUTE_FWMARK
2025         rth->fl.fl4_fwmark= skb->nfmark;
2026 #endif
2027         rth->fl.fl4_src = saddr;
2028         rth->rt_src     = saddr;
2029 #ifdef CONFIG_NET_CLS_ROUTE
2030         rth->u.dst.tclassid = itag;
2031 #endif
2032         rth->rt_iif     =
2033         rth->fl.iif     = dev->ifindex;
2034         rth->u.dst.dev  = &loopback_dev;
2035         dev_hold(rth->u.dst.dev);
2036         rth->idev       = in_dev_get(rth->u.dst.dev);
2037         rth->rt_gateway = daddr;
2038         rth->rt_spec_dst= spec_dst;
2039         rth->u.dst.input= ip_local_deliver;
2040         rth->rt_flags   = flags|RTCF_LOCAL;
2041         if (res.type == RTN_UNREACHABLE) {
2042                 rth->u.dst.input= ip_error;
2043                 rth->u.dst.error= -err;
2044                 rth->rt_flags   &= ~RTCF_LOCAL;
2045         }
2046         rth->rt_type    = res.type;
2047         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2048         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2049         goto done;
2050
2051 no_route:
2052         RT_CACHE_STAT_INC(in_no_route);
2053         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2054         res.type = RTN_UNREACHABLE;
2055         goto local_input;
2056
2057         /*
2058          *      Do not cache martian addresses: they should be logged (RFC1812)
2059          */
2060 martian_destination:
2061         RT_CACHE_STAT_INC(in_martian_dst);
2062 #ifdef CONFIG_IP_ROUTE_VERBOSE
2063         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2064                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2065                         "%u.%u.%u.%u, dev %s\n",
2066                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2067 #endif
2068
2069 e_hostunreach:
2070         err = -EHOSTUNREACH;
2071         goto done;
2072
2073 e_inval:
2074         err = -EINVAL;
2075         goto done;
2076
2077 e_nobufs:
2078         err = -ENOBUFS;
2079         goto done;
2080
2081 martian_source:
2082         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2083         goto e_inval;
2084 }
2085
2086 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2087                    u8 tos, struct net_device *dev)
2088 {
2089         struct rtable * rth;
2090         unsigned        hash;
2091         int iif = dev->ifindex;
2092
2093         tos &= IPTOS_RT_MASK;
2094         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2095
2096         rcu_read_lock();
2097         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2098              rth = rcu_dereference(rth->u.rt_next)) {
2099                 if (rth->fl.fl4_dst == daddr &&
2100                     rth->fl.fl4_src == saddr &&
2101                     rth->fl.iif == iif &&
2102                     rth->fl.oif == 0 &&
2103 #ifdef CONFIG_IP_ROUTE_FWMARK
2104                     rth->fl.fl4_fwmark == skb->nfmark &&
2105 #endif
2106                     rth->fl.fl4_tos == tos) {
2107                         rth->u.dst.lastuse = jiffies;
2108                         dst_hold(&rth->u.dst);
2109                         rth->u.dst.__use++;
2110                         RT_CACHE_STAT_INC(in_hit);
2111                         rcu_read_unlock();
2112                         skb->dst = (struct dst_entry*)rth;
2113                         return 0;
2114                 }
2115                 RT_CACHE_STAT_INC(in_hlist_search);
2116         }
2117         rcu_read_unlock();
2118
2119         /* Multicast recognition logic is moved from route cache to here.
2120            The problem was that too many Ethernet cards have broken/missing
2121            hardware multicast filters :-( As result the host on multicasting
2122            network acquires a lot of useless route cache entries, sort of
2123            SDR messages from all the world. Now we try to get rid of them.
2124            Really, provided software IP multicast filter is organized
2125            reasonably (at least, hashed), it does not result in a slowdown
2126            comparing with route cache reject entries.
2127            Note, that multicast routers are not affected, because
2128            route cache entry is created eventually.
2129          */
2130         if (MULTICAST(daddr)) {
2131                 struct in_device *in_dev;
2132
2133                 rcu_read_lock();
2134                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2135                         int our = ip_check_mc(in_dev, daddr, saddr,
2136                                 skb->nh.iph->protocol);
2137                         if (our
2138 #ifdef CONFIG_IP_MROUTE
2139                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2140 #endif
2141                             ) {
2142                                 rcu_read_unlock();
2143                                 return ip_route_input_mc(skb, daddr, saddr,
2144                                                          tos, dev, our);
2145                         }
2146                 }
2147                 rcu_read_unlock();
2148                 return -EINVAL;
2149         }
2150         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2151 }
2152
2153 static inline int __mkroute_output(struct rtable **result,
2154                                    struct fib_result* res,
2155                                    const struct flowi *fl,
2156                                    const struct flowi *oldflp,
2157                                    struct net_device *dev_out,
2158                                    unsigned flags)
2159 {
2160         struct rtable *rth;
2161         struct in_device *in_dev;
2162         u32 tos = RT_FL_TOS(oldflp);
2163         int err = 0;
2164
2165         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2166                 return -EINVAL;
2167
2168         if (fl->fl4_dst == 0xFFFFFFFF)
2169                 res->type = RTN_BROADCAST;
2170         else if (MULTICAST(fl->fl4_dst))
2171                 res->type = RTN_MULTICAST;
2172         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2173                 return -EINVAL;
2174
2175         if (dev_out->flags & IFF_LOOPBACK)
2176                 flags |= RTCF_LOCAL;
2177
2178         /* get work reference to inet device */
2179         in_dev = in_dev_get(dev_out);
2180         if (!in_dev)
2181                 return -EINVAL;
2182
2183         if (res->type == RTN_BROADCAST) {
2184                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2185                 if (res->fi) {
2186                         fib_info_put(res->fi);
2187                         res->fi = NULL;
2188                 }
2189         } else if (res->type == RTN_MULTICAST) {
2190                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2191                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2192                                  oldflp->proto))
2193                         flags &= ~RTCF_LOCAL;
2194                 /* If multicast route do not exist use
2195                    default one, but do not gateway in this case.
2196                    Yes, it is hack.
2197                  */
2198                 if (res->fi && res->prefixlen < 4) {
2199                         fib_info_put(res->fi);
2200                         res->fi = NULL;
2201                 }
2202         }
2203
2204
2205         rth = dst_alloc(&ipv4_dst_ops);
2206         if (!rth) {
2207                 err = -ENOBUFS;
2208                 goto cleanup;
2209         }
2210
2211         rth->u.dst.flags= DST_HOST;
2212 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2213         if (res->fi) {
2214                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2215                 if (res->fi->fib_nhs > 1)
2216                         rth->u.dst.flags |= DST_BALANCED;
2217         }
2218 #endif
2219         if (in_dev->cnf.no_xfrm)
2220                 rth->u.dst.flags |= DST_NOXFRM;
2221         if (in_dev->cnf.no_policy)
2222                 rth->u.dst.flags |= DST_NOPOLICY;
2223
2224         rth->fl.fl4_dst = oldflp->fl4_dst;
2225         rth->fl.fl4_tos = tos;
2226         rth->fl.fl4_src = oldflp->fl4_src;
2227         rth->fl.oif     = oldflp->oif;
2228 #ifdef CONFIG_IP_ROUTE_FWMARK
2229         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2230 #endif
2231         rth->rt_dst     = fl->fl4_dst;
2232         rth->rt_src     = fl->fl4_src;
2233         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2234         /* get references to the devices that are to be hold by the routing
2235            cache entry */
2236         rth->u.dst.dev  = dev_out;
2237         dev_hold(dev_out);
2238         rth->idev       = in_dev_get(dev_out);
2239         rth->rt_gateway = fl->fl4_dst;
2240         rth->rt_spec_dst= fl->fl4_src;
2241
2242         rth->u.dst.output=ip_output;
2243
2244         RT_CACHE_STAT_INC(out_slow_tot);
2245
2246         if (flags & RTCF_LOCAL) {
2247                 rth->u.dst.input = ip_local_deliver;
2248                 rth->rt_spec_dst = fl->fl4_dst;
2249         }
2250         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251                 rth->rt_spec_dst = fl->fl4_src;
2252                 if (flags & RTCF_LOCAL &&
2253                     !(dev_out->flags & IFF_LOOPBACK)) {
2254                         rth->u.dst.output = ip_mc_output;
2255                         RT_CACHE_STAT_INC(out_slow_mc);
2256                 }
2257 #ifdef CONFIG_IP_MROUTE
2258                 if (res->type == RTN_MULTICAST) {
2259                         if (IN_DEV_MFORWARD(in_dev) &&
2260                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2261                                 rth->u.dst.input = ip_mr_input;
2262                                 rth->u.dst.output = ip_mc_output;
2263                         }
2264                 }
2265 #endif
2266         }
2267
2268         rt_set_nexthop(rth, res, 0);
2269
2270         rth->rt_flags = flags;
2271
2272         *result = rth;
2273  cleanup:
2274         /* release work reference to inet device */
2275         in_dev_put(in_dev);
2276
2277         return err;
2278 }
2279
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281                                         struct fib_result* res,
2282                                         const struct flowi *fl,
2283                                         const struct flowi *oldflp,
2284                                         struct net_device *dev_out,
2285                                         unsigned flags)
2286 {
2287         struct rtable *rth = NULL;
2288         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289         unsigned hash;
2290         if (err == 0) {
2291                 u32 tos = RT_FL_TOS(oldflp);
2292
2293                 atomic_set(&rth->u.dst.__refcnt, 1);
2294
2295                 hash = rt_hash_code(oldflp->fl4_dst,
2296                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2297                 err = rt_intern_hash(hash, rth, rp);
2298         }
2299
2300         return err;
2301 }
2302
2303 static inline int ip_mkroute_output(struct rtable** rp,
2304                                     struct fib_result* res,
2305                                     const struct flowi *fl,
2306                                     const struct flowi *oldflp,
2307                                     struct net_device *dev_out,
2308                                     unsigned flags)
2309 {
2310 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2311         u32 tos = RT_FL_TOS(oldflp);
2312         unsigned char hop;
2313         unsigned hash;
2314         int err = -EINVAL;
2315         struct rtable *rth = NULL;
2316
2317         if (res->fi && res->fi->fib_nhs > 1) {
2318                 unsigned char hopcount = res->fi->fib_nhs;
2319
2320                 for (hop = 0; hop < hopcount; hop++) {
2321                         struct net_device *dev2nexthop;
2322
2323                         res->nh_sel = hop;
2324
2325                         /* hold a work reference to the output device */
2326                         dev2nexthop = FIB_RES_DEV(*res);
2327                         dev_hold(dev2nexthop);
2328
2329                         err = __mkroute_output(&rth, res, fl, oldflp,
2330                                                dev2nexthop, flags);
2331
2332                         if (err != 0)
2333                                 goto cleanup;
2334
2335                         hash = rt_hash_code(oldflp->fl4_dst,
2336                                             oldflp->fl4_src ^
2337                                             (oldflp->oif << 5), tos);
2338                         err = rt_intern_hash(hash, rth, rp);
2339
2340                         /* forward hop information to multipath impl. */
2341                         multipath_set_nhinfo(rth,
2342                                              FIB_RES_NETWORK(*res),
2343                                              FIB_RES_NETMASK(*res),
2344                                              res->prefixlen,
2345                                              &FIB_RES_NH(*res));
2346                 cleanup:
2347                         /* release work reference to output device */
2348                         dev_put(dev2nexthop);
2349
2350                         if (err != 0)
2351                                 return err;
2352                 }
2353                 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2354                 return err;
2355         } else {
2356                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2357                                              flags);
2358         }
2359 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2360         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2361 #endif
2362 }
2363
2364 /*
2365  * Major route resolver routine.
2366  */
2367
2368 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2369 {
2370         u32 tos = RT_FL_TOS(oldflp);
2371         struct flowi fl = { .nl_u = { .ip4_u =
2372                                       { .daddr = oldflp->fl4_dst,
2373                                         .saddr = oldflp->fl4_src,
2374                                         .tos = tos & IPTOS_RT_MASK,
2375                                         .scope = ((tos & RTO_ONLINK) ?
2376                                                   RT_SCOPE_LINK :
2377                                                   RT_SCOPE_UNIVERSE),
2378 #ifdef CONFIG_IP_ROUTE_FWMARK
2379                                         .fwmark = oldflp->fl4_fwmark
2380 #endif
2381                                       } },
2382                             .iif = loopback_dev.ifindex,
2383                             .oif = oldflp->oif };
2384         struct fib_result res;
2385         unsigned flags = 0;
2386         struct net_device *dev_out = NULL;
2387         int free_res = 0;
2388         int err;
2389
2390
2391         res.fi          = NULL;
2392 #ifdef CONFIG_IP_MULTIPLE_TABLES
2393         res.r           = NULL;
2394 #endif
2395
2396         if (oldflp->fl4_src) {
2397                 err = -EINVAL;
2398                 if (MULTICAST(oldflp->fl4_src) ||
2399                     BADCLASS(oldflp->fl4_src) ||
2400                     ZERONET(oldflp->fl4_src))
2401                         goto out;
2402
2403                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2404                 dev_out = ip_dev_find(oldflp->fl4_src);
2405                 if (dev_out == NULL)
2406                         goto out;
2407
2408                 /* I removed check for oif == dev_out->oif here.
2409                    It was wrong for two reasons:
2410                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2411                       assigned to multiple interfaces.
2412                    2. Moreover, we are allowed to send packets with saddr
2413                       of another iface. --ANK
2414                  */
2415
2416                 if (oldflp->oif == 0
2417                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2418                         /* Special hack: user can direct multicasts
2419                            and limited broadcast via necessary interface
2420                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2421                            This hack is not just for fun, it allows
2422                            vic,vat and friends to work.
2423                            They bind socket to loopback, set ttl to zero
2424                            and expect that it will work.
2425                            From the viewpoint of routing cache they are broken,
2426                            because we are not allowed to build multicast path
2427                            with loopback source addr (look, routing cache
2428                            cannot know, that ttl is zero, so that packet
2429                            will not leave this host and route is valid).
2430                            Luckily, this hack is good workaround.
2431                          */
2432
2433                         fl.oif = dev_out->ifindex;
2434                         goto make_route;
2435                 }
2436                 if (dev_out)
2437                         dev_put(dev_out);
2438                 dev_out = NULL;
2439         }
2440
2441
2442         if (oldflp->oif) {
2443                 dev_out = dev_get_by_index(oldflp->oif);
2444                 err = -ENODEV;
2445                 if (dev_out == NULL)
2446                         goto out;
2447                 if (__in_dev_get(dev_out) == NULL) {
2448                         dev_put(dev_out);
2449                         goto out;       /* Wrong error code */
2450                 }
2451
2452                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2453                         if (!fl.fl4_src)
2454                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2455                                                               RT_SCOPE_LINK);
2456                         goto make_route;
2457                 }
2458                 if (!fl.fl4_src) {
2459                         if (MULTICAST(oldflp->fl4_dst))
2460                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2461                                                               fl.fl4_scope);
2462                         else if (!oldflp->fl4_dst)
2463                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2464                                                               RT_SCOPE_HOST);
2465                 }
2466         }
2467
2468         if (!fl.fl4_dst) {
2469                 fl.fl4_dst = fl.fl4_src;
2470                 if (!fl.fl4_dst)
2471                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2472                 if (dev_out)
2473                         dev_put(dev_out);
2474                 dev_out = &loopback_dev;
2475                 dev_hold(dev_out);
2476                 fl.oif = loopback_dev.ifindex;
2477                 res.type = RTN_LOCAL;
2478                 flags |= RTCF_LOCAL;
2479                 goto make_route;
2480         }
2481
2482         if (fib_lookup(&fl, &res)) {
2483                 res.fi = NULL;
2484                 if (oldflp->oif) {
2485                         /* Apparently, routing tables are wrong. Assume,
2486                            that the destination is on link.
2487
2488                            WHY? DW.
2489                            Because we are allowed to send to iface
2490                            even if it has NO routes and NO assigned
2491                            addresses. When oif is specified, routing
2492                            tables are looked up with only one purpose:
2493                            to catch if destination is gatewayed, rather than
2494                            direct. Moreover, if MSG_DONTROUTE is set,
2495                            we send packet, ignoring both routing tables
2496                            and ifaddr state. --ANK
2497
2498
2499                            We could make it even if oif is unknown,
2500                            likely IPv6, but we do not.
2501                          */
2502
2503                         if (fl.fl4_src == 0)
2504                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2505                                                               RT_SCOPE_LINK);
2506                         res.type = RTN_UNICAST;
2507                         goto make_route;
2508                 }
2509                 if (dev_out)
2510                         dev_put(dev_out);
2511                 err = -ENETUNREACH;
2512                 goto out;
2513         }
2514         free_res = 1;
2515
2516         if (res.type == RTN_LOCAL) {
2517                 if (!fl.fl4_src)
2518                         fl.fl4_src = fl.fl4_dst;
2519                 if (dev_out)
2520                         dev_put(dev_out);
2521                 dev_out = &loopback_dev;
2522                 dev_hold(dev_out);
2523                 fl.oif = dev_out->ifindex;
2524                 if (res.fi)
2525                         fib_info_put(res.fi);
2526                 res.fi = NULL;
2527                 flags |= RTCF_LOCAL;
2528                 goto make_route;
2529         }
2530
2531 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2532         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2533                 fib_select_multipath(&fl, &res);
2534         else
2535 #endif
2536         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2537                 fib_select_default(&fl, &res);
2538
2539         if (!fl.fl4_src)
2540                 fl.fl4_src = FIB_RES_PREFSRC(res);
2541
2542         if (dev_out)
2543                 dev_put(dev_out);
2544         dev_out = FIB_RES_DEV(res);
2545         dev_hold(dev_out);
2546         fl.oif = dev_out->ifindex;
2547
2548
2549 make_route:
2550         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2551
2552
2553         if (free_res)
2554                 fib_res_put(&res);
2555         if (dev_out)
2556                 dev_put(dev_out);
2557 out:    return err;
2558 }
2559
2560 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2561 {
2562         unsigned hash;
2563         struct rtable *rth;
2564
2565         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2566
2567         rcu_read_lock_bh();
2568         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2569                 rth = rcu_dereference(rth->u.rt_next)) {
2570                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2571                     rth->fl.fl4_src == flp->fl4_src &&
2572                     rth->fl.iif == 0 &&
2573                     rth->fl.oif == flp->oif &&
2574 #ifdef CONFIG_IP_ROUTE_FWMARK
2575                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2576 #endif
2577                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2578                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2579
2580                         /* check for multipath routes and choose one if
2581                          * necessary
2582                          */
2583                         if (multipath_select_route(flp, rth, rp)) {
2584                                 dst_hold(&(*rp)->u.dst);
2585                                 RT_CACHE_STAT_INC(out_hit);
2586                                 rcu_read_unlock_bh();
2587                                 return 0;
2588                         }
2589
2590                         rth->u.dst.lastuse = jiffies;
2591                         dst_hold(&rth->u.dst);
2592                         rth->u.dst.__use++;
2593                         RT_CACHE_STAT_INC(out_hit);
2594                         rcu_read_unlock_bh();
2595                         *rp = rth;
2596                         return 0;
2597                 }
2598                 RT_CACHE_STAT_INC(out_hlist_search);
2599         }
2600         rcu_read_unlock_bh();
2601
2602         return ip_route_output_slow(rp, flp);
2603 }
2604
2605 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2606
2607 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2608 {
2609         int err;
2610
2611         if ((err = __ip_route_output_key(rp, flp)) != 0)
2612                 return err;
2613
2614         if (flp->proto) {
2615                 if (!flp->fl4_src)
2616                         flp->fl4_src = (*rp)->rt_src;
2617                 if (!flp->fl4_dst)
2618                         flp->fl4_dst = (*rp)->rt_dst;
2619                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2620         }
2621
2622         return 0;
2623 }
2624
2625 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2626
2627 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2628 {
2629         return ip_route_output_flow(rp, flp, NULL, 0);
2630 }
2631
2632 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2633                         int nowait, unsigned int flags)
2634 {
2635         struct rtable *rt = (struct rtable*)skb->dst;
2636         struct rtmsg *r;
2637         struct nlmsghdr  *nlh;
2638         unsigned char    *b = skb->tail;
2639         struct rta_cacheinfo ci;
2640 #ifdef CONFIG_IP_MROUTE
2641         struct rtattr *eptr;
2642 #endif
2643         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2644         r = NLMSG_DATA(nlh);
2645         r->rtm_family    = AF_INET;
2646         r->rtm_dst_len  = 32;
2647         r->rtm_src_len  = 0;
2648         r->rtm_tos      = rt->fl.fl4_tos;
2649         r->rtm_table    = RT_TABLE_MAIN;
2650         r->rtm_type     = rt->rt_type;
2651         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2652         r->rtm_protocol = RTPROT_UNSPEC;
2653         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2654         if (rt->rt_flags & RTCF_NOTIFY)
2655                 r->rtm_flags |= RTM_F_NOTIFY;
2656         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2657         if (rt->fl.fl4_src) {
2658                 r->rtm_src_len = 32;
2659                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2660         }
2661         if (rt->u.dst.dev)
2662                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2663 #ifdef CONFIG_NET_CLS_ROUTE
2664         if (rt->u.dst.tclassid)
2665                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2666 #endif
2667 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2668         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2669                 __u32 alg = rt->rt_multipath_alg;
2670
2671                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2672         }
2673 #endif
2674         if (rt->fl.iif)
2675                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2676         else if (rt->rt_src != rt->fl.fl4_src)
2677                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2678         if (rt->rt_dst != rt->rt_gateway)
2679                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2680         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2681                 goto rtattr_failure;
2682         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2683         ci.rta_used     = rt->u.dst.__use;
2684         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2685         if (rt->u.dst.expires)
2686                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2687         else
2688                 ci.rta_expires = 0;
2689         ci.rta_error    = rt->u.dst.error;
2690         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2691         if (rt->peer) {
2692                 ci.rta_id = rt->peer->ip_id_count;
2693                 if (rt->peer->tcp_ts_stamp) {
2694                         ci.rta_ts = rt->peer->tcp_ts;
2695                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2696                 }
2697         }
2698 #ifdef CONFIG_IP_MROUTE
2699         eptr = (struct rtattr*)skb->tail;
2700 #endif
2701         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2702         if (rt->fl.iif) {
2703 #ifdef CONFIG_IP_MROUTE
2704                 u32 dst = rt->rt_dst;
2705
2706                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2707                     ipv4_devconf.mc_forwarding) {
2708                         int err = ipmr_get_route(skb, r, nowait);
2709                         if (err <= 0) {
2710                                 if (!nowait) {
2711                                         if (err == 0)
2712                                                 return 0;
2713                                         goto nlmsg_failure;
2714                                 } else {
2715                                         if (err == -EMSGSIZE)
2716                                                 goto nlmsg_failure;
2717                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2718                                 }
2719                         }
2720                 } else
2721 #endif
2722                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2723         }
2724
2725         nlh->nlmsg_len = skb->tail - b;
2726         return skb->len;
2727
2728 nlmsg_failure:
2729 rtattr_failure:
2730         skb_trim(skb, b - skb->data);
2731         return -1;
2732 }
2733
2734 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2735 {
2736         struct rtattr **rta = arg;
2737         struct rtmsg *rtm = NLMSG_DATA(nlh);
2738         struct rtable *rt = NULL;
2739         u32 dst = 0;
2740         u32 src = 0;
2741         int iif = 0;
2742         int err = -ENOBUFS;
2743         struct sk_buff *skb;
2744
2745         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2746         if (!skb)
2747                 goto out;
2748
2749         /* Reserve room for dummy headers, this skb can pass
2750            through good chunk of routing engine.
2751          */
2752         skb->mac.raw = skb->data;
2753         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754
2755         if (rta[RTA_SRC - 1])
2756                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757         if (rta[RTA_DST - 1])
2758                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759         if (rta[RTA_IIF - 1])
2760                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2761
2762         if (iif) {
2763                 struct net_device *dev = __dev_get_by_index(iif);
2764                 err = -ENODEV;
2765                 if (!dev)
2766                         goto out_free;
2767                 skb->protocol   = htons(ETH_P_IP);
2768                 skb->dev        = dev;
2769                 local_bh_disable();
2770                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2771                 local_bh_enable();
2772                 rt = (struct rtable*)skb->dst;
2773                 if (!err && rt->u.dst.error)
2774                         err = -rt->u.dst.error;
2775         } else {
2776                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2777                                                          .saddr = src,
2778                                                          .tos = rtm->rtm_tos } } };
2779                 int oif = 0;
2780                 if (rta[RTA_OIF - 1])
2781                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2782                 fl.oif = oif;
2783                 err = ip_route_output_key(&rt, &fl);
2784         }
2785         if (err)
2786                 goto out_free;
2787
2788         skb->dst = &rt->u.dst;
2789         if (rtm->rtm_flags & RTM_F_NOTIFY)
2790                 rt->rt_flags |= RTCF_NOTIFY;
2791
2792         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2793
2794         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2795                                 RTM_NEWROUTE, 0, 0);
2796         if (!err)
2797                 goto out_free;
2798         if (err < 0) {
2799                 err = -EMSGSIZE;
2800                 goto out_free;
2801         }
2802
2803         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2804         if (err > 0)
2805                 err = 0;
2806 out:    return err;
2807
2808 out_free:
2809         kfree_skb(skb);
2810         goto out;
2811 }
2812
2813 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2814 {
2815         struct rtable *rt;
2816         int h, s_h;
2817         int idx, s_idx;
2818
2819         s_h = cb->args[0];
2820         s_idx = idx = cb->args[1];
2821         for (h = 0; h <= rt_hash_mask; h++) {
2822                 if (h < s_h) continue;
2823                 if (h > s_h)
2824                         s_idx = 0;
2825                 rcu_read_lock_bh();
2826                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2828                         if (idx < s_idx)
2829                                 continue;
2830                         skb->dst = dst_clone(&rt->u.dst);
2831                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833                                          1, NLM_F_MULTI) <= 0) {
2834                                 dst_release(xchg(&skb->dst, NULL));
2835                                 rcu_read_unlock_bh();
2836                                 goto done;
2837                         }
2838                         dst_release(xchg(&skb->dst, NULL));
2839                 }
2840                 rcu_read_unlock_bh();
2841         }
2842
2843 done:
2844         cb->args[0] = h;
2845         cb->args[1] = idx;
2846         return skb->len;
2847 }
2848
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2850 {
2851         rt_cache_flush(0);
2852 }
2853
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2856
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858                                         struct file *filp, void __user *buffer,
2859                                         size_t *lenp, loff_t *ppos)
2860 {
2861         if (write) {
2862                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863                 rt_cache_flush(flush_delay);
2864                 return 0;
2865         }
2866
2867         return -EINVAL;
2868 }
2869
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871                                                 int __user *name,
2872                                                 int nlen,
2873                                                 void __user *oldval,
2874                                                 size_t __user *oldlenp,
2875                                                 void __user *newval,
2876                                                 size_t newlen,
2877                                                 void **context)
2878 {
2879         int delay;
2880         if (newlen != sizeof(int))
2881                 return -EINVAL;
2882         if (get_user(delay, (int __user *)newval))
2883                 return -EFAULT;
2884         rt_cache_flush(delay);
2885         return 0;
2886 }
2887
2888 ctl_table ipv4_route_table[] = {
2889         {
2890                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2891                 .procname       = "flush",
2892                 .data           = &flush_delay,
2893                 .maxlen         = sizeof(int),
2894                 .mode           = 0200,
2895                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2896                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2897         },
2898         {
2899                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2900                 .procname       = "min_delay",
2901                 .data           = &ip_rt_min_delay,
2902                 .maxlen         = sizeof(int),
2903                 .mode           = 0644,
2904                 .proc_handler   = &proc_dointvec_jiffies,
2905                 .strategy       = &sysctl_jiffies,
2906         },
2907         {
2908                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2909                 .procname       = "max_delay",
2910                 .data           = &ip_rt_max_delay,
2911                 .maxlen         = sizeof(int),
2912                 .mode           = 0644,
2913                 .proc_handler   = &proc_dointvec_jiffies,
2914                 .strategy       = &sysctl_jiffies,
2915         },
2916         {
2917                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2918                 .procname       = "gc_thresh",
2919                 .data           = &ipv4_dst_ops.gc_thresh,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = &proc_dointvec,
2923         },
2924         {
2925                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2926                 .procname       = "max_size",
2927                 .data           = &ip_rt_max_size,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = &proc_dointvec,
2931         },
2932         {
2933                 /*  Deprecated. Use gc_min_interval_ms */
2934
2935                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936                 .procname       = "gc_min_interval",
2937                 .data           = &ip_rt_gc_min_interval,
2938                 .maxlen         = sizeof(int),
2939                 .mode           = 0644,
2940                 .proc_handler   = &proc_dointvec_jiffies,
2941                 .strategy       = &sysctl_jiffies,
2942         },
2943         {
2944                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945                 .procname       = "gc_min_interval_ms",
2946                 .data           = &ip_rt_gc_min_interval,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = &proc_dointvec_ms_jiffies,
2950                 .strategy       = &sysctl_ms_jiffies,
2951         },
2952         {
2953                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2954                 .procname       = "gc_timeout",
2955                 .data           = &ip_rt_gc_timeout,
2956                 .maxlen         = sizeof(int),
2957                 .mode           = 0644,
2958                 .proc_handler   = &proc_dointvec_jiffies,
2959                 .strategy       = &sysctl_jiffies,
2960         },
2961         {
2962                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2963                 .procname       = "gc_interval",
2964                 .data           = &ip_rt_gc_interval,
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0644,
2967                 .proc_handler   = &proc_dointvec_jiffies,
2968                 .strategy       = &sysctl_jiffies,
2969         },
2970         {
2971                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972                 .procname       = "redirect_load",
2973                 .data           = &ip_rt_redirect_load,
2974                 .maxlen         = sizeof(int),
2975                 .mode           = 0644,
2976                 .proc_handler   = &proc_dointvec,
2977         },
2978         {
2979                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980                 .procname       = "redirect_number",
2981                 .data           = &ip_rt_redirect_number,
2982                 .maxlen         = sizeof(int),
2983                 .mode           = 0644,
2984                 .proc_handler   = &proc_dointvec,
2985         },
2986         {
2987                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988                 .procname       = "redirect_silence",
2989                 .data           = &ip_rt_redirect_silence,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = &proc_dointvec,
2993         },
2994         {
2995                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2996                 .procname       = "error_cost",
2997                 .data           = &ip_rt_error_cost,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = &proc_dointvec,
3001         },
3002         {
3003                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3004                 .procname       = "error_burst",
3005                 .data           = &ip_rt_error_burst,
3006                 .maxlen         = sizeof(int),
3007                 .mode           = 0644,
3008                 .proc_handler   = &proc_dointvec,
3009         },
3010         {
3011                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3012                 .procname       = "gc_elasticity",
3013                 .data           = &ip_rt_gc_elasticity,
3014                 .maxlen         = sizeof(int),
3015                 .mode           = 0644,
3016                 .proc_handler   = &proc_dointvec,
3017         },
3018         {
3019                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3020                 .procname       = "mtu_expires",
3021                 .data           = &ip_rt_mtu_expires,
3022                 .maxlen         = sizeof(int),
3023                 .mode           = 0644,
3024                 .proc_handler   = &proc_dointvec_jiffies,
3025                 .strategy       = &sysctl_jiffies,
3026         },
3027         {
3028                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3029                 .procname       = "min_pmtu",
3030                 .data           = &ip_rt_min_pmtu,
3031                 .maxlen         = sizeof(int),
3032                 .mode           = 0644,
3033                 .proc_handler   = &proc_dointvec,
3034         },
3035         {
3036                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3037                 .procname       = "min_adv_mss",
3038                 .data           = &ip_rt_min_advmss,
3039                 .maxlen         = sizeof(int),
3040                 .mode           = 0644,
3041                 .proc_handler   = &proc_dointvec,
3042         },
3043         {
3044                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045                 .procname       = "secret_interval",
3046                 .data           = &ip_rt_secret_interval,
3047                 .maxlen         = sizeof(int),
3048                 .mode           = 0644,
3049                 .proc_handler   = &proc_dointvec_jiffies,
3050                 .strategy       = &sysctl_jiffies,
3051         },
3052         { .ctl_name = 0 }
3053 };
3054 #endif
3055
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3058
3059 /* This code sucks.  But you should have seen it before! --RR */
3060
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3063
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066                            int length, int *eof, void *data)
3067 {
3068         unsigned int i;
3069
3070         if ((offset & 3) || (length & 3))
3071                 return -EIO;
3072
3073         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074                 *eof = 1;
3075                 return 0;
3076         }
3077
3078         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3080                 *eof = 1;
3081         }
3082
3083         offset /= sizeof(u32);
3084
3085         if (length > 0) {
3086                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087                 u32 *dst = (u32 *) buffer;
3088
3089                 /* Copy first cpu. */
3090                 *start = buffer;
3091                 memcpy(dst, src, length);
3092
3093                 /* Add the other cpus in, one int at a time */
3094                 for_each_cpu(i) {
3095                         unsigned int j;
3096
3097                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3098
3099                         for (j = 0; j < length/4; j++)
3100                                 dst[j] += src[j];
3101                 }
3102         }
3103         return length;
3104 }
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3107
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3110 {
3111         if (!str)
3112                 return 0;
3113         rhash_entries = simple_strtoul(str, &str, 0);
3114         return 1;
3115 }
3116 __setup("rhash_entries=", set_rhash_entries);
3117
3118 int __init ip_rt_init(void)
3119 {
3120         int rc = 0;
3121
3122         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123                              (jiffies ^ (jiffies >> 7)));
3124
3125 #ifdef CONFIG_NET_CLS_ROUTE
3126         {
3127         int order;
3128         for (order = 0;
3129              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130                 /* NOTHING */;
3131         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132         if (!ip_rt_acct)
3133                 panic("IP: failed to allocate ip_rt_acct\n");
3134         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3135         }
3136 #endif
3137
3138         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139                                                      sizeof(struct rtable),
3140                                                      0, SLAB_HWCACHE_ALIGN,
3141                                                      NULL, NULL);
3142
3143         if (!ipv4_dst_ops.kmem_cachep)
3144                 panic("IP: failed to allocate ip_dst_cache\n");
3145
3146         rt_hash_table = (struct rt_hash_bucket *)
3147                 alloc_large_system_hash("IP route cache",
3148                                         sizeof(struct rt_hash_bucket),
3149                                         rhash_entries,
3150                                         (num_physpages >= 128 * 1024) ?
3151                                                 (27 - PAGE_SHIFT) :
3152                                                 (29 - PAGE_SHIFT),
3153                                         HASH_HIGHMEM,
3154                                         &rt_hash_log,
3155                                         &rt_hash_mask,
3156                                         0);
3157         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3158         rt_hash_lock_init();
3159
3160         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3161         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3162
3163         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3164         if (!rt_cache_stat)
3165                 return -ENOMEM;
3166
3167         devinet_init();
3168         ip_fib_init();
3169
3170         init_timer(&rt_flush_timer);
3171         rt_flush_timer.function = rt_run_flush;
3172         init_timer(&rt_periodic_timer);
3173         rt_periodic_timer.function = rt_check_expire;
3174         init_timer(&rt_secret_timer);
3175         rt_secret_timer.function = rt_secret_rebuild;
3176
3177         /* All the timers, started at system startup tend
3178            to synchronize. Perturb it a bit.
3179          */
3180         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3181                                         ip_rt_gc_interval;
3182         add_timer(&rt_periodic_timer);
3183
3184         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3185                 ip_rt_secret_interval;
3186         add_timer(&rt_secret_timer);
3187
3188 #ifdef CONFIG_PROC_FS
3189         {
3190         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3191         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3192             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3193                                              proc_net_stat))) {
3194                 free_percpu(rt_cache_stat);
3195                 return -ENOMEM;
3196         }
3197         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3198         }
3199 #ifdef CONFIG_NET_CLS_ROUTE
3200         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3201 #endif
3202 #endif
3203 #ifdef CONFIG_XFRM
3204         xfrm_init();
3205         xfrm4_init();
3206 #endif
3207         return rc;
3208 }
3209
3210 EXPORT_SYMBOL(__ip_select_ident);
3211 EXPORT_SYMBOL(ip_route_input);
3212 EXPORT_SYMBOL(ip_route_output_key);