git.oblomov.eu Git - linux-2.6/blob - net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/mm.h>
  75 #include <linux/bootmem.h>
  76 #include <linux/string.h>
  77 #include <linux/socket.h>
  78 #include <linux/sockios.h>
  79 #include <linux/errno.h>
  80 #include <linux/in.h>
  81 #include <linux/inet.h>
  82 #include <linux/netdevice.h>
  83 #include <linux/proc_fs.h>
  84 #include <linux/init.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/rtnetlink.h>
  87 #include <linux/inetdevice.h>
  88 #include <linux/igmp.h>
  89 #include <linux/pkt_sched.h>
  90 #include <linux/mroute.h>
  91 #include <linux/netfilter_ipv4.h>
  92 #include <linux/random.h>
  93 #include <linux/jhash.h>
  94 #include <linux/rcupdate.h>
  95 #include <linux/times.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/ip_mp_alg.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_min_delay              = 2 * HZ;
 119 static int ip_rt_max_delay              = 10 * HZ;
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval            = 60 * HZ;
 123 static int ip_rt_gc_min_interval        = HZ / 2;
 124 static int ip_rt_redirect_number        = 9;
 125 static int ip_rt_redirect_load          = HZ / 50;
 126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost             = HZ;
 128 static int ip_rt_error_burst            = 5 * HZ;
 129 static int ip_rt_gc_elasticity          = 8;
 130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 132 static int ip_rt_min_advmss             = 256;
 133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 134 static unsigned long rt_deadline;
 135
 136 #define RTprint(a...)   printk(KERN_DEBUG a)
 137
 138 static struct timer_list rt_flush_timer;
 139 static struct timer_list rt_periodic_timer;
 140 static struct timer_list rt_secret_timer;
 141
 142 /*
 143  *      Interface to generic destination cache.
 144  */
 145
 146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 147 static void              ipv4_dst_destroy(struct dst_entry *dst);
 148 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 149                                          struct net_device *dev, int how);
 150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 151 static void              ipv4_link_failure(struct sk_buff *skb);
 152 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 153 static int rt_garbage_collect(void);
 154
 155
 156 static struct dst_ops ipv4_dst_ops = {
 157         .family =               AF_INET,
 158         .protocol =             __constant_htons(ETH_P_IP),
 159         .gc =                   rt_garbage_collect,
 160         .check =                ipv4_dst_check,
 161         .destroy =              ipv4_dst_destroy,
 162         .ifdown =               ipv4_dst_ifdown,
 163         .negative_advice =      ipv4_negative_advice,
 164         .link_failure =         ipv4_link_failure,
 165         .update_pmtu =          ip_rt_update_pmtu,
 166         .entry_size =           sizeof(struct rtable),
 167 };
 168
 169 #define ECN_OR_COST(class)      TC_PRIO_##class
 170
 171 __u8 ip_tos2prio[16] = {
 172         TC_PRIO_BESTEFFORT,
 173         ECN_OR_COST(FILLER),
 174         TC_PRIO_BESTEFFORT,
 175         ECN_OR_COST(BESTEFFORT),
 176         TC_PRIO_BULK,
 177         ECN_OR_COST(BULK),
 178         TC_PRIO_BULK,
 179         ECN_OR_COST(BULK),
 180         TC_PRIO_INTERACTIVE,
 181         ECN_OR_COST(INTERACTIVE),
 182         TC_PRIO_INTERACTIVE,
 183         ECN_OR_COST(INTERACTIVE),
 184         TC_PRIO_INTERACTIVE_BULK,
 185         ECN_OR_COST(INTERACTIVE_BULK),
 186         TC_PRIO_INTERACTIVE_BULK,
 187         ECN_OR_COST(INTERACTIVE_BULK)
 188 };
 189
 190
 191 /*
 192  * Route cache.
 193  */
 194
 195 /* The locking scheme is rather straight forward:
 196  *
 197  * 1) Read-Copy Update protects the buckets of the central route hash.
 198  * 2) Only writers remove entries, and they hold the lock
 199  *    as they look at rtable reference counts.
 200  * 3) Only readers acquire references to rtable entries,
 201  *    they do so with atomic increments and with the
 202  *    lock held.
 203  */
 204
 205 struct rt_hash_bucket {
 206         struct rtable   *chain;
 207 };
 208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 209 /*
 210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 211  * The size of this table is a power of two and depends on the number of CPUS.
 212  */
 213 #if NR_CPUS >= 32
 214 #define RT_HASH_LOCK_SZ 4096
 215 #elif NR_CPUS >= 16
 216 #define RT_HASH_LOCK_SZ 2048
 217 #elif NR_CPUS >= 8
 218 #define RT_HASH_LOCK_SZ 1024
 219 #elif NR_CPUS >= 4
 220 #define RT_HASH_LOCK_SZ 512
 221 #else
 222 #define RT_HASH_LOCK_SZ 256
 223 #endif
 224
 225 static spinlock_t       *rt_hash_locks;
 226 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 227 # define rt_hash_lock_init()    { \
 228                 int i; \
 229                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 230                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 231                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 232                         spin_lock_init(&rt_hash_locks[i]); \
 233                 }
 234 #else
 235 # define rt_hash_lock_addr(slot) NULL
 236 # define rt_hash_lock_init()
 237 #endif
 238
 239 static struct rt_hash_bucket    *rt_hash_table;
 240 static unsigned                 rt_hash_mask;
 241 static int                      rt_hash_log;
 242 static unsigned int             rt_hash_rnd;
 243
 244 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 245 #define RT_CACHE_STAT_INC(field) \
 246         (__raw_get_cpu_var(rt_cache_stat).field++)
 247
 248 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 249                                 struct rtable **res);
 250
 251 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 252 {
 253         return (jhash_2words(daddr, saddr, rt_hash_rnd)
 254                 & rt_hash_mask);
 255 }
 256
 257 #ifdef CONFIG_PROC_FS
 258 struct rt_cache_iter_state {
 259         int bucket;
 260 };
 261
 262 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 263 {
 264         struct rtable *r = NULL;
 265         struct rt_cache_iter_state *st = seq->private;
 266
 267         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 268                 rcu_read_lock_bh();
 269                 r = rt_hash_table[st->bucket].chain;
 270                 if (r)
 271                         break;
 272                 rcu_read_unlock_bh();
 273         }
 274         return r;
 275 }
 276
 277 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 278 {
 279         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 280
 281         r = r->u.rt_next;
 282         while (!r) {
 283                 rcu_read_unlock_bh();
 284                 if (--st->bucket < 0)
 285                         break;
 286                 rcu_read_lock_bh();
 287                 r = rt_hash_table[st->bucket].chain;
 288         }
 289         return r;
 290 }
 291
 292 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 293 {
 294         struct rtable *r = rt_cache_get_first(seq);
 295
 296         if (r)
 297                 while (pos && (r = rt_cache_get_next(seq, r)))
 298                         --pos;
 299         return pos ? NULL : r;
 300 }
 301
 302 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 303 {
 304         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 305 }
 306
 307 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 308 {
 309         struct rtable *r = NULL;
 310
 311         if (v == SEQ_START_TOKEN)
 312                 r = rt_cache_get_first(seq);
 313         else
 314                 r = rt_cache_get_next(seq, v);
 315         ++*pos;
 316         return r;
 317 }
 318
 319 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 320 {
 321         if (v && v != SEQ_START_TOKEN)
 322                 rcu_read_unlock_bh();
 323 }
 324
 325 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 326 {
 327         if (v == SEQ_START_TOKEN)
 328                 seq_printf(seq, "%-127s\n",
 329                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 330                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 331                            "HHUptod\tSpecDst");
 332         else {
 333                 struct rtable *r = v;
 334                 char temp[256];
 335
 336                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 337                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 338                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 339                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 340                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 341                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 342                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 343                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 344                         dst_metric(&r->u.dst, RTAX_WINDOW),
 345                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 346                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 347                         r->fl.fl4_tos,
 348                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 349                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 350                                        dev_queue_xmit) : 0,
 351                         r->rt_spec_dst);
 352                 seq_printf(seq, "%-127s\n", temp);
 353         }
 354         return 0;
 355 }
 356
 357 static struct seq_operations rt_cache_seq_ops = {
 358         .start  = rt_cache_seq_start,
 359         .next   = rt_cache_seq_next,
 360         .stop   = rt_cache_seq_stop,
 361         .show   = rt_cache_seq_show,
 362 };
 363
 364 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 365 {
 366         struct seq_file *seq;
 367         int rc = -ENOMEM;
 368         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 369
 370         if (!s)
 371                 goto out;
 372         rc = seq_open(file, &rt_cache_seq_ops);
 373         if (rc)
 374                 goto out_kfree;
 375         seq          = file->private_data;
 376         seq->private = s;
 377         memset(s, 0, sizeof(*s));
 378 out:
 379         return rc;
 380 out_kfree:
 381         kfree(s);
 382         goto out;
 383 }
 384
 385 static struct file_operations rt_cache_seq_fops = {
 386         .owner   = THIS_MODULE,
 387         .open    = rt_cache_seq_open,
 388         .read    = seq_read,
 389         .llseek  = seq_lseek,
 390         .release = seq_release_private,
 391 };
 392
 393
 394 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 395 {
 396         int cpu;
 397
 398         if (*pos == 0)
 399                 return SEQ_START_TOKEN;
 400
 401         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 402                 if (!cpu_possible(cpu))
 403                         continue;
 404                 *pos = cpu+1;
 405                 return &per_cpu(rt_cache_stat, cpu);
 406         }
 407         return NULL;
 408 }
 409
 410 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 411 {
 412         int cpu;
 413
 414         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 415                 if (!cpu_possible(cpu))
 416                         continue;
 417                 *pos = cpu+1;
 418                 return &per_cpu(rt_cache_stat, cpu);
 419         }
 420         return NULL;
 421
 422 }
 423
 424 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 425 {
 426
 427 }
 428
 429 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 430 {
 431         struct rt_cache_stat *st = v;
 432
 433         if (v == SEQ_START_TOKEN) {
 434                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 435                 return 0;
 436         }
 437
 438         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 439                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 440                    atomic_read(&ipv4_dst_ops.entries),
 441                    st->in_hit,
 442                    st->in_slow_tot,
 443                    st->in_slow_mc,
 444                    st->in_no_route,
 445                    st->in_brd,
 446                    st->in_martian_dst,
 447                    st->in_martian_src,
 448
 449                    st->out_hit,
 450                    st->out_slow_tot,
 451                    st->out_slow_mc,
 452
 453                    st->gc_total,
 454                    st->gc_ignored,
 455                    st->gc_goal_miss,
 456                    st->gc_dst_overflow,
 457                    st->in_hlist_search,
 458                    st->out_hlist_search
 459                 );
 460         return 0;
 461 }
 462
 463 static struct seq_operations rt_cpu_seq_ops = {
 464         .start  = rt_cpu_seq_start,
 465         .next   = rt_cpu_seq_next,
 466         .stop   = rt_cpu_seq_stop,
 467         .show   = rt_cpu_seq_show,
 468 };
 469
 470
 471 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 472 {
 473         return seq_open(file, &rt_cpu_seq_ops);
 474 }
 475
 476 static struct file_operations rt_cpu_seq_fops = {
 477         .owner   = THIS_MODULE,
 478         .open    = rt_cpu_seq_open,
 479         .read    = seq_read,
 480         .llseek  = seq_lseek,
 481         .release = seq_release,
 482 };
 483
 484 #endif /* CONFIG_PROC_FS */
 485
 486 static __inline__ void rt_free(struct rtable *rt)
 487 {
 488         multipath_remove(rt);
 489         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 490 }
 491
 492 static __inline__ void rt_drop(struct rtable *rt)
 493 {
 494         multipath_remove(rt);
 495         ip_rt_put(rt);
 496         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 497 }
 498
 499 static __inline__ int rt_fast_clean(struct rtable *rth)
 500 {
 501         /* Kill broadcast/multicast entries very aggresively, if they
 502            collide in hash table with more useful entries */
 503         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 504                 rth->fl.iif && rth->u.rt_next;
 505 }
 506
 507 static __inline__ int rt_valuable(struct rtable *rth)
 508 {
 509         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 510                 rth->u.dst.expires;
 511 }
 512
 513 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 514 {
 515         unsigned long age;
 516         int ret = 0;
 517
 518         if (atomic_read(&rth->u.dst.__refcnt))
 519                 goto out;
 520
 521         ret = 1;
 522         if (rth->u.dst.expires &&
 523             time_after_eq(jiffies, rth->u.dst.expires))
 524                 goto out;
 525
 526         age = jiffies - rth->u.dst.lastuse;
 527         ret = 0;
 528         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 529             (age <= tmo2 && rt_valuable(rth)))
 530                 goto out;
 531         ret = 1;
 532 out:    return ret;
 533 }
 534
 535 /* Bits of score are:
 536  * 31: very valuable
 537  * 30: not quite useless
 538  * 29..0: usage counter
 539  */
 540 static inline u32 rt_score(struct rtable *rt)
 541 {
 542         u32 score = jiffies - rt->u.dst.lastuse;
 543
 544         score = ~score & ~(3<<30);
 545
 546         if (rt_valuable(rt))
 547                 score |= (1<<31);
 548
 549         if (!rt->fl.iif ||
 550             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 551                 score |= (1<<30);
 552
 553         return score;
 554 }
 555
 556 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 557 {
 558         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 559                fl1->oif     == fl2->oif &&
 560                fl1->iif     == fl2->iif;
 561 }
 562
 563 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 564 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 565                                                 struct rtable *expentry,
 566                                                 int *removed_count)
 567 {
 568         int passedexpired = 0;
 569         struct rtable **nextstep = NULL;
 570         struct rtable **rthp = chain_head;
 571         struct rtable *rth;
 572
 573         if (removed_count)
 574                 *removed_count = 0;
 575
 576         while ((rth = *rthp) != NULL) {
 577                 if (rth == expentry)
 578                         passedexpired = 1;
 579
 580                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 581                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 582                         if (*rthp == expentry) {
 583                                 *rthp = rth->u.rt_next;
 584                                 continue;
 585                         } else {
 586                                 *rthp = rth->u.rt_next;
 587                                 rt_free(rth);
 588                                 if (removed_count)
 589                                         ++(*removed_count);
 590                         }
 591                 } else {
 592                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 593                             passedexpired && !nextstep)
 594                                 nextstep = &rth->u.rt_next;
 595
 596                         rthp = &rth->u.rt_next;
 597                 }
 598         }
 599
 600         rt_free(expentry);
 601         if (removed_count)
 602                 ++(*removed_count);
 603
 604         return nextstep;
 605 }
 606 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 607
 608
 609 /* This runs via a timer and thus is always in BH context. */
 610 static void rt_check_expire(unsigned long dummy)
 611 {
 612         static unsigned int rover;
 613         unsigned int i = rover, goal;
 614         struct rtable *rth, **rthp;
 615         unsigned long now = jiffies;
 616         u64 mult;
 617
 618         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 619         if (ip_rt_gc_timeout > 1)
 620                 do_div(mult, ip_rt_gc_timeout);
 621         goal = (unsigned int)mult;
 622         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 623         for (; goal > 0; goal--) {
 624                 unsigned long tmo = ip_rt_gc_timeout;
 625
 626                 i = (i + 1) & rt_hash_mask;
 627                 rthp = &rt_hash_table[i].chain;
 628
 629                 if (*rthp == 0)
 630                         continue;
 631                 spin_lock(rt_hash_lock_addr(i));
 632                 while ((rth = *rthp) != NULL) {
 633                         if (rth->u.dst.expires) {
 634                                 /* Entry is expired even if it is in use */
 635                                 if (time_before_eq(now, rth->u.dst.expires)) {
 636                                         tmo >>= 1;
 637                                         rthp = &rth->u.rt_next;
 638                                         continue;
 639                                 }
 640                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 641                                 tmo >>= 1;
 642                                 rthp = &rth->u.rt_next;
 643                                 continue;
 644                         }
 645
 646                         /* Cleanup aged off entries. */
 647 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 648                         /* remove all related balanced entries if necessary */
 649                         if (rth->u.dst.flags & DST_BALANCED) {
 650                                 rthp = rt_remove_balanced_route(
 651                                         &rt_hash_table[i].chain,
 652                                         rth, NULL);
 653                                 if (!rthp)
 654                                         break;
 655                         } else {
 656                                 *rthp = rth->u.rt_next;
 657                                 rt_free(rth);
 658                         }
 659 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 660                         *rthp = rth->u.rt_next;
 661                         rt_free(rth);
 662 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 663                 }
 664                 spin_unlock(rt_hash_lock_addr(i));
 665
 666                 /* Fallback loop breaker. */
 667                 if (time_after(jiffies, now))
 668                         break;
 669         }
 670         rover = i;
 671         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 672 }
 673
 674 /* This can run from both BH and non-BH contexts, the latter
 675  * in the case of a forced flush event.
 676  */
 677 static void rt_run_flush(unsigned long dummy)
 678 {
 679         int i;
 680         struct rtable *rth, *next;
 681
 682         rt_deadline = 0;
 683
 684         get_random_bytes(&rt_hash_rnd, 4);
 685
 686         for (i = rt_hash_mask; i >= 0; i--) {
 687                 spin_lock_bh(rt_hash_lock_addr(i));
 688                 rth = rt_hash_table[i].chain;
 689                 if (rth)
 690                         rt_hash_table[i].chain = NULL;
 691                 spin_unlock_bh(rt_hash_lock_addr(i));
 692
 693                 for (; rth; rth = next) {
 694                         next = rth->u.rt_next;
 695                         rt_free(rth);
 696                 }
 697         }
 698 }
 699
 700 static DEFINE_SPINLOCK(rt_flush_lock);
 701
 702 void rt_cache_flush(int delay)
 703 {
 704         unsigned long now = jiffies;
 705         int user_mode = !in_softirq();
 706
 707         if (delay < 0)
 708                 delay = ip_rt_min_delay;
 709
 710         /* flush existing multipath state*/
 711         multipath_flush();
 712
 713         spin_lock_bh(&rt_flush_lock);
 714
 715         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 716                 long tmo = (long)(rt_deadline - now);
 717
 718                 /* If flush timer is already running
 719                    and flush request is not immediate (delay > 0):
 720
 721                    if deadline is not achieved, prolongate timer to "delay",
 722                    otherwise fire it at deadline time.
 723                  */
 724
 725                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 726                         tmo = 0;
 727
 728                 if (delay > tmo)
 729                         delay = tmo;
 730         }
 731
 732         if (delay <= 0) {
 733                 spin_unlock_bh(&rt_flush_lock);
 734                 rt_run_flush(0);
 735                 return;
 736         }
 737
 738         if (rt_deadline == 0)
 739                 rt_deadline = now + ip_rt_max_delay;
 740
 741         mod_timer(&rt_flush_timer, now+delay);
 742         spin_unlock_bh(&rt_flush_lock);
 743 }
 744
 745 static void rt_secret_rebuild(unsigned long dummy)
 746 {
 747         unsigned long now = jiffies;
 748
 749         rt_cache_flush(0);
 750         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 751 }
 752
 753 /*
 754    Short description of GC goals.
 755
 756    We want to build algorithm, which will keep routing cache
 757    at some equilibrium point, when number of aged off entries
 758    is kept approximately equal to newly generated ones.
 759
 760    Current expiration strength is variable "expire".
 761    We try to adjust it dynamically, so that if networking
 762    is idle expires is large enough to keep enough of warm entries,
 763    and when load increases it reduces to limit cache size.
 764  */
 765
 766 static int rt_garbage_collect(void)
 767 {
 768         static unsigned long expire = RT_GC_TIMEOUT;
 769         static unsigned long last_gc;
 770         static int rover;
 771         static int equilibrium;
 772         struct rtable *rth, **rthp;
 773         unsigned long now = jiffies;
 774         int goal;
 775
 776         /*
 777          * Garbage collection is pretty expensive,
 778          * do not make it too frequently.
 779          */
 780
 781         RT_CACHE_STAT_INC(gc_total);
 782
 783         if (now - last_gc < ip_rt_gc_min_interval &&
 784             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 785                 RT_CACHE_STAT_INC(gc_ignored);
 786                 goto out;
 787         }
 788
 789         /* Calculate number of entries, which we want to expire now. */
 790         goal = atomic_read(&ipv4_dst_ops.entries) -
 791                 (ip_rt_gc_elasticity << rt_hash_log);
 792         if (goal <= 0) {
 793                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 794                         equilibrium = ipv4_dst_ops.gc_thresh;
 795                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 796                 if (goal > 0) {
 797                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 798                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 799                 }
 800         } else {
 801                 /* We are in dangerous area. Try to reduce cache really
 802                  * aggressively.
 803                  */
 804                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 805                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 806         }
 807
 808         if (now - last_gc >= ip_rt_gc_min_interval)
 809                 last_gc = now;
 810
 811         if (goal <= 0) {
 812                 equilibrium += goal;
 813                 goto work_done;
 814         }
 815
 816         do {
 817                 int i, k;
 818
 819                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 820                         unsigned long tmo = expire;
 821
 822                         k = (k + 1) & rt_hash_mask;
 823                         rthp = &rt_hash_table[k].chain;
 824                         spin_lock_bh(rt_hash_lock_addr(k));
 825                         while ((rth = *rthp) != NULL) {
 826                                 if (!rt_may_expire(rth, tmo, expire)) {
 827                                         tmo >>= 1;
 828                                         rthp = &rth->u.rt_next;
 829                                         continue;
 830                                 }
 831 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 832                                 /* remove all related balanced entries
 833                                  * if necessary
 834                                  */
 835                                 if (rth->u.dst.flags & DST_BALANCED) {
 836                                         int r;
 837
 838                                         rthp = rt_remove_balanced_route(
 839                                                 &rt_hash_table[k].chain,
 840                                                 rth,
 841                                                 &r);
 842                                         goal -= r;
 843                                         if (!rthp)
 844                                                 break;
 845                                 } else {
 846                                         *rthp = rth->u.rt_next;
 847                                         rt_free(rth);
 848                                         goal--;
 849                                 }
 850 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 851                                 *rthp = rth->u.rt_next;
 852                                 rt_free(rth);
 853                                 goal--;
 854 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 855                         }
 856                         spin_unlock_bh(rt_hash_lock_addr(k));
 857                         if (goal <= 0)
 858                                 break;
 859                 }
 860                 rover = k;
 861
 862                 if (goal <= 0)
 863                         goto work_done;
 864
 865                 /* Goal is not achieved. We stop process if:
 866
 867                    - if expire reduced to zero. Otherwise, expire is halfed.
 868                    - if table is not full.
 869                    - if we are called from interrupt.
 870                    - jiffies check is just fallback/debug loop breaker.
 871                      We will not spin here for long time in any case.
 872                  */
 873
 874                 RT_CACHE_STAT_INC(gc_goal_miss);
 875
 876                 if (expire == 0)
 877                         break;
 878
 879                 expire >>= 1;
 880 #if RT_CACHE_DEBUG >= 2
 881                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 882                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 883 #endif
 884
 885                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 886                         goto out;
 887         } while (!in_softirq() && time_before_eq(jiffies, now));
 888
 889         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 890                 goto out;
 891         if (net_ratelimit())
 892                 printk(KERN_WARNING "dst cache overflow\n");
 893         RT_CACHE_STAT_INC(gc_dst_overflow);
 894         return 1;
 895
 896 work_done:
 897         expire += ip_rt_gc_min_interval;
 898         if (expire > ip_rt_gc_timeout ||
 899             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 900                 expire = ip_rt_gc_timeout;
 901 #if RT_CACHE_DEBUG >= 2
 902         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 903                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 904 #endif
 905 out:    return 0;
 906 }
 907
 908 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 909 {
 910         struct rtable   *rth, **rthp;
 911         unsigned long   now;
 912         struct rtable *cand, **candp;
 913         u32             min_score;
 914         int             chain_length;
 915         int attempts = !in_softirq();
 916
 917 restart:
 918         chain_length = 0;
 919         min_score = ~(u32)0;
 920         cand = NULL;
 921         candp = NULL;
 922         now = jiffies;
 923
 924         rthp = &rt_hash_table[hash].chain;
 925
 926         spin_lock_bh(rt_hash_lock_addr(hash));
 927         while ((rth = *rthp) != NULL) {
 928 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 929                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 930                     compare_keys(&rth->fl, &rt->fl)) {
 931 #else
 932                 if (compare_keys(&rth->fl, &rt->fl)) {
 933 #endif
 934                         /* Put it first */
 935                         *rthp = rth->u.rt_next;
 936                         /*
 937                          * Since lookup is lockfree, the deletion
 938                          * must be visible to another weakly ordered CPU before
 939                          * the insertion at the start of the hash chain.
 940                          */
 941                         rcu_assign_pointer(rth->u.rt_next,
 942                                            rt_hash_table[hash].chain);
 943                         /*
 944                          * Since lookup is lockfree, the update writes
 945                          * must be ordered for consistency on SMP.
 946                          */
 947                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 948
 949                         rth->u.dst.__use++;
 950                         dst_hold(&rth->u.dst);
 951                         rth->u.dst.lastuse = now;
 952                         spin_unlock_bh(rt_hash_lock_addr(hash));
 953
 954                         rt_drop(rt);
 955                         *rp = rth;
 956                         return 0;
 957                 }
 958
 959                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 960                         u32 score = rt_score(rth);
 961
 962                         if (score <= min_score) {
 963                                 cand = rth;
 964                                 candp = rthp;
 965                                 min_score = score;
 966                         }
 967                 }
 968
 969                 chain_length++;
 970
 971                 rthp = &rth->u.rt_next;
 972         }
 973
 974         if (cand) {
 975                 /* ip_rt_gc_elasticity used to be average length of chain
 976                  * length, when exceeded gc becomes really aggressive.
 977                  *
 978                  * The second limit is less certain. At the moment it allows
 979                  * only 2 entries per bucket. We will see.
 980                  */
 981                 if (chain_length > ip_rt_gc_elasticity) {
 982                         *candp = cand->u.rt_next;
 983                         rt_free(cand);
 984                 }
 985         }
 986
 987         /* Try to bind route to arp only if it is output
 988            route or unicast forwarding path.
 989          */
 990         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 991                 int err = arp_bind_neighbour(&rt->u.dst);
 992                 if (err) {
 993                         spin_unlock_bh(rt_hash_lock_addr(hash));
 994
 995                         if (err != -ENOBUFS) {
 996                                 rt_drop(rt);
 997                                 return err;
 998                         }
 999
1000                         /* Neighbour tables are full and nothing
1001                            can be released. Try to shrink route cache,
1002                            it is most likely it holds some neighbour records.
1003                          */
1004                         if (attempts-- > 0) {
1005                                 int saved_elasticity = ip_rt_gc_elasticity;
1006                                 int saved_int = ip_rt_gc_min_interval;
1007                                 ip_rt_gc_elasticity     = 1;
1008                                 ip_rt_gc_min_interval   = 0;
1009                                 rt_garbage_collect();
1010                                 ip_rt_gc_min_interval   = saved_int;
1011                                 ip_rt_gc_elasticity     = saved_elasticity;
1012                                 goto restart;
1013                         }
1014
1015                         if (net_ratelimit())
1016                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1017                         rt_drop(rt);
1018                         return -ENOBUFS;
1019                 }
1020         }
1021
1022         rt->u.rt_next = rt_hash_table[hash].chain;
1023 #if RT_CACHE_DEBUG >= 2
1024         if (rt->u.rt_next) {
1025                 struct rtable *trt;
1026                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1027                        NIPQUAD(rt->rt_dst));
1028                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1029                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1030                 printk("\n");
1031         }
1032 #endif
1033         rt_hash_table[hash].chain = rt;
1034         spin_unlock_bh(rt_hash_lock_addr(hash));
1035         *rp = rt;
1036         return 0;
1037 }
1038
1039 void rt_bind_peer(struct rtable *rt, int create)
1040 {
1041         static DEFINE_SPINLOCK(rt_peer_lock);
1042         struct inet_peer *peer;
1043
1044         peer = inet_getpeer(rt->rt_dst, create);
1045
1046         spin_lock_bh(&rt_peer_lock);
1047         if (rt->peer == NULL) {
1048                 rt->peer = peer;
1049                 peer = NULL;
1050         }
1051         spin_unlock_bh(&rt_peer_lock);
1052         if (peer)
1053                 inet_putpeer(peer);
1054 }
1055
1056 /*
1057  * Peer allocation may fail only in serious out-of-memory conditions.  However
1058  * we still can generate some output.
1059  * Random ID selection looks a bit dangerous because we have no chances to
1060  * select ID being unique in a reasonable period of time.
1061  * But broken packet identifier may be better than no packet at all.
1062  */
1063 static void ip_select_fb_ident(struct iphdr *iph)
1064 {
1065         static DEFINE_SPINLOCK(ip_fb_id_lock);
1066         static u32 ip_fallback_id;
1067         u32 salt;
1068
1069         spin_lock_bh(&ip_fb_id_lock);
1070         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1071         iph->id = htons(salt & 0xFFFF);
1072         ip_fallback_id = salt;
1073         spin_unlock_bh(&ip_fb_id_lock);
1074 }
1075
1076 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1077 {
1078         struct rtable *rt = (struct rtable *) dst;
1079
1080         if (rt) {
1081                 if (rt->peer == NULL)
1082                         rt_bind_peer(rt, 1);
1083
1084                 /* If peer is attached to destination, it is never detached,
1085                    so that we need not to grab a lock to dereference it.
1086                  */
1087                 if (rt->peer) {
1088                         iph->id = htons(inet_getid(rt->peer, more));
1089                         return;
1090                 }
1091         } else
1092                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1093                        __builtin_return_address(0));
1094
1095         ip_select_fb_ident(iph);
1096 }
1097
1098 static void rt_del(unsigned hash, struct rtable *rt)
1099 {
1100         struct rtable **rthp;
1101
1102         spin_lock_bh(rt_hash_lock_addr(hash));
1103         ip_rt_put(rt);
1104         for (rthp = &rt_hash_table[hash].chain; *rthp;
1105              rthp = &(*rthp)->u.rt_next)
1106                 if (*rthp == rt) {
1107                         *rthp = rt->u.rt_next;
1108                         rt_free(rt);
1109                         break;
1110                 }
1111         spin_unlock_bh(rt_hash_lock_addr(hash));
1112 }
1113
1114 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1115                     u32 saddr, struct net_device *dev)
1116 {
1117         int i, k;
1118         struct in_device *in_dev = in_dev_get(dev);
1119         struct rtable *rth, **rthp;
1120         u32  skeys[2] = { saddr, 0 };
1121         int  ikeys[2] = { dev->ifindex, 0 };
1122
1123         if (!in_dev)
1124                 return;
1125
1126         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1127             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1128                 goto reject_redirect;
1129
1130         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1131                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1132                         goto reject_redirect;
1133                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1134                         goto reject_redirect;
1135         } else {
1136                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1137                         goto reject_redirect;
1138         }
1139
1140         for (i = 0; i < 2; i++) {
1141                 for (k = 0; k < 2; k++) {
1142                         unsigned hash = rt_hash_code(daddr,
1143                                                      skeys[i] ^ (ikeys[k] << 5));
1144
1145                         rthp=&rt_hash_table[hash].chain;
1146
1147                         rcu_read_lock();
1148                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1149                                 struct rtable *rt;
1150
1151                                 if (rth->fl.fl4_dst != daddr ||
1152                                     rth->fl.fl4_src != skeys[i] ||
1153                                     rth->fl.oif != ikeys[k] ||
1154                                     rth->fl.iif != 0) {
1155                                         rthp = &rth->u.rt_next;
1156                                         continue;
1157                                 }
1158
1159                                 if (rth->rt_dst != daddr ||
1160                                     rth->rt_src != saddr ||
1161                                     rth->u.dst.error ||
1162                                     rth->rt_gateway != old_gw ||
1163                                     rth->u.dst.dev != dev)
1164                                         break;
1165
1166                                 dst_hold(&rth->u.dst);
1167                                 rcu_read_unlock();
1168
1169                                 rt = dst_alloc(&ipv4_dst_ops);
1170                                 if (rt == NULL) {
1171                                         ip_rt_put(rth);
1172                                         in_dev_put(in_dev);
1173                                         return;
1174                                 }
1175
1176                                 /* Copy all the information. */
1177                                 *rt = *rth;
1178                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1179                                 rt->u.dst.__use         = 1;
1180                                 atomic_set(&rt->u.dst.__refcnt, 1);
1181                                 rt->u.dst.child         = NULL;
1182                                 if (rt->u.dst.dev)
1183                                         dev_hold(rt->u.dst.dev);
1184                                 if (rt->idev)
1185                                         in_dev_hold(rt->idev);
1186                                 rt->u.dst.obsolete      = 0;
1187                                 rt->u.dst.lastuse       = jiffies;
1188                                 rt->u.dst.path          = &rt->u.dst;
1189                                 rt->u.dst.neighbour     = NULL;
1190                                 rt->u.dst.hh            = NULL;
1191                                 rt->u.dst.xfrm          = NULL;
1192
1193                                 rt->rt_flags            |= RTCF_REDIRECTED;
1194
1195                                 /* Gateway is different ... */
1196                                 rt->rt_gateway          = new_gw;
1197
1198                                 /* Redirect received -> path was valid */
1199                                 dst_confirm(&rth->u.dst);
1200
1201                                 if (rt->peer)
1202                                         atomic_inc(&rt->peer->refcnt);
1203
1204                                 if (arp_bind_neighbour(&rt->u.dst) ||
1205                                     !(rt->u.dst.neighbour->nud_state &
1206                                             NUD_VALID)) {
1207                                         if (rt->u.dst.neighbour)
1208                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1209                                         ip_rt_put(rth);
1210                                         rt_drop(rt);
1211                                         goto do_next;
1212                                 }
1213
1214                                 rt_del(hash, rth);
1215                                 if (!rt_intern_hash(hash, rt, &rt))
1216                                         ip_rt_put(rt);
1217                                 goto do_next;
1218                         }
1219                         rcu_read_unlock();
1220                 do_next:
1221                         ;
1222                 }
1223         }
1224         in_dev_put(in_dev);
1225         return;
1226
1227 reject_redirect:
1228 #ifdef CONFIG_IP_ROUTE_VERBOSE
1229         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1230                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1231                         "%u.%u.%u.%u ignored.\n"
1232                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1233                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1234                        NIPQUAD(saddr), NIPQUAD(daddr));
1235 #endif
1236         in_dev_put(in_dev);
1237 }
1238
1239 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1240 {
1241         struct rtable *rt = (struct rtable*)dst;
1242         struct dst_entry *ret = dst;
1243
1244         if (rt) {
1245                 if (dst->obsolete) {
1246                         ip_rt_put(rt);
1247                         ret = NULL;
1248                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1249                            rt->u.dst.expires) {
1250                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1251                                                      rt->fl.fl4_src ^
1252                                                         (rt->fl.oif << 5));
1253 #if RT_CACHE_DEBUG >= 1
1254                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1255                                           "%u.%u.%u.%u/%02x dropped\n",
1256                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1257 #endif
1258                         rt_del(hash, rt);
1259                         ret = NULL;
1260                 }
1261         }
1262         return ret;
1263 }
1264
1265 /*
1266  * Algorithm:
1267  *      1. The first ip_rt_redirect_number redirects are sent
1268  *         with exponential backoff, then we stop sending them at all,
1269  *         assuming that the host ignores our redirects.
1270  *      2. If we did not see packets requiring redirects
1271  *         during ip_rt_redirect_silence, we assume that the host
1272  *         forgot redirected route and start to send redirects again.
1273  *
1274  * This algorithm is much cheaper and more intelligent than dumb load limiting
1275  * in icmp.c.
1276  *
1277  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1278  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1279  */
1280
1281 void ip_rt_send_redirect(struct sk_buff *skb)
1282 {
1283         struct rtable *rt = (struct rtable*)skb->dst;
1284         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1285
1286         if (!in_dev)
1287                 return;
1288
1289         if (!IN_DEV_TX_REDIRECTS(in_dev))
1290                 goto out;
1291
1292         /* No redirected packets during ip_rt_redirect_silence;
1293          * reset the algorithm.
1294          */
1295         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1296                 rt->u.dst.rate_tokens = 0;
1297
1298         /* Too many ignored redirects; do not send anything
1299          * set u.dst.rate_last to the last seen redirected packet.
1300          */
1301         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1302                 rt->u.dst.rate_last = jiffies;
1303                 goto out;
1304         }
1305
1306         /* Check for load limit; set rate_last to the latest sent
1307          * redirect.
1308          */
1309         if (time_after(jiffies,
1310                        (rt->u.dst.rate_last +
1311                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1312                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1313                 rt->u.dst.rate_last = jiffies;
1314                 ++rt->u.dst.rate_tokens;
1315 #ifdef CONFIG_IP_ROUTE_VERBOSE
1316                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1317                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1318                     net_ratelimit())
1319                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1320                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1321                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1322                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1323 #endif
1324         }
1325 out:
1326         in_dev_put(in_dev);
1327 }
1328
1329 static int ip_error(struct sk_buff *skb)
1330 {
1331         struct rtable *rt = (struct rtable*)skb->dst;
1332         unsigned long now;
1333         int code;
1334
1335         switch (rt->u.dst.error) {
1336                 case EINVAL:
1337                 default:
1338                         goto out;
1339                 case EHOSTUNREACH:
1340                         code = ICMP_HOST_UNREACH;
1341                         break;
1342                 case ENETUNREACH:
1343                         code = ICMP_NET_UNREACH;
1344                         break;
1345                 case EACCES:
1346                         code = ICMP_PKT_FILTERED;
1347                         break;
1348         }
1349
1350         now = jiffies;
1351         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1352         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1353                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1354         rt->u.dst.rate_last = now;
1355         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1356                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1357                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1358         }
1359
1360 out:    kfree_skb(skb);
1361         return 0;
1362 }
1363
1364 /*
1365  *      The last two values are not from the RFC but
1366  *      are needed for AMPRnet AX.25 paths.
1367  */
1368
1369 static const unsigned short mtu_plateau[] =
1370 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1371
1372 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1373 {
1374         int i;
1375
1376         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1377                 if (old_mtu > mtu_plateau[i])
1378                         return mtu_plateau[i];
1379         return 68;
1380 }
1381
1382 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1383 {
1384         int i;
1385         unsigned short old_mtu = ntohs(iph->tot_len);
1386         struct rtable *rth;
1387         u32  skeys[2] = { iph->saddr, 0, };
1388         u32  daddr = iph->daddr;
1389         unsigned short est_mtu = 0;
1390
1391         if (ipv4_config.no_pmtu_disc)
1392                 return 0;
1393
1394         for (i = 0; i < 2; i++) {
1395                 unsigned hash = rt_hash_code(daddr, skeys[i]);
1396
1397                 rcu_read_lock();
1398                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1399                      rth = rcu_dereference(rth->u.rt_next)) {
1400                         if (rth->fl.fl4_dst == daddr &&
1401                             rth->fl.fl4_src == skeys[i] &&
1402                             rth->rt_dst  == daddr &&
1403                             rth->rt_src  == iph->saddr &&
1404                             rth->fl.iif == 0 &&
1405                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1406                                 unsigned short mtu = new_mtu;
1407
1408                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1409
1410                                         /* BSD 4.2 compatibility hack :-( */
1411                                         if (mtu == 0 &&
1412                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1413                                             old_mtu >= 68 + (iph->ihl << 2))
1414                                                 old_mtu -= iph->ihl << 2;
1415
1416                                         mtu = guess_mtu(old_mtu);
1417                                 }
1418                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1419                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1420                                                 dst_confirm(&rth->u.dst);
1421                                                 if (mtu < ip_rt_min_pmtu) {
1422                                                         mtu = ip_rt_min_pmtu;
1423                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1424                                                                 (1 << RTAX_MTU);
1425                                                 }
1426                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1427                                                 dst_set_expires(&rth->u.dst,
1428                                                         ip_rt_mtu_expires);
1429                                         }
1430                                         est_mtu = mtu;
1431                                 }
1432                         }
1433                 }
1434                 rcu_read_unlock();
1435         }
1436         return est_mtu ? : new_mtu;
1437 }
1438
1439 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1440 {
1441         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1442             !(dst_metric_locked(dst, RTAX_MTU))) {
1443                 if (mtu < ip_rt_min_pmtu) {
1444                         mtu = ip_rt_min_pmtu;
1445                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1446                 }
1447                 dst->metrics[RTAX_MTU-1] = mtu;
1448                 dst_set_expires(dst, ip_rt_mtu_expires);
1449         }
1450 }
1451
1452 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1453 {
1454         return NULL;
1455 }
1456
1457 static void ipv4_dst_destroy(struct dst_entry *dst)
1458 {
1459         struct rtable *rt = (struct rtable *) dst;
1460         struct inet_peer *peer = rt->peer;
1461         struct in_device *idev = rt->idev;
1462
1463         if (peer) {
1464                 rt->peer = NULL;
1465                 inet_putpeer(peer);
1466         }
1467
1468         if (idev) {
1469                 rt->idev = NULL;
1470                 in_dev_put(idev);
1471         }
1472 }
1473
1474 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1475                             int how)
1476 {
1477         struct rtable *rt = (struct rtable *) dst;
1478         struct in_device *idev = rt->idev;
1479         if (dev != &loopback_dev && idev && idev->dev == dev) {
1480                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1481                 if (loopback_idev) {
1482                         rt->idev = loopback_idev;
1483                         in_dev_put(idev);
1484                 }
1485         }
1486 }
1487
1488 static void ipv4_link_failure(struct sk_buff *skb)
1489 {
1490         struct rtable *rt;
1491
1492         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1493
1494         rt = (struct rtable *) skb->dst;
1495         if (rt)
1496                 dst_set_expires(&rt->u.dst, 0);
1497 }
1498
1499 static int ip_rt_bug(struct sk_buff *skb)
1500 {
1501         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1502                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1503                 skb->dev ? skb->dev->name : "?");
1504         kfree_skb(skb);
1505         return 0;
1506 }
1507
1508 /*
1509    We do not cache source address of outgoing interface,
1510    because it is used only by IP RR, TS and SRR options,
1511    so that it out of fast path.
1512
1513    BTW remember: "addr" is allowed to be not aligned
1514    in IP options!
1515  */
1516
1517 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1518 {
1519         u32 src;
1520         struct fib_result res;
1521
1522         if (rt->fl.iif == 0)
1523                 src = rt->rt_src;
1524         else if (fib_lookup(&rt->fl, &res) == 0) {
1525                 src = FIB_RES_PREFSRC(res);
1526                 fib_res_put(&res);
1527         } else
1528                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1529                                         RT_SCOPE_UNIVERSE);
1530         memcpy(addr, &src, 4);
1531 }
1532
1533 #ifdef CONFIG_NET_CLS_ROUTE
1534 static void set_class_tag(struct rtable *rt, u32 tag)
1535 {
1536         if (!(rt->u.dst.tclassid & 0xFFFF))
1537                 rt->u.dst.tclassid |= tag & 0xFFFF;
1538         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1539                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1540 }
1541 #endif
1542
1543 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1544 {
1545         struct fib_info *fi = res->fi;
1546
1547         if (fi) {
1548                 if (FIB_RES_GW(*res) &&
1549                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1550                         rt->rt_gateway = FIB_RES_GW(*res);
1551                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1552                        sizeof(rt->u.dst.metrics));
1553                 if (fi->fib_mtu == 0) {
1554                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1555                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1556                             rt->rt_gateway != rt->rt_dst &&
1557                             rt->u.dst.dev->mtu > 576)
1558                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1559                 }
1560 #ifdef CONFIG_NET_CLS_ROUTE
1561                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1562 #endif
1563         } else
1564                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1565
1566         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1567                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1568         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1569                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1570         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1571                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1572                                        ip_rt_min_advmss);
1573         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1574                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1575
1576 #ifdef CONFIG_NET_CLS_ROUTE
1577 #ifdef CONFIG_IP_MULTIPLE_TABLES
1578         set_class_tag(rt, fib_rules_tclass(res));
1579 #endif
1580         set_class_tag(rt, itag);
1581 #endif
1582         rt->rt_type = res->type;
1583 }
1584
1585 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1586                                 u8 tos, struct net_device *dev, int our)
1587 {
1588         unsigned hash;
1589         struct rtable *rth;
1590         u32 spec_dst;
1591         struct in_device *in_dev = in_dev_get(dev);
1592         u32 itag = 0;
1593
1594         /* Primary sanity checks. */
1595
1596         if (in_dev == NULL)
1597                 return -EINVAL;
1598
1599         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1600             skb->protocol != htons(ETH_P_IP))
1601                 goto e_inval;
1602
1603         if (ZERONET(saddr)) {
1604                 if (!LOCAL_MCAST(daddr))
1605                         goto e_inval;
1606                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1607         } else if (fib_validate_source(saddr, 0, tos, 0,
1608                                         dev, &spec_dst, &itag) < 0)
1609                 goto e_inval;
1610
1611         rth = dst_alloc(&ipv4_dst_ops);
1612         if (!rth)
1613                 goto e_nobufs;
1614
1615         rth->u.dst.output= ip_rt_bug;
1616
1617         atomic_set(&rth->u.dst.__refcnt, 1);
1618         rth->u.dst.flags= DST_HOST;
1619         if (in_dev->cnf.no_policy)
1620                 rth->u.dst.flags |= DST_NOPOLICY;
1621         rth->fl.fl4_dst = daddr;
1622         rth->rt_dst     = daddr;
1623         rth->fl.fl4_tos = tos;
1624 #ifdef CONFIG_IP_ROUTE_FWMARK
1625         rth->fl.fl4_fwmark= skb->nfmark;
1626 #endif
1627         rth->fl.fl4_src = saddr;
1628         rth->rt_src     = saddr;
1629 #ifdef CONFIG_NET_CLS_ROUTE
1630         rth->u.dst.tclassid = itag;
1631 #endif
1632         rth->rt_iif     =
1633         rth->fl.iif     = dev->ifindex;
1634         rth->u.dst.dev  = &loopback_dev;
1635         dev_hold(rth->u.dst.dev);
1636         rth->idev       = in_dev_get(rth->u.dst.dev);
1637         rth->fl.oif     = 0;
1638         rth->rt_gateway = daddr;
1639         rth->rt_spec_dst= spec_dst;
1640         rth->rt_type    = RTN_MULTICAST;
1641         rth->rt_flags   = RTCF_MULTICAST;
1642         if (our) {
1643                 rth->u.dst.input= ip_local_deliver;
1644                 rth->rt_flags |= RTCF_LOCAL;
1645         }
1646
1647 #ifdef CONFIG_IP_MROUTE
1648         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1649                 rth->u.dst.input = ip_mr_input;
1650 #endif
1651         RT_CACHE_STAT_INC(in_slow_mc);
1652
1653         in_dev_put(in_dev);
1654         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1655         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1656
1657 e_nobufs:
1658         in_dev_put(in_dev);
1659         return -ENOBUFS;
1660
1661 e_inval:
1662         in_dev_put(in_dev);
1663         return -EINVAL;
1664 }
1665
1666
1667 static void ip_handle_martian_source(struct net_device *dev,
1668                                      struct in_device *in_dev,
1669                                      struct sk_buff *skb,
1670                                      u32 daddr,
1671                                      u32 saddr)
1672 {
1673         RT_CACHE_STAT_INC(in_martian_src);
1674 #ifdef CONFIG_IP_ROUTE_VERBOSE
1675         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1676                 /*
1677                  *      RFC1812 recommendation, if source is martian,
1678                  *      the only hint is MAC header.
1679                  */
1680                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1681                         "%u.%u.%u.%u, on dev %s\n",
1682                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1683                 if (dev->hard_header_len && skb->mac.raw) {
1684                         int i;
1685                         unsigned char *p = skb->mac.raw;
1686                         printk(KERN_WARNING "ll header: ");
1687                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1688                                 printk("%02x", *p);
1689                                 if (i < (dev->hard_header_len - 1))
1690                                         printk(":");
1691                         }
1692                         printk("\n");
1693                 }
1694         }
1695 #endif
1696 }
1697
1698 static inline int __mkroute_input(struct sk_buff *skb,
1699                                   struct fib_result* res,
1700                                   struct in_device *in_dev,
1701                                   u32 daddr, u32 saddr, u32 tos,
1702                                   struct rtable **result)
1703 {
1704
1705         struct rtable *rth;
1706         int err;
1707         struct in_device *out_dev;
1708         unsigned flags = 0;
1709         u32 spec_dst, itag;
1710
1711         /* get a working reference to the output device */
1712         out_dev = in_dev_get(FIB_RES_DEV(*res));
1713         if (out_dev == NULL) {
1714                 if (net_ratelimit())
1715                         printk(KERN_CRIT "Bug in ip_route_input" \
1716                                "_slow(). Please, report\n");
1717                 return -EINVAL;
1718         }
1719
1720
1721         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1722                                   in_dev->dev, &spec_dst, &itag);
1723         if (err < 0) {
1724                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1725                                          saddr);
1726
1727                 err = -EINVAL;
1728                 goto cleanup;
1729         }
1730
1731         if (err)
1732                 flags |= RTCF_DIRECTSRC;
1733
1734         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1735             (IN_DEV_SHARED_MEDIA(out_dev) ||
1736              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1737                 flags |= RTCF_DOREDIRECT;
1738
1739         if (skb->protocol != htons(ETH_P_IP)) {
1740                 /* Not IP (i.e. ARP). Do not create route, if it is
1741                  * invalid for proxy arp. DNAT routes are always valid.
1742                  */
1743                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1744                         err = -EINVAL;
1745                         goto cleanup;
1746                 }
1747         }
1748
1749
1750         rth = dst_alloc(&ipv4_dst_ops);
1751         if (!rth) {
1752                 err = -ENOBUFS;
1753                 goto cleanup;
1754         }
1755
1756         atomic_set(&rth->u.dst.__refcnt, 1);
1757         rth->u.dst.flags= DST_HOST;
1758 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1759         if (res->fi->fib_nhs > 1)
1760                 rth->u.dst.flags |= DST_BALANCED;
1761 #endif
1762         if (in_dev->cnf.no_policy)
1763                 rth->u.dst.flags |= DST_NOPOLICY;
1764         if (in_dev->cnf.no_xfrm)
1765                 rth->u.dst.flags |= DST_NOXFRM;
1766         rth->fl.fl4_dst = daddr;
1767         rth->rt_dst     = daddr;
1768         rth->fl.fl4_tos = tos;
1769 #ifdef CONFIG_IP_ROUTE_FWMARK
1770         rth->fl.fl4_fwmark= skb->nfmark;
1771 #endif
1772         rth->fl.fl4_src = saddr;
1773         rth->rt_src     = saddr;
1774         rth->rt_gateway = daddr;
1775         rth->rt_iif     =
1776                 rth->fl.iif     = in_dev->dev->ifindex;
1777         rth->u.dst.dev  = (out_dev)->dev;
1778         dev_hold(rth->u.dst.dev);
1779         rth->idev       = in_dev_get(rth->u.dst.dev);
1780         rth->fl.oif     = 0;
1781         rth->rt_spec_dst= spec_dst;
1782
1783         rth->u.dst.input = ip_forward;
1784         rth->u.dst.output = ip_output;
1785
1786         rt_set_nexthop(rth, res, itag);
1787
1788         rth->rt_flags = flags;
1789
1790         *result = rth;
1791         err = 0;
1792  cleanup:
1793         /* release the working reference to the output device */
1794         in_dev_put(out_dev);
1795         return err;
1796 }
1797
1798 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1799                                        struct fib_result* res,
1800                                        const struct flowi *fl,
1801                                        struct in_device *in_dev,
1802                                        u32 daddr, u32 saddr, u32 tos)
1803 {
1804         struct rtable* rth = NULL;
1805         int err;
1806         unsigned hash;
1807
1808 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1809         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1810                 fib_select_multipath(fl, res);
1811 #endif
1812
1813         /* create a routing cache entry */
1814         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1815         if (err)
1816                 return err;
1817
1818         /* put it into the cache */
1819         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1820         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1821 }
1822
1823 static inline int ip_mkroute_input(struct sk_buff *skb,
1824                                    struct fib_result* res,
1825                                    const struct flowi *fl,
1826                                    struct in_device *in_dev,
1827                                    u32 daddr, u32 saddr, u32 tos)
1828 {
1829 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1830         struct rtable* rth = NULL, *rtres;
1831         unsigned char hop, hopcount;
1832         int err = -EINVAL;
1833         unsigned int hash;
1834
1835         if (res->fi)
1836                 hopcount = res->fi->fib_nhs;
1837         else
1838                 hopcount = 1;
1839
1840         /* distinguish between multipath and singlepath */
1841         if (hopcount < 2)
1842                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1843                                             saddr, tos);
1844
1845         /* add all alternatives to the routing cache */
1846         for (hop = 0; hop < hopcount; hop++) {
1847                 res->nh_sel = hop;
1848
1849                 /* put reference to previous result */
1850                 if (hop)
1851                         ip_rt_put(rtres);
1852
1853                 /* create a routing cache entry */
1854                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1855                                       &rth);
1856                 if (err)
1857                         return err;
1858
1859                 /* put it into the cache */
1860                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1861                 err = rt_intern_hash(hash, rth, &rtres);
1862                 if (err)
1863                         return err;
1864
1865                 /* forward hop information to multipath impl. */
1866                 multipath_set_nhinfo(rth,
1867                                      FIB_RES_NETWORK(*res),
1868                                      FIB_RES_NETMASK(*res),
1869                                      res->prefixlen,
1870                                      &FIB_RES_NH(*res));
1871         }
1872         skb->dst = &rtres->u.dst;
1873         return err;
1874 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1875         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1876 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1877 }
1878
1879
1880 /*
1881  *      NOTE. We drop all the packets that has local source
1882  *      addresses, because every properly looped back packet
1883  *      must have correct destination already attached by output routine.
1884  *
1885  *      Such approach solves two big problems:
1886  *      1. Not simplex devices are handled properly.
1887  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1888  */
1889
1890 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1891                                u8 tos, struct net_device *dev)
1892 {
1893         struct fib_result res;
1894         struct in_device *in_dev = in_dev_get(dev);
1895         struct flowi fl = { .nl_u = { .ip4_u =
1896                                       { .daddr = daddr,
1897                                         .saddr = saddr,
1898                                         .tos = tos,
1899                                         .scope = RT_SCOPE_UNIVERSE,
1900 #ifdef CONFIG_IP_ROUTE_FWMARK
1901                                         .fwmark = skb->nfmark
1902 #endif
1903                                       } },
1904                             .iif = dev->ifindex };
1905         unsigned        flags = 0;
1906         u32             itag = 0;
1907         struct rtable * rth;
1908         unsigned        hash;
1909         u32             spec_dst;
1910         int             err = -EINVAL;
1911         int             free_res = 0;
1912
1913         /* IP on this device is disabled. */
1914
1915         if (!in_dev)
1916                 goto out;
1917
1918         /* Check for the most weird martians, which can be not detected
1919            by fib_lookup.
1920          */
1921
1922         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1923                 goto martian_source;
1924
1925         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1926                 goto brd_input;
1927
1928         /* Accept zero addresses only to limited broadcast;
1929          * I even do not know to fix it or not. Waiting for complains :-)
1930          */
1931         if (ZERONET(saddr))
1932                 goto martian_source;
1933
1934         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1935                 goto martian_destination;
1936
1937         /*
1938          *      Now we are ready to route packet.
1939          */
1940         if ((err = fib_lookup(&fl, &res)) != 0) {
1941                 if (!IN_DEV_FORWARD(in_dev))
1942                         goto e_hostunreach;
1943                 goto no_route;
1944         }
1945         free_res = 1;
1946
1947         RT_CACHE_STAT_INC(in_slow_tot);
1948
1949         if (res.type == RTN_BROADCAST)
1950                 goto brd_input;
1951
1952         if (res.type == RTN_LOCAL) {
1953                 int result;
1954                 result = fib_validate_source(saddr, daddr, tos,
1955                                              loopback_dev.ifindex,
1956                                              dev, &spec_dst, &itag);
1957                 if (result < 0)
1958                         goto martian_source;
1959                 if (result)
1960                         flags |= RTCF_DIRECTSRC;
1961                 spec_dst = daddr;
1962                 goto local_input;
1963         }
1964
1965         if (!IN_DEV_FORWARD(in_dev))
1966                 goto e_hostunreach;
1967         if (res.type != RTN_UNICAST)
1968                 goto martian_destination;
1969
1970         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971         if (err == -ENOBUFS)
1972                 goto e_nobufs;
1973         if (err == -EINVAL)
1974                 goto e_inval;
1975
1976 done:
1977         in_dev_put(in_dev);
1978         if (free_res)
1979                 fib_res_put(&res);
1980 out:    return err;
1981
1982 brd_input:
1983         if (skb->protocol != htons(ETH_P_IP))
1984                 goto e_inval;
1985
1986         if (ZERONET(saddr))
1987                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1988         else {
1989                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1990                                           &itag);
1991                 if (err < 0)
1992                         goto martian_source;
1993                 if (err)
1994                         flags |= RTCF_DIRECTSRC;
1995         }
1996         flags |= RTCF_BROADCAST;
1997         res.type = RTN_BROADCAST;
1998         RT_CACHE_STAT_INC(in_brd);
1999
2000 local_input:
2001         rth = dst_alloc(&ipv4_dst_ops);
2002         if (!rth)
2003                 goto e_nobufs;
2004
2005         rth->u.dst.output= ip_rt_bug;
2006
2007         atomic_set(&rth->u.dst.__refcnt, 1);
2008         rth->u.dst.flags= DST_HOST;
2009         if (in_dev->cnf.no_policy)
2010                 rth->u.dst.flags |= DST_NOPOLICY;
2011         rth->fl.fl4_dst = daddr;
2012         rth->rt_dst     = daddr;
2013         rth->fl.fl4_tos = tos;
2014 #ifdef CONFIG_IP_ROUTE_FWMARK
2015         rth->fl.fl4_fwmark= skb->nfmark;
2016 #endif
2017         rth->fl.fl4_src = saddr;
2018         rth->rt_src     = saddr;
2019 #ifdef CONFIG_NET_CLS_ROUTE
2020         rth->u.dst.tclassid = itag;
2021 #endif
2022         rth->rt_iif     =
2023         rth->fl.iif     = dev->ifindex;
2024         rth->u.dst.dev  = &loopback_dev;
2025         dev_hold(rth->u.dst.dev);
2026         rth->idev       = in_dev_get(rth->u.dst.dev);
2027         rth->rt_gateway = daddr;
2028         rth->rt_spec_dst= spec_dst;
2029         rth->u.dst.input= ip_local_deliver;
2030         rth->rt_flags   = flags|RTCF_LOCAL;
2031         if (res.type == RTN_UNREACHABLE) {
2032                 rth->u.dst.input= ip_error;
2033                 rth->u.dst.error= -err;
2034                 rth->rt_flags   &= ~RTCF_LOCAL;
2035         }
2036         rth->rt_type    = res.type;
2037         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2038         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2039         goto done;
2040
2041 no_route:
2042         RT_CACHE_STAT_INC(in_no_route);
2043         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2044         res.type = RTN_UNREACHABLE;
2045         goto local_input;
2046
2047         /*
2048          *      Do not cache martian addresses: they should be logged (RFC1812)
2049          */
2050 martian_destination:
2051         RT_CACHE_STAT_INC(in_martian_dst);
2052 #ifdef CONFIG_IP_ROUTE_VERBOSE
2053         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2054                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2055                         "%u.%u.%u.%u, dev %s\n",
2056                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2057 #endif
2058
2059 e_hostunreach:
2060         err = -EHOSTUNREACH;
2061         goto done;
2062
2063 e_inval:
2064         err = -EINVAL;
2065         goto done;
2066
2067 e_nobufs:
2068         err = -ENOBUFS;
2069         goto done;
2070
2071 martian_source:
2072         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2073         goto e_inval;
2074 }
2075
2076 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2077                    u8 tos, struct net_device *dev)
2078 {
2079         struct rtable * rth;
2080         unsigned        hash;
2081         int iif = dev->ifindex;
2082
2083         tos &= IPTOS_RT_MASK;
2084         hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2085
2086         rcu_read_lock();
2087         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088              rth = rcu_dereference(rth->u.rt_next)) {
2089                 if (rth->fl.fl4_dst == daddr &&
2090                     rth->fl.fl4_src == saddr &&
2091                     rth->fl.iif == iif &&
2092                     rth->fl.oif == 0 &&
2093 #ifdef CONFIG_IP_ROUTE_FWMARK
2094                     rth->fl.fl4_fwmark == skb->nfmark &&
2095 #endif
2096                     rth->fl.fl4_tos == tos) {
2097                         rth->u.dst.lastuse = jiffies;
2098                         dst_hold(&rth->u.dst);
2099                         rth->u.dst.__use++;
2100                         RT_CACHE_STAT_INC(in_hit);
2101                         rcu_read_unlock();
2102                         skb->dst = (struct dst_entry*)rth;
2103                         return 0;
2104                 }
2105                 RT_CACHE_STAT_INC(in_hlist_search);
2106         }
2107         rcu_read_unlock();
2108
2109         /* Multicast recognition logic is moved from route cache to here.
2110            The problem was that too many Ethernet cards have broken/missing
2111            hardware multicast filters :-( As result the host on multicasting
2112            network acquires a lot of useless route cache entries, sort of
2113            SDR messages from all the world. Now we try to get rid of them.
2114            Really, provided software IP multicast filter is organized
2115            reasonably (at least, hashed), it does not result in a slowdown
2116            comparing with route cache reject entries.
2117            Note, that multicast routers are not affected, because
2118            route cache entry is created eventually.
2119          */
2120         if (MULTICAST(daddr)) {
2121                 struct in_device *in_dev;
2122
2123                 rcu_read_lock();
2124                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2125                         int our = ip_check_mc(in_dev, daddr, saddr,
2126                                 skb->nh.iph->protocol);
2127                         if (our
2128 #ifdef CONFIG_IP_MROUTE
2129                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2130 #endif
2131                             ) {
2132                                 rcu_read_unlock();
2133                                 return ip_route_input_mc(skb, daddr, saddr,
2134                                                          tos, dev, our);
2135                         }
2136                 }
2137                 rcu_read_unlock();
2138                 return -EINVAL;
2139         }
2140         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2141 }
2142
2143 static inline int __mkroute_output(struct rtable **result,
2144                                    struct fib_result* res,
2145                                    const struct flowi *fl,
2146                                    const struct flowi *oldflp,
2147                                    struct net_device *dev_out,
2148                                    unsigned flags)
2149 {
2150         struct rtable *rth;
2151         struct in_device *in_dev;
2152         u32 tos = RT_FL_TOS(oldflp);
2153         int err = 0;
2154
2155         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2156                 return -EINVAL;
2157
2158         if (fl->fl4_dst == 0xFFFFFFFF)
2159                 res->type = RTN_BROADCAST;
2160         else if (MULTICAST(fl->fl4_dst))
2161                 res->type = RTN_MULTICAST;
2162         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2163                 return -EINVAL;
2164
2165         if (dev_out->flags & IFF_LOOPBACK)
2166                 flags |= RTCF_LOCAL;
2167
2168         /* get work reference to inet device */
2169         in_dev = in_dev_get(dev_out);
2170         if (!in_dev)
2171                 return -EINVAL;
2172
2173         if (res->type == RTN_BROADCAST) {
2174                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2175                 if (res->fi) {
2176                         fib_info_put(res->fi);
2177                         res->fi = NULL;
2178                 }
2179         } else if (res->type == RTN_MULTICAST) {
2180                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2181                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2182                                  oldflp->proto))
2183                         flags &= ~RTCF_LOCAL;
2184                 /* If multicast route do not exist use
2185                    default one, but do not gateway in this case.
2186                    Yes, it is hack.
2187                  */
2188                 if (res->fi && res->prefixlen < 4) {
2189                         fib_info_put(res->fi);
2190                         res->fi = NULL;
2191                 }
2192         }
2193
2194
2195         rth = dst_alloc(&ipv4_dst_ops);
2196         if (!rth) {
2197                 err = -ENOBUFS;
2198                 goto cleanup;
2199         }
2200
2201         atomic_set(&rth->u.dst.__refcnt, 1);
2202         rth->u.dst.flags= DST_HOST;
2203 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2204         if (res->fi) {
2205                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2206                 if (res->fi->fib_nhs > 1)
2207                         rth->u.dst.flags |= DST_BALANCED;
2208         }
2209 #endif
2210         if (in_dev->cnf.no_xfrm)
2211                 rth->u.dst.flags |= DST_NOXFRM;
2212         if (in_dev->cnf.no_policy)
2213                 rth->u.dst.flags |= DST_NOPOLICY;
2214
2215         rth->fl.fl4_dst = oldflp->fl4_dst;
2216         rth->fl.fl4_tos = tos;
2217         rth->fl.fl4_src = oldflp->fl4_src;
2218         rth->fl.oif     = oldflp->oif;
2219 #ifdef CONFIG_IP_ROUTE_FWMARK
2220         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2221 #endif
2222         rth->rt_dst     = fl->fl4_dst;
2223         rth->rt_src     = fl->fl4_src;
2224         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2225         /* get references to the devices that are to be hold by the routing
2226            cache entry */
2227         rth->u.dst.dev  = dev_out;
2228         dev_hold(dev_out);
2229         rth->idev       = in_dev_get(dev_out);
2230         rth->rt_gateway = fl->fl4_dst;
2231         rth->rt_spec_dst= fl->fl4_src;
2232
2233         rth->u.dst.output=ip_output;
2234
2235         RT_CACHE_STAT_INC(out_slow_tot);
2236
2237         if (flags & RTCF_LOCAL) {
2238                 rth->u.dst.input = ip_local_deliver;
2239                 rth->rt_spec_dst = fl->fl4_dst;
2240         }
2241         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2242                 rth->rt_spec_dst = fl->fl4_src;
2243                 if (flags & RTCF_LOCAL &&
2244                     !(dev_out->flags & IFF_LOOPBACK)) {
2245                         rth->u.dst.output = ip_mc_output;
2246                         RT_CACHE_STAT_INC(out_slow_mc);
2247                 }
2248 #ifdef CONFIG_IP_MROUTE
2249                 if (res->type == RTN_MULTICAST) {
2250                         if (IN_DEV_MFORWARD(in_dev) &&
2251                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2252                                 rth->u.dst.input = ip_mr_input;
2253                                 rth->u.dst.output = ip_mc_output;
2254                         }
2255                 }
2256 #endif
2257         }
2258
2259         rt_set_nexthop(rth, res, 0);
2260
2261         rth->rt_flags = flags;
2262
2263         *result = rth;
2264  cleanup:
2265         /* release work reference to inet device */
2266         in_dev_put(in_dev);
2267
2268         return err;
2269 }
2270
2271 static inline int ip_mkroute_output_def(struct rtable **rp,
2272                                         struct fib_result* res,
2273                                         const struct flowi *fl,
2274                                         const struct flowi *oldflp,
2275                                         struct net_device *dev_out,
2276                                         unsigned flags)
2277 {
2278         struct rtable *rth = NULL;
2279         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2280         unsigned hash;
2281         if (err == 0) {
2282                 hash = rt_hash_code(oldflp->fl4_dst,
2283                                     oldflp->fl4_src ^ (oldflp->oif << 5));
2284                 err = rt_intern_hash(hash, rth, rp);
2285         }
2286
2287         return err;
2288 }
2289
2290 static inline int ip_mkroute_output(struct rtable** rp,
2291                                     struct fib_result* res,
2292                                     const struct flowi *fl,
2293                                     const struct flowi *oldflp,
2294                                     struct net_device *dev_out,
2295                                     unsigned flags)
2296 {
2297 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2298         unsigned char hop;
2299         unsigned hash;
2300         int err = -EINVAL;
2301         struct rtable *rth = NULL;
2302
2303         if (res->fi && res->fi->fib_nhs > 1) {
2304                 unsigned char hopcount = res->fi->fib_nhs;
2305
2306                 for (hop = 0; hop < hopcount; hop++) {
2307                         struct net_device *dev2nexthop;
2308
2309                         res->nh_sel = hop;
2310
2311                         /* hold a work reference to the output device */
2312                         dev2nexthop = FIB_RES_DEV(*res);
2313                         dev_hold(dev2nexthop);
2314
2315                         /* put reference to previous result */
2316                         if (hop)
2317                                 ip_rt_put(*rp);
2318
2319                         err = __mkroute_output(&rth, res, fl, oldflp,
2320                                                dev2nexthop, flags);
2321
2322                         if (err != 0)
2323                                 goto cleanup;
2324
2325                         hash = rt_hash_code(oldflp->fl4_dst,
2326                                             oldflp->fl4_src ^
2327                                             (oldflp->oif << 5));
2328                         err = rt_intern_hash(hash, rth, rp);
2329
2330                         /* forward hop information to multipath impl. */
2331                         multipath_set_nhinfo(rth,
2332                                              FIB_RES_NETWORK(*res),
2333                                              FIB_RES_NETMASK(*res),
2334                                              res->prefixlen,
2335                                              &FIB_RES_NH(*res));
2336                 cleanup:
2337                         /* release work reference to output device */
2338                         dev_put(dev2nexthop);
2339
2340                         if (err != 0)
2341                                 return err;
2342                 }
2343                 return err;
2344         } else {
2345                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2346                                              flags);
2347         }
2348 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2349         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2350 #endif
2351 }
2352
2353 /*
2354  * Major route resolver routine.
2355  */
2356
2357 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2358 {
2359         u32 tos = RT_FL_TOS(oldflp);
2360         struct flowi fl = { .nl_u = { .ip4_u =
2361                                       { .daddr = oldflp->fl4_dst,
2362                                         .saddr = oldflp->fl4_src,
2363                                         .tos = tos & IPTOS_RT_MASK,
2364                                         .scope = ((tos & RTO_ONLINK) ?
2365                                                   RT_SCOPE_LINK :
2366                                                   RT_SCOPE_UNIVERSE),
2367 #ifdef CONFIG_IP_ROUTE_FWMARK
2368                                         .fwmark = oldflp->fl4_fwmark
2369 #endif
2370                                       } },
2371                             .iif = loopback_dev.ifindex,
2372                             .oif = oldflp->oif };
2373         struct fib_result res;
2374         unsigned flags = 0;
2375         struct net_device *dev_out = NULL;
2376         int free_res = 0;
2377         int err;
2378
2379
2380         res.fi          = NULL;
2381 #ifdef CONFIG_IP_MULTIPLE_TABLES
2382         res.r           = NULL;
2383 #endif
2384
2385         if (oldflp->fl4_src) {
2386                 err = -EINVAL;
2387                 if (MULTICAST(oldflp->fl4_src) ||
2388                     BADCLASS(oldflp->fl4_src) ||
2389                     ZERONET(oldflp->fl4_src))
2390                         goto out;
2391
2392                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2393                 dev_out = ip_dev_find(oldflp->fl4_src);
2394                 if (dev_out == NULL)
2395                         goto out;
2396
2397                 /* I removed check for oif == dev_out->oif here.
2398                    It was wrong for two reasons:
2399                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2400                       assigned to multiple interfaces.
2401                    2. Moreover, we are allowed to send packets with saddr
2402                       of another iface. --ANK
2403                  */
2404
2405                 if (oldflp->oif == 0
2406                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2407                         /* Special hack: user can direct multicasts
2408                            and limited broadcast via necessary interface
2409                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2410                            This hack is not just for fun, it allows
2411                            vic,vat and friends to work.
2412                            They bind socket to loopback, set ttl to zero
2413                            and expect that it will work.
2414                            From the viewpoint of routing cache they are broken,
2415                            because we are not allowed to build multicast path
2416                            with loopback source addr (look, routing cache
2417                            cannot know, that ttl is zero, so that packet
2418                            will not leave this host and route is valid).
2419                            Luckily, this hack is good workaround.
2420                          */
2421
2422                         fl.oif = dev_out->ifindex;
2423                         goto make_route;
2424                 }
2425                 if (dev_out)
2426                         dev_put(dev_out);
2427                 dev_out = NULL;
2428         }
2429
2430
2431         if (oldflp->oif) {
2432                 dev_out = dev_get_by_index(oldflp->oif);
2433                 err = -ENODEV;
2434                 if (dev_out == NULL)
2435                         goto out;
2436
2437                 /* RACE: Check return value of inet_select_addr instead. */
2438                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2439                         dev_put(dev_out);
2440                         goto out;       /* Wrong error code */
2441                 }
2442
2443                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2444                         if (!fl.fl4_src)
2445                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2446                                                               RT_SCOPE_LINK);
2447                         goto make_route;
2448                 }
2449                 if (!fl.fl4_src) {
2450                         if (MULTICAST(oldflp->fl4_dst))
2451                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2452                                                               fl.fl4_scope);
2453                         else if (!oldflp->fl4_dst)
2454                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2455                                                               RT_SCOPE_HOST);
2456                 }
2457         }
2458
2459         if (!fl.fl4_dst) {
2460                 fl.fl4_dst = fl.fl4_src;
2461                 if (!fl.fl4_dst)
2462                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2463                 if (dev_out)
2464                         dev_put(dev_out);
2465                 dev_out = &loopback_dev;
2466                 dev_hold(dev_out);
2467                 fl.oif = loopback_dev.ifindex;
2468                 res.type = RTN_LOCAL;
2469                 flags |= RTCF_LOCAL;
2470                 goto make_route;
2471         }
2472
2473         if (fib_lookup(&fl, &res)) {
2474                 res.fi = NULL;
2475                 if (oldflp->oif) {
2476                         /* Apparently, routing tables are wrong. Assume,
2477                            that the destination is on link.
2478
2479                            WHY? DW.
2480                            Because we are allowed to send to iface
2481                            even if it has NO routes and NO assigned
2482                            addresses. When oif is specified, routing
2483                            tables are looked up with only one purpose:
2484                            to catch if destination is gatewayed, rather than
2485                            direct. Moreover, if MSG_DONTROUTE is set,
2486                            we send packet, ignoring both routing tables
2487                            and ifaddr state. --ANK
2488
2489
2490                            We could make it even if oif is unknown,
2491                            likely IPv6, but we do not.
2492                          */
2493
2494                         if (fl.fl4_src == 0)
2495                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2496                                                               RT_SCOPE_LINK);
2497                         res.type = RTN_UNICAST;
2498                         goto make_route;
2499                 }
2500                 if (dev_out)
2501                         dev_put(dev_out);
2502                 err = -ENETUNREACH;
2503                 goto out;
2504         }
2505         free_res = 1;
2506
2507         if (res.type == RTN_LOCAL) {
2508                 if (!fl.fl4_src)
2509                         fl.fl4_src = fl.fl4_dst;
2510                 if (dev_out)
2511                         dev_put(dev_out);
2512                 dev_out = &loopback_dev;
2513                 dev_hold(dev_out);
2514                 fl.oif = dev_out->ifindex;
2515                 if (res.fi)
2516                         fib_info_put(res.fi);
2517                 res.fi = NULL;
2518                 flags |= RTCF_LOCAL;
2519                 goto make_route;
2520         }
2521
2522 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2523         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2524                 fib_select_multipath(&fl, &res);
2525         else
2526 #endif
2527         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2528                 fib_select_default(&fl, &res);
2529
2530         if (!fl.fl4_src)
2531                 fl.fl4_src = FIB_RES_PREFSRC(res);
2532
2533         if (dev_out)
2534                 dev_put(dev_out);
2535         dev_out = FIB_RES_DEV(res);
2536         dev_hold(dev_out);
2537         fl.oif = dev_out->ifindex;
2538
2539
2540 make_route:
2541         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2542
2543
2544         if (free_res)
2545                 fib_res_put(&res);
2546         if (dev_out)
2547                 dev_put(dev_out);
2548 out:    return err;
2549 }
2550
2551 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2552 {
2553         unsigned hash;
2554         struct rtable *rth;
2555
2556         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2557
2558         rcu_read_lock_bh();
2559         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2560                 rth = rcu_dereference(rth->u.rt_next)) {
2561                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2562                     rth->fl.fl4_src == flp->fl4_src &&
2563                     rth->fl.iif == 0 &&
2564                     rth->fl.oif == flp->oif &&
2565 #ifdef CONFIG_IP_ROUTE_FWMARK
2566                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2567 #endif
2568                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2569                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2570
2571                         /* check for multipath routes and choose one if
2572                          * necessary
2573                          */
2574                         if (multipath_select_route(flp, rth, rp)) {
2575                                 dst_hold(&(*rp)->u.dst);
2576                                 RT_CACHE_STAT_INC(out_hit);
2577                                 rcu_read_unlock_bh();
2578                                 return 0;
2579                         }
2580
2581                         rth->u.dst.lastuse = jiffies;
2582                         dst_hold(&rth->u.dst);
2583                         rth->u.dst.__use++;
2584                         RT_CACHE_STAT_INC(out_hit);
2585                         rcu_read_unlock_bh();
2586                         *rp = rth;
2587                         return 0;
2588                 }
2589                 RT_CACHE_STAT_INC(out_hlist_search);
2590         }
2591         rcu_read_unlock_bh();
2592
2593         return ip_route_output_slow(rp, flp);
2594 }
2595
2596 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2597
2598 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2599 {
2600         int err;
2601
2602         if ((err = __ip_route_output_key(rp, flp)) != 0)
2603                 return err;
2604
2605         if (flp->proto) {
2606                 if (!flp->fl4_src)
2607                         flp->fl4_src = (*rp)->rt_src;
2608                 if (!flp->fl4_dst)
2609                         flp->fl4_dst = (*rp)->rt_dst;
2610                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2611         }
2612
2613         return 0;
2614 }
2615
2616 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2617
2618 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2619 {
2620         return ip_route_output_flow(rp, flp, NULL, 0);
2621 }
2622
2623 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2624                         int nowait, unsigned int flags)
2625 {
2626         struct rtable *rt = (struct rtable*)skb->dst;
2627         struct rtmsg *r;
2628         struct nlmsghdr  *nlh;
2629         unsigned char    *b = skb->tail;
2630         struct rta_cacheinfo ci;
2631 #ifdef CONFIG_IP_MROUTE
2632         struct rtattr *eptr;
2633 #endif
2634         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2635         r = NLMSG_DATA(nlh);
2636         r->rtm_family    = AF_INET;
2637         r->rtm_dst_len  = 32;
2638         r->rtm_src_len  = 0;
2639         r->rtm_tos      = rt->fl.fl4_tos;
2640         r->rtm_table    = RT_TABLE_MAIN;
2641         r->rtm_type     = rt->rt_type;
2642         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2643         r->rtm_protocol = RTPROT_UNSPEC;
2644         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2645         if (rt->rt_flags & RTCF_NOTIFY)
2646                 r->rtm_flags |= RTM_F_NOTIFY;
2647         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2648         if (rt->fl.fl4_src) {
2649                 r->rtm_src_len = 32;
2650                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2651         }
2652         if (rt->u.dst.dev)
2653                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2654 #ifdef CONFIG_NET_CLS_ROUTE
2655         if (rt->u.dst.tclassid)
2656                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2657 #endif
2658 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2659         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2660                 __u32 alg = rt->rt_multipath_alg;
2661
2662                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2663         }
2664 #endif
2665         if (rt->fl.iif)
2666                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2667         else if (rt->rt_src != rt->fl.fl4_src)
2668                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2669         if (rt->rt_dst != rt->rt_gateway)
2670                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2671         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2672                 goto rtattr_failure;
2673         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2674         ci.rta_used     = rt->u.dst.__use;
2675         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2676         if (rt->u.dst.expires)
2677                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2678         else
2679                 ci.rta_expires = 0;
2680         ci.rta_error    = rt->u.dst.error;
2681         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2682         if (rt->peer) {
2683                 ci.rta_id = rt->peer->ip_id_count;
2684                 if (rt->peer->tcp_ts_stamp) {
2685                         ci.rta_ts = rt->peer->tcp_ts;
2686                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2687                 }
2688         }
2689 #ifdef CONFIG_IP_MROUTE
2690         eptr = (struct rtattr*)skb->tail;
2691 #endif
2692         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2693         if (rt->fl.iif) {
2694 #ifdef CONFIG_IP_MROUTE
2695                 u32 dst = rt->rt_dst;
2696
2697                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2698                     ipv4_devconf.mc_forwarding) {
2699                         int err = ipmr_get_route(skb, r, nowait);
2700                         if (err <= 0) {
2701                                 if (!nowait) {
2702                                         if (err == 0)
2703                                                 return 0;
2704                                         goto nlmsg_failure;
2705                                 } else {
2706                                         if (err == -EMSGSIZE)
2707                                                 goto nlmsg_failure;
2708                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2709                                 }
2710                         }
2711                 } else
2712 #endif
2713                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2714         }
2715
2716         nlh->nlmsg_len = skb->tail - b;
2717         return skb->len;
2718
2719 nlmsg_failure:
2720 rtattr_failure:
2721         skb_trim(skb, b - skb->data);
2722         return -1;
2723 }
2724
2725 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2726 {
2727         struct rtattr **rta = arg;
2728         struct rtmsg *rtm = NLMSG_DATA(nlh);
2729         struct rtable *rt = NULL;
2730         u32 dst = 0;
2731         u32 src = 0;
2732         int iif = 0;
2733         int err = -ENOBUFS;
2734         struct sk_buff *skb;
2735
2736         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2737         if (!skb)
2738                 goto out;
2739
2740         /* Reserve room for dummy headers, this skb can pass
2741            through good chunk of routing engine.
2742          */
2743         skb->mac.raw = skb->nh.raw = skb->data;
2744
2745         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2746         skb->nh.iph->protocol = IPPROTO_ICMP;
2747         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2748
2749         if (rta[RTA_SRC - 1])
2750                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751         if (rta[RTA_DST - 1])
2752                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753         if (rta[RTA_IIF - 1])
2754                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2755
2756         if (iif) {
2757                 struct net_device *dev = __dev_get_by_index(iif);
2758                 err = -ENODEV;
2759                 if (!dev)
2760                         goto out_free;
2761                 skb->protocol   = htons(ETH_P_IP);
2762                 skb->dev        = dev;
2763                 local_bh_disable();
2764                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2765                 local_bh_enable();
2766                 rt = (struct rtable*)skb->dst;
2767                 if (!err && rt->u.dst.error)
2768                         err = -rt->u.dst.error;
2769         } else {
2770                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2771                                                          .saddr = src,
2772                                                          .tos = rtm->rtm_tos } } };
2773                 int oif = 0;
2774                 if (rta[RTA_OIF - 1])
2775                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2776                 fl.oif = oif;
2777                 err = ip_route_output_key(&rt, &fl);
2778         }
2779         if (err)
2780                 goto out_free;
2781
2782         skb->dst = &rt->u.dst;
2783         if (rtm->rtm_flags & RTM_F_NOTIFY)
2784                 rt->rt_flags |= RTCF_NOTIFY;
2785
2786         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2787
2788         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2789                                 RTM_NEWROUTE, 0, 0);
2790         if (!err)
2791                 goto out_free;
2792         if (err < 0) {
2793                 err = -EMSGSIZE;
2794                 goto out_free;
2795         }
2796
2797         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2798         if (err > 0)
2799                 err = 0;
2800 out:    return err;
2801
2802 out_free:
2803         kfree_skb(skb);
2804         goto out;
2805 }
2806
2807 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2808 {
2809         struct rtable *rt;
2810         int h, s_h;
2811         int idx, s_idx;
2812
2813         s_h = cb->args[0];
2814         s_idx = idx = cb->args[1];
2815         for (h = 0; h <= rt_hash_mask; h++) {
2816                 if (h < s_h) continue;
2817                 if (h > s_h)
2818                         s_idx = 0;
2819                 rcu_read_lock_bh();
2820                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2822                         if (idx < s_idx)
2823                                 continue;
2824                         skb->dst = dst_clone(&rt->u.dst);
2825                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2826                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2827                                          1, NLM_F_MULTI) <= 0) {
2828                                 dst_release(xchg(&skb->dst, NULL));
2829                                 rcu_read_unlock_bh();
2830                                 goto done;
2831                         }
2832                         dst_release(xchg(&skb->dst, NULL));
2833                 }
2834                 rcu_read_unlock_bh();
2835         }
2836
2837 done:
2838         cb->args[0] = h;
2839         cb->args[1] = idx;
2840         return skb->len;
2841 }
2842
2843 void ip_rt_multicast_event(struct in_device *in_dev)
2844 {
2845         rt_cache_flush(0);
2846 }
2847
2848 #ifdef CONFIG_SYSCTL
2849 static int flush_delay;
2850
2851 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852                                         struct file *filp, void __user *buffer,
2853                                         size_t *lenp, loff_t *ppos)
2854 {
2855         if (write) {
2856                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857                 rt_cache_flush(flush_delay);
2858                 return 0;
2859         }
2860
2861         return -EINVAL;
2862 }
2863
2864 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2865                                                 int __user *name,
2866                                                 int nlen,
2867                                                 void __user *oldval,
2868                                                 size_t __user *oldlenp,
2869                                                 void __user *newval,
2870                                                 size_t newlen,
2871                                                 void **context)
2872 {
2873         int delay;
2874         if (newlen != sizeof(int))
2875                 return -EINVAL;
2876         if (get_user(delay, (int __user *)newval))
2877                 return -EFAULT;
2878         rt_cache_flush(delay);
2879         return 0;
2880 }
2881
2882 ctl_table ipv4_route_table[] = {
2883         {
2884                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2885                 .procname       = "flush",
2886                 .data           = &flush_delay,
2887                 .maxlen         = sizeof(int),
2888                 .mode           = 0200,
2889                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2890                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2891         },
2892         {
2893                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2894                 .procname       = "min_delay",
2895                 .data           = &ip_rt_min_delay,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = &proc_dointvec_jiffies,
2899                 .strategy       = &sysctl_jiffies,
2900         },
2901         {
2902                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2903                 .procname       = "max_delay",
2904                 .data           = &ip_rt_max_delay,
2905                 .maxlen         = sizeof(int),
2906                 .mode           = 0644,
2907                 .proc_handler   = &proc_dointvec_jiffies,
2908                 .strategy       = &sysctl_jiffies,
2909         },
2910         {
2911                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2912                 .procname       = "gc_thresh",
2913                 .data           = &ipv4_dst_ops.gc_thresh,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0644,
2916                 .proc_handler   = &proc_dointvec,
2917         },
2918         {
2919                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2920                 .procname       = "max_size",
2921                 .data           = &ip_rt_max_size,
2922                 .maxlen         = sizeof(int),
2923                 .mode           = 0644,
2924                 .proc_handler   = &proc_dointvec,
2925         },
2926         {
2927                 /*  Deprecated. Use gc_min_interval_ms */
2928
2929                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930                 .procname       = "gc_min_interval",
2931                 .data           = &ip_rt_gc_min_interval,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = &proc_dointvec_jiffies,
2935                 .strategy       = &sysctl_jiffies,
2936         },
2937         {
2938                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939                 .procname       = "gc_min_interval_ms",
2940                 .data           = &ip_rt_gc_min_interval,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = &proc_dointvec_ms_jiffies,
2944                 .strategy       = &sysctl_ms_jiffies,
2945         },
2946         {
2947                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2948                 .procname       = "gc_timeout",
2949                 .data           = &ip_rt_gc_timeout,
2950                 .maxlen         = sizeof(int),
2951                 .mode           = 0644,
2952                 .proc_handler   = &proc_dointvec_jiffies,
2953                 .strategy       = &sysctl_jiffies,
2954         },
2955         {
2956                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2957                 .procname       = "gc_interval",
2958                 .data           = &ip_rt_gc_interval,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = &proc_dointvec_jiffies,
2962                 .strategy       = &sysctl_jiffies,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2966                 .procname       = "redirect_load",
2967                 .data           = &ip_rt_redirect_load,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974                 .procname       = "redirect_number",
2975                 .data           = &ip_rt_redirect_number,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec,
2979         },
2980         {
2981                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982                 .procname       = "redirect_silence",
2983                 .data           = &ip_rt_redirect_silence,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = &proc_dointvec,
2987         },
2988         {
2989                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2990                 .procname       = "error_cost",
2991                 .data           = &ip_rt_error_cost,
2992                 .maxlen         = sizeof(int),
2993                 .mode           = 0644,
2994                 .proc_handler   = &proc_dointvec,
2995         },
2996         {
2997                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2998                 .procname       = "error_burst",
2999                 .data           = &ip_rt_error_burst,
3000                 .maxlen         = sizeof(int),
3001                 .mode           = 0644,
3002                 .proc_handler   = &proc_dointvec,
3003         },
3004         {
3005                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3006                 .procname       = "gc_elasticity",
3007                 .data           = &ip_rt_gc_elasticity,
3008                 .maxlen         = sizeof(int),
3009                 .mode           = 0644,
3010                 .proc_handler   = &proc_dointvec,
3011         },
3012         {
3013                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3014                 .procname       = "mtu_expires",
3015                 .data           = &ip_rt_mtu_expires,
3016                 .maxlen         = sizeof(int),
3017                 .mode           = 0644,
3018                 .proc_handler   = &proc_dointvec_jiffies,
3019                 .strategy       = &sysctl_jiffies,
3020         },
3021         {
3022                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3023                 .procname       = "min_pmtu",
3024                 .data           = &ip_rt_min_pmtu,
3025                 .maxlen         = sizeof(int),
3026                 .mode           = 0644,
3027                 .proc_handler   = &proc_dointvec,
3028         },
3029         {
3030                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3031                 .procname       = "min_adv_mss",
3032                 .data           = &ip_rt_min_advmss,
3033                 .maxlen         = sizeof(int),
3034                 .mode           = 0644,
3035                 .proc_handler   = &proc_dointvec,
3036         },
3037         {
3038                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3039                 .procname       = "secret_interval",
3040                 .data           = &ip_rt_secret_interval,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = &proc_dointvec_jiffies,
3044                 .strategy       = &sysctl_jiffies,
3045         },
3046         { .ctl_name = 0 }
3047 };
3048 #endif
3049
3050 #ifdef CONFIG_NET_CLS_ROUTE
3051 struct ip_rt_acct *ip_rt_acct;
3052
3053 /* This code sucks.  But you should have seen it before! --RR */
3054
3055 /* IP route accounting ptr for this logical cpu number. */
3056 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3057
3058 #ifdef CONFIG_PROC_FS
3059 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060                            int length, int *eof, void *data)
3061 {
3062         unsigned int i;
3063
3064         if ((offset & 3) || (length & 3))
3065                 return -EIO;
3066
3067         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3068                 *eof = 1;
3069                 return 0;
3070         }
3071
3072         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3074                 *eof = 1;
3075         }
3076
3077         offset /= sizeof(u32);
3078
3079         if (length > 0) {
3080                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081                 u32 *dst = (u32 *) buffer;
3082
3083                 /* Copy first cpu. */
3084                 *start = buffer;
3085                 memcpy(dst, src, length);
3086
3087                 /* Add the other cpus in, one int at a time */
3088                 for_each_possible_cpu(i) {
3089                         unsigned int j;
3090
3091                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3092
3093                         for (j = 0; j < length/4; j++)
3094                                 dst[j] += src[j];
3095                 }
3096         }
3097         return length;
3098 }
3099 #endif /* CONFIG_PROC_FS */
3100 #endif /* CONFIG_NET_CLS_ROUTE */
3101
3102 static __initdata unsigned long rhash_entries;
3103 static int __init set_rhash_entries(char *str)
3104 {
3105         if (!str)
3106                 return 0;
3107         rhash_entries = simple_strtoul(str, &str, 0);
3108         return 1;
3109 }
3110 __setup("rhash_entries=", set_rhash_entries);
3111
3112 int __init ip_rt_init(void)
3113 {
3114         int rc = 0;
3115
3116         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117                              (jiffies ^ (jiffies >> 7)));
3118
3119 #ifdef CONFIG_NET_CLS_ROUTE
3120         {
3121         int order;
3122         for (order = 0;
3123              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3124                 /* NOTHING */;
3125         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3126         if (!ip_rt_acct)
3127                 panic("IP: failed to allocate ip_rt_acct\n");
3128         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129         }
3130 #endif
3131
3132         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133                                                      sizeof(struct rtable),
3134                                                      0, SLAB_HWCACHE_ALIGN,
3135                                                      NULL, NULL);
3136
3137         if (!ipv4_dst_ops.kmem_cachep)
3138                 panic("IP: failed to allocate ip_dst_cache\n");
3139
3140         rt_hash_table = (struct rt_hash_bucket *)
3141                 alloc_large_system_hash("IP route cache",
3142                                         sizeof(struct rt_hash_bucket),
3143                                         rhash_entries,
3144                                         (num_physpages >= 128 * 1024) ?
3145                                         15 : 17,
3146                                         HASH_HIGHMEM,
3147                                         &rt_hash_log,
3148                                         &rt_hash_mask,
3149                                         0);
3150         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3151         rt_hash_lock_init();
3152
3153         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3154         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3155
3156         devinet_init();
3157         ip_fib_init();
3158
3159         init_timer(&rt_flush_timer);
3160         rt_flush_timer.function = rt_run_flush;
3161         init_timer(&rt_periodic_timer);
3162         rt_periodic_timer.function = rt_check_expire;
3163         init_timer(&rt_secret_timer);
3164         rt_secret_timer.function = rt_secret_rebuild;
3165
3166         /* All the timers, started at system startup tend
3167            to synchronize. Perturb it a bit.
3168          */
3169         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3170                                         ip_rt_gc_interval;
3171         add_timer(&rt_periodic_timer);
3172
3173         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3174                 ip_rt_secret_interval;
3175         add_timer(&rt_secret_timer);
3176
3177 #ifdef CONFIG_PROC_FS
3178         {
3179         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3180         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3181             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3182                                              proc_net_stat))) {
3183                 return -ENOMEM;
3184         }
3185         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3186         }
3187 #ifdef CONFIG_NET_CLS_ROUTE
3188         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3189 #endif
3190 #endif
3191 #ifdef CONFIG_XFRM
3192         xfrm_init();
3193         xfrm4_init();
3194 #endif
3195         return rc;
3196 }
3197
3198 EXPORT_SYMBOL(__ip_select_ident);
3199 EXPORT_SYMBOL(ip_route_input);
3200 EXPORT_SYMBOL(ip_route_output_key);