git.oblomov.eu Git - linux-2.6/blob - net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *
  58  *              This program is free software; you can redistribute it and/or
  59  *              modify it under the terms of the GNU General Public License
  60  *              as published by the Free Software Foundation; either version
  61  *              2 of the License, or (at your option) any later version.
  62  */
  63
  64 #include <linux/config.h>
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/sched.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/rtnetlink.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/protocol.h>
  94 #include <net/ip.h>
  95 #include <net/route.h>
  96 #include <net/inetpeer.h>
  97 #include <net/sock.h>
  98 #include <net/ip_fib.h>
  99 #include <net/arp.h>
 100 #include <net/tcp.h>
 101 #include <net/icmp.h>
 102 #include <net/xfrm.h>
 103 #include <net/ip_mp_alg.h>
 104 #ifdef CONFIG_SYSCTL
 105 #include <linux/sysctl.h>
 106 #endif
 107
 108 #define RT_FL_TOS(oldflp) \
 109     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 110
 111 #define IP_MAX_MTU      0xFFF0
 112
 113 #define RT_GC_TIMEOUT (300*HZ)
 114
 115 static int ip_rt_min_delay              = 2 * HZ;
 116 static int ip_rt_max_delay              = 10 * HZ;
 117 static int ip_rt_max_size;
 118 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 119 static int ip_rt_gc_interval            = 60 * HZ;
 120 static int ip_rt_gc_min_interval        = HZ / 2;
 121 static int ip_rt_redirect_number        = 9;
 122 static int ip_rt_redirect_load          = HZ / 50;
 123 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 124 static int ip_rt_error_cost             = HZ;
 125 static int ip_rt_error_burst            = 5 * HZ;
 126 static int ip_rt_gc_elasticity          = 8;
 127 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 128 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 129 static int ip_rt_min_advmss             = 256;
 130 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 131 static unsigned long rt_deadline;
 132
 133 #define RTprint(a...)   printk(KERN_DEBUG a)
 134
 135 static struct timer_list rt_flush_timer;
 136 static struct timer_list rt_periodic_timer;
 137 static struct timer_list rt_secret_timer;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 146                                          struct net_device *dev, int how);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(void);
 151
 152
 153 static struct dst_ops ipv4_dst_ops = {
 154         .family =               AF_INET,
 155         .protocol =             __constant_htons(ETH_P_IP),
 156         .gc =                   rt_garbage_collect,
 157         .check =                ipv4_dst_check,
 158         .destroy =              ipv4_dst_destroy,
 159         .ifdown =               ipv4_dst_ifdown,
 160         .negative_advice =      ipv4_negative_advice,
 161         .link_failure =         ipv4_link_failure,
 162         .update_pmtu =          ip_rt_update_pmtu,
 163         .entry_size =           sizeof(struct rtable),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204         spinlock_t      lock;
 205 } __attribute__((__aligned__(8)));
 206
 207 static struct rt_hash_bucket    *rt_hash_table;
 208 static unsigned                 rt_hash_mask;
 209 static int                      rt_hash_log;
 210 static unsigned int             rt_hash_rnd;
 211
 212 struct rt_cache_stat *rt_cache_stat;
 213
 214 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 215                                 struct rtable **res);
 216
 217 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 218 {
 219         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 220                 & rt_hash_mask);
 221 }
 222
 223 #ifdef CONFIG_PROC_FS
 224 struct rt_cache_iter_state {
 225         int bucket;
 226 };
 227
 228 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 229 {
 230         struct rtable *r = NULL;
 231         struct rt_cache_iter_state *st = seq->private;
 232
 233         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 234                 rcu_read_lock_bh();
 235                 r = rt_hash_table[st->bucket].chain;
 236                 if (r)
 237                         break;
 238                 rcu_read_unlock_bh();
 239         }
 240         return r;
 241 }
 242
 243 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 244 {
 245         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 246
 247         r = r->u.rt_next;
 248         while (!r) {
 249                 rcu_read_unlock_bh();
 250                 if (--st->bucket < 0)
 251                         break;
 252                 rcu_read_lock_bh();
 253                 r = rt_hash_table[st->bucket].chain;
 254         }
 255         return r;
 256 }
 257
 258 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 259 {
 260         struct rtable *r = rt_cache_get_first(seq);
 261
 262         if (r)
 263                 while (pos && (r = rt_cache_get_next(seq, r)))
 264                         --pos;
 265         return pos ? NULL : r;
 266 }
 267
 268 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 269 {
 270         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 271 }
 272
 273 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 274 {
 275         struct rtable *r = NULL;
 276
 277         if (v == SEQ_START_TOKEN)
 278                 r = rt_cache_get_first(seq);
 279         else
 280                 r = rt_cache_get_next(seq, v);
 281         ++*pos;
 282         return r;
 283 }
 284
 285 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 286 {
 287         if (v && v != SEQ_START_TOKEN)
 288                 rcu_read_unlock_bh();
 289 }
 290
 291 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 292 {
 293         if (v == SEQ_START_TOKEN)
 294                 seq_printf(seq, "%-127s\n",
 295                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 296                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 297                            "HHUptod\tSpecDst");
 298         else {
 299                 struct rtable *r = v;
 300                 char temp[256];
 301
 302                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 303                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 304                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 305                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 306                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 307                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 308                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 309                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 310                         dst_metric(&r->u.dst, RTAX_WINDOW),
 311                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 312                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 313                         r->fl.fl4_tos,
 314                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 315                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 316                                        dev_queue_xmit) : 0,
 317                         r->rt_spec_dst);
 318                 seq_printf(seq, "%-127s\n", temp);
 319         }
 320         return 0;
 321 }
 322
 323 static struct seq_operations rt_cache_seq_ops = {
 324         .start  = rt_cache_seq_start,
 325         .next   = rt_cache_seq_next,
 326         .stop   = rt_cache_seq_stop,
 327         .show   = rt_cache_seq_show,
 328 };
 329
 330 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 331 {
 332         struct seq_file *seq;
 333         int rc = -ENOMEM;
 334         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 335
 336         if (!s)
 337                 goto out;
 338         rc = seq_open(file, &rt_cache_seq_ops);
 339         if (rc)
 340                 goto out_kfree;
 341         seq          = file->private_data;
 342         seq->private = s;
 343         memset(s, 0, sizeof(*s));
 344 out:
 345         return rc;
 346 out_kfree:
 347         kfree(s);
 348         goto out;
 349 }
 350
 351 static struct file_operations rt_cache_seq_fops = {
 352         .owner   = THIS_MODULE,
 353         .open    = rt_cache_seq_open,
 354         .read    = seq_read,
 355         .llseek  = seq_lseek,
 356         .release = seq_release_private,
 357 };
 358
 359
 360 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 361 {
 362         int cpu;
 363
 364         if (*pos == 0)
 365                 return SEQ_START_TOKEN;
 366
 367         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 368                 if (!cpu_possible(cpu))
 369                         continue;
 370                 *pos = cpu+1;
 371                 return per_cpu_ptr(rt_cache_stat, cpu);
 372         }
 373         return NULL;
 374 }
 375
 376 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 377 {
 378         int cpu;
 379
 380         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 381                 if (!cpu_possible(cpu))
 382                         continue;
 383                 *pos = cpu+1;
 384                 return per_cpu_ptr(rt_cache_stat, cpu);
 385         }
 386         return NULL;
 387
 388 }
 389
 390 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 391 {
 392
 393 }
 394
 395 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 396 {
 397         struct rt_cache_stat *st = v;
 398
 399         if (v == SEQ_START_TOKEN) {
 400                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 401                 return 0;
 402         }
 403
 404         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 405                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 406                    atomic_read(&ipv4_dst_ops.entries),
 407                    st->in_hit,
 408                    st->in_slow_tot,
 409                    st->in_slow_mc,
 410                    st->in_no_route,
 411                    st->in_brd,
 412                    st->in_martian_dst,
 413                    st->in_martian_src,
 414
 415                    st->out_hit,
 416                    st->out_slow_tot,
 417                    st->out_slow_mc,
 418
 419                    st->gc_total,
 420                    st->gc_ignored,
 421                    st->gc_goal_miss,
 422                    st->gc_dst_overflow,
 423                    st->in_hlist_search,
 424                    st->out_hlist_search
 425                 );
 426         return 0;
 427 }
 428
 429 static struct seq_operations rt_cpu_seq_ops = {
 430         .start  = rt_cpu_seq_start,
 431         .next   = rt_cpu_seq_next,
 432         .stop   = rt_cpu_seq_stop,
 433         .show   = rt_cpu_seq_show,
 434 };
 435
 436
 437 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 438 {
 439         return seq_open(file, &rt_cpu_seq_ops);
 440 }
 441
 442 static struct file_operations rt_cpu_seq_fops = {
 443         .owner   = THIS_MODULE,
 444         .open    = rt_cpu_seq_open,
 445         .read    = seq_read,
 446         .llseek  = seq_lseek,
 447         .release = seq_release,
 448 };
 449
 450 #endif /* CONFIG_PROC_FS */
 451
 452 static __inline__ void rt_free(struct rtable *rt)
 453 {
 454         multipath_remove(rt);
 455         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 456 }
 457
 458 static __inline__ void rt_drop(struct rtable *rt)
 459 {
 460         multipath_remove(rt);
 461         ip_rt_put(rt);
 462         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 463 }
 464
 465 static __inline__ int rt_fast_clean(struct rtable *rth)
 466 {
 467         /* Kill broadcast/multicast entries very aggresively, if they
 468            collide in hash table with more useful entries */
 469         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 470                 rth->fl.iif && rth->u.rt_next;
 471 }
 472
 473 static __inline__ int rt_valuable(struct rtable *rth)
 474 {
 475         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 476                 rth->u.dst.expires;
 477 }
 478
 479 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 480 {
 481         unsigned long age;
 482         int ret = 0;
 483
 484         if (atomic_read(&rth->u.dst.__refcnt))
 485                 goto out;
 486
 487         ret = 1;
 488         if (rth->u.dst.expires &&
 489             time_after_eq(jiffies, rth->u.dst.expires))
 490                 goto out;
 491
 492         age = jiffies - rth->u.dst.lastuse;
 493         ret = 0;
 494         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 495             (age <= tmo2 && rt_valuable(rth)))
 496                 goto out;
 497         ret = 1;
 498 out:    return ret;
 499 }
 500
 501 /* Bits of score are:
 502  * 31: very valuable
 503  * 30: not quite useless
 504  * 29..0: usage counter
 505  */
 506 static inline u32 rt_score(struct rtable *rt)
 507 {
 508         u32 score = jiffies - rt->u.dst.lastuse;
 509
 510         score = ~score & ~(3<<30);
 511
 512         if (rt_valuable(rt))
 513                 score |= (1<<31);
 514
 515         if (!rt->fl.iif ||
 516             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 517                 score |= (1<<30);
 518
 519         return score;
 520 }
 521
 522 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 523 {
 524         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 525                fl1->oif     == fl2->oif &&
 526                fl1->iif     == fl2->iif;
 527 }
 528
 529 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 530 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 531                                                 struct rtable *expentry,
 532                                                 int *removed_count)
 533 {
 534         int passedexpired = 0;
 535         struct rtable **nextstep = NULL;
 536         struct rtable **rthp = chain_head;
 537         struct rtable *rth;
 538
 539         if (removed_count)
 540                 *removed_count = 0;
 541
 542         while ((rth = *rthp) != NULL) {
 543                 if (rth == expentry)
 544                         passedexpired = 1;
 545
 546                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 547                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 548                         if (*rthp == expentry) {
 549                                 *rthp = rth->u.rt_next;
 550                                 continue;
 551                         } else {
 552                                 *rthp = rth->u.rt_next;
 553                                 rt_free(rth);
 554                                 if (removed_count)
 555                                         ++(*removed_count);
 556                         }
 557                 } else {
 558                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 559                             passedexpired && !nextstep)
 560                                 nextstep = &rth->u.rt_next;
 561
 562                         rthp = &rth->u.rt_next;
 563                 }
 564         }
 565
 566         rt_free(expentry);
 567         if (removed_count)
 568                 ++(*removed_count);
 569
 570         return nextstep;
 571 }
 572 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 573
 574
 575 /* This runs via a timer and thus is always in BH context. */
 576 static void rt_check_expire(unsigned long dummy)
 577 {
 578         static int rover;
 579         int i = rover, t;
 580         struct rtable *rth, **rthp;
 581         unsigned long now = jiffies;
 582
 583         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 584              t -= ip_rt_gc_timeout) {
 585                 unsigned long tmo = ip_rt_gc_timeout;
 586
 587                 i = (i + 1) & rt_hash_mask;
 588                 rthp = &rt_hash_table[i].chain;
 589
 590                 spin_lock(&rt_hash_table[i].lock);
 591                 while ((rth = *rthp) != NULL) {
 592                         if (rth->u.dst.expires) {
 593                                 /* Entry is expired even if it is in use */
 594                                 if (time_before_eq(now, rth->u.dst.expires)) {
 595                                         tmo >>= 1;
 596                                         rthp = &rth->u.rt_next;
 597                                         continue;
 598                                 }
 599                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 600                                 tmo >>= 1;
 601                                 rthp = &rth->u.rt_next;
 602                                 continue;
 603                         }
 604
 605                         /* Cleanup aged off entries. */
 606 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 607                         /* remove all related balanced entries if necessary */
 608                         if (rth->u.dst.flags & DST_BALANCED) {
 609                                 rthp = rt_remove_balanced_route(
 610                                         &rt_hash_table[i].chain,
 611                                         rth, NULL);
 612                                 if (!rthp)
 613                                         break;
 614                         } else {
 615                                 *rthp = rth->u.rt_next;
 616                                 rt_free(rth);
 617                         }
 618 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 619                         *rthp = rth->u.rt_next;
 620                         rt_free(rth);
 621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 622                 }
 623                 spin_unlock(&rt_hash_table[i].lock);
 624
 625                 /* Fallback loop breaker. */
 626                 if (time_after(jiffies, now))
 627                         break;
 628         }
 629         rover = i;
 630         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 631 }
 632
 633 /* This can run from both BH and non-BH contexts, the latter
 634  * in the case of a forced flush event.
 635  */
 636 static void rt_run_flush(unsigned long dummy)
 637 {
 638         int i;
 639         struct rtable *rth, *next;
 640
 641         rt_deadline = 0;
 642
 643         get_random_bytes(&rt_hash_rnd, 4);
 644
 645         for (i = rt_hash_mask; i >= 0; i--) {
 646                 spin_lock_bh(&rt_hash_table[i].lock);
 647                 rth = rt_hash_table[i].chain;
 648                 if (rth)
 649                         rt_hash_table[i].chain = NULL;
 650                 spin_unlock_bh(&rt_hash_table[i].lock);
 651
 652                 for (; rth; rth = next) {
 653                         next = rth->u.rt_next;
 654                         rt_free(rth);
 655                 }
 656         }
 657 }
 658
 659 static DEFINE_SPINLOCK(rt_flush_lock);
 660
 661 void rt_cache_flush(int delay)
 662 {
 663         unsigned long now = jiffies;
 664         int user_mode = !in_softirq();
 665
 666         if (delay < 0)
 667                 delay = ip_rt_min_delay;
 668
 669         /* flush existing multipath state*/
 670         multipath_flush();
 671
 672         spin_lock_bh(&rt_flush_lock);
 673
 674         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 675                 long tmo = (long)(rt_deadline - now);
 676
 677                 /* If flush timer is already running
 678                    and flush request is not immediate (delay > 0):
 679
 680                    if deadline is not achieved, prolongate timer to "delay",
 681                    otherwise fire it at deadline time.
 682                  */
 683
 684                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 685                         tmo = 0;
 686
 687                 if (delay > tmo)
 688                         delay = tmo;
 689         }
 690
 691         if (delay <= 0) {
 692                 spin_unlock_bh(&rt_flush_lock);
 693                 rt_run_flush(0);
 694                 return;
 695         }
 696
 697         if (rt_deadline == 0)
 698                 rt_deadline = now + ip_rt_max_delay;
 699
 700         mod_timer(&rt_flush_timer, now+delay);
 701         spin_unlock_bh(&rt_flush_lock);
 702 }
 703
 704 static void rt_secret_rebuild(unsigned long dummy)
 705 {
 706         unsigned long now = jiffies;
 707
 708         rt_cache_flush(0);
 709         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 710 }
 711
 712 /*
 713    Short description of GC goals.
 714
 715    We want to build algorithm, which will keep routing cache
 716    at some equilibrium point, when number of aged off entries
 717    is kept approximately equal to newly generated ones.
 718
 719    Current expiration strength is variable "expire".
 720    We try to adjust it dynamically, so that if networking
 721    is idle expires is large enough to keep enough of warm entries,
 722    and when load increases it reduces to limit cache size.
 723  */
 724
 725 static int rt_garbage_collect(void)
 726 {
 727         static unsigned long expire = RT_GC_TIMEOUT;
 728         static unsigned long last_gc;
 729         static int rover;
 730         static int equilibrium;
 731         struct rtable *rth, **rthp;
 732         unsigned long now = jiffies;
 733         int goal;
 734
 735         /*
 736          * Garbage collection is pretty expensive,
 737          * do not make it too frequently.
 738          */
 739
 740         RT_CACHE_STAT_INC(gc_total);
 741
 742         if (now - last_gc < ip_rt_gc_min_interval &&
 743             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 744                 RT_CACHE_STAT_INC(gc_ignored);
 745                 goto out;
 746         }
 747
 748         /* Calculate number of entries, which we want to expire now. */
 749         goal = atomic_read(&ipv4_dst_ops.entries) -
 750                 (ip_rt_gc_elasticity << rt_hash_log);
 751         if (goal <= 0) {
 752                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 753                         equilibrium = ipv4_dst_ops.gc_thresh;
 754                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 755                 if (goal > 0) {
 756                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 757                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 758                 }
 759         } else {
 760                 /* We are in dangerous area. Try to reduce cache really
 761                  * aggressively.
 762                  */
 763                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 764                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 765         }
 766
 767         if (now - last_gc >= ip_rt_gc_min_interval)
 768                 last_gc = now;
 769
 770         if (goal <= 0) {
 771                 equilibrium += goal;
 772                 goto work_done;
 773         }
 774
 775         do {
 776                 int i, k;
 777
 778                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 779                         unsigned long tmo = expire;
 780
 781                         k = (k + 1) & rt_hash_mask;
 782                         rthp = &rt_hash_table[k].chain;
 783                         spin_lock_bh(&rt_hash_table[k].lock);
 784                         while ((rth = *rthp) != NULL) {
 785                                 if (!rt_may_expire(rth, tmo, expire)) {
 786                                         tmo >>= 1;
 787                                         rthp = &rth->u.rt_next;
 788                                         continue;
 789                                 }
 790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 791                                 /* remove all related balanced entries
 792                                  * if necessary
 793                                  */
 794                                 if (rth->u.dst.flags & DST_BALANCED) {
 795                                         int r;
 796
 797                                         rthp = rt_remove_balanced_route(
 798                                                 &rt_hash_table[i].chain,
 799                                                 rth,
 800                                                 &r);
 801                                         goal -= r;
 802                                         if (!rthp)
 803                                                 break;
 804                                 } else {
 805                                         *rthp = rth->u.rt_next;
 806                                         rt_free(rth);
 807                                         goal--;
 808                                 }
 809 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 810                                 *rthp = rth->u.rt_next;
 811                                 rt_free(rth);
 812                                 goal--;
 813 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 814                         }
 815                         spin_unlock_bh(&rt_hash_table[k].lock);
 816                         if (goal <= 0)
 817                                 break;
 818                 }
 819                 rover = k;
 820
 821                 if (goal <= 0)
 822                         goto work_done;
 823
 824                 /* Goal is not achieved. We stop process if:
 825
 826                    - if expire reduced to zero. Otherwise, expire is halfed.
 827                    - if table is not full.
 828                    - if we are called from interrupt.
 829                    - jiffies check is just fallback/debug loop breaker.
 830                      We will not spin here for long time in any case.
 831                  */
 832
 833                 RT_CACHE_STAT_INC(gc_goal_miss);
 834
 835                 if (expire == 0)
 836                         break;
 837
 838                 expire >>= 1;
 839 #if RT_CACHE_DEBUG >= 2
 840                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 841                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 842 #endif
 843
 844                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 845                         goto out;
 846         } while (!in_softirq() && time_before_eq(jiffies, now));
 847
 848         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 849                 goto out;
 850         if (net_ratelimit())
 851                 printk(KERN_WARNING "dst cache overflow\n");
 852         RT_CACHE_STAT_INC(gc_dst_overflow);
 853         return 1;
 854
 855 work_done:
 856         expire += ip_rt_gc_min_interval;
 857         if (expire > ip_rt_gc_timeout ||
 858             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 859                 expire = ip_rt_gc_timeout;
 860 #if RT_CACHE_DEBUG >= 2
 861         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 862                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 863 #endif
 864 out:    return 0;
 865 }
 866
 867 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 868 {
 869         struct rtable   *rth, **rthp;
 870         unsigned long   now;
 871         struct rtable *cand, **candp;
 872         u32             min_score;
 873         int             chain_length;
 874         int attempts = !in_softirq();
 875
 876 restart:
 877         chain_length = 0;
 878         min_score = ~(u32)0;
 879         cand = NULL;
 880         candp = NULL;
 881         now = jiffies;
 882
 883         rthp = &rt_hash_table[hash].chain;
 884
 885         spin_lock_bh(&rt_hash_table[hash].lock);
 886         while ((rth = *rthp) != NULL) {
 887 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 888                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 889                     compare_keys(&rth->fl, &rt->fl)) {
 890 #else
 891                 if (compare_keys(&rth->fl, &rt->fl)) {
 892 #endif
 893                         /* Put it first */
 894                         *rthp = rth->u.rt_next;
 895                         /*
 896                          * Since lookup is lockfree, the deletion
 897                          * must be visible to another weakly ordered CPU before
 898                          * the insertion at the start of the hash chain.
 899                          */
 900                         rcu_assign_pointer(rth->u.rt_next,
 901                                            rt_hash_table[hash].chain);
 902                         /*
 903                          * Since lookup is lockfree, the update writes
 904                          * must be ordered for consistency on SMP.
 905                          */
 906                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 907
 908                         rth->u.dst.__use++;
 909                         dst_hold(&rth->u.dst);
 910                         rth->u.dst.lastuse = now;
 911                         spin_unlock_bh(&rt_hash_table[hash].lock);
 912
 913                         rt_drop(rt);
 914                         *rp = rth;
 915                         return 0;
 916                 }
 917
 918                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 919                         u32 score = rt_score(rth);
 920
 921                         if (score <= min_score) {
 922                                 cand = rth;
 923                                 candp = rthp;
 924                                 min_score = score;
 925                         }
 926                 }
 927
 928                 chain_length++;
 929
 930                 rthp = &rth->u.rt_next;
 931         }
 932
 933         if (cand) {
 934                 /* ip_rt_gc_elasticity used to be average length of chain
 935                  * length, when exceeded gc becomes really aggressive.
 936                  *
 937                  * The second limit is less certain. At the moment it allows
 938                  * only 2 entries per bucket. We will see.
 939                  */
 940                 if (chain_length > ip_rt_gc_elasticity) {
 941                         *candp = cand->u.rt_next;
 942                         rt_free(cand);
 943                 }
 944         }
 945
 946         /* Try to bind route to arp only if it is output
 947            route or unicast forwarding path.
 948          */
 949         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 950                 int err = arp_bind_neighbour(&rt->u.dst);
 951                 if (err) {
 952                         spin_unlock_bh(&rt_hash_table[hash].lock);
 953
 954                         if (err != -ENOBUFS) {
 955                                 rt_drop(rt);
 956                                 return err;
 957                         }
 958
 959                         /* Neighbour tables are full and nothing
 960                            can be released. Try to shrink route cache,
 961                            it is most likely it holds some neighbour records.
 962                          */
 963                         if (attempts-- > 0) {
 964                                 int saved_elasticity = ip_rt_gc_elasticity;
 965                                 int saved_int = ip_rt_gc_min_interval;
 966                                 ip_rt_gc_elasticity     = 1;
 967                                 ip_rt_gc_min_interval   = 0;
 968                                 rt_garbage_collect();
 969                                 ip_rt_gc_min_interval   = saved_int;
 970                                 ip_rt_gc_elasticity     = saved_elasticity;
 971                                 goto restart;
 972                         }
 973
 974                         if (net_ratelimit())
 975                                 printk(KERN_WARNING "Neighbour table overflow.\n");
 976                         rt_drop(rt);
 977                         return -ENOBUFS;
 978                 }
 979         }
 980
 981         rt->u.rt_next = rt_hash_table[hash].chain;
 982 #if RT_CACHE_DEBUG >= 2
 983         if (rt->u.rt_next) {
 984                 struct rtable *trt;
 985                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 986                        NIPQUAD(rt->rt_dst));
 987                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
 988                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 989                 printk("\n");
 990         }
 991 #endif
 992         rt_hash_table[hash].chain = rt;
 993         spin_unlock_bh(&rt_hash_table[hash].lock);
 994         *rp = rt;
 995         return 0;
 996 }
 997
 998 void rt_bind_peer(struct rtable *rt, int create)
 999 {
1000         static DEFINE_SPINLOCK(rt_peer_lock);
1001         struct inet_peer *peer;
1002
1003         peer = inet_getpeer(rt->rt_dst, create);
1004
1005         spin_lock_bh(&rt_peer_lock);
1006         if (rt->peer == NULL) {
1007                 rt->peer = peer;
1008                 peer = NULL;
1009         }
1010         spin_unlock_bh(&rt_peer_lock);
1011         if (peer)
1012                 inet_putpeer(peer);
1013 }
1014
1015 /*
1016  * Peer allocation may fail only in serious out-of-memory conditions.  However
1017  * we still can generate some output.
1018  * Random ID selection looks a bit dangerous because we have no chances to
1019  * select ID being unique in a reasonable period of time.
1020  * But broken packet identifier may be better than no packet at all.
1021  */
1022 static void ip_select_fb_ident(struct iphdr *iph)
1023 {
1024         static DEFINE_SPINLOCK(ip_fb_id_lock);
1025         static u32 ip_fallback_id;
1026         u32 salt;
1027
1028         spin_lock_bh(&ip_fb_id_lock);
1029         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1030         iph->id = htons(salt & 0xFFFF);
1031         ip_fallback_id = salt;
1032         spin_unlock_bh(&ip_fb_id_lock);
1033 }
1034
1035 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1036 {
1037         struct rtable *rt = (struct rtable *) dst;
1038
1039         if (rt) {
1040                 if (rt->peer == NULL)
1041                         rt_bind_peer(rt, 1);
1042
1043                 /* If peer is attached to destination, it is never detached,
1044                    so that we need not to grab a lock to dereference it.
1045                  */
1046                 if (rt->peer) {
1047                         iph->id = htons(inet_getid(rt->peer, more));
1048                         return;
1049                 }
1050         } else
1051                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1052                        __builtin_return_address(0));
1053
1054         ip_select_fb_ident(iph);
1055 }
1056
1057 static void rt_del(unsigned hash, struct rtable *rt)
1058 {
1059         struct rtable **rthp;
1060
1061         spin_lock_bh(&rt_hash_table[hash].lock);
1062         ip_rt_put(rt);
1063         for (rthp = &rt_hash_table[hash].chain; *rthp;
1064              rthp = &(*rthp)->u.rt_next)
1065                 if (*rthp == rt) {
1066                         *rthp = rt->u.rt_next;
1067                         rt_free(rt);
1068                         break;
1069                 }
1070         spin_unlock_bh(&rt_hash_table[hash].lock);
1071 }
1072
1073 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1074                     u32 saddr, u8 tos, struct net_device *dev)
1075 {
1076         int i, k;
1077         struct in_device *in_dev = in_dev_get(dev);
1078         struct rtable *rth, **rthp;
1079         u32  skeys[2] = { saddr, 0 };
1080         int  ikeys[2] = { dev->ifindex, 0 };
1081
1082         tos &= IPTOS_RT_MASK;
1083
1084         if (!in_dev)
1085                 return;
1086
1087         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1088             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1089                 goto reject_redirect;
1090
1091         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1092                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1093                         goto reject_redirect;
1094                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1095                         goto reject_redirect;
1096         } else {
1097                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1098                         goto reject_redirect;
1099         }
1100
1101         for (i = 0; i < 2; i++) {
1102                 for (k = 0; k < 2; k++) {
1103                         unsigned hash = rt_hash_code(daddr,
1104                                                      skeys[i] ^ (ikeys[k] << 5),
1105                                                      tos);
1106
1107                         rthp=&rt_hash_table[hash].chain;
1108
1109                         rcu_read_lock();
1110                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1111                                 struct rtable *rt;
1112
1113                                 if (rth->fl.fl4_dst != daddr ||
1114                                     rth->fl.fl4_src != skeys[i] ||
1115                                     rth->fl.fl4_tos != tos ||
1116                                     rth->fl.oif != ikeys[k] ||
1117                                     rth->fl.iif != 0) {
1118                                         rthp = &rth->u.rt_next;
1119                                         continue;
1120                                 }
1121
1122                                 if (rth->rt_dst != daddr ||
1123                                     rth->rt_src != saddr ||
1124                                     rth->u.dst.error ||
1125                                     rth->rt_gateway != old_gw ||
1126                                     rth->u.dst.dev != dev)
1127                                         break;
1128
1129                                 dst_hold(&rth->u.dst);
1130                                 rcu_read_unlock();
1131
1132                                 rt = dst_alloc(&ipv4_dst_ops);
1133                                 if (rt == NULL) {
1134                                         ip_rt_put(rth);
1135                                         in_dev_put(in_dev);
1136                                         return;
1137                                 }
1138
1139                                 /* Copy all the information. */
1140                                 *rt = *rth;
1141                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1142                                 rt->u.dst.__use         = 1;
1143                                 atomic_set(&rt->u.dst.__refcnt, 1);
1144                                 rt->u.dst.child         = NULL;
1145                                 if (rt->u.dst.dev)
1146                                         dev_hold(rt->u.dst.dev);
1147                                 if (rt->idev)
1148                                         in_dev_hold(rt->idev);
1149                                 rt->u.dst.obsolete      = 0;
1150                                 rt->u.dst.lastuse       = jiffies;
1151                                 rt->u.dst.path          = &rt->u.dst;
1152                                 rt->u.dst.neighbour     = NULL;
1153                                 rt->u.dst.hh            = NULL;
1154                                 rt->u.dst.xfrm          = NULL;
1155
1156                                 rt->rt_flags            |= RTCF_REDIRECTED;
1157
1158                                 /* Gateway is different ... */
1159                                 rt->rt_gateway          = new_gw;
1160
1161                                 /* Redirect received -> path was valid */
1162                                 dst_confirm(&rth->u.dst);
1163
1164                                 if (rt->peer)
1165                                         atomic_inc(&rt->peer->refcnt);
1166
1167                                 if (arp_bind_neighbour(&rt->u.dst) ||
1168                                     !(rt->u.dst.neighbour->nud_state &
1169                                             NUD_VALID)) {
1170                                         if (rt->u.dst.neighbour)
1171                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1172                                         ip_rt_put(rth);
1173                                         rt_drop(rt);
1174                                         goto do_next;
1175                                 }
1176
1177                                 rt_del(hash, rth);
1178                                 if (!rt_intern_hash(hash, rt, &rt))
1179                                         ip_rt_put(rt);
1180                                 goto do_next;
1181                         }
1182                         rcu_read_unlock();
1183                 do_next:
1184                         ;
1185                 }
1186         }
1187         in_dev_put(in_dev);
1188         return;
1189
1190 reject_redirect:
1191 #ifdef CONFIG_IP_ROUTE_VERBOSE
1192         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1193                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1194                         "%u.%u.%u.%u ignored.\n"
1195                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1196                         "tos %02x\n",
1197                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1198                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1199 #endif
1200         in_dev_put(in_dev);
1201 }
1202
1203 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1204 {
1205         struct rtable *rt = (struct rtable*)dst;
1206         struct dst_entry *ret = dst;
1207
1208         if (rt) {
1209                 if (dst->obsolete) {
1210                         ip_rt_put(rt);
1211                         ret = NULL;
1212                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1213                            rt->u.dst.expires) {
1214                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1215                                                      rt->fl.fl4_src ^
1216                                                         (rt->fl.oif << 5),
1217                                                      rt->fl.fl4_tos);
1218 #if RT_CACHE_DEBUG >= 1
1219                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1220                                           "%u.%u.%u.%u/%02x dropped\n",
1221                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1222 #endif
1223                         rt_del(hash, rt);
1224                         ret = NULL;
1225                 }
1226         }
1227         return ret;
1228 }
1229
1230 /*
1231  * Algorithm:
1232  *      1. The first ip_rt_redirect_number redirects are sent
1233  *         with exponential backoff, then we stop sending them at all,
1234  *         assuming that the host ignores our redirects.
1235  *      2. If we did not see packets requiring redirects
1236  *         during ip_rt_redirect_silence, we assume that the host
1237  *         forgot redirected route and start to send redirects again.
1238  *
1239  * This algorithm is much cheaper and more intelligent than dumb load limiting
1240  * in icmp.c.
1241  *
1242  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1243  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1244  */
1245
1246 void ip_rt_send_redirect(struct sk_buff *skb)
1247 {
1248         struct rtable *rt = (struct rtable*)skb->dst;
1249         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1250
1251         if (!in_dev)
1252                 return;
1253
1254         if (!IN_DEV_TX_REDIRECTS(in_dev))
1255                 goto out;
1256
1257         /* No redirected packets during ip_rt_redirect_silence;
1258          * reset the algorithm.
1259          */
1260         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1261                 rt->u.dst.rate_tokens = 0;
1262
1263         /* Too many ignored redirects; do not send anything
1264          * set u.dst.rate_last to the last seen redirected packet.
1265          */
1266         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1267                 rt->u.dst.rate_last = jiffies;
1268                 goto out;
1269         }
1270
1271         /* Check for load limit; set rate_last to the latest sent
1272          * redirect.
1273          */
1274         if (time_after(jiffies,
1275                        (rt->u.dst.rate_last +
1276                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1277                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1278                 rt->u.dst.rate_last = jiffies;
1279                 ++rt->u.dst.rate_tokens;
1280 #ifdef CONFIG_IP_ROUTE_VERBOSE
1281                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1282                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1283                     net_ratelimit())
1284                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1285                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1286                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1287                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1288 #endif
1289         }
1290 out:
1291         in_dev_put(in_dev);
1292 }
1293
1294 static int ip_error(struct sk_buff *skb)
1295 {
1296         struct rtable *rt = (struct rtable*)skb->dst;
1297         unsigned long now;
1298         int code;
1299
1300         switch (rt->u.dst.error) {
1301                 case EINVAL:
1302                 default:
1303                         goto out;
1304                 case EHOSTUNREACH:
1305                         code = ICMP_HOST_UNREACH;
1306                         break;
1307                 case ENETUNREACH:
1308                         code = ICMP_NET_UNREACH;
1309                         break;
1310                 case EACCES:
1311                         code = ICMP_PKT_FILTERED;
1312                         break;
1313         }
1314
1315         now = jiffies;
1316         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1317         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1318                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1319         rt->u.dst.rate_last = now;
1320         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1321                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1322                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1323         }
1324
1325 out:    kfree_skb(skb);
1326         return 0;
1327 }
1328
1329 /*
1330  *      The last two values are not from the RFC but
1331  *      are needed for AMPRnet AX.25 paths.
1332  */
1333
1334 static unsigned short mtu_plateau[] =
1335 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1336
1337 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1338 {
1339         int i;
1340
1341         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1342                 if (old_mtu > mtu_plateau[i])
1343                         return mtu_plateau[i];
1344         return 68;
1345 }
1346
1347 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1348 {
1349         int i;
1350         unsigned short old_mtu = ntohs(iph->tot_len);
1351         struct rtable *rth;
1352         u32  skeys[2] = { iph->saddr, 0, };
1353         u32  daddr = iph->daddr;
1354         u8   tos = iph->tos & IPTOS_RT_MASK;
1355         unsigned short est_mtu = 0;
1356
1357         if (ipv4_config.no_pmtu_disc)
1358                 return 0;
1359
1360         for (i = 0; i < 2; i++) {
1361                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1362
1363                 rcu_read_lock();
1364                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1365                      rth = rcu_dereference(rth->u.rt_next)) {
1366                         if (rth->fl.fl4_dst == daddr &&
1367                             rth->fl.fl4_src == skeys[i] &&
1368                             rth->rt_dst  == daddr &&
1369                             rth->rt_src  == iph->saddr &&
1370                             rth->fl.fl4_tos == tos &&
1371                             rth->fl.iif == 0 &&
1372                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1373                                 unsigned short mtu = new_mtu;
1374
1375                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1376
1377                                         /* BSD 4.2 compatibility hack :-( */
1378                                         if (mtu == 0 &&
1379                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1380                                             old_mtu >= 68 + (iph->ihl << 2))
1381                                                 old_mtu -= iph->ihl << 2;
1382
1383                                         mtu = guess_mtu(old_mtu);
1384                                 }
1385                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1386                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1387                                                 dst_confirm(&rth->u.dst);
1388                                                 if (mtu < ip_rt_min_pmtu) {
1389                                                         mtu = ip_rt_min_pmtu;
1390                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1391                                                                 (1 << RTAX_MTU);
1392                                                 }
1393                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1394                                                 dst_set_expires(&rth->u.dst,
1395                                                         ip_rt_mtu_expires);
1396                                         }
1397                                         est_mtu = mtu;
1398                                 }
1399                         }
1400                 }
1401                 rcu_read_unlock();
1402         }
1403         return est_mtu ? : new_mtu;
1404 }
1405
1406 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1407 {
1408         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1409             !(dst_metric_locked(dst, RTAX_MTU))) {
1410                 if (mtu < ip_rt_min_pmtu) {
1411                         mtu = ip_rt_min_pmtu;
1412                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1413                 }
1414                 dst->metrics[RTAX_MTU-1] = mtu;
1415                 dst_set_expires(dst, ip_rt_mtu_expires);
1416         }
1417 }
1418
1419 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1420 {
1421         return NULL;
1422 }
1423
1424 static void ipv4_dst_destroy(struct dst_entry *dst)
1425 {
1426         struct rtable *rt = (struct rtable *) dst;
1427         struct inet_peer *peer = rt->peer;
1428         struct in_device *idev = rt->idev;
1429
1430         if (peer) {
1431                 rt->peer = NULL;
1432                 inet_putpeer(peer);
1433         }
1434
1435         if (idev) {
1436                 rt->idev = NULL;
1437                 in_dev_put(idev);
1438         }
1439 }
1440
1441 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1442                             int how)
1443 {
1444         struct rtable *rt = (struct rtable *) dst;
1445         struct in_device *idev = rt->idev;
1446         if (dev != &loopback_dev && idev && idev->dev == dev) {
1447                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1448                 if (loopback_idev) {
1449                         rt->idev = loopback_idev;
1450                         in_dev_put(idev);
1451                 }
1452         }
1453 }
1454
1455 static void ipv4_link_failure(struct sk_buff *skb)
1456 {
1457         struct rtable *rt;
1458
1459         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1460
1461         rt = (struct rtable *) skb->dst;
1462         if (rt)
1463                 dst_set_expires(&rt->u.dst, 0);
1464 }
1465
1466 static int ip_rt_bug(struct sk_buff *skb)
1467 {
1468         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1469                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1470                 skb->dev ? skb->dev->name : "?");
1471         kfree_skb(skb);
1472         return 0;
1473 }
1474
1475 /*
1476    We do not cache source address of outgoing interface,
1477    because it is used only by IP RR, TS and SRR options,
1478    so that it out of fast path.
1479
1480    BTW remember: "addr" is allowed to be not aligned
1481    in IP options!
1482  */
1483
1484 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1485 {
1486         u32 src;
1487         struct fib_result res;
1488
1489         if (rt->fl.iif == 0)
1490                 src = rt->rt_src;
1491         else if (fib_lookup(&rt->fl, &res) == 0) {
1492                 src = FIB_RES_PREFSRC(res);
1493                 fib_res_put(&res);
1494         } else
1495                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1496                                         RT_SCOPE_UNIVERSE);
1497         memcpy(addr, &src, 4);
1498 }
1499
1500 #ifdef CONFIG_NET_CLS_ROUTE
1501 static void set_class_tag(struct rtable *rt, u32 tag)
1502 {
1503         if (!(rt->u.dst.tclassid & 0xFFFF))
1504                 rt->u.dst.tclassid |= tag & 0xFFFF;
1505         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1506                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1507 }
1508 #endif
1509
1510 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1511 {
1512         struct fib_info *fi = res->fi;
1513
1514         if (fi) {
1515                 if (FIB_RES_GW(*res) &&
1516                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1517                         rt->rt_gateway = FIB_RES_GW(*res);
1518                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1519                        sizeof(rt->u.dst.metrics));
1520                 if (fi->fib_mtu == 0) {
1521                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1522                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1523                             rt->rt_gateway != rt->rt_dst &&
1524                             rt->u.dst.dev->mtu > 576)
1525                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1526                 }
1527 #ifdef CONFIG_NET_CLS_ROUTE
1528                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1529 #endif
1530         } else
1531                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1532
1533         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1534                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1535         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1536                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1537         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1538                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1539                                        ip_rt_min_advmss);
1540         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1541                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1542
1543 #ifdef CONFIG_NET_CLS_ROUTE
1544 #ifdef CONFIG_IP_MULTIPLE_TABLES
1545         set_class_tag(rt, fib_rules_tclass(res));
1546 #endif
1547         set_class_tag(rt, itag);
1548 #endif
1549         rt->rt_type = res->type;
1550 }
1551
1552 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1553                                 u8 tos, struct net_device *dev, int our)
1554 {
1555         unsigned hash;
1556         struct rtable *rth;
1557         u32 spec_dst;
1558         struct in_device *in_dev = in_dev_get(dev);
1559         u32 itag = 0;
1560
1561         /* Primary sanity checks. */
1562
1563         if (in_dev == NULL)
1564                 return -EINVAL;
1565
1566         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1567             skb->protocol != htons(ETH_P_IP))
1568                 goto e_inval;
1569
1570         if (ZERONET(saddr)) {
1571                 if (!LOCAL_MCAST(daddr))
1572                         goto e_inval;
1573                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1574         } else if (fib_validate_source(saddr, 0, tos, 0,
1575                                         dev, &spec_dst, &itag) < 0)
1576                 goto e_inval;
1577
1578         rth = dst_alloc(&ipv4_dst_ops);
1579         if (!rth)
1580                 goto e_nobufs;
1581
1582         rth->u.dst.output= ip_rt_bug;
1583
1584         atomic_set(&rth->u.dst.__refcnt, 1);
1585         rth->u.dst.flags= DST_HOST;
1586         if (in_dev->cnf.no_policy)
1587                 rth->u.dst.flags |= DST_NOPOLICY;
1588         rth->fl.fl4_dst = daddr;
1589         rth->rt_dst     = daddr;
1590         rth->fl.fl4_tos = tos;
1591 #ifdef CONFIG_IP_ROUTE_FWMARK
1592         rth->fl.fl4_fwmark= skb->nfmark;
1593 #endif
1594         rth->fl.fl4_src = saddr;
1595         rth->rt_src     = saddr;
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597         rth->u.dst.tclassid = itag;
1598 #endif
1599         rth->rt_iif     =
1600         rth->fl.iif     = dev->ifindex;
1601         rth->u.dst.dev  = &loopback_dev;
1602         dev_hold(rth->u.dst.dev);
1603         rth->idev       = in_dev_get(rth->u.dst.dev);
1604         rth->fl.oif     = 0;
1605         rth->rt_gateway = daddr;
1606         rth->rt_spec_dst= spec_dst;
1607         rth->rt_type    = RTN_MULTICAST;
1608         rth->rt_flags   = RTCF_MULTICAST;
1609         if (our) {
1610                 rth->u.dst.input= ip_local_deliver;
1611                 rth->rt_flags |= RTCF_LOCAL;
1612         }
1613
1614 #ifdef CONFIG_IP_MROUTE
1615         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1616                 rth->u.dst.input = ip_mr_input;
1617 #endif
1618         RT_CACHE_STAT_INC(in_slow_mc);
1619
1620         in_dev_put(in_dev);
1621         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1622         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1623
1624 e_nobufs:
1625         in_dev_put(in_dev);
1626         return -ENOBUFS;
1627
1628 e_inval:
1629         in_dev_put(in_dev);
1630         return -EINVAL;
1631 }
1632
1633
1634 static void ip_handle_martian_source(struct net_device *dev,
1635                                      struct in_device *in_dev,
1636                                      struct sk_buff *skb,
1637                                      u32 daddr,
1638                                      u32 saddr)
1639 {
1640         RT_CACHE_STAT_INC(in_martian_src);
1641 #ifdef CONFIG_IP_ROUTE_VERBOSE
1642         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1643                 /*
1644                  *      RFC1812 recommendation, if source is martian,
1645                  *      the only hint is MAC header.
1646                  */
1647                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648                         "%u.%u.%u.%u, on dev %s\n",
1649                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650                 if (dev->hard_header_len) {
1651                         int i;
1652                         unsigned char *p = skb->mac.raw;
1653                         printk(KERN_WARNING "ll header: ");
1654                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1655                                 printk("%02x", *p);
1656                                 if (i < (dev->hard_header_len - 1))
1657                                         printk(":");
1658                         }
1659                         printk("\n");
1660                 }
1661         }
1662 #endif
1663 }
1664
1665 static inline int __mkroute_input(struct sk_buff *skb,
1666                                   struct fib_result* res,
1667                                   struct in_device *in_dev,
1668                                   u32 daddr, u32 saddr, u32 tos,
1669                                   struct rtable **result)
1670 {
1671
1672         struct rtable *rth;
1673         int err;
1674         struct in_device *out_dev;
1675         unsigned flags = 0;
1676         u32 spec_dst, itag;
1677
1678         /* get a working reference to the output device */
1679         out_dev = in_dev_get(FIB_RES_DEV(*res));
1680         if (out_dev == NULL) {
1681                 if (net_ratelimit())
1682                         printk(KERN_CRIT "Bug in ip_route_input" \
1683                                "_slow(). Please, report\n");
1684                 return -EINVAL;
1685         }
1686
1687
1688         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1689                                   in_dev->dev, &spec_dst, &itag);
1690         if (err < 0) {
1691                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1692                                          saddr);
1693
1694                 err = -EINVAL;
1695                 goto cleanup;
1696         }
1697
1698         if (err)
1699                 flags |= RTCF_DIRECTSRC;
1700
1701         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1702             (IN_DEV_SHARED_MEDIA(out_dev) ||
1703              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1704                 flags |= RTCF_DOREDIRECT;
1705
1706         if (skb->protocol != htons(ETH_P_IP)) {
1707                 /* Not IP (i.e. ARP). Do not create route, if it is
1708                  * invalid for proxy arp. DNAT routes are always valid.
1709                  */
1710                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1711                         err = -EINVAL;
1712                         goto cleanup;
1713                 }
1714         }
1715
1716
1717         rth = dst_alloc(&ipv4_dst_ops);
1718         if (!rth) {
1719                 err = -ENOBUFS;
1720                 goto cleanup;
1721         }
1722
1723         rth->u.dst.flags= DST_HOST;
1724 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1725         if (res->fi->fib_nhs > 1)
1726                 rth->u.dst.flags |= DST_BALANCED;
1727 #endif
1728         if (in_dev->cnf.no_policy)
1729                 rth->u.dst.flags |= DST_NOPOLICY;
1730         if (in_dev->cnf.no_xfrm)
1731                 rth->u.dst.flags |= DST_NOXFRM;
1732         rth->fl.fl4_dst = daddr;
1733         rth->rt_dst     = daddr;
1734         rth->fl.fl4_tos = tos;
1735 #ifdef CONFIG_IP_ROUTE_FWMARK
1736         rth->fl.fl4_fwmark= skb->nfmark;
1737 #endif
1738         rth->fl.fl4_src = saddr;
1739         rth->rt_src     = saddr;
1740         rth->rt_gateway = daddr;
1741         rth->rt_iif     =
1742                 rth->fl.iif     = in_dev->dev->ifindex;
1743         rth->u.dst.dev  = (out_dev)->dev;
1744         dev_hold(rth->u.dst.dev);
1745         rth->idev       = in_dev_get(rth->u.dst.dev);
1746         rth->fl.oif     = 0;
1747         rth->rt_spec_dst= spec_dst;
1748
1749         rth->u.dst.input = ip_forward;
1750         rth->u.dst.output = ip_output;
1751
1752         rt_set_nexthop(rth, res, itag);
1753
1754         rth->rt_flags = flags;
1755
1756         *result = rth;
1757         err = 0;
1758  cleanup:
1759         /* release the working reference to the output device */
1760         in_dev_put(out_dev);
1761         return err;
1762 }
1763
1764 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1765                                        struct fib_result* res,
1766                                        const struct flowi *fl,
1767                                        struct in_device *in_dev,
1768                                        u32 daddr, u32 saddr, u32 tos)
1769 {
1770         struct rtable* rth = NULL;
1771         int err;
1772         unsigned hash;
1773
1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1775         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1776                 fib_select_multipath(fl, res);
1777 #endif
1778
1779         /* create a routing cache entry */
1780         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1781         if (err)
1782                 return err;
1783         atomic_set(&rth->u.dst.__refcnt, 1);
1784
1785         /* put it into the cache */
1786         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1787         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1788 }
1789
1790 static inline int ip_mkroute_input(struct sk_buff *skb,
1791                                    struct fib_result* res,
1792                                    const struct flowi *fl,
1793                                    struct in_device *in_dev,
1794                                    u32 daddr, u32 saddr, u32 tos)
1795 {
1796 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1797         struct rtable* rth = NULL;
1798         unsigned char hop, hopcount, lasthop;
1799         int err = -EINVAL;
1800         unsigned int hash;
1801
1802         if (res->fi)
1803                 hopcount = res->fi->fib_nhs;
1804         else
1805                 hopcount = 1;
1806
1807         lasthop = hopcount - 1;
1808
1809         /* distinguish between multipath and singlepath */
1810         if (hopcount < 2)
1811                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1812                                             saddr, tos);
1813
1814         /* add all alternatives to the routing cache */
1815         for (hop = 0; hop < hopcount; hop++) {
1816                 res->nh_sel = hop;
1817
1818                 /* create a routing cache entry */
1819                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1820                                       &rth);
1821                 if (err)
1822                         return err;
1823
1824                 /* put it into the cache */
1825                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1826                 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1827                 if (err)
1828                         return err;
1829
1830                 /* forward hop information to multipath impl. */
1831                 multipath_set_nhinfo(rth,
1832                                      FIB_RES_NETWORK(*res),
1833                                      FIB_RES_NETMASK(*res),
1834                                      res->prefixlen,
1835                                      &FIB_RES_NH(*res));
1836
1837                 /* only for the last hop the reference count is handled
1838                  * outside
1839                  */
1840                 if (hop == lasthop)
1841                         atomic_set(&(skb->dst->__refcnt), 1);
1842         }
1843         return err;
1844 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1845         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1846 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1847 }
1848
1849
1850 /*
1851  *      NOTE. We drop all the packets that has local source
1852  *      addresses, because every properly looped back packet
1853  *      must have correct destination already attached by output routine.
1854  *
1855  *      Such approach solves two big problems:
1856  *      1. Not simplex devices are handled properly.
1857  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1858  */
1859
1860 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1861                                u8 tos, struct net_device *dev)
1862 {
1863         struct fib_result res;
1864         struct in_device *in_dev = in_dev_get(dev);
1865         struct flowi fl = { .nl_u = { .ip4_u =
1866                                       { .daddr = daddr,
1867                                         .saddr = saddr,
1868                                         .tos = tos,
1869                                         .scope = RT_SCOPE_UNIVERSE,
1870 #ifdef CONFIG_IP_ROUTE_FWMARK
1871                                         .fwmark = skb->nfmark
1872 #endif
1873                                       } },
1874                             .iif = dev->ifindex };
1875         unsigned        flags = 0;
1876         u32             itag = 0;
1877         struct rtable * rth;
1878         unsigned        hash;
1879         u32             spec_dst;
1880         int             err = -EINVAL;
1881         int             free_res = 0;
1882
1883         /* IP on this device is disabled. */
1884
1885         if (!in_dev)
1886                 goto out;
1887
1888         /* Check for the most weird martians, which can be not detected
1889            by fib_lookup.
1890          */
1891
1892         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1893                 goto martian_source;
1894
1895         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1896                 goto brd_input;
1897
1898         /* Accept zero addresses only to limited broadcast;
1899          * I even do not know to fix it or not. Waiting for complains :-)
1900          */
1901         if (ZERONET(saddr))
1902                 goto martian_source;
1903
1904         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1905                 goto martian_destination;
1906
1907         /*
1908          *      Now we are ready to route packet.
1909          */
1910         if ((err = fib_lookup(&fl, &res)) != 0) {
1911                 if (!IN_DEV_FORWARD(in_dev))
1912                         goto e_inval;
1913                 goto no_route;
1914         }
1915         free_res = 1;
1916
1917         RT_CACHE_STAT_INC(in_slow_tot);
1918
1919         if (res.type == RTN_BROADCAST)
1920                 goto brd_input;
1921
1922         if (res.type == RTN_LOCAL) {
1923                 int result;
1924                 result = fib_validate_source(saddr, daddr, tos,
1925                                              loopback_dev.ifindex,
1926                                              dev, &spec_dst, &itag);
1927                 if (result < 0)
1928                         goto martian_source;
1929                 if (result)
1930                         flags |= RTCF_DIRECTSRC;
1931                 spec_dst = daddr;
1932                 goto local_input;
1933         }
1934
1935         if (!IN_DEV_FORWARD(in_dev))
1936                 goto e_inval;
1937         if (res.type != RTN_UNICAST)
1938                 goto martian_destination;
1939
1940         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1941         if (err == -ENOBUFS)
1942                 goto e_nobufs;
1943         if (err == -EINVAL)
1944                 goto e_inval;
1945
1946 done:
1947         in_dev_put(in_dev);
1948         if (free_res)
1949                 fib_res_put(&res);
1950 out:    return err;
1951
1952 brd_input:
1953         if (skb->protocol != htons(ETH_P_IP))
1954                 goto e_inval;
1955
1956         if (ZERONET(saddr))
1957                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1958         else {
1959                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1960                                           &itag);
1961                 if (err < 0)
1962                         goto martian_source;
1963                 if (err)
1964                         flags |= RTCF_DIRECTSRC;
1965         }
1966         flags |= RTCF_BROADCAST;
1967         res.type = RTN_BROADCAST;
1968         RT_CACHE_STAT_INC(in_brd);
1969
1970 local_input:
1971         rth = dst_alloc(&ipv4_dst_ops);
1972         if (!rth)
1973                 goto e_nobufs;
1974
1975         rth->u.dst.output= ip_rt_bug;
1976
1977         atomic_set(&rth->u.dst.__refcnt, 1);
1978         rth->u.dst.flags= DST_HOST;
1979         if (in_dev->cnf.no_policy)
1980                 rth->u.dst.flags |= DST_NOPOLICY;
1981         rth->fl.fl4_dst = daddr;
1982         rth->rt_dst     = daddr;
1983         rth->fl.fl4_tos = tos;
1984 #ifdef CONFIG_IP_ROUTE_FWMARK
1985         rth->fl.fl4_fwmark= skb->nfmark;
1986 #endif
1987         rth->fl.fl4_src = saddr;
1988         rth->rt_src     = saddr;
1989 #ifdef CONFIG_NET_CLS_ROUTE
1990         rth->u.dst.tclassid = itag;
1991 #endif
1992         rth->rt_iif     =
1993         rth->fl.iif     = dev->ifindex;
1994         rth->u.dst.dev  = &loopback_dev;
1995         dev_hold(rth->u.dst.dev);
1996         rth->idev       = in_dev_get(rth->u.dst.dev);
1997         rth->rt_gateway = daddr;
1998         rth->rt_spec_dst= spec_dst;
1999         rth->u.dst.input= ip_local_deliver;
2000         rth->rt_flags   = flags|RTCF_LOCAL;
2001         if (res.type == RTN_UNREACHABLE) {
2002                 rth->u.dst.input= ip_error;
2003                 rth->u.dst.error= -err;
2004                 rth->rt_flags   &= ~RTCF_LOCAL;
2005         }
2006         rth->rt_type    = res.type;
2007         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2008         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2009         goto done;
2010
2011 no_route:
2012         RT_CACHE_STAT_INC(in_no_route);
2013         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2014         res.type = RTN_UNREACHABLE;
2015         goto local_input;
2016
2017         /*
2018          *      Do not cache martian addresses: they should be logged (RFC1812)
2019          */
2020 martian_destination:
2021         RT_CACHE_STAT_INC(in_martian_dst);
2022 #ifdef CONFIG_IP_ROUTE_VERBOSE
2023         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2024                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2025                         "%u.%u.%u.%u, dev %s\n",
2026                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027 #endif
2028 e_inval:
2029         err = -EINVAL;
2030         goto done;
2031
2032 e_nobufs:
2033         err = -ENOBUFS;
2034         goto done;
2035
2036 martian_source:
2037         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2038         goto e_inval;
2039 }
2040
2041 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2042                    u8 tos, struct net_device *dev)
2043 {
2044         struct rtable * rth;
2045         unsigned        hash;
2046         int iif = dev->ifindex;
2047
2048         tos &= IPTOS_RT_MASK;
2049         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2050
2051         rcu_read_lock();
2052         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2053              rth = rcu_dereference(rth->u.rt_next)) {
2054                 if (rth->fl.fl4_dst == daddr &&
2055                     rth->fl.fl4_src == saddr &&
2056                     rth->fl.iif == iif &&
2057                     rth->fl.oif == 0 &&
2058 #ifdef CONFIG_IP_ROUTE_FWMARK
2059                     rth->fl.fl4_fwmark == skb->nfmark &&
2060 #endif
2061                     rth->fl.fl4_tos == tos) {
2062                         rth->u.dst.lastuse = jiffies;
2063                         dst_hold(&rth->u.dst);
2064                         rth->u.dst.__use++;
2065                         RT_CACHE_STAT_INC(in_hit);
2066                         rcu_read_unlock();
2067                         skb->dst = (struct dst_entry*)rth;
2068                         return 0;
2069                 }
2070                 RT_CACHE_STAT_INC(in_hlist_search);
2071         }
2072         rcu_read_unlock();
2073
2074         /* Multicast recognition logic is moved from route cache to here.
2075            The problem was that too many Ethernet cards have broken/missing
2076            hardware multicast filters :-( As result the host on multicasting
2077            network acquires a lot of useless route cache entries, sort of
2078            SDR messages from all the world. Now we try to get rid of them.
2079            Really, provided software IP multicast filter is organized
2080            reasonably (at least, hashed), it does not result in a slowdown
2081            comparing with route cache reject entries.
2082            Note, that multicast routers are not affected, because
2083            route cache entry is created eventually.
2084          */
2085         if (MULTICAST(daddr)) {
2086                 struct in_device *in_dev;
2087
2088                 rcu_read_lock();
2089                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2090                         int our = ip_check_mc(in_dev, daddr, saddr,
2091                                 skb->nh.iph->protocol);
2092                         if (our
2093 #ifdef CONFIG_IP_MROUTE
2094                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2095 #endif
2096                             ) {
2097                                 rcu_read_unlock();
2098                                 return ip_route_input_mc(skb, daddr, saddr,
2099                                                          tos, dev, our);
2100                         }
2101                 }
2102                 rcu_read_unlock();
2103                 return -EINVAL;
2104         }
2105         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2106 }
2107
2108 static inline int __mkroute_output(struct rtable **result,
2109                                    struct fib_result* res,
2110                                    const struct flowi *fl,
2111                                    const struct flowi *oldflp,
2112                                    struct net_device *dev_out,
2113                                    unsigned flags)
2114 {
2115         struct rtable *rth;
2116         struct in_device *in_dev;
2117         u32 tos = RT_FL_TOS(oldflp);
2118         int err = 0;
2119
2120         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2121                 return -EINVAL;
2122
2123         if (fl->fl4_dst == 0xFFFFFFFF)
2124                 res->type = RTN_BROADCAST;
2125         else if (MULTICAST(fl->fl4_dst))
2126                 res->type = RTN_MULTICAST;
2127         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2128                 return -EINVAL;
2129
2130         if (dev_out->flags & IFF_LOOPBACK)
2131                 flags |= RTCF_LOCAL;
2132
2133         /* get work reference to inet device */
2134         in_dev = in_dev_get(dev_out);
2135         if (!in_dev)
2136                 return -EINVAL;
2137
2138         if (res->type == RTN_BROADCAST) {
2139                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2140                 if (res->fi) {
2141                         fib_info_put(res->fi);
2142                         res->fi = NULL;
2143                 }
2144         } else if (res->type == RTN_MULTICAST) {
2145                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2146                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2147                                  oldflp->proto))
2148                         flags &= ~RTCF_LOCAL;
2149                 /* If multicast route do not exist use
2150                    default one, but do not gateway in this case.
2151                    Yes, it is hack.
2152                  */
2153                 if (res->fi && res->prefixlen < 4) {
2154                         fib_info_put(res->fi);
2155                         res->fi = NULL;
2156                 }
2157         }
2158
2159
2160         rth = dst_alloc(&ipv4_dst_ops);
2161         if (!rth) {
2162                 err = -ENOBUFS;
2163                 goto cleanup;
2164         }
2165
2166         rth->u.dst.flags= DST_HOST;
2167 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2168         if (res->fi) {
2169                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2170                 if (res->fi->fib_nhs > 1)
2171                         rth->u.dst.flags |= DST_BALANCED;
2172         }
2173 #endif
2174         if (in_dev->cnf.no_xfrm)
2175                 rth->u.dst.flags |= DST_NOXFRM;
2176         if (in_dev->cnf.no_policy)
2177                 rth->u.dst.flags |= DST_NOPOLICY;
2178
2179         rth->fl.fl4_dst = oldflp->fl4_dst;
2180         rth->fl.fl4_tos = tos;
2181         rth->fl.fl4_src = oldflp->fl4_src;
2182         rth->fl.oif     = oldflp->oif;
2183 #ifdef CONFIG_IP_ROUTE_FWMARK
2184         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2185 #endif
2186         rth->rt_dst     = fl->fl4_dst;
2187         rth->rt_src     = fl->fl4_src;
2188         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2189         /* get references to the devices that are to be hold by the routing
2190            cache entry */
2191         rth->u.dst.dev  = dev_out;
2192         dev_hold(dev_out);
2193         rth->idev       = in_dev_get(dev_out);
2194         rth->rt_gateway = fl->fl4_dst;
2195         rth->rt_spec_dst= fl->fl4_src;
2196
2197         rth->u.dst.output=ip_output;
2198
2199         RT_CACHE_STAT_INC(out_slow_tot);
2200
2201         if (flags & RTCF_LOCAL) {
2202                 rth->u.dst.input = ip_local_deliver;
2203                 rth->rt_spec_dst = fl->fl4_dst;
2204         }
2205         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2206                 rth->rt_spec_dst = fl->fl4_src;
2207                 if (flags & RTCF_LOCAL &&
2208                     !(dev_out->flags & IFF_LOOPBACK)) {
2209                         rth->u.dst.output = ip_mc_output;
2210                         RT_CACHE_STAT_INC(out_slow_mc);
2211                 }
2212 #ifdef CONFIG_IP_MROUTE
2213                 if (res->type == RTN_MULTICAST) {
2214                         if (IN_DEV_MFORWARD(in_dev) &&
2215                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2216                                 rth->u.dst.input = ip_mr_input;
2217                                 rth->u.dst.output = ip_mc_output;
2218                         }
2219                 }
2220 #endif
2221         }
2222
2223         rt_set_nexthop(rth, res, 0);
2224
2225         rth->rt_flags = flags;
2226
2227         *result = rth;
2228  cleanup:
2229         /* release work reference to inet device */
2230         in_dev_put(in_dev);
2231
2232         return err;
2233 }
2234
2235 static inline int ip_mkroute_output_def(struct rtable **rp,
2236                                         struct fib_result* res,
2237                                         const struct flowi *fl,
2238                                         const struct flowi *oldflp,
2239                                         struct net_device *dev_out,
2240                                         unsigned flags)
2241 {
2242         struct rtable *rth = NULL;
2243         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2244         unsigned hash;
2245         if (err == 0) {
2246                 u32 tos = RT_FL_TOS(oldflp);
2247
2248                 atomic_set(&rth->u.dst.__refcnt, 1);
2249
2250                 hash = rt_hash_code(oldflp->fl4_dst,
2251                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2252                 err = rt_intern_hash(hash, rth, rp);
2253         }
2254
2255         return err;
2256 }
2257
2258 static inline int ip_mkroute_output(struct rtable** rp,
2259                                     struct fib_result* res,
2260                                     const struct flowi *fl,
2261                                     const struct flowi *oldflp,
2262                                     struct net_device *dev_out,
2263                                     unsigned flags)
2264 {
2265 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2266         u32 tos = RT_FL_TOS(oldflp);
2267         unsigned char hop;
2268         unsigned hash;
2269         int err = -EINVAL;
2270         struct rtable *rth = NULL;
2271
2272         if (res->fi && res->fi->fib_nhs > 1) {
2273                 unsigned char hopcount = res->fi->fib_nhs;
2274
2275                 for (hop = 0; hop < hopcount; hop++) {
2276                         struct net_device *dev2nexthop;
2277
2278                         res->nh_sel = hop;
2279
2280                         /* hold a work reference to the output device */
2281                         dev2nexthop = FIB_RES_DEV(*res);
2282                         dev_hold(dev2nexthop);
2283
2284                         err = __mkroute_output(&rth, res, fl, oldflp,
2285                                                dev2nexthop, flags);
2286
2287                         if (err != 0)
2288                                 goto cleanup;
2289
2290                         hash = rt_hash_code(oldflp->fl4_dst,
2291                                             oldflp->fl4_src ^
2292                                             (oldflp->oif << 5), tos);
2293                         err = rt_intern_hash(hash, rth, rp);
2294
2295                         /* forward hop information to multipath impl. */
2296                         multipath_set_nhinfo(rth,
2297                                              FIB_RES_NETWORK(*res),
2298                                              FIB_RES_NETMASK(*res),
2299                                              res->prefixlen,
2300                                              &FIB_RES_NH(*res));
2301                 cleanup:
2302                         /* release work reference to output device */
2303                         dev_put(dev2nexthop);
2304
2305                         if (err != 0)
2306                                 return err;
2307                 }
2308                 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2309                 return err;
2310         } else {
2311                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2312                                              flags);
2313         }
2314 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2315         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2316 #endif
2317 }
2318
2319 /*
2320  * Major route resolver routine.
2321  */
2322
2323 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2324 {
2325         u32 tos = RT_FL_TOS(oldflp);
2326         struct flowi fl = { .nl_u = { .ip4_u =
2327                                       { .daddr = oldflp->fl4_dst,
2328                                         .saddr = oldflp->fl4_src,
2329                                         .tos = tos & IPTOS_RT_MASK,
2330                                         .scope = ((tos & RTO_ONLINK) ?
2331                                                   RT_SCOPE_LINK :
2332                                                   RT_SCOPE_UNIVERSE),
2333 #ifdef CONFIG_IP_ROUTE_FWMARK
2334                                         .fwmark = oldflp->fl4_fwmark
2335 #endif
2336                                       } },
2337                             .iif = loopback_dev.ifindex,
2338                             .oif = oldflp->oif };
2339         struct fib_result res;
2340         unsigned flags = 0;
2341         struct net_device *dev_out = NULL;
2342         int free_res = 0;
2343         int err;
2344
2345
2346         res.fi          = NULL;
2347 #ifdef CONFIG_IP_MULTIPLE_TABLES
2348         res.r           = NULL;
2349 #endif
2350
2351         if (oldflp->fl4_src) {
2352                 err = -EINVAL;
2353                 if (MULTICAST(oldflp->fl4_src) ||
2354                     BADCLASS(oldflp->fl4_src) ||
2355                     ZERONET(oldflp->fl4_src))
2356                         goto out;
2357
2358                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2359                 dev_out = ip_dev_find(oldflp->fl4_src);
2360                 if (dev_out == NULL)
2361                         goto out;
2362
2363                 /* I removed check for oif == dev_out->oif here.
2364                    It was wrong for two reasons:
2365                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2366                       assigned to multiple interfaces.
2367                    2. Moreover, we are allowed to send packets with saddr
2368                       of another iface. --ANK
2369                  */
2370
2371                 if (oldflp->oif == 0
2372                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2373                         /* Special hack: user can direct multicasts
2374                            and limited broadcast via necessary interface
2375                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2376                            This hack is not just for fun, it allows
2377                            vic,vat and friends to work.
2378                            They bind socket to loopback, set ttl to zero
2379                            and expect that it will work.
2380                            From the viewpoint of routing cache they are broken,
2381                            because we are not allowed to build multicast path
2382                            with loopback source addr (look, routing cache
2383                            cannot know, that ttl is zero, so that packet
2384                            will not leave this host and route is valid).
2385                            Luckily, this hack is good workaround.
2386                          */
2387
2388                         fl.oif = dev_out->ifindex;
2389                         goto make_route;
2390                 }
2391                 if (dev_out)
2392                         dev_put(dev_out);
2393                 dev_out = NULL;
2394         }
2395
2396
2397         if (oldflp->oif) {
2398                 dev_out = dev_get_by_index(oldflp->oif);
2399                 err = -ENODEV;
2400                 if (dev_out == NULL)
2401                         goto out;
2402                 if (__in_dev_get(dev_out) == NULL) {
2403                         dev_put(dev_out);
2404                         goto out;       /* Wrong error code */
2405                 }
2406
2407                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2408                         if (!fl.fl4_src)
2409                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2410                                                               RT_SCOPE_LINK);
2411                         goto make_route;
2412                 }
2413                 if (!fl.fl4_src) {
2414                         if (MULTICAST(oldflp->fl4_dst))
2415                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2416                                                               fl.fl4_scope);
2417                         else if (!oldflp->fl4_dst)
2418                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2419                                                               RT_SCOPE_HOST);
2420                 }
2421         }
2422
2423         if (!fl.fl4_dst) {
2424                 fl.fl4_dst = fl.fl4_src;
2425                 if (!fl.fl4_dst)
2426                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2427                 if (dev_out)
2428                         dev_put(dev_out);
2429                 dev_out = &loopback_dev;
2430                 dev_hold(dev_out);
2431                 fl.oif = loopback_dev.ifindex;
2432                 res.type = RTN_LOCAL;
2433                 flags |= RTCF_LOCAL;
2434                 goto make_route;
2435         }
2436
2437         if (fib_lookup(&fl, &res)) {
2438                 res.fi = NULL;
2439                 if (oldflp->oif) {
2440                         /* Apparently, routing tables are wrong. Assume,
2441                            that the destination is on link.
2442
2443                            WHY? DW.
2444                            Because we are allowed to send to iface
2445                            even if it has NO routes and NO assigned
2446                            addresses. When oif is specified, routing
2447                            tables are looked up with only one purpose:
2448                            to catch if destination is gatewayed, rather than
2449                            direct. Moreover, if MSG_DONTROUTE is set,
2450                            we send packet, ignoring both routing tables
2451                            and ifaddr state. --ANK
2452
2453
2454                            We could make it even if oif is unknown,
2455                            likely IPv6, but we do not.
2456                          */
2457
2458                         if (fl.fl4_src == 0)
2459                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2460                                                               RT_SCOPE_LINK);
2461                         res.type = RTN_UNICAST;
2462                         goto make_route;
2463                 }
2464                 if (dev_out)
2465                         dev_put(dev_out);
2466                 err = -ENETUNREACH;
2467                 goto out;
2468         }
2469         free_res = 1;
2470
2471         if (res.type == RTN_LOCAL) {
2472                 if (!fl.fl4_src)
2473                         fl.fl4_src = fl.fl4_dst;
2474                 if (dev_out)
2475                         dev_put(dev_out);
2476                 dev_out = &loopback_dev;
2477                 dev_hold(dev_out);
2478                 fl.oif = dev_out->ifindex;
2479                 if (res.fi)
2480                         fib_info_put(res.fi);
2481                 res.fi = NULL;
2482                 flags |= RTCF_LOCAL;
2483                 goto make_route;
2484         }
2485
2486 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2487         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2488                 fib_select_multipath(&fl, &res);
2489         else
2490 #endif
2491         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2492                 fib_select_default(&fl, &res);
2493
2494         if (!fl.fl4_src)
2495                 fl.fl4_src = FIB_RES_PREFSRC(res);
2496
2497         if (dev_out)
2498                 dev_put(dev_out);
2499         dev_out = FIB_RES_DEV(res);
2500         dev_hold(dev_out);
2501         fl.oif = dev_out->ifindex;
2502
2503
2504 make_route:
2505         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2506
2507
2508         if (free_res)
2509                 fib_res_put(&res);
2510         if (dev_out)
2511                 dev_put(dev_out);
2512 out:    return err;
2513 }
2514
2515 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2516 {
2517         unsigned hash;
2518         struct rtable *rth;
2519
2520         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2521
2522         rcu_read_lock_bh();
2523         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2524                 rth = rcu_dereference(rth->u.rt_next)) {
2525                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2526                     rth->fl.fl4_src == flp->fl4_src &&
2527                     rth->fl.iif == 0 &&
2528                     rth->fl.oif == flp->oif &&
2529 #ifdef CONFIG_IP_ROUTE_FWMARK
2530                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2531 #endif
2532                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2533                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2534
2535                         /* check for multipath routes and choose one if
2536                          * necessary
2537                          */
2538                         if (multipath_select_route(flp, rth, rp)) {
2539                                 dst_hold(&(*rp)->u.dst);
2540                                 RT_CACHE_STAT_INC(out_hit);
2541                                 rcu_read_unlock_bh();
2542                                 return 0;
2543                         }
2544
2545                         rth->u.dst.lastuse = jiffies;
2546                         dst_hold(&rth->u.dst);
2547                         rth->u.dst.__use++;
2548                         RT_CACHE_STAT_INC(out_hit);
2549                         rcu_read_unlock_bh();
2550                         *rp = rth;
2551                         return 0;
2552                 }
2553                 RT_CACHE_STAT_INC(out_hlist_search);
2554         }
2555         rcu_read_unlock_bh();
2556
2557         return ip_route_output_slow(rp, flp);
2558 }
2559
2560 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2561 {
2562         int err;
2563
2564         if ((err = __ip_route_output_key(rp, flp)) != 0)
2565                 return err;
2566
2567         if (flp->proto) {
2568                 if (!flp->fl4_src)
2569                         flp->fl4_src = (*rp)->rt_src;
2570                 if (!flp->fl4_dst)
2571                         flp->fl4_dst = (*rp)->rt_dst;
2572                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2573         }
2574
2575         return 0;
2576 }
2577
2578 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2579 {
2580         return ip_route_output_flow(rp, flp, NULL, 0);
2581 }
2582
2583 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2584                         int nowait, unsigned int flags)
2585 {
2586         struct rtable *rt = (struct rtable*)skb->dst;
2587         struct rtmsg *r;
2588         struct nlmsghdr  *nlh;
2589         unsigned char    *b = skb->tail;
2590         struct rta_cacheinfo ci;
2591 #ifdef CONFIG_IP_MROUTE
2592         struct rtattr *eptr;
2593 #endif
2594         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2595         r = NLMSG_DATA(nlh);
2596         r->rtm_family    = AF_INET;
2597         r->rtm_dst_len  = 32;
2598         r->rtm_src_len  = 0;
2599         r->rtm_tos      = rt->fl.fl4_tos;
2600         r->rtm_table    = RT_TABLE_MAIN;
2601         r->rtm_type     = rt->rt_type;
2602         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2603         r->rtm_protocol = RTPROT_UNSPEC;
2604         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2605         if (rt->rt_flags & RTCF_NOTIFY)
2606                 r->rtm_flags |= RTM_F_NOTIFY;
2607         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2608         if (rt->fl.fl4_src) {
2609                 r->rtm_src_len = 32;
2610                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2611         }
2612         if (rt->u.dst.dev)
2613                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2614 #ifdef CONFIG_NET_CLS_ROUTE
2615         if (rt->u.dst.tclassid)
2616                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2617 #endif
2618 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2619         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2620                 __u32 alg = rt->rt_multipath_alg;
2621
2622                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2623         }
2624 #endif
2625         if (rt->fl.iif)
2626                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2627         else if (rt->rt_src != rt->fl.fl4_src)
2628                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2629         if (rt->rt_dst != rt->rt_gateway)
2630                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2631         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2632                 goto rtattr_failure;
2633         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2634         ci.rta_used     = rt->u.dst.__use;
2635         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2636         if (rt->u.dst.expires)
2637                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2638         else
2639                 ci.rta_expires = 0;
2640         ci.rta_error    = rt->u.dst.error;
2641         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2642         if (rt->peer) {
2643                 ci.rta_id = rt->peer->ip_id_count;
2644                 if (rt->peer->tcp_ts_stamp) {
2645                         ci.rta_ts = rt->peer->tcp_ts;
2646                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2647                 }
2648         }
2649 #ifdef CONFIG_IP_MROUTE
2650         eptr = (struct rtattr*)skb->tail;
2651 #endif
2652         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2653         if (rt->fl.iif) {
2654 #ifdef CONFIG_IP_MROUTE
2655                 u32 dst = rt->rt_dst;
2656
2657                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2658                     ipv4_devconf.mc_forwarding) {
2659                         int err = ipmr_get_route(skb, r, nowait);
2660                         if (err <= 0) {
2661                                 if (!nowait) {
2662                                         if (err == 0)
2663                                                 return 0;
2664                                         goto nlmsg_failure;
2665                                 } else {
2666                                         if (err == -EMSGSIZE)
2667                                                 goto nlmsg_failure;
2668                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2669                                 }
2670                         }
2671                 } else
2672 #endif
2673                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2674         }
2675
2676         nlh->nlmsg_len = skb->tail - b;
2677         return skb->len;
2678
2679 nlmsg_failure:
2680 rtattr_failure:
2681         skb_trim(skb, b - skb->data);
2682         return -1;
2683 }
2684
2685 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2686 {
2687         struct rtattr **rta = arg;
2688         struct rtmsg *rtm = NLMSG_DATA(nlh);
2689         struct rtable *rt = NULL;
2690         u32 dst = 0;
2691         u32 src = 0;
2692         int iif = 0;
2693         int err = -ENOBUFS;
2694         struct sk_buff *skb;
2695
2696         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2697         if (!skb)
2698                 goto out;
2699
2700         /* Reserve room for dummy headers, this skb can pass
2701            through good chunk of routing engine.
2702          */
2703         skb->mac.raw = skb->data;
2704         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2705
2706         if (rta[RTA_SRC - 1])
2707                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2708         if (rta[RTA_DST - 1])
2709                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2710         if (rta[RTA_IIF - 1])
2711                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2712
2713         if (iif) {
2714                 struct net_device *dev = __dev_get_by_index(iif);
2715                 err = -ENODEV;
2716                 if (!dev)
2717                         goto out_free;
2718                 skb->protocol   = htons(ETH_P_IP);
2719                 skb->dev        = dev;
2720                 local_bh_disable();
2721                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2722                 local_bh_enable();
2723                 rt = (struct rtable*)skb->dst;
2724                 if (!err && rt->u.dst.error)
2725                         err = -rt->u.dst.error;
2726         } else {
2727                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2728                                                          .saddr = src,
2729                                                          .tos = rtm->rtm_tos } } };
2730                 int oif = 0;
2731                 if (rta[RTA_OIF - 1])
2732                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2733                 fl.oif = oif;
2734                 err = ip_route_output_key(&rt, &fl);
2735         }
2736         if (err)
2737                 goto out_free;
2738
2739         skb->dst = &rt->u.dst;
2740         if (rtm->rtm_flags & RTM_F_NOTIFY)
2741                 rt->rt_flags |= RTCF_NOTIFY;
2742
2743         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2744
2745         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2746                                 RTM_NEWROUTE, 0, 0);
2747         if (!err)
2748                 goto out_free;
2749         if (err < 0) {
2750                 err = -EMSGSIZE;
2751                 goto out_free;
2752         }
2753
2754         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2755         if (err > 0)
2756                 err = 0;
2757 out:    return err;
2758
2759 out_free:
2760         kfree_skb(skb);
2761         goto out;
2762 }
2763
2764 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2765 {
2766         struct rtable *rt;
2767         int h, s_h;
2768         int idx, s_idx;
2769
2770         s_h = cb->args[0];
2771         s_idx = idx = cb->args[1];
2772         for (h = 0; h <= rt_hash_mask; h++) {
2773                 if (h < s_h) continue;
2774                 if (h > s_h)
2775                         s_idx = 0;
2776                 rcu_read_lock_bh();
2777                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2778                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2779                         if (idx < s_idx)
2780                                 continue;
2781                         skb->dst = dst_clone(&rt->u.dst);
2782                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2783                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2784                                          1, NLM_F_MULTI) <= 0) {
2785                                 dst_release(xchg(&skb->dst, NULL));
2786                                 rcu_read_unlock_bh();
2787                                 goto done;
2788                         }
2789                         dst_release(xchg(&skb->dst, NULL));
2790                 }
2791                 rcu_read_unlock_bh();
2792         }
2793
2794 done:
2795         cb->args[0] = h;
2796         cb->args[1] = idx;
2797         return skb->len;
2798 }
2799
2800 void ip_rt_multicast_event(struct in_device *in_dev)
2801 {
2802         rt_cache_flush(0);
2803 }
2804
2805 #ifdef CONFIG_SYSCTL
2806 static int flush_delay;
2807
2808 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2809                                         struct file *filp, void __user *buffer,
2810                                         size_t *lenp, loff_t *ppos)
2811 {
2812         if (write) {
2813                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2814                 rt_cache_flush(flush_delay);
2815                 return 0;
2816         }
2817
2818         return -EINVAL;
2819 }
2820
2821 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2822                                                 int __user *name,
2823                                                 int nlen,
2824                                                 void __user *oldval,
2825                                                 size_t __user *oldlenp,
2826                                                 void __user *newval,
2827                                                 size_t newlen,
2828                                                 void **context)
2829 {
2830         int delay;
2831         if (newlen != sizeof(int))
2832                 return -EINVAL;
2833         if (get_user(delay, (int __user *)newval))
2834                 return -EFAULT;
2835         rt_cache_flush(delay);
2836         return 0;
2837 }
2838
2839 ctl_table ipv4_route_table[] = {
2840         {
2841                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2842                 .procname       = "flush",
2843                 .data           = &flush_delay,
2844                 .maxlen         = sizeof(int),
2845                 .mode           = 0200,
2846                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2847                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2848         },
2849         {
2850                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2851                 .procname       = "min_delay",
2852                 .data           = &ip_rt_min_delay,
2853                 .maxlen         = sizeof(int),
2854                 .mode           = 0644,
2855                 .proc_handler   = &proc_dointvec_jiffies,
2856                 .strategy       = &sysctl_jiffies,
2857         },
2858         {
2859                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2860                 .procname       = "max_delay",
2861                 .data           = &ip_rt_max_delay,
2862                 .maxlen         = sizeof(int),
2863                 .mode           = 0644,
2864                 .proc_handler   = &proc_dointvec_jiffies,
2865                 .strategy       = &sysctl_jiffies,
2866         },
2867         {
2868                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2869                 .procname       = "gc_thresh",
2870                 .data           = &ipv4_dst_ops.gc_thresh,
2871                 .maxlen         = sizeof(int),
2872                 .mode           = 0644,
2873                 .proc_handler   = &proc_dointvec,
2874         },
2875         {
2876                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2877                 .procname       = "max_size",
2878                 .data           = &ip_rt_max_size,
2879                 .maxlen         = sizeof(int),
2880                 .mode           = 0644,
2881                 .proc_handler   = &proc_dointvec,
2882         },
2883         {
2884                 /*  Deprecated. Use gc_min_interval_ms */
2885
2886                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2887                 .procname       = "gc_min_interval",
2888                 .data           = &ip_rt_gc_min_interval,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = &proc_dointvec_jiffies,
2892                 .strategy       = &sysctl_jiffies,
2893         },
2894         {
2895                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2896                 .procname       = "gc_min_interval_ms",
2897                 .data           = &ip_rt_gc_min_interval,
2898                 .maxlen         = sizeof(int),
2899                 .mode           = 0644,
2900                 .proc_handler   = &proc_dointvec_ms_jiffies,
2901                 .strategy       = &sysctl_ms_jiffies,
2902         },
2903         {
2904                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2905                 .procname       = "gc_timeout",
2906                 .data           = &ip_rt_gc_timeout,
2907                 .maxlen         = sizeof(int),
2908                 .mode           = 0644,
2909                 .proc_handler   = &proc_dointvec_jiffies,
2910                 .strategy       = &sysctl_jiffies,
2911         },
2912         {
2913                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2914                 .procname       = "gc_interval",
2915                 .data           = &ip_rt_gc_interval,
2916                 .maxlen         = sizeof(int),
2917                 .mode           = 0644,
2918                 .proc_handler   = &proc_dointvec_jiffies,
2919                 .strategy       = &sysctl_jiffies,
2920         },
2921         {
2922                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2923                 .procname       = "redirect_load",
2924                 .data           = &ip_rt_redirect_load,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = &proc_dointvec,
2928         },
2929         {
2930                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2931                 .procname       = "redirect_number",
2932                 .data           = &ip_rt_redirect_number,
2933                 .maxlen         = sizeof(int),
2934                 .mode           = 0644,
2935                 .proc_handler   = &proc_dointvec,
2936         },
2937         {
2938                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2939                 .procname       = "redirect_silence",
2940                 .data           = &ip_rt_redirect_silence,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = &proc_dointvec,
2944         },
2945         {
2946                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2947                 .procname       = "error_cost",
2948                 .data           = &ip_rt_error_cost,
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0644,
2951                 .proc_handler   = &proc_dointvec,
2952         },
2953         {
2954                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2955                 .procname       = "error_burst",
2956                 .data           = &ip_rt_error_burst,
2957                 .maxlen         = sizeof(int),
2958                 .mode           = 0644,
2959                 .proc_handler   = &proc_dointvec,
2960         },
2961         {
2962                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2963                 .procname       = "gc_elasticity",
2964                 .data           = &ip_rt_gc_elasticity,
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0644,
2967                 .proc_handler   = &proc_dointvec,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2971                 .procname       = "mtu_expires",
2972                 .data           = &ip_rt_mtu_expires,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec_jiffies,
2976                 .strategy       = &sysctl_jiffies,
2977         },
2978         {
2979                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2980                 .procname       = "min_pmtu",
2981                 .data           = &ip_rt_min_pmtu,
2982                 .maxlen         = sizeof(int),
2983                 .mode           = 0644,
2984                 .proc_handler   = &proc_dointvec,
2985         },
2986         {
2987                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2988                 .procname       = "min_adv_mss",
2989                 .data           = &ip_rt_min_advmss,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = &proc_dointvec,
2993         },
2994         {
2995                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2996                 .procname       = "secret_interval",
2997                 .data           = &ip_rt_secret_interval,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = &proc_dointvec_jiffies,
3001                 .strategy       = &sysctl_jiffies,
3002         },
3003         { .ctl_name = 0 }
3004 };
3005 #endif
3006
3007 #ifdef CONFIG_NET_CLS_ROUTE
3008 struct ip_rt_acct *ip_rt_acct;
3009
3010 /* This code sucks.  But you should have seen it before! --RR */
3011
3012 /* IP route accounting ptr for this logical cpu number. */
3013 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3014
3015 #ifdef CONFIG_PROC_FS
3016 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3017                            int length, int *eof, void *data)
3018 {
3019         unsigned int i;
3020
3021         if ((offset & 3) || (length & 3))
3022                 return -EIO;
3023
3024         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3025                 *eof = 1;
3026                 return 0;
3027         }
3028
3029         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3030                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3031                 *eof = 1;
3032         }
3033
3034         offset /= sizeof(u32);
3035
3036         if (length > 0) {
3037                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3038                 u32 *dst = (u32 *) buffer;
3039
3040                 /* Copy first cpu. */
3041                 *start = buffer;
3042                 memcpy(dst, src, length);
3043
3044                 /* Add the other cpus in, one int at a time */
3045                 for_each_cpu(i) {
3046                         unsigned int j;
3047
3048                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3049
3050                         for (j = 0; j < length/4; j++)
3051                                 dst[j] += src[j];
3052                 }
3053         }
3054         return length;
3055 }
3056 #endif /* CONFIG_PROC_FS */
3057 #endif /* CONFIG_NET_CLS_ROUTE */
3058
3059 static __initdata unsigned long rhash_entries;
3060 static int __init set_rhash_entries(char *str)
3061 {
3062         if (!str)
3063                 return 0;
3064         rhash_entries = simple_strtoul(str, &str, 0);
3065         return 1;
3066 }
3067 __setup("rhash_entries=", set_rhash_entries);
3068
3069 int __init ip_rt_init(void)
3070 {
3071         int i, order, goal, rc = 0;
3072
3073         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074                              (jiffies ^ (jiffies >> 7)));
3075
3076 #ifdef CONFIG_NET_CLS_ROUTE
3077         for (order = 0;
3078              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079                 /* NOTHING */;
3080         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3081         if (!ip_rt_acct)
3082                 panic("IP: failed to allocate ip_rt_acct\n");
3083         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3084 #endif
3085
3086         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3087                                                      sizeof(struct rtable),
3088                                                      0, SLAB_HWCACHE_ALIGN,
3089                                                      NULL, NULL);
3090
3091         if (!ipv4_dst_ops.kmem_cachep)
3092                 panic("IP: failed to allocate ip_dst_cache\n");
3093
3094         goal = num_physpages >> (26 - PAGE_SHIFT);
3095         if (rhash_entries)
3096                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3097         for (order = 0; (1UL << order) < goal; order++)
3098                 /* NOTHING */;
3099
3100         do {
3101                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3102                         sizeof(struct rt_hash_bucket);
3103                 while (rt_hash_mask & (rt_hash_mask - 1))
3104                         rt_hash_mask--;
3105                 rt_hash_table = (struct rt_hash_bucket *)
3106                         __get_free_pages(GFP_ATOMIC, order);
3107         } while (rt_hash_table == NULL && --order > 0);
3108
3109         if (!rt_hash_table)
3110                 panic("Failed to allocate IP route cache hash table\n");
3111
3112         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113                rt_hash_mask,
3114                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115
3116         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117                 /* NOTHING */;
3118
3119         rt_hash_mask--;
3120         for (i = 0; i <= rt_hash_mask; i++) {
3121                 spin_lock_init(&rt_hash_table[i].lock);
3122                 rt_hash_table[i].chain = NULL;
3123         }
3124
3125         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3127
3128         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3129         if (!rt_cache_stat)
3130                 return -ENOMEM;
3131
3132         devinet_init();
3133         ip_fib_init();
3134
3135         init_timer(&rt_flush_timer);
3136         rt_flush_timer.function = rt_run_flush;
3137         init_timer(&rt_periodic_timer);
3138         rt_periodic_timer.function = rt_check_expire;
3139         init_timer(&rt_secret_timer);
3140         rt_secret_timer.function = rt_secret_rebuild;
3141
3142         /* All the timers, started at system startup tend
3143            to synchronize. Perturb it a bit.
3144          */
3145         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3146                                         ip_rt_gc_interval;
3147         add_timer(&rt_periodic_timer);
3148
3149         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3150                 ip_rt_secret_interval;
3151         add_timer(&rt_secret_timer);
3152
3153 #ifdef CONFIG_PROC_FS
3154         {
3155         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3156         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3157             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3158                                              proc_net_stat))) {
3159                 free_percpu(rt_cache_stat);
3160                 return -ENOMEM;
3161         }
3162         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3163         }
3164 #ifdef CONFIG_NET_CLS_ROUTE
3165         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3166 #endif
3167 #endif
3168 #ifdef CONFIG_XFRM
3169         xfrm_init();
3170         xfrm4_init();
3171 #endif
3172         return rc;
3173 }
3174
3175 EXPORT_SYMBOL(__ip_select_ident);
3176 EXPORT_SYMBOL(ip_route_input);
3177 EXPORT_SYMBOL(ip_route_output_key);