git.oblomov.eu Git - linux-2.6/blob - net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *
  58  *              This program is free software; you can redistribute it and/or
  59  *              modify it under the terms of the GNU General Public License
  60  *              as published by the Free Software Foundation; either version
  61  *              2 of the License, or (at your option) any later version.
  62  */
  63
  64 #include <linux/config.h>
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/sched.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/rtnetlink.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/protocol.h>
  94 #include <net/ip.h>
  95 #include <net/route.h>
  96 #include <net/inetpeer.h>
  97 #include <net/sock.h>
  98 #include <net/ip_fib.h>
  99 #include <net/arp.h>
 100 #include <net/tcp.h>
 101 #include <net/icmp.h>
 102 #include <net/xfrm.h>
 103 #include <net/ip_mp_alg.h>
 104 #ifdef CONFIG_SYSCTL
 105 #include <linux/sysctl.h>
 106 #endif
 107
 108 #define RT_FL_TOS(oldflp) \
 109     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 110
 111 #define IP_MAX_MTU      0xFFF0
 112
 113 #define RT_GC_TIMEOUT (300*HZ)
 114
 115 static int ip_rt_min_delay              = 2 * HZ;
 116 static int ip_rt_max_delay              = 10 * HZ;
 117 static int ip_rt_max_size;
 118 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 119 static int ip_rt_gc_interval            = 60 * HZ;
 120 static int ip_rt_gc_min_interval        = HZ / 2;
 121 static int ip_rt_redirect_number        = 9;
 122 static int ip_rt_redirect_load          = HZ / 50;
 123 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 124 static int ip_rt_error_cost             = HZ;
 125 static int ip_rt_error_burst            = 5 * HZ;
 126 static int ip_rt_gc_elasticity          = 8;
 127 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 128 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 129 static int ip_rt_min_advmss             = 256;
 130 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 131 static unsigned long rt_deadline;
 132
 133 #define RTprint(a...)   printk(KERN_DEBUG a)
 134
 135 static struct timer_list rt_flush_timer;
 136 static struct timer_list rt_periodic_timer;
 137 static struct timer_list rt_secret_timer;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 146                                          struct net_device *dev, int how);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(void);
 151
 152
 153 static struct dst_ops ipv4_dst_ops = {
 154         .family =               AF_INET,
 155         .protocol =             __constant_htons(ETH_P_IP),
 156         .gc =                   rt_garbage_collect,
 157         .check =                ipv4_dst_check,
 158         .destroy =              ipv4_dst_destroy,
 159         .ifdown =               ipv4_dst_ifdown,
 160         .negative_advice =      ipv4_negative_advice,
 161         .link_failure =         ipv4_link_failure,
 162         .update_pmtu =          ip_rt_update_pmtu,
 163         .entry_size =           sizeof(struct rtable),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204         spinlock_t      lock;
 205 } __attribute__((__aligned__(8)));
 206
 207 static struct rt_hash_bucket    *rt_hash_table;
 208 static unsigned                 rt_hash_mask;
 209 static int                      rt_hash_log;
 210 static unsigned int             rt_hash_rnd;
 211
 212 struct rt_cache_stat *rt_cache_stat;
 213
 214 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 215                                 struct rtable **res);
 216
 217 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 218 {
 219         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 220                 & rt_hash_mask);
 221 }
 222
 223 #ifdef CONFIG_PROC_FS
 224 struct rt_cache_iter_state {
 225         int bucket;
 226 };
 227
 228 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 229 {
 230         struct rtable *r = NULL;
 231         struct rt_cache_iter_state *st = seq->private;
 232
 233         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 234                 rcu_read_lock_bh();
 235                 r = rt_hash_table[st->bucket].chain;
 236                 if (r)
 237                         break;
 238                 rcu_read_unlock_bh();
 239         }
 240         return r;
 241 }
 242
 243 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 244 {
 245         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 246
 247         r = r->u.rt_next;
 248         while (!r) {
 249                 rcu_read_unlock_bh();
 250                 if (--st->bucket < 0)
 251                         break;
 252                 rcu_read_lock_bh();
 253                 r = rt_hash_table[st->bucket].chain;
 254         }
 255         return r;
 256 }
 257
 258 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 259 {
 260         struct rtable *r = rt_cache_get_first(seq);
 261
 262         if (r)
 263                 while (pos && (r = rt_cache_get_next(seq, r)))
 264                         --pos;
 265         return pos ? NULL : r;
 266 }
 267
 268 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 269 {
 270         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 271 }
 272
 273 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 274 {
 275         struct rtable *r = NULL;
 276
 277         if (v == SEQ_START_TOKEN)
 278                 r = rt_cache_get_first(seq);
 279         else
 280                 r = rt_cache_get_next(seq, v);
 281         ++*pos;
 282         return r;
 283 }
 284
 285 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 286 {
 287         if (v && v != SEQ_START_TOKEN)
 288                 rcu_read_unlock_bh();
 289 }
 290
 291 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 292 {
 293         if (v == SEQ_START_TOKEN)
 294                 seq_printf(seq, "%-127s\n",
 295                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 296                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 297                            "HHUptod\tSpecDst");
 298         else {
 299                 struct rtable *r = v;
 300                 char temp[256];
 301
 302                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 303                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 304                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 305                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 306                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 307                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 308                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 309                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 310                         dst_metric(&r->u.dst, RTAX_WINDOW),
 311                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 312                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 313                         r->fl.fl4_tos,
 314                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 315                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 316                                        dev_queue_xmit) : 0,
 317                         r->rt_spec_dst);
 318                 seq_printf(seq, "%-127s\n", temp);
 319         }
 320         return 0;
 321 }
 322
 323 static struct seq_operations rt_cache_seq_ops = {
 324         .start  = rt_cache_seq_start,
 325         .next   = rt_cache_seq_next,
 326         .stop   = rt_cache_seq_stop,
 327         .show   = rt_cache_seq_show,
 328 };
 329
 330 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 331 {
 332         struct seq_file *seq;
 333         int rc = -ENOMEM;
 334         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 335
 336         if (!s)
 337                 goto out;
 338         rc = seq_open(file, &rt_cache_seq_ops);
 339         if (rc)
 340                 goto out_kfree;
 341         seq          = file->private_data;
 342         seq->private = s;
 343         memset(s, 0, sizeof(*s));
 344 out:
 345         return rc;
 346 out_kfree:
 347         kfree(s);
 348         goto out;
 349 }
 350
 351 static struct file_operations rt_cache_seq_fops = {
 352         .owner   = THIS_MODULE,
 353         .open    = rt_cache_seq_open,
 354         .read    = seq_read,
 355         .llseek  = seq_lseek,
 356         .release = seq_release_private,
 357 };
 358
 359
 360 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 361 {
 362         int cpu;
 363
 364         if (*pos == 0)
 365                 return SEQ_START_TOKEN;
 366
 367         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 368                 if (!cpu_possible(cpu))
 369                         continue;
 370                 *pos = cpu+1;
 371                 return per_cpu_ptr(rt_cache_stat, cpu);
 372         }
 373         return NULL;
 374 }
 375
 376 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 377 {
 378         int cpu;
 379
 380         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 381                 if (!cpu_possible(cpu))
 382                         continue;
 383                 *pos = cpu+1;
 384                 return per_cpu_ptr(rt_cache_stat, cpu);
 385         }
 386         return NULL;
 387
 388 }
 389
 390 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 391 {
 392
 393 }
 394
 395 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 396 {
 397         struct rt_cache_stat *st = v;
 398
 399         if (v == SEQ_START_TOKEN) {
 400                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 401                 return 0;
 402         }
 403
 404         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 405                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 406                    atomic_read(&ipv4_dst_ops.entries),
 407                    st->in_hit,
 408                    st->in_slow_tot,
 409                    st->in_slow_mc,
 410                    st->in_no_route,
 411                    st->in_brd,
 412                    st->in_martian_dst,
 413                    st->in_martian_src,
 414
 415                    st->out_hit,
 416                    st->out_slow_tot,
 417                    st->out_slow_mc,
 418
 419                    st->gc_total,
 420                    st->gc_ignored,
 421                    st->gc_goal_miss,
 422                    st->gc_dst_overflow,
 423                    st->in_hlist_search,
 424                    st->out_hlist_search
 425                 );
 426         return 0;
 427 }
 428
 429 static struct seq_operations rt_cpu_seq_ops = {
 430         .start  = rt_cpu_seq_start,
 431         .next   = rt_cpu_seq_next,
 432         .stop   = rt_cpu_seq_stop,
 433         .show   = rt_cpu_seq_show,
 434 };
 435
 436
 437 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 438 {
 439         return seq_open(file, &rt_cpu_seq_ops);
 440 }
 441
 442 static struct file_operations rt_cpu_seq_fops = {
 443         .owner   = THIS_MODULE,
 444         .open    = rt_cpu_seq_open,
 445         .read    = seq_read,
 446         .llseek  = seq_lseek,
 447         .release = seq_release,
 448 };
 449
 450 #endif /* CONFIG_PROC_FS */
 451
 452 static __inline__ void rt_free(struct rtable *rt)
 453 {
 454         multipath_remove(rt);
 455         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 456 }
 457
 458 static __inline__ void rt_drop(struct rtable *rt)
 459 {
 460         multipath_remove(rt);
 461         ip_rt_put(rt);
 462         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 463 }
 464
 465 static __inline__ int rt_fast_clean(struct rtable *rth)
 466 {
 467         /* Kill broadcast/multicast entries very aggresively, if they
 468            collide in hash table with more useful entries */
 469         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 470                 rth->fl.iif && rth->u.rt_next;
 471 }
 472
 473 static __inline__ int rt_valuable(struct rtable *rth)
 474 {
 475         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 476                 rth->u.dst.expires;
 477 }
 478
 479 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 480 {
 481         unsigned long age;
 482         int ret = 0;
 483
 484         if (atomic_read(&rth->u.dst.__refcnt))
 485                 goto out;
 486
 487         ret = 1;
 488         if (rth->u.dst.expires &&
 489             time_after_eq(jiffies, rth->u.dst.expires))
 490                 goto out;
 491
 492         age = jiffies - rth->u.dst.lastuse;
 493         ret = 0;
 494         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 495             (age <= tmo2 && rt_valuable(rth)))
 496                 goto out;
 497         ret = 1;
 498 out:    return ret;
 499 }
 500
 501 /* Bits of score are:
 502  * 31: very valuable
 503  * 30: not quite useless
 504  * 29..0: usage counter
 505  */
 506 static inline u32 rt_score(struct rtable *rt)
 507 {
 508         u32 score = jiffies - rt->u.dst.lastuse;
 509
 510         score = ~score & ~(3<<30);
 511
 512         if (rt_valuable(rt))
 513                 score |= (1<<31);
 514
 515         if (!rt->fl.iif ||
 516             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 517                 score |= (1<<30);
 518
 519         return score;
 520 }
 521
 522 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 523 {
 524         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 525                fl1->oif     == fl2->oif &&
 526                fl1->iif     == fl2->iif;
 527 }
 528
 529 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 530 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 531                                                 struct rtable *expentry,
 532                                                 int *removed_count)
 533 {
 534         int passedexpired = 0;
 535         struct rtable **nextstep = NULL;
 536         struct rtable **rthp = chain_head;
 537         struct rtable *rth;
 538
 539         if (removed_count)
 540                 *removed_count = 0;
 541
 542         while ((rth = *rthp) != NULL) {
 543                 if (rth == expentry)
 544                         passedexpired = 1;
 545
 546                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 547                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 548                         if (*rthp == expentry) {
 549                                 *rthp = rth->u.rt_next;
 550                                 continue;
 551                         } else {
 552                                 *rthp = rth->u.rt_next;
 553                                 rt_free(rth);
 554                                 if (removed_count)
 555                                         ++(*removed_count);
 556                         }
 557                 } else {
 558                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 559                             passedexpired && !nextstep)
 560                                 nextstep = &rth->u.rt_next;
 561
 562                         rthp = &rth->u.rt_next;
 563                 }
 564         }
 565
 566         rt_free(expentry);
 567         if (removed_count)
 568                 ++(*removed_count);
 569
 570         return nextstep;
 571 }
 572 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 573
 574
 575 /* This runs via a timer and thus is always in BH context. */
 576 static void rt_check_expire(unsigned long dummy)
 577 {
 578         static int rover;
 579         int i = rover, t;
 580         struct rtable *rth, **rthp;
 581         unsigned long now = jiffies;
 582
 583         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 584              t -= ip_rt_gc_timeout) {
 585                 unsigned long tmo = ip_rt_gc_timeout;
 586
 587                 i = (i + 1) & rt_hash_mask;
 588                 rthp = &rt_hash_table[i].chain;
 589
 590                 spin_lock(&rt_hash_table[i].lock);
 591                 while ((rth = *rthp) != NULL) {
 592                         if (rth->u.dst.expires) {
 593                                 /* Entry is expired even if it is in use */
 594                                 if (time_before_eq(now, rth->u.dst.expires)) {
 595                                         tmo >>= 1;
 596                                         rthp = &rth->u.rt_next;
 597                                         continue;
 598                                 }
 599                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 600                                 tmo >>= 1;
 601                                 rthp = &rth->u.rt_next;
 602                                 continue;
 603                         }
 604
 605                         /* Cleanup aged off entries. */
 606 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 607                         /* remove all related balanced entries if necessary */
 608                         if (rth->u.dst.flags & DST_BALANCED) {
 609                                 rthp = rt_remove_balanced_route(
 610                                         &rt_hash_table[i].chain,
 611                                         rth, NULL);
 612                                 if (!rthp)
 613                                         break;
 614                         } else {
 615                                 *rthp = rth->u.rt_next;
 616                                 rt_free(rth);
 617                         }
 618 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 619                         *rthp = rth->u.rt_next;
 620                         rt_free(rth);
 621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 622                 }
 623                 spin_unlock(&rt_hash_table[i].lock);
 624
 625                 /* Fallback loop breaker. */
 626                 if (time_after(jiffies, now))
 627                         break;
 628         }
 629         rover = i;
 630         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 631 }
 632
 633 /* This can run from both BH and non-BH contexts, the latter
 634  * in the case of a forced flush event.
 635  */
 636 static void rt_run_flush(unsigned long dummy)
 637 {
 638         int i;
 639         struct rtable *rth, *next;
 640
 641         rt_deadline = 0;
 642
 643         get_random_bytes(&rt_hash_rnd, 4);
 644
 645         for (i = rt_hash_mask; i >= 0; i--) {
 646                 spin_lock_bh(&rt_hash_table[i].lock);
 647                 rth = rt_hash_table[i].chain;
 648                 if (rth)
 649                         rt_hash_table[i].chain = NULL;
 650                 spin_unlock_bh(&rt_hash_table[i].lock);
 651
 652                 for (; rth; rth = next) {
 653                         next = rth->u.rt_next;
 654                         rt_free(rth);
 655                 }
 656         }
 657 }
 658
 659 static DEFINE_SPINLOCK(rt_flush_lock);
 660
 661 void rt_cache_flush(int delay)
 662 {
 663         unsigned long now = jiffies;
 664         int user_mode = !in_softirq();
 665
 666         if (delay < 0)
 667                 delay = ip_rt_min_delay;
 668
 669         /* flush existing multipath state*/
 670         multipath_flush();
 671
 672         spin_lock_bh(&rt_flush_lock);
 673
 674         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 675                 long tmo = (long)(rt_deadline - now);
 676
 677                 /* If flush timer is already running
 678                    and flush request is not immediate (delay > 0):
 679
 680                    if deadline is not achieved, prolongate timer to "delay",
 681                    otherwise fire it at deadline time.
 682                  */
 683
 684                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 685                         tmo = 0;
 686
 687                 if (delay > tmo)
 688                         delay = tmo;
 689         }
 690
 691         if (delay <= 0) {
 692                 spin_unlock_bh(&rt_flush_lock);
 693                 rt_run_flush(0);
 694                 return;
 695         }
 696
 697         if (rt_deadline == 0)
 698                 rt_deadline = now + ip_rt_max_delay;
 699
 700         mod_timer(&rt_flush_timer, now+delay);
 701         spin_unlock_bh(&rt_flush_lock);
 702 }
 703
 704 static void rt_secret_rebuild(unsigned long dummy)
 705 {
 706         unsigned long now = jiffies;
 707
 708         rt_cache_flush(0);
 709         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 710 }
 711
 712 /*
 713    Short description of GC goals.
 714
 715    We want to build algorithm, which will keep routing cache
 716    at some equilibrium point, when number of aged off entries
 717    is kept approximately equal to newly generated ones.
 718
 719    Current expiration strength is variable "expire".
 720    We try to adjust it dynamically, so that if networking
 721    is idle expires is large enough to keep enough of warm entries,
 722    and when load increases it reduces to limit cache size.
 723  */
 724
 725 static int rt_garbage_collect(void)
 726 {
 727         static unsigned long expire = RT_GC_TIMEOUT;
 728         static unsigned long last_gc;
 729         static int rover;
 730         static int equilibrium;
 731         struct rtable *rth, **rthp;
 732         unsigned long now = jiffies;
 733         int goal;
 734
 735         /*
 736          * Garbage collection is pretty expensive,
 737          * do not make it too frequently.
 738          */
 739
 740         RT_CACHE_STAT_INC(gc_total);
 741
 742         if (now - last_gc < ip_rt_gc_min_interval &&
 743             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 744                 RT_CACHE_STAT_INC(gc_ignored);
 745                 goto out;
 746         }
 747
 748         /* Calculate number of entries, which we want to expire now. */
 749         goal = atomic_read(&ipv4_dst_ops.entries) -
 750                 (ip_rt_gc_elasticity << rt_hash_log);
 751         if (goal <= 0) {
 752                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 753                         equilibrium = ipv4_dst_ops.gc_thresh;
 754                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 755                 if (goal > 0) {
 756                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 757                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 758                 }
 759         } else {
 760                 /* We are in dangerous area. Try to reduce cache really
 761                  * aggressively.
 762                  */
 763                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 764                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 765         }
 766
 767         if (now - last_gc >= ip_rt_gc_min_interval)
 768                 last_gc = now;
 769
 770         if (goal <= 0) {
 771                 equilibrium += goal;
 772                 goto work_done;
 773         }
 774
 775         do {
 776                 int i, k;
 777
 778                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 779                         unsigned long tmo = expire;
 780
 781                         k = (k + 1) & rt_hash_mask;
 782                         rthp = &rt_hash_table[k].chain;
 783                         spin_lock_bh(&rt_hash_table[k].lock);
 784                         while ((rth = *rthp) != NULL) {
 785                                 if (!rt_may_expire(rth, tmo, expire)) {
 786                                         tmo >>= 1;
 787                                         rthp = &rth->u.rt_next;
 788                                         continue;
 789                                 }
 790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 791                                 /* remove all related balanced entries
 792                                  * if necessary
 793                                  */
 794                                 if (rth->u.dst.flags & DST_BALANCED) {
 795                                         int r;
 796
 797                                         rthp = rt_remove_balanced_route(
 798                                                 &rt_hash_table[i].chain,
 799                                                 rth,
 800                                                 &r);
 801                                         goal -= r;
 802                                         if (!rthp)
 803                                                 break;
 804                                 } else {
 805                                         *rthp = rth->u.rt_next;
 806                                         rt_free(rth);
 807                                         goal--;
 808                                 }
 809 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 810                                 *rthp = rth->u.rt_next;
 811                                 rt_free(rth);
 812                                 goal--;
 813 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 814                         }
 815                         spin_unlock_bh(&rt_hash_table[k].lock);
 816                         if (goal <= 0)
 817                                 break;
 818                 }
 819                 rover = k;
 820
 821                 if (goal <= 0)
 822                         goto work_done;
 823
 824                 /* Goal is not achieved. We stop process if:
 825
 826                    - if expire reduced to zero. Otherwise, expire is halfed.
 827                    - if table is not full.
 828                    - if we are called from interrupt.
 829                    - jiffies check is just fallback/debug loop breaker.
 830                      We will not spin here for long time in any case.
 831                  */
 832
 833                 RT_CACHE_STAT_INC(gc_goal_miss);
 834
 835                 if (expire == 0)
 836                         break;
 837
 838                 expire >>= 1;
 839 #if RT_CACHE_DEBUG >= 2
 840                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 841                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 842 #endif
 843
 844                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 845                         goto out;
 846         } while (!in_softirq() && time_before_eq(jiffies, now));
 847
 848         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 849                 goto out;
 850         if (net_ratelimit())
 851                 printk(KERN_WARNING "dst cache overflow\n");
 852         RT_CACHE_STAT_INC(gc_dst_overflow);
 853         return 1;
 854
 855 work_done:
 856         expire += ip_rt_gc_min_interval;
 857         if (expire > ip_rt_gc_timeout ||
 858             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 859                 expire = ip_rt_gc_timeout;
 860 #if RT_CACHE_DEBUG >= 2
 861         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 862                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 863 #endif
 864 out:    return 0;
 865 }
 866
 867 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 868 {
 869         struct rtable   *rth, **rthp;
 870         unsigned long   now;
 871         struct rtable *cand, **candp;
 872         u32             min_score;
 873         int             chain_length;
 874         int attempts = !in_softirq();
 875
 876 restart:
 877         chain_length = 0;
 878         min_score = ~(u32)0;
 879         cand = NULL;
 880         candp = NULL;
 881         now = jiffies;
 882
 883         rthp = &rt_hash_table[hash].chain;
 884
 885         spin_lock_bh(&rt_hash_table[hash].lock);
 886         while ((rth = *rthp) != NULL) {
 887 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 888                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 889                     compare_keys(&rth->fl, &rt->fl)) {
 890 #else
 891                 if (compare_keys(&rth->fl, &rt->fl)) {
 892 #endif
 893                         /* Put it first */
 894                         *rthp = rth->u.rt_next;
 895                         /*
 896                          * Since lookup is lockfree, the deletion
 897                          * must be visible to another weakly ordered CPU before
 898                          * the insertion at the start of the hash chain.
 899                          */
 900                         rcu_assign_pointer(rth->u.rt_next,
 901                                            rt_hash_table[hash].chain);
 902                         /*
 903                          * Since lookup is lockfree, the update writes
 904                          * must be ordered for consistency on SMP.
 905                          */
 906                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 907
 908                         rth->u.dst.__use++;
 909                         dst_hold(&rth->u.dst);
 910                         rth->u.dst.lastuse = now;
 911                         spin_unlock_bh(&rt_hash_table[hash].lock);
 912
 913                         rt_drop(rt);
 914                         *rp = rth;
 915                         return 0;
 916                 }
 917
 918                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 919                         u32 score = rt_score(rth);
 920
 921                         if (score <= min_score) {
 922                                 cand = rth;
 923                                 candp = rthp;
 924                                 min_score = score;
 925                         }
 926                 }
 927
 928                 chain_length++;
 929
 930                 rthp = &rth->u.rt_next;
 931         }
 932
 933         if (cand) {
 934                 /* ip_rt_gc_elasticity used to be average length of chain
 935                  * length, when exceeded gc becomes really aggressive.
 936                  *
 937                  * The second limit is less certain. At the moment it allows
 938                  * only 2 entries per bucket. We will see.
 939                  */
 940                 if (chain_length > ip_rt_gc_elasticity) {
 941                         *candp = cand->u.rt_next;
 942                         rt_free(cand);
 943                 }
 944         }
 945
 946         /* Try to bind route to arp only if it is output
 947            route or unicast forwarding path.
 948          */
 949         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 950                 int err = arp_bind_neighbour(&rt->u.dst);
 951                 if (err) {
 952                         spin_unlock_bh(&rt_hash_table[hash].lock);
 953
 954                         if (err != -ENOBUFS) {
 955                                 rt_drop(rt);
 956                                 return err;
 957                         }
 958
 959                         /* Neighbour tables are full and nothing
 960                            can be released. Try to shrink route cache,
 961                            it is most likely it holds some neighbour records.
 962                          */
 963                         if (attempts-- > 0) {
 964                                 int saved_elasticity = ip_rt_gc_elasticity;
 965                                 int saved_int = ip_rt_gc_min_interval;
 966                                 ip_rt_gc_elasticity     = 1;
 967                                 ip_rt_gc_min_interval   = 0;
 968                                 rt_garbage_collect();
 969                                 ip_rt_gc_min_interval   = saved_int;
 970                                 ip_rt_gc_elasticity     = saved_elasticity;
 971                                 goto restart;
 972                         }
 973
 974                         if (net_ratelimit())
 975                                 printk(KERN_WARNING "Neighbour table overflow.\n");
 976                         rt_drop(rt);
 977                         return -ENOBUFS;
 978                 }
 979         }
 980
 981         rt->u.rt_next = rt_hash_table[hash].chain;
 982 #if RT_CACHE_DEBUG >= 2
 983         if (rt->u.rt_next) {
 984                 struct rtable *trt;
 985                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 986                        NIPQUAD(rt->rt_dst));
 987                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
 988                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 989                 printk("\n");
 990         }
 991 #endif
 992         rt_hash_table[hash].chain = rt;
 993         spin_unlock_bh(&rt_hash_table[hash].lock);
 994         *rp = rt;
 995         return 0;
 996 }
 997
 998 void rt_bind_peer(struct rtable *rt, int create)
 999 {
1000         static DEFINE_SPINLOCK(rt_peer_lock);
1001         struct inet_peer *peer;
1002
1003         peer = inet_getpeer(rt->rt_dst, create);
1004
1005         spin_lock_bh(&rt_peer_lock);
1006         if (rt->peer == NULL) {
1007                 rt->peer = peer;
1008                 peer = NULL;
1009         }
1010         spin_unlock_bh(&rt_peer_lock);
1011         if (peer)
1012                 inet_putpeer(peer);
1013 }
1014
1015 /*
1016  * Peer allocation may fail only in serious out-of-memory conditions.  However
1017  * we still can generate some output.
1018  * Random ID selection looks a bit dangerous because we have no chances to
1019  * select ID being unique in a reasonable period of time.
1020  * But broken packet identifier may be better than no packet at all.
1021  */
1022 static void ip_select_fb_ident(struct iphdr *iph)
1023 {
1024         static DEFINE_SPINLOCK(ip_fb_id_lock);
1025         static u32 ip_fallback_id;
1026         u32 salt;
1027
1028         spin_lock_bh(&ip_fb_id_lock);
1029         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1030         iph->id = htons(salt & 0xFFFF);
1031         ip_fallback_id = salt;
1032         spin_unlock_bh(&ip_fb_id_lock);
1033 }
1034
1035 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1036 {
1037         struct rtable *rt = (struct rtable *) dst;
1038
1039         if (rt) {
1040                 if (rt->peer == NULL)
1041                         rt_bind_peer(rt, 1);
1042
1043                 /* If peer is attached to destination, it is never detached,
1044                    so that we need not to grab a lock to dereference it.
1045                  */
1046                 if (rt->peer) {
1047                         iph->id = htons(inet_getid(rt->peer, more));
1048                         return;
1049                 }
1050         } else
1051                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1052                        __builtin_return_address(0));
1053
1054         ip_select_fb_ident(iph);
1055 }
1056
1057 static void rt_del(unsigned hash, struct rtable *rt)
1058 {
1059         struct rtable **rthp;
1060
1061         spin_lock_bh(&rt_hash_table[hash].lock);
1062         ip_rt_put(rt);
1063         for (rthp = &rt_hash_table[hash].chain; *rthp;
1064              rthp = &(*rthp)->u.rt_next)
1065                 if (*rthp == rt) {
1066                         *rthp = rt->u.rt_next;
1067                         rt_free(rt);
1068                         break;
1069                 }
1070         spin_unlock_bh(&rt_hash_table[hash].lock);
1071 }
1072
1073 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1074                     u32 saddr, u8 tos, struct net_device *dev)
1075 {
1076         int i, k;
1077         struct in_device *in_dev = in_dev_get(dev);
1078         struct rtable *rth, **rthp;
1079         u32  skeys[2] = { saddr, 0 };
1080         int  ikeys[2] = { dev->ifindex, 0 };
1081
1082         tos &= IPTOS_RT_MASK;
1083
1084         if (!in_dev)
1085                 return;
1086
1087         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1088             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1089                 goto reject_redirect;
1090
1091         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1092                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1093                         goto reject_redirect;
1094                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1095                         goto reject_redirect;
1096         } else {
1097                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1098                         goto reject_redirect;
1099         }
1100
1101         for (i = 0; i < 2; i++) {
1102                 for (k = 0; k < 2; k++) {
1103                         unsigned hash = rt_hash_code(daddr,
1104                                                      skeys[i] ^ (ikeys[k] << 5),
1105                                                      tos);
1106
1107                         rthp=&rt_hash_table[hash].chain;
1108
1109                         rcu_read_lock();
1110                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1111                                 struct rtable *rt;
1112
1113                                 if (rth->fl.fl4_dst != daddr ||
1114                                     rth->fl.fl4_src != skeys[i] ||
1115                                     rth->fl.fl4_tos != tos ||
1116                                     rth->fl.oif != ikeys[k] ||
1117                                     rth->fl.iif != 0) {
1118                                         rthp = &rth->u.rt_next;
1119                                         continue;
1120                                 }
1121
1122                                 if (rth->rt_dst != daddr ||
1123                                     rth->rt_src != saddr ||
1124                                     rth->u.dst.error ||
1125                                     rth->rt_gateway != old_gw ||
1126                                     rth->u.dst.dev != dev)
1127                                         break;
1128
1129                                 dst_hold(&rth->u.dst);
1130                                 rcu_read_unlock();
1131
1132                                 rt = dst_alloc(&ipv4_dst_ops);
1133                                 if (rt == NULL) {
1134                                         ip_rt_put(rth);
1135                                         in_dev_put(in_dev);
1136                                         return;
1137                                 }
1138
1139                                 /* Copy all the information. */
1140                                 *rt = *rth;
1141                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1142                                 rt->u.dst.__use         = 1;
1143                                 atomic_set(&rt->u.dst.__refcnt, 1);
1144                                 rt->u.dst.child         = NULL;
1145                                 if (rt->u.dst.dev)
1146                                         dev_hold(rt->u.dst.dev);
1147                                 if (rt->idev)
1148                                         in_dev_hold(rt->idev);
1149                                 rt->u.dst.obsolete      = 0;
1150                                 rt->u.dst.lastuse       = jiffies;
1151                                 rt->u.dst.path          = &rt->u.dst;
1152                                 rt->u.dst.neighbour     = NULL;
1153                                 rt->u.dst.hh            = NULL;
1154                                 rt->u.dst.xfrm          = NULL;
1155
1156                                 rt->rt_flags            |= RTCF_REDIRECTED;
1157
1158                                 /* Gateway is different ... */
1159                                 rt->rt_gateway          = new_gw;
1160
1161                                 /* Redirect received -> path was valid */
1162                                 dst_confirm(&rth->u.dst);
1163
1164                                 if (rt->peer)
1165                                         atomic_inc(&rt->peer->refcnt);
1166
1167                                 if (arp_bind_neighbour(&rt->u.dst) ||
1168                                     !(rt->u.dst.neighbour->nud_state &
1169                                             NUD_VALID)) {
1170                                         if (rt->u.dst.neighbour)
1171                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1172                                         ip_rt_put(rth);
1173                                         rt_drop(rt);
1174                                         goto do_next;
1175                                 }
1176
1177                                 rt_del(hash, rth);
1178                                 if (!rt_intern_hash(hash, rt, &rt))
1179                                         ip_rt_put(rt);
1180                                 goto do_next;
1181                         }
1182                         rcu_read_unlock();
1183                 do_next:
1184                         ;
1185                 }
1186         }
1187         in_dev_put(in_dev);
1188         return;
1189
1190 reject_redirect:
1191 #ifdef CONFIG_IP_ROUTE_VERBOSE
1192         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1193                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1194                         "%u.%u.%u.%u ignored.\n"
1195                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1196                         "tos %02x\n",
1197                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1198                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1199 #endif
1200         in_dev_put(in_dev);
1201 }
1202
1203 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1204 {
1205         struct rtable *rt = (struct rtable*)dst;
1206         struct dst_entry *ret = dst;
1207
1208         if (rt) {
1209                 if (dst->obsolete) {
1210                         ip_rt_put(rt);
1211                         ret = NULL;
1212                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1213                            rt->u.dst.expires) {
1214                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1215                                                      rt->fl.fl4_src ^
1216                                                         (rt->fl.oif << 5),
1217                                                      rt->fl.fl4_tos);
1218 #if RT_CACHE_DEBUG >= 1
1219                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1220                                           "%u.%u.%u.%u/%02x dropped\n",
1221                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1222 #endif
1223                         rt_del(hash, rt);
1224                         ret = NULL;
1225                 }
1226         }
1227         return ret;
1228 }
1229
1230 /*
1231  * Algorithm:
1232  *      1. The first ip_rt_redirect_number redirects are sent
1233  *         with exponential backoff, then we stop sending them at all,
1234  *         assuming that the host ignores our redirects.
1235  *      2. If we did not see packets requiring redirects
1236  *         during ip_rt_redirect_silence, we assume that the host
1237  *         forgot redirected route and start to send redirects again.
1238  *
1239  * This algorithm is much cheaper and more intelligent than dumb load limiting
1240  * in icmp.c.
1241  *
1242  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1243  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1244  */
1245
1246 void ip_rt_send_redirect(struct sk_buff *skb)
1247 {
1248         struct rtable *rt = (struct rtable*)skb->dst;
1249         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1250
1251         if (!in_dev)
1252                 return;
1253
1254         if (!IN_DEV_TX_REDIRECTS(in_dev))
1255                 goto out;
1256
1257         /* No redirected packets during ip_rt_redirect_silence;
1258          * reset the algorithm.
1259          */
1260         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1261                 rt->u.dst.rate_tokens = 0;
1262
1263         /* Too many ignored redirects; do not send anything
1264          * set u.dst.rate_last to the last seen redirected packet.
1265          */
1266         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1267                 rt->u.dst.rate_last = jiffies;
1268                 goto out;
1269         }
1270
1271         /* Check for load limit; set rate_last to the latest sent
1272          * redirect.
1273          */
1274         if (time_after(jiffies,
1275                        (rt->u.dst.rate_last +
1276                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1277                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1278                 rt->u.dst.rate_last = jiffies;
1279                 ++rt->u.dst.rate_tokens;
1280 #ifdef CONFIG_IP_ROUTE_VERBOSE
1281                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1282                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1283                     net_ratelimit())
1284                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1285                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1286                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1287                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1288 #endif
1289         }
1290 out:
1291         in_dev_put(in_dev);
1292 }
1293
1294 static int ip_error(struct sk_buff *skb)
1295 {
1296         struct rtable *rt = (struct rtable*)skb->dst;
1297         unsigned long now;
1298         int code;
1299
1300         switch (rt->u.dst.error) {
1301                 case EINVAL:
1302                 default:
1303                         goto out;
1304                 case EHOSTUNREACH:
1305                         code = ICMP_HOST_UNREACH;
1306                         break;
1307                 case ENETUNREACH:
1308                         code = ICMP_NET_UNREACH;
1309                         break;
1310                 case EACCES:
1311                         code = ICMP_PKT_FILTERED;
1312                         break;
1313         }
1314
1315         now = jiffies;
1316         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1317         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1318                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1319         rt->u.dst.rate_last = now;
1320         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1321                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1322                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1323         }
1324
1325 out:    kfree_skb(skb);
1326         return 0;
1327 }
1328
1329 /*
1330  *      The last two values are not from the RFC but
1331  *      are needed for AMPRnet AX.25 paths.
1332  */
1333
1334 static unsigned short mtu_plateau[] =
1335 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1336
1337 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1338 {
1339         int i;
1340
1341         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1342                 if (old_mtu > mtu_plateau[i])
1343                         return mtu_plateau[i];
1344         return 68;
1345 }
1346
1347 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1348 {
1349         int i;
1350         unsigned short old_mtu = ntohs(iph->tot_len);
1351         struct rtable *rth;
1352         u32  skeys[2] = { iph->saddr, 0, };
1353         u32  daddr = iph->daddr;
1354         u8   tos = iph->tos & IPTOS_RT_MASK;
1355         unsigned short est_mtu = 0;
1356
1357         if (ipv4_config.no_pmtu_disc)
1358                 return 0;
1359
1360         for (i = 0; i < 2; i++) {
1361                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1362
1363                 rcu_read_lock();
1364                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1365                      rth = rcu_dereference(rth->u.rt_next)) {
1366                         if (rth->fl.fl4_dst == daddr &&
1367                             rth->fl.fl4_src == skeys[i] &&
1368                             rth->rt_dst  == daddr &&
1369                             rth->rt_src  == iph->saddr &&
1370                             rth->fl.fl4_tos == tos &&
1371                             rth->fl.iif == 0 &&
1372                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1373                                 unsigned short mtu = new_mtu;
1374
1375                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1376
1377                                         /* BSD 4.2 compatibility hack :-( */
1378                                         if (mtu == 0 &&
1379                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1380                                             old_mtu >= 68 + (iph->ihl << 2))
1381                                                 old_mtu -= iph->ihl << 2;
1382
1383                                         mtu = guess_mtu(old_mtu);
1384                                 }
1385                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1386                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1387                                                 dst_confirm(&rth->u.dst);
1388                                                 if (mtu < ip_rt_min_pmtu) {
1389                                                         mtu = ip_rt_min_pmtu;
1390                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1391                                                                 (1 << RTAX_MTU);
1392                                                 }
1393                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1394                                                 dst_set_expires(&rth->u.dst,
1395                                                         ip_rt_mtu_expires);
1396                                         }
1397                                         est_mtu = mtu;
1398                                 }
1399                         }
1400                 }
1401                 rcu_read_unlock();
1402         }
1403         return est_mtu ? : new_mtu;
1404 }
1405
1406 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1407 {
1408         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1409             !(dst_metric_locked(dst, RTAX_MTU))) {
1410                 if (mtu < ip_rt_min_pmtu) {
1411                         mtu = ip_rt_min_pmtu;
1412                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1413                 }
1414                 dst->metrics[RTAX_MTU-1] = mtu;
1415                 dst_set_expires(dst, ip_rt_mtu_expires);
1416         }
1417 }
1418
1419 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1420 {
1421         return NULL;
1422 }
1423
1424 static void ipv4_dst_destroy(struct dst_entry *dst)
1425 {
1426         struct rtable *rt = (struct rtable *) dst;
1427         struct inet_peer *peer = rt->peer;
1428         struct in_device *idev = rt->idev;
1429
1430         if (peer) {
1431                 rt->peer = NULL;
1432                 inet_putpeer(peer);
1433         }
1434
1435         if (idev) {
1436                 rt->idev = NULL;
1437                 in_dev_put(idev);
1438         }
1439 }
1440
1441 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1442                             int how)
1443 {
1444         struct rtable *rt = (struct rtable *) dst;
1445         struct in_device *idev = rt->idev;
1446         if (dev != &loopback_dev && idev && idev->dev == dev) {
1447                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1448                 if (loopback_idev) {
1449                         rt->idev = loopback_idev;
1450                         in_dev_put(idev);
1451                 }
1452         }
1453 }
1454
1455 static void ipv4_link_failure(struct sk_buff *skb)
1456 {
1457         struct rtable *rt;
1458
1459         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1460
1461         rt = (struct rtable *) skb->dst;
1462         if (rt)
1463                 dst_set_expires(&rt->u.dst, 0);
1464 }
1465
1466 static int ip_rt_bug(struct sk_buff *skb)
1467 {
1468         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1469                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1470                 skb->dev ? skb->dev->name : "?");
1471         kfree_skb(skb);
1472         return 0;
1473 }
1474
1475 /*
1476    We do not cache source address of outgoing interface,
1477    because it is used only by IP RR, TS and SRR options,
1478    so that it out of fast path.
1479
1480    BTW remember: "addr" is allowed to be not aligned
1481    in IP options!
1482  */
1483
1484 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1485 {
1486         u32 src;
1487         struct fib_result res;
1488
1489         if (rt->fl.iif == 0)
1490                 src = rt->rt_src;
1491         else if (fib_lookup(&rt->fl, &res) == 0) {
1492                 src = FIB_RES_PREFSRC(res);
1493                 fib_res_put(&res);
1494         } else
1495                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1496                                         RT_SCOPE_UNIVERSE);
1497         memcpy(addr, &src, 4);
1498 }
1499
1500 #ifdef CONFIG_NET_CLS_ROUTE
1501 static void set_class_tag(struct rtable *rt, u32 tag)
1502 {
1503         if (!(rt->u.dst.tclassid & 0xFFFF))
1504                 rt->u.dst.tclassid |= tag & 0xFFFF;
1505         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1506                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1507 }
1508 #endif
1509
1510 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1511 {
1512         struct fib_info *fi = res->fi;
1513
1514         if (fi) {
1515                 if (FIB_RES_GW(*res) &&
1516                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1517                         rt->rt_gateway = FIB_RES_GW(*res);
1518                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1519                        sizeof(rt->u.dst.metrics));
1520                 if (fi->fib_mtu == 0) {
1521                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1522                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1523                             rt->rt_gateway != rt->rt_dst &&
1524                             rt->u.dst.dev->mtu > 576)
1525                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1526                 }
1527 #ifdef CONFIG_NET_CLS_ROUTE
1528                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1529 #endif
1530         } else
1531                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1532
1533         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1534                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1535         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1536                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1537         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1538                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1539                                        ip_rt_min_advmss);
1540         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1541                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1542
1543 #ifdef CONFIG_NET_CLS_ROUTE
1544 #ifdef CONFIG_IP_MULTIPLE_TABLES
1545         set_class_tag(rt, fib_rules_tclass(res));
1546 #endif
1547         set_class_tag(rt, itag);
1548 #endif
1549         rt->rt_type = res->type;
1550 }
1551
1552 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1553                                 u8 tos, struct net_device *dev, int our)
1554 {
1555         unsigned hash;
1556         struct rtable *rth;
1557         u32 spec_dst;
1558         struct in_device *in_dev = in_dev_get(dev);
1559         u32 itag = 0;
1560
1561         /* Primary sanity checks. */
1562
1563         if (in_dev == NULL)
1564                 return -EINVAL;
1565
1566         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1567             skb->protocol != htons(ETH_P_IP))
1568                 goto e_inval;
1569
1570         if (ZERONET(saddr)) {
1571                 if (!LOCAL_MCAST(daddr))
1572                         goto e_inval;
1573                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1574         } else if (fib_validate_source(saddr, 0, tos, 0,
1575                                         dev, &spec_dst, &itag) < 0)
1576                 goto e_inval;
1577
1578         rth = dst_alloc(&ipv4_dst_ops);
1579         if (!rth)
1580                 goto e_nobufs;
1581
1582         rth->u.dst.output= ip_rt_bug;
1583
1584         atomic_set(&rth->u.dst.__refcnt, 1);
1585         rth->u.dst.flags= DST_HOST;
1586         if (in_dev->cnf.no_policy)
1587                 rth->u.dst.flags |= DST_NOPOLICY;
1588         rth->fl.fl4_dst = daddr;
1589         rth->rt_dst     = daddr;
1590         rth->fl.fl4_tos = tos;
1591 #ifdef CONFIG_IP_ROUTE_FWMARK
1592         rth->fl.fl4_fwmark= skb->nfmark;
1593 #endif
1594         rth->fl.fl4_src = saddr;
1595         rth->rt_src     = saddr;
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597         rth->u.dst.tclassid = itag;
1598 #endif
1599         rth->rt_iif     =
1600         rth->fl.iif     = dev->ifindex;
1601         rth->u.dst.dev  = &loopback_dev;
1602         dev_hold(rth->u.dst.dev);
1603         rth->idev       = in_dev_get(rth->u.dst.dev);
1604         rth->fl.oif     = 0;
1605         rth->rt_gateway = daddr;
1606         rth->rt_spec_dst= spec_dst;
1607         rth->rt_type    = RTN_MULTICAST;
1608         rth->rt_flags   = RTCF_MULTICAST;
1609         if (our) {
1610                 rth->u.dst.input= ip_local_deliver;
1611                 rth->rt_flags |= RTCF_LOCAL;
1612         }
1613
1614 #ifdef CONFIG_IP_MROUTE
1615         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1616                 rth->u.dst.input = ip_mr_input;
1617 #endif
1618         RT_CACHE_STAT_INC(in_slow_mc);
1619
1620         in_dev_put(in_dev);
1621         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1622         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1623
1624 e_nobufs:
1625         in_dev_put(in_dev);
1626         return -ENOBUFS;
1627
1628 e_inval:
1629         in_dev_put(in_dev);
1630         return -EINVAL;
1631 }
1632
1633
1634 static void ip_handle_martian_source(struct net_device *dev,
1635                                      struct in_device *in_dev,
1636                                      struct sk_buff *skb,
1637                                      u32 daddr,
1638                                      u32 saddr)
1639 {
1640         RT_CACHE_STAT_INC(in_martian_src);
1641 #ifdef CONFIG_IP_ROUTE_VERBOSE
1642         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1643                 /*
1644                  *      RFC1812 recommendation, if source is martian,
1645                  *      the only hint is MAC header.
1646                  */
1647                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648                         "%u.%u.%u.%u, on dev %s\n",
1649                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650                 if (dev->hard_header_len) {
1651                         int i;
1652                         unsigned char *p = skb->mac.raw;
1653                         printk(KERN_WARNING "ll header: ");
1654                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1655                                 printk("%02x", *p);
1656                                 if (i < (dev->hard_header_len - 1))
1657                                         printk(":");
1658                         }
1659                         printk("\n");
1660                 }
1661         }
1662 #endif
1663 }
1664
1665 static inline int __mkroute_input(struct sk_buff *skb,
1666                                   struct fib_result* res,
1667                                   struct in_device *in_dev,
1668                                   u32 daddr, u32 saddr, u32 tos,
1669                                   struct rtable **result)
1670 {
1671
1672         struct rtable *rth;
1673         int err;
1674         struct in_device *out_dev;
1675         unsigned flags = 0;
1676         u32 spec_dst, itag;
1677
1678         /* get a working reference to the output device */
1679         out_dev = in_dev_get(FIB_RES_DEV(*res));
1680         if (out_dev == NULL) {
1681                 if (net_ratelimit())
1682                         printk(KERN_CRIT "Bug in ip_route_input" \
1683                                "_slow(). Please, report\n");
1684                 return -EINVAL;
1685         }
1686
1687
1688         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1689                                   in_dev->dev, &spec_dst, &itag);
1690         if (err < 0) {
1691                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1692                                          saddr);
1693
1694                 err = -EINVAL;
1695                 goto cleanup;
1696         }
1697
1698         if (err)
1699                 flags |= RTCF_DIRECTSRC;
1700
1701         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1702             (IN_DEV_SHARED_MEDIA(out_dev) ||
1703              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1704                 flags |= RTCF_DOREDIRECT;
1705
1706         if (skb->protocol != htons(ETH_P_IP)) {
1707                 /* Not IP (i.e. ARP). Do not create route, if it is
1708                  * invalid for proxy arp. DNAT routes are always valid.
1709                  */
1710                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1711                         err = -EINVAL;
1712                         goto cleanup;
1713                 }
1714         }
1715
1716
1717         rth = dst_alloc(&ipv4_dst_ops);
1718         if (!rth) {
1719                 err = -ENOBUFS;
1720                 goto cleanup;
1721         }
1722
1723         rth->u.dst.flags= DST_HOST;
1724 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1725         if (res->fi->fib_nhs > 1)
1726                 rth->u.dst.flags |= DST_BALANCED;
1727 #endif
1728         if (in_dev->cnf.no_policy)
1729                 rth->u.dst.flags |= DST_NOPOLICY;
1730         if (in_dev->cnf.no_xfrm)
1731                 rth->u.dst.flags |= DST_NOXFRM;
1732         rth->fl.fl4_dst = daddr;
1733         rth->rt_dst     = daddr;
1734         rth->fl.fl4_tos = tos;
1735 #ifdef CONFIG_IP_ROUTE_FWMARK
1736         rth->fl.fl4_fwmark= skb->nfmark;
1737 #endif
1738         rth->fl.fl4_src = saddr;
1739         rth->rt_src     = saddr;
1740         rth->rt_gateway = daddr;
1741         rth->rt_iif     =
1742                 rth->fl.iif     = in_dev->dev->ifindex;
1743         rth->u.dst.dev  = (out_dev)->dev;
1744         dev_hold(rth->u.dst.dev);
1745         rth->idev       = in_dev_get(rth->u.dst.dev);
1746         rth->fl.oif     = 0;
1747         rth->rt_spec_dst= spec_dst;
1748
1749         rth->u.dst.input = ip_forward;
1750         rth->u.dst.output = ip_output;
1751
1752         rt_set_nexthop(rth, res, itag);
1753
1754         rth->rt_flags = flags;
1755
1756         *result = rth;
1757         err = 0;
1758  cleanup:
1759         /* release the working reference to the output device */
1760         in_dev_put(out_dev);
1761         return err;
1762 }
1763
1764 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1765                                        struct fib_result* res,
1766                                        const struct flowi *fl,
1767                                        struct in_device *in_dev,
1768                                        u32 daddr, u32 saddr, u32 tos)
1769 {
1770         struct rtable* rth = NULL;
1771         int err;
1772         unsigned hash;
1773
1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1775         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1776                 fib_select_multipath(fl, res);
1777 #endif
1778
1779         /* create a routing cache entry */
1780         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1781         if (err)
1782                 return err;
1783         atomic_set(&rth->u.dst.__refcnt, 1);
1784
1785         /* put it into the cache */
1786         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1787         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1788 }
1789
1790 static inline int ip_mkroute_input(struct sk_buff *skb,
1791                                    struct fib_result* res,
1792                                    const struct flowi *fl,
1793                                    struct in_device *in_dev,
1794                                    u32 daddr, u32 saddr, u32 tos)
1795 {
1796 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1797         struct rtable* rth = NULL;
1798         unsigned char hop, hopcount, lasthop;
1799         int err = -EINVAL;
1800         unsigned int hash;
1801
1802         if (res->fi)
1803                 hopcount = res->fi->fib_nhs;
1804         else
1805                 hopcount = 1;
1806
1807         lasthop = hopcount - 1;
1808
1809         /* distinguish between multipath and singlepath */
1810         if (hopcount < 2)
1811                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1812                                             saddr, tos);
1813
1814         /* add all alternatives to the routing cache */
1815         for (hop = 0; hop < hopcount; hop++) {
1816                 res->nh_sel = hop;
1817
1818                 /* create a routing cache entry */
1819                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1820                                       &rth);
1821                 if (err)
1822                         return err;
1823
1824                 /* put it into the cache */
1825                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1826                 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1827                 if (err)
1828                         return err;
1829
1830                 /* forward hop information to multipath impl. */
1831                 multipath_set_nhinfo(rth,
1832                                      FIB_RES_NETWORK(*res),
1833                                      FIB_RES_NETMASK(*res),
1834                                      res->prefixlen,
1835                                      &FIB_RES_NH(*res));
1836
1837                 /* only for the last hop the reference count is handled
1838                  * outside
1839                  */
1840                 if (hop == lasthop)
1841                         atomic_set(&(skb->dst->__refcnt), 1);
1842         }
1843         return err;
1844 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1845         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1846 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1847 }
1848
1849
1850 /*
1851  *      NOTE. We drop all the packets that has local source
1852  *      addresses, because every properly looped back packet
1853  *      must have correct destination already attached by output routine.
1854  *
1855  *      Such approach solves two big problems:
1856  *      1. Not simplex devices are handled properly.
1857  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1858  */
1859
1860 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1861                                u8 tos, struct net_device *dev)
1862 {
1863         struct fib_result res;
1864         struct in_device *in_dev = in_dev_get(dev);
1865         struct flowi fl = { .nl_u = { .ip4_u =
1866                                       { .daddr = daddr,
1867                                         .saddr = saddr,
1868                                         .tos = tos,
1869                                         .scope = RT_SCOPE_UNIVERSE,
1870 #ifdef CONFIG_IP_ROUTE_FWMARK
1871                                         .fwmark = skb->nfmark
1872 #endif
1873                                       } },
1874                             .iif = dev->ifindex };
1875         unsigned        flags = 0;
1876         u32             itag = 0;
1877         struct rtable * rth;
1878         unsigned        hash;
1879         u32             spec_dst;
1880         int             err = -EINVAL;
1881         int             free_res = 0;
1882
1883         /* IP on this device is disabled. */
1884
1885         if (!in_dev)
1886                 goto out;
1887
1888         /* Check for the most weird martians, which can be not detected
1889            by fib_lookup.
1890          */
1891
1892         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1893                 goto martian_source;
1894
1895         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1896                 goto brd_input;
1897
1898         /* Accept zero addresses only to limited broadcast;
1899          * I even do not know to fix it or not. Waiting for complains :-)
1900          */
1901         if (ZERONET(saddr))
1902                 goto martian_source;
1903
1904         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1905                 goto martian_destination;
1906
1907         /*
1908          *      Now we are ready to route packet.
1909          */
1910         if ((err = fib_lookup(&fl, &res)) != 0) {
1911                 if (!IN_DEV_FORWARD(in_dev))
1912                         goto e_hostunreach;
1913                 goto no_route;
1914         }
1915         free_res = 1;
1916
1917         RT_CACHE_STAT_INC(in_slow_tot);
1918
1919         if (res.type == RTN_BROADCAST)
1920                 goto brd_input;
1921
1922         if (res.type == RTN_LOCAL) {
1923                 int result;
1924                 result = fib_validate_source(saddr, daddr, tos,
1925                                              loopback_dev.ifindex,
1926                                              dev, &spec_dst, &itag);
1927                 if (result < 0)
1928                         goto martian_source;
1929                 if (result)
1930                         flags |= RTCF_DIRECTSRC;
1931                 spec_dst = daddr;
1932                 goto local_input;
1933         }
1934
1935         if (!IN_DEV_FORWARD(in_dev))
1936                 goto e_hostunreach;
1937         if (res.type != RTN_UNICAST)
1938                 goto martian_destination;
1939
1940         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1941         if (err == -ENOBUFS)
1942                 goto e_nobufs;
1943         if (err == -EINVAL)
1944                 goto e_inval;
1945
1946 done:
1947         in_dev_put(in_dev);
1948         if (free_res)
1949                 fib_res_put(&res);
1950 out:    return err;
1951
1952 brd_input:
1953         if (skb->protocol != htons(ETH_P_IP))
1954                 goto e_inval;
1955
1956         if (ZERONET(saddr))
1957                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1958         else {
1959                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1960                                           &itag);
1961                 if (err < 0)
1962                         goto martian_source;
1963                 if (err)
1964                         flags |= RTCF_DIRECTSRC;
1965         }
1966         flags |= RTCF_BROADCAST;
1967         res.type = RTN_BROADCAST;
1968         RT_CACHE_STAT_INC(in_brd);
1969
1970 local_input:
1971         rth = dst_alloc(&ipv4_dst_ops);
1972         if (!rth)
1973                 goto e_nobufs;
1974
1975         rth->u.dst.output= ip_rt_bug;
1976
1977         atomic_set(&rth->u.dst.__refcnt, 1);
1978         rth->u.dst.flags= DST_HOST;
1979         if (in_dev->cnf.no_policy)
1980                 rth->u.dst.flags |= DST_NOPOLICY;
1981         rth->fl.fl4_dst = daddr;
1982         rth->rt_dst     = daddr;
1983         rth->fl.fl4_tos = tos;
1984 #ifdef CONFIG_IP_ROUTE_FWMARK
1985         rth->fl.fl4_fwmark= skb->nfmark;
1986 #endif
1987         rth->fl.fl4_src = saddr;
1988         rth->rt_src     = saddr;
1989 #ifdef CONFIG_NET_CLS_ROUTE
1990         rth->u.dst.tclassid = itag;
1991 #endif
1992         rth->rt_iif     =
1993         rth->fl.iif     = dev->ifindex;
1994         rth->u.dst.dev  = &loopback_dev;
1995         dev_hold(rth->u.dst.dev);
1996         rth->idev       = in_dev_get(rth->u.dst.dev);
1997         rth->rt_gateway = daddr;
1998         rth->rt_spec_dst= spec_dst;
1999         rth->u.dst.input= ip_local_deliver;
2000         rth->rt_flags   = flags|RTCF_LOCAL;
2001         if (res.type == RTN_UNREACHABLE) {
2002                 rth->u.dst.input= ip_error;
2003                 rth->u.dst.error= -err;
2004                 rth->rt_flags   &= ~RTCF_LOCAL;
2005         }
2006         rth->rt_type    = res.type;
2007         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2008         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2009         goto done;
2010
2011 no_route:
2012         RT_CACHE_STAT_INC(in_no_route);
2013         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2014         res.type = RTN_UNREACHABLE;
2015         goto local_input;
2016
2017         /*
2018          *      Do not cache martian addresses: they should be logged (RFC1812)
2019          */
2020 martian_destination:
2021         RT_CACHE_STAT_INC(in_martian_dst);
2022 #ifdef CONFIG_IP_ROUTE_VERBOSE
2023         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2024                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2025                         "%u.%u.%u.%u, dev %s\n",
2026                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027 #endif
2028
2029 e_hostunreach:
2030         err = -EHOSTUNREACH;
2031         goto done;
2032
2033 e_inval:
2034         err = -EINVAL;
2035         goto done;
2036
2037 e_nobufs:
2038         err = -ENOBUFS;
2039         goto done;
2040
2041 martian_source:
2042         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2043         goto e_inval;
2044 }
2045
2046 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2047                    u8 tos, struct net_device *dev)
2048 {
2049         struct rtable * rth;
2050         unsigned        hash;
2051         int iif = dev->ifindex;
2052
2053         tos &= IPTOS_RT_MASK;
2054         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2055
2056         rcu_read_lock();
2057         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2058              rth = rcu_dereference(rth->u.rt_next)) {
2059                 if (rth->fl.fl4_dst == daddr &&
2060                     rth->fl.fl4_src == saddr &&
2061                     rth->fl.iif == iif &&
2062                     rth->fl.oif == 0 &&
2063 #ifdef CONFIG_IP_ROUTE_FWMARK
2064                     rth->fl.fl4_fwmark == skb->nfmark &&
2065 #endif
2066                     rth->fl.fl4_tos == tos) {
2067                         rth->u.dst.lastuse = jiffies;
2068                         dst_hold(&rth->u.dst);
2069                         rth->u.dst.__use++;
2070                         RT_CACHE_STAT_INC(in_hit);
2071                         rcu_read_unlock();
2072                         skb->dst = (struct dst_entry*)rth;
2073                         return 0;
2074                 }
2075                 RT_CACHE_STAT_INC(in_hlist_search);
2076         }
2077         rcu_read_unlock();
2078
2079         /* Multicast recognition logic is moved from route cache to here.
2080            The problem was that too many Ethernet cards have broken/missing
2081            hardware multicast filters :-( As result the host on multicasting
2082            network acquires a lot of useless route cache entries, sort of
2083            SDR messages from all the world. Now we try to get rid of them.
2084            Really, provided software IP multicast filter is organized
2085            reasonably (at least, hashed), it does not result in a slowdown
2086            comparing with route cache reject entries.
2087            Note, that multicast routers are not affected, because
2088            route cache entry is created eventually.
2089          */
2090         if (MULTICAST(daddr)) {
2091                 struct in_device *in_dev;
2092
2093                 rcu_read_lock();
2094                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2095                         int our = ip_check_mc(in_dev, daddr, saddr,
2096                                 skb->nh.iph->protocol);
2097                         if (our
2098 #ifdef CONFIG_IP_MROUTE
2099                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2100 #endif
2101                             ) {
2102                                 rcu_read_unlock();
2103                                 return ip_route_input_mc(skb, daddr, saddr,
2104                                                          tos, dev, our);
2105                         }
2106                 }
2107                 rcu_read_unlock();
2108                 return -EINVAL;
2109         }
2110         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2111 }
2112
2113 static inline int __mkroute_output(struct rtable **result,
2114                                    struct fib_result* res,
2115                                    const struct flowi *fl,
2116                                    const struct flowi *oldflp,
2117                                    struct net_device *dev_out,
2118                                    unsigned flags)
2119 {
2120         struct rtable *rth;
2121         struct in_device *in_dev;
2122         u32 tos = RT_FL_TOS(oldflp);
2123         int err = 0;
2124
2125         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2126                 return -EINVAL;
2127
2128         if (fl->fl4_dst == 0xFFFFFFFF)
2129                 res->type = RTN_BROADCAST;
2130         else if (MULTICAST(fl->fl4_dst))
2131                 res->type = RTN_MULTICAST;
2132         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2133                 return -EINVAL;
2134
2135         if (dev_out->flags & IFF_LOOPBACK)
2136                 flags |= RTCF_LOCAL;
2137
2138         /* get work reference to inet device */
2139         in_dev = in_dev_get(dev_out);
2140         if (!in_dev)
2141                 return -EINVAL;
2142
2143         if (res->type == RTN_BROADCAST) {
2144                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2145                 if (res->fi) {
2146                         fib_info_put(res->fi);
2147                         res->fi = NULL;
2148                 }
2149         } else if (res->type == RTN_MULTICAST) {
2150                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2151                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2152                                  oldflp->proto))
2153                         flags &= ~RTCF_LOCAL;
2154                 /* If multicast route do not exist use
2155                    default one, but do not gateway in this case.
2156                    Yes, it is hack.
2157                  */
2158                 if (res->fi && res->prefixlen < 4) {
2159                         fib_info_put(res->fi);
2160                         res->fi = NULL;
2161                 }
2162         }
2163
2164
2165         rth = dst_alloc(&ipv4_dst_ops);
2166         if (!rth) {
2167                 err = -ENOBUFS;
2168                 goto cleanup;
2169         }
2170
2171         rth->u.dst.flags= DST_HOST;
2172 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2173         if (res->fi) {
2174                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2175                 if (res->fi->fib_nhs > 1)
2176                         rth->u.dst.flags |= DST_BALANCED;
2177         }
2178 #endif
2179         if (in_dev->cnf.no_xfrm)
2180                 rth->u.dst.flags |= DST_NOXFRM;
2181         if (in_dev->cnf.no_policy)
2182                 rth->u.dst.flags |= DST_NOPOLICY;
2183
2184         rth->fl.fl4_dst = oldflp->fl4_dst;
2185         rth->fl.fl4_tos = tos;
2186         rth->fl.fl4_src = oldflp->fl4_src;
2187         rth->fl.oif     = oldflp->oif;
2188 #ifdef CONFIG_IP_ROUTE_FWMARK
2189         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2190 #endif
2191         rth->rt_dst     = fl->fl4_dst;
2192         rth->rt_src     = fl->fl4_src;
2193         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2194         /* get references to the devices that are to be hold by the routing
2195            cache entry */
2196         rth->u.dst.dev  = dev_out;
2197         dev_hold(dev_out);
2198         rth->idev       = in_dev_get(dev_out);
2199         rth->rt_gateway = fl->fl4_dst;
2200         rth->rt_spec_dst= fl->fl4_src;
2201
2202         rth->u.dst.output=ip_output;
2203
2204         RT_CACHE_STAT_INC(out_slow_tot);
2205
2206         if (flags & RTCF_LOCAL) {
2207                 rth->u.dst.input = ip_local_deliver;
2208                 rth->rt_spec_dst = fl->fl4_dst;
2209         }
2210         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2211                 rth->rt_spec_dst = fl->fl4_src;
2212                 if (flags & RTCF_LOCAL &&
2213                     !(dev_out->flags & IFF_LOOPBACK)) {
2214                         rth->u.dst.output = ip_mc_output;
2215                         RT_CACHE_STAT_INC(out_slow_mc);
2216                 }
2217 #ifdef CONFIG_IP_MROUTE
2218                 if (res->type == RTN_MULTICAST) {
2219                         if (IN_DEV_MFORWARD(in_dev) &&
2220                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2221                                 rth->u.dst.input = ip_mr_input;
2222                                 rth->u.dst.output = ip_mc_output;
2223                         }
2224                 }
2225 #endif
2226         }
2227
2228         rt_set_nexthop(rth, res, 0);
2229
2230         rth->rt_flags = flags;
2231
2232         *result = rth;
2233  cleanup:
2234         /* release work reference to inet device */
2235         in_dev_put(in_dev);
2236
2237         return err;
2238 }
2239
2240 static inline int ip_mkroute_output_def(struct rtable **rp,
2241                                         struct fib_result* res,
2242                                         const struct flowi *fl,
2243                                         const struct flowi *oldflp,
2244                                         struct net_device *dev_out,
2245                                         unsigned flags)
2246 {
2247         struct rtable *rth = NULL;
2248         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2249         unsigned hash;
2250         if (err == 0) {
2251                 u32 tos = RT_FL_TOS(oldflp);
2252
2253                 atomic_set(&rth->u.dst.__refcnt, 1);
2254
2255                 hash = rt_hash_code(oldflp->fl4_dst,
2256                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2257                 err = rt_intern_hash(hash, rth, rp);
2258         }
2259
2260         return err;
2261 }
2262
2263 static inline int ip_mkroute_output(struct rtable** rp,
2264                                     struct fib_result* res,
2265                                     const struct flowi *fl,
2266                                     const struct flowi *oldflp,
2267                                     struct net_device *dev_out,
2268                                     unsigned flags)
2269 {
2270 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2271         u32 tos = RT_FL_TOS(oldflp);
2272         unsigned char hop;
2273         unsigned hash;
2274         int err = -EINVAL;
2275         struct rtable *rth = NULL;
2276
2277         if (res->fi && res->fi->fib_nhs > 1) {
2278                 unsigned char hopcount = res->fi->fib_nhs;
2279
2280                 for (hop = 0; hop < hopcount; hop++) {
2281                         struct net_device *dev2nexthop;
2282
2283                         res->nh_sel = hop;
2284
2285                         /* hold a work reference to the output device */
2286                         dev2nexthop = FIB_RES_DEV(*res);
2287                         dev_hold(dev2nexthop);
2288
2289                         err = __mkroute_output(&rth, res, fl, oldflp,
2290                                                dev2nexthop, flags);
2291
2292                         if (err != 0)
2293                                 goto cleanup;
2294
2295                         hash = rt_hash_code(oldflp->fl4_dst,
2296                                             oldflp->fl4_src ^
2297                                             (oldflp->oif << 5), tos);
2298                         err = rt_intern_hash(hash, rth, rp);
2299
2300                         /* forward hop information to multipath impl. */
2301                         multipath_set_nhinfo(rth,
2302                                              FIB_RES_NETWORK(*res),
2303                                              FIB_RES_NETMASK(*res),
2304                                              res->prefixlen,
2305                                              &FIB_RES_NH(*res));
2306                 cleanup:
2307                         /* release work reference to output device */
2308                         dev_put(dev2nexthop);
2309
2310                         if (err != 0)
2311                                 return err;
2312                 }
2313                 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2314                 return err;
2315         } else {
2316                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2317                                              flags);
2318         }
2319 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2320         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2321 #endif
2322 }
2323
2324 /*
2325  * Major route resolver routine.
2326  */
2327
2328 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2329 {
2330         u32 tos = RT_FL_TOS(oldflp);
2331         struct flowi fl = { .nl_u = { .ip4_u =
2332                                       { .daddr = oldflp->fl4_dst,
2333                                         .saddr = oldflp->fl4_src,
2334                                         .tos = tos & IPTOS_RT_MASK,
2335                                         .scope = ((tos & RTO_ONLINK) ?
2336                                                   RT_SCOPE_LINK :
2337                                                   RT_SCOPE_UNIVERSE),
2338 #ifdef CONFIG_IP_ROUTE_FWMARK
2339                                         .fwmark = oldflp->fl4_fwmark
2340 #endif
2341                                       } },
2342                             .iif = loopback_dev.ifindex,
2343                             .oif = oldflp->oif };
2344         struct fib_result res;
2345         unsigned flags = 0;
2346         struct net_device *dev_out = NULL;
2347         int free_res = 0;
2348         int err;
2349
2350
2351         res.fi          = NULL;
2352 #ifdef CONFIG_IP_MULTIPLE_TABLES
2353         res.r           = NULL;
2354 #endif
2355
2356         if (oldflp->fl4_src) {
2357                 err = -EINVAL;
2358                 if (MULTICAST(oldflp->fl4_src) ||
2359                     BADCLASS(oldflp->fl4_src) ||
2360                     ZERONET(oldflp->fl4_src))
2361                         goto out;
2362
2363                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2364                 dev_out = ip_dev_find(oldflp->fl4_src);
2365                 if (dev_out == NULL)
2366                         goto out;
2367
2368                 /* I removed check for oif == dev_out->oif here.
2369                    It was wrong for two reasons:
2370                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2371                       assigned to multiple interfaces.
2372                    2. Moreover, we are allowed to send packets with saddr
2373                       of another iface. --ANK
2374                  */
2375
2376                 if (oldflp->oif == 0
2377                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2378                         /* Special hack: user can direct multicasts
2379                            and limited broadcast via necessary interface
2380                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2381                            This hack is not just for fun, it allows
2382                            vic,vat and friends to work.
2383                            They bind socket to loopback, set ttl to zero
2384                            and expect that it will work.
2385                            From the viewpoint of routing cache they are broken,
2386                            because we are not allowed to build multicast path
2387                            with loopback source addr (look, routing cache
2388                            cannot know, that ttl is zero, so that packet
2389                            will not leave this host and route is valid).
2390                            Luckily, this hack is good workaround.
2391                          */
2392
2393                         fl.oif = dev_out->ifindex;
2394                         goto make_route;
2395                 }
2396                 if (dev_out)
2397                         dev_put(dev_out);
2398                 dev_out = NULL;
2399         }
2400
2401
2402         if (oldflp->oif) {
2403                 dev_out = dev_get_by_index(oldflp->oif);
2404                 err = -ENODEV;
2405                 if (dev_out == NULL)
2406                         goto out;
2407                 if (__in_dev_get(dev_out) == NULL) {
2408                         dev_put(dev_out);
2409                         goto out;       /* Wrong error code */
2410                 }
2411
2412                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2413                         if (!fl.fl4_src)
2414                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2415                                                               RT_SCOPE_LINK);
2416                         goto make_route;
2417                 }
2418                 if (!fl.fl4_src) {
2419                         if (MULTICAST(oldflp->fl4_dst))
2420                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2421                                                               fl.fl4_scope);
2422                         else if (!oldflp->fl4_dst)
2423                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2424                                                               RT_SCOPE_HOST);
2425                 }
2426         }
2427
2428         if (!fl.fl4_dst) {
2429                 fl.fl4_dst = fl.fl4_src;
2430                 if (!fl.fl4_dst)
2431                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2432                 if (dev_out)
2433                         dev_put(dev_out);
2434                 dev_out = &loopback_dev;
2435                 dev_hold(dev_out);
2436                 fl.oif = loopback_dev.ifindex;
2437                 res.type = RTN_LOCAL;
2438                 flags |= RTCF_LOCAL;
2439                 goto make_route;
2440         }
2441
2442         if (fib_lookup(&fl, &res)) {
2443                 res.fi = NULL;
2444                 if (oldflp->oif) {
2445                         /* Apparently, routing tables are wrong. Assume,
2446                            that the destination is on link.
2447
2448                            WHY? DW.
2449                            Because we are allowed to send to iface
2450                            even if it has NO routes and NO assigned
2451                            addresses. When oif is specified, routing
2452                            tables are looked up with only one purpose:
2453                            to catch if destination is gatewayed, rather than
2454                            direct. Moreover, if MSG_DONTROUTE is set,
2455                            we send packet, ignoring both routing tables
2456                            and ifaddr state. --ANK
2457
2458
2459                            We could make it even if oif is unknown,
2460                            likely IPv6, but we do not.
2461                          */
2462
2463                         if (fl.fl4_src == 0)
2464                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2465                                                               RT_SCOPE_LINK);
2466                         res.type = RTN_UNICAST;
2467                         goto make_route;
2468                 }
2469                 if (dev_out)
2470                         dev_put(dev_out);
2471                 err = -ENETUNREACH;
2472                 goto out;
2473         }
2474         free_res = 1;
2475
2476         if (res.type == RTN_LOCAL) {
2477                 if (!fl.fl4_src)
2478                         fl.fl4_src = fl.fl4_dst;
2479                 if (dev_out)
2480                         dev_put(dev_out);
2481                 dev_out = &loopback_dev;
2482                 dev_hold(dev_out);
2483                 fl.oif = dev_out->ifindex;
2484                 if (res.fi)
2485                         fib_info_put(res.fi);
2486                 res.fi = NULL;
2487                 flags |= RTCF_LOCAL;
2488                 goto make_route;
2489         }
2490
2491 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2492         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2493                 fib_select_multipath(&fl, &res);
2494         else
2495 #endif
2496         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2497                 fib_select_default(&fl, &res);
2498
2499         if (!fl.fl4_src)
2500                 fl.fl4_src = FIB_RES_PREFSRC(res);
2501
2502         if (dev_out)
2503                 dev_put(dev_out);
2504         dev_out = FIB_RES_DEV(res);
2505         dev_hold(dev_out);
2506         fl.oif = dev_out->ifindex;
2507
2508
2509 make_route:
2510         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2511
2512
2513         if (free_res)
2514                 fib_res_put(&res);
2515         if (dev_out)
2516                 dev_put(dev_out);
2517 out:    return err;
2518 }
2519
2520 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2521 {
2522         unsigned hash;
2523         struct rtable *rth;
2524
2525         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2526
2527         rcu_read_lock_bh();
2528         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2529                 rth = rcu_dereference(rth->u.rt_next)) {
2530                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2531                     rth->fl.fl4_src == flp->fl4_src &&
2532                     rth->fl.iif == 0 &&
2533                     rth->fl.oif == flp->oif &&
2534 #ifdef CONFIG_IP_ROUTE_FWMARK
2535                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2536 #endif
2537                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2538                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2539
2540                         /* check for multipath routes and choose one if
2541                          * necessary
2542                          */
2543                         if (multipath_select_route(flp, rth, rp)) {
2544                                 dst_hold(&(*rp)->u.dst);
2545                                 RT_CACHE_STAT_INC(out_hit);
2546                                 rcu_read_unlock_bh();
2547                                 return 0;
2548                         }
2549
2550                         rth->u.dst.lastuse = jiffies;
2551                         dst_hold(&rth->u.dst);
2552                         rth->u.dst.__use++;
2553                         RT_CACHE_STAT_INC(out_hit);
2554                         rcu_read_unlock_bh();
2555                         *rp = rth;
2556                         return 0;
2557                 }
2558                 RT_CACHE_STAT_INC(out_hlist_search);
2559         }
2560         rcu_read_unlock_bh();
2561
2562         return ip_route_output_slow(rp, flp);
2563 }
2564
2565 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2566 {
2567         int err;
2568
2569         if ((err = __ip_route_output_key(rp, flp)) != 0)
2570                 return err;
2571
2572         if (flp->proto) {
2573                 if (!flp->fl4_src)
2574                         flp->fl4_src = (*rp)->rt_src;
2575                 if (!flp->fl4_dst)
2576                         flp->fl4_dst = (*rp)->rt_dst;
2577                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2578         }
2579
2580         return 0;
2581 }
2582
2583 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2584 {
2585         return ip_route_output_flow(rp, flp, NULL, 0);
2586 }
2587
2588 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2589                         int nowait, unsigned int flags)
2590 {
2591         struct rtable *rt = (struct rtable*)skb->dst;
2592         struct rtmsg *r;
2593         struct nlmsghdr  *nlh;
2594         unsigned char    *b = skb->tail;
2595         struct rta_cacheinfo ci;
2596 #ifdef CONFIG_IP_MROUTE
2597         struct rtattr *eptr;
2598 #endif
2599         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2600         r = NLMSG_DATA(nlh);
2601         r->rtm_family    = AF_INET;
2602         r->rtm_dst_len  = 32;
2603         r->rtm_src_len  = 0;
2604         r->rtm_tos      = rt->fl.fl4_tos;
2605         r->rtm_table    = RT_TABLE_MAIN;
2606         r->rtm_type     = rt->rt_type;
2607         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2608         r->rtm_protocol = RTPROT_UNSPEC;
2609         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2610         if (rt->rt_flags & RTCF_NOTIFY)
2611                 r->rtm_flags |= RTM_F_NOTIFY;
2612         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2613         if (rt->fl.fl4_src) {
2614                 r->rtm_src_len = 32;
2615                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2616         }
2617         if (rt->u.dst.dev)
2618                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2619 #ifdef CONFIG_NET_CLS_ROUTE
2620         if (rt->u.dst.tclassid)
2621                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2622 #endif
2623 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2624         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2625                 __u32 alg = rt->rt_multipath_alg;
2626
2627                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2628         }
2629 #endif
2630         if (rt->fl.iif)
2631                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2632         else if (rt->rt_src != rt->fl.fl4_src)
2633                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2634         if (rt->rt_dst != rt->rt_gateway)
2635                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2636         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2637                 goto rtattr_failure;
2638         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2639         ci.rta_used     = rt->u.dst.__use;
2640         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2641         if (rt->u.dst.expires)
2642                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2643         else
2644                 ci.rta_expires = 0;
2645         ci.rta_error    = rt->u.dst.error;
2646         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2647         if (rt->peer) {
2648                 ci.rta_id = rt->peer->ip_id_count;
2649                 if (rt->peer->tcp_ts_stamp) {
2650                         ci.rta_ts = rt->peer->tcp_ts;
2651                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2652                 }
2653         }
2654 #ifdef CONFIG_IP_MROUTE
2655         eptr = (struct rtattr*)skb->tail;
2656 #endif
2657         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2658         if (rt->fl.iif) {
2659 #ifdef CONFIG_IP_MROUTE
2660                 u32 dst = rt->rt_dst;
2661
2662                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2663                     ipv4_devconf.mc_forwarding) {
2664                         int err = ipmr_get_route(skb, r, nowait);
2665                         if (err <= 0) {
2666                                 if (!nowait) {
2667                                         if (err == 0)
2668                                                 return 0;
2669                                         goto nlmsg_failure;
2670                                 } else {
2671                                         if (err == -EMSGSIZE)
2672                                                 goto nlmsg_failure;
2673                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2674                                 }
2675                         }
2676                 } else
2677 #endif
2678                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2679         }
2680
2681         nlh->nlmsg_len = skb->tail - b;
2682         return skb->len;
2683
2684 nlmsg_failure:
2685 rtattr_failure:
2686         skb_trim(skb, b - skb->data);
2687         return -1;
2688 }
2689
2690 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2691 {
2692         struct rtattr **rta = arg;
2693         struct rtmsg *rtm = NLMSG_DATA(nlh);
2694         struct rtable *rt = NULL;
2695         u32 dst = 0;
2696         u32 src = 0;
2697         int iif = 0;
2698         int err = -ENOBUFS;
2699         struct sk_buff *skb;
2700
2701         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2702         if (!skb)
2703                 goto out;
2704
2705         /* Reserve room for dummy headers, this skb can pass
2706            through good chunk of routing engine.
2707          */
2708         skb->mac.raw = skb->data;
2709         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2710
2711         if (rta[RTA_SRC - 1])
2712                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2713         if (rta[RTA_DST - 1])
2714                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2715         if (rta[RTA_IIF - 1])
2716                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2717
2718         if (iif) {
2719                 struct net_device *dev = __dev_get_by_index(iif);
2720                 err = -ENODEV;
2721                 if (!dev)
2722                         goto out_free;
2723                 skb->protocol   = htons(ETH_P_IP);
2724                 skb->dev        = dev;
2725                 local_bh_disable();
2726                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2727                 local_bh_enable();
2728                 rt = (struct rtable*)skb->dst;
2729                 if (!err && rt->u.dst.error)
2730                         err = -rt->u.dst.error;
2731         } else {
2732                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2733                                                          .saddr = src,
2734                                                          .tos = rtm->rtm_tos } } };
2735                 int oif = 0;
2736                 if (rta[RTA_OIF - 1])
2737                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2738                 fl.oif = oif;
2739                 err = ip_route_output_key(&rt, &fl);
2740         }
2741         if (err)
2742                 goto out_free;
2743
2744         skb->dst = &rt->u.dst;
2745         if (rtm->rtm_flags & RTM_F_NOTIFY)
2746                 rt->rt_flags |= RTCF_NOTIFY;
2747
2748         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2749
2750         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2751                                 RTM_NEWROUTE, 0, 0);
2752         if (!err)
2753                 goto out_free;
2754         if (err < 0) {
2755                 err = -EMSGSIZE;
2756                 goto out_free;
2757         }
2758
2759         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2760         if (err > 0)
2761                 err = 0;
2762 out:    return err;
2763
2764 out_free:
2765         kfree_skb(skb);
2766         goto out;
2767 }
2768
2769 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2770 {
2771         struct rtable *rt;
2772         int h, s_h;
2773         int idx, s_idx;
2774
2775         s_h = cb->args[0];
2776         s_idx = idx = cb->args[1];
2777         for (h = 0; h <= rt_hash_mask; h++) {
2778                 if (h < s_h) continue;
2779                 if (h > s_h)
2780                         s_idx = 0;
2781                 rcu_read_lock_bh();
2782                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2783                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2784                         if (idx < s_idx)
2785                                 continue;
2786                         skb->dst = dst_clone(&rt->u.dst);
2787                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2788                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2789                                          1, NLM_F_MULTI) <= 0) {
2790                                 dst_release(xchg(&skb->dst, NULL));
2791                                 rcu_read_unlock_bh();
2792                                 goto done;
2793                         }
2794                         dst_release(xchg(&skb->dst, NULL));
2795                 }
2796                 rcu_read_unlock_bh();
2797         }
2798
2799 done:
2800         cb->args[0] = h;
2801         cb->args[1] = idx;
2802         return skb->len;
2803 }
2804
2805 void ip_rt_multicast_event(struct in_device *in_dev)
2806 {
2807         rt_cache_flush(0);
2808 }
2809
2810 #ifdef CONFIG_SYSCTL
2811 static int flush_delay;
2812
2813 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2814                                         struct file *filp, void __user *buffer,
2815                                         size_t *lenp, loff_t *ppos)
2816 {
2817         if (write) {
2818                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2819                 rt_cache_flush(flush_delay);
2820                 return 0;
2821         }
2822
2823         return -EINVAL;
2824 }
2825
2826 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2827                                                 int __user *name,
2828                                                 int nlen,
2829                                                 void __user *oldval,
2830                                                 size_t __user *oldlenp,
2831                                                 void __user *newval,
2832                                                 size_t newlen,
2833                                                 void **context)
2834 {
2835         int delay;
2836         if (newlen != sizeof(int))
2837                 return -EINVAL;
2838         if (get_user(delay, (int __user *)newval))
2839                 return -EFAULT;
2840         rt_cache_flush(delay);
2841         return 0;
2842 }
2843
2844 ctl_table ipv4_route_table[] = {
2845         {
2846                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2847                 .procname       = "flush",
2848                 .data           = &flush_delay,
2849                 .maxlen         = sizeof(int),
2850                 .mode           = 0200,
2851                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2852                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2853         },
2854         {
2855                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2856                 .procname       = "min_delay",
2857                 .data           = &ip_rt_min_delay,
2858                 .maxlen         = sizeof(int),
2859                 .mode           = 0644,
2860                 .proc_handler   = &proc_dointvec_jiffies,
2861                 .strategy       = &sysctl_jiffies,
2862         },
2863         {
2864                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2865                 .procname       = "max_delay",
2866                 .data           = &ip_rt_max_delay,
2867                 .maxlen         = sizeof(int),
2868                 .mode           = 0644,
2869                 .proc_handler   = &proc_dointvec_jiffies,
2870                 .strategy       = &sysctl_jiffies,
2871         },
2872         {
2873                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2874                 .procname       = "gc_thresh",
2875                 .data           = &ipv4_dst_ops.gc_thresh,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = &proc_dointvec,
2879         },
2880         {
2881                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2882                 .procname       = "max_size",
2883                 .data           = &ip_rt_max_size,
2884                 .maxlen         = sizeof(int),
2885                 .mode           = 0644,
2886                 .proc_handler   = &proc_dointvec,
2887         },
2888         {
2889                 /*  Deprecated. Use gc_min_interval_ms */
2890
2891                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2892                 .procname       = "gc_min_interval",
2893                 .data           = &ip_rt_gc_min_interval,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = &proc_dointvec_jiffies,
2897                 .strategy       = &sysctl_jiffies,
2898         },
2899         {
2900                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2901                 .procname       = "gc_min_interval_ms",
2902                 .data           = &ip_rt_gc_min_interval,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = &proc_dointvec_ms_jiffies,
2906                 .strategy       = &sysctl_ms_jiffies,
2907         },
2908         {
2909                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2910                 .procname       = "gc_timeout",
2911                 .data           = &ip_rt_gc_timeout,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = &proc_dointvec_jiffies,
2915                 .strategy       = &sysctl_jiffies,
2916         },
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2919                 .procname       = "gc_interval",
2920                 .data           = &ip_rt_gc_interval,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec_jiffies,
2924                 .strategy       = &sysctl_jiffies,
2925         },
2926         {
2927                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2928                 .procname       = "redirect_load",
2929                 .data           = &ip_rt_redirect_load,
2930                 .maxlen         = sizeof(int),
2931                 .mode           = 0644,
2932                 .proc_handler   = &proc_dointvec,
2933         },
2934         {
2935                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2936                 .procname       = "redirect_number",
2937                 .data           = &ip_rt_redirect_number,
2938                 .maxlen         = sizeof(int),
2939                 .mode           = 0644,
2940                 .proc_handler   = &proc_dointvec,
2941         },
2942         {
2943                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2944                 .procname       = "redirect_silence",
2945                 .data           = &ip_rt_redirect_silence,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = &proc_dointvec,
2949         },
2950         {
2951                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2952                 .procname       = "error_cost",
2953                 .data           = &ip_rt_error_cost,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = &proc_dointvec,
2957         },
2958         {
2959                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2960                 .procname       = "error_burst",
2961                 .data           = &ip_rt_error_burst,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = &proc_dointvec,
2965         },
2966         {
2967                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2968                 .procname       = "gc_elasticity",
2969                 .data           = &ip_rt_gc_elasticity,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = &proc_dointvec,
2973         },
2974         {
2975                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2976                 .procname       = "mtu_expires",
2977                 .data           = &ip_rt_mtu_expires,
2978                 .maxlen         = sizeof(int),
2979                 .mode           = 0644,
2980                 .proc_handler   = &proc_dointvec_jiffies,
2981                 .strategy       = &sysctl_jiffies,
2982         },
2983         {
2984                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2985                 .procname       = "min_pmtu",
2986                 .data           = &ip_rt_min_pmtu,
2987                 .maxlen         = sizeof(int),
2988                 .mode           = 0644,
2989                 .proc_handler   = &proc_dointvec,
2990         },
2991         {
2992                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2993                 .procname       = "min_adv_mss",
2994                 .data           = &ip_rt_min_advmss,
2995                 .maxlen         = sizeof(int),
2996                 .mode           = 0644,
2997                 .proc_handler   = &proc_dointvec,
2998         },
2999         {
3000                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3001                 .procname       = "secret_interval",
3002                 .data           = &ip_rt_secret_interval,
3003                 .maxlen         = sizeof(int),
3004                 .mode           = 0644,
3005                 .proc_handler   = &proc_dointvec_jiffies,
3006                 .strategy       = &sysctl_jiffies,
3007         },
3008         { .ctl_name = 0 }
3009 };
3010 #endif
3011
3012 #ifdef CONFIG_NET_CLS_ROUTE
3013 struct ip_rt_acct *ip_rt_acct;
3014
3015 /* This code sucks.  But you should have seen it before! --RR */
3016
3017 /* IP route accounting ptr for this logical cpu number. */
3018 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3019
3020 #ifdef CONFIG_PROC_FS
3021 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3022                            int length, int *eof, void *data)
3023 {
3024         unsigned int i;
3025
3026         if ((offset & 3) || (length & 3))
3027                 return -EIO;
3028
3029         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3030                 *eof = 1;
3031                 return 0;
3032         }
3033
3034         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3035                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3036                 *eof = 1;
3037         }
3038
3039         offset /= sizeof(u32);
3040
3041         if (length > 0) {
3042                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3043                 u32 *dst = (u32 *) buffer;
3044
3045                 /* Copy first cpu. */
3046                 *start = buffer;
3047                 memcpy(dst, src, length);
3048
3049                 /* Add the other cpus in, one int at a time */
3050                 for_each_cpu(i) {
3051                         unsigned int j;
3052
3053                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3054
3055                         for (j = 0; j < length/4; j++)
3056                                 dst[j] += src[j];
3057                 }
3058         }
3059         return length;
3060 }
3061 #endif /* CONFIG_PROC_FS */
3062 #endif /* CONFIG_NET_CLS_ROUTE */
3063
3064 static __initdata unsigned long rhash_entries;
3065 static int __init set_rhash_entries(char *str)
3066 {
3067         if (!str)
3068                 return 0;
3069         rhash_entries = simple_strtoul(str, &str, 0);
3070         return 1;
3071 }
3072 __setup("rhash_entries=", set_rhash_entries);
3073
3074 int __init ip_rt_init(void)
3075 {
3076         int i, order, goal, rc = 0;
3077
3078         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3079                              (jiffies ^ (jiffies >> 7)));
3080
3081 #ifdef CONFIG_NET_CLS_ROUTE
3082         for (order = 0;
3083              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3084                 /* NOTHING */;
3085         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3086         if (!ip_rt_acct)
3087                 panic("IP: failed to allocate ip_rt_acct\n");
3088         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3089 #endif
3090
3091         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3092                                                      sizeof(struct rtable),
3093                                                      0, SLAB_HWCACHE_ALIGN,
3094                                                      NULL, NULL);
3095
3096         if (!ipv4_dst_ops.kmem_cachep)
3097                 panic("IP: failed to allocate ip_dst_cache\n");
3098
3099         goal = num_physpages >> (26 - PAGE_SHIFT);
3100         if (rhash_entries)
3101                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3102         for (order = 0; (1UL << order) < goal; order++)
3103                 /* NOTHING */;
3104
3105         do {
3106                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3107                         sizeof(struct rt_hash_bucket);
3108                 while (rt_hash_mask & (rt_hash_mask - 1))
3109                         rt_hash_mask--;
3110                 rt_hash_table = (struct rt_hash_bucket *)
3111                         __get_free_pages(GFP_ATOMIC, order);
3112         } while (rt_hash_table == NULL && --order > 0);
3113
3114         if (!rt_hash_table)
3115                 panic("Failed to allocate IP route cache hash table\n");
3116
3117         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3118                rt_hash_mask,
3119                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3120
3121         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3122                 /* NOTHING */;
3123
3124         rt_hash_mask--;
3125         for (i = 0; i <= rt_hash_mask; i++) {
3126                 spin_lock_init(&rt_hash_table[i].lock);
3127                 rt_hash_table[i].chain = NULL;
3128         }
3129
3130         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3131         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3132
3133         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3134         if (!rt_cache_stat)
3135                 return -ENOMEM;
3136
3137         devinet_init();
3138         ip_fib_init();
3139
3140         init_timer(&rt_flush_timer);
3141         rt_flush_timer.function = rt_run_flush;
3142         init_timer(&rt_periodic_timer);
3143         rt_periodic_timer.function = rt_check_expire;
3144         init_timer(&rt_secret_timer);
3145         rt_secret_timer.function = rt_secret_rebuild;
3146
3147         /* All the timers, started at system startup tend
3148            to synchronize. Perturb it a bit.
3149          */
3150         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3151                                         ip_rt_gc_interval;
3152         add_timer(&rt_periodic_timer);
3153
3154         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3155                 ip_rt_secret_interval;
3156         add_timer(&rt_secret_timer);
3157
3158 #ifdef CONFIG_PROC_FS
3159         {
3160         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3161         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3162             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3163                                              proc_net_stat))) {
3164                 free_percpu(rt_cache_stat);
3165                 return -ENOMEM;
3166         }
3167         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3168         }
3169 #ifdef CONFIG_NET_CLS_ROUTE
3170         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3171 #endif
3172 #endif
3173 #ifdef CONFIG_XFRM
3174         xfrm_init();
3175         xfrm4_init();
3176 #endif
3177         return rc;
3178 }
3179
3180 EXPORT_SYMBOL(__ip_select_ident);
3181 EXPORT_SYMBOL(ip_route_input);
3182 EXPORT_SYMBOL(ip_route_output_key);