Pull move-iosapic-to-acpi into release branch
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112
113 #define IP_MAX_MTU      0xFFF0
114
115 #define RT_GC_TIMEOUT (300*HZ)
116
117 static int ip_rt_min_delay              = 2 * HZ;
118 static int ip_rt_max_delay              = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval            = 60 * HZ;
122 static int ip_rt_gc_min_interval        = HZ / 2;
123 static int ip_rt_redirect_number        = 9;
124 static int ip_rt_redirect_load          = HZ / 50;
125 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost             = HZ;
127 static int ip_rt_error_burst            = 5 * HZ;
128 static int ip_rt_gc_elasticity          = 8;
129 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu               = 512 + 20 + 20;
131 static int ip_rt_min_advmss             = 256;
132 static int ip_rt_secret_interval        = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .entry_size =           sizeof(struct rtable),
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(FILLER),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188
189
190 /*
191  * Route cache.
192  */
193
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203
204 struct rt_hash_bucket {
205         struct rtable   *chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  */
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ 4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ 2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ 1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ 512
220 #else
221 #define RT_HASH_LOCK_SZ 256
222 #endif
223
224 static spinlock_t       *rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init()    { \
227                 int i; \
228                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231                         spin_lock_init(&rt_hash_locks[i]); \
232                 }
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
237
238 static struct rt_hash_bucket    *rt_hash_table;
239 static unsigned                 rt_hash_mask;
240 static int                      rt_hash_log;
241 static unsigned int             rt_hash_rnd;
242
243 static struct rt_cache_stat *rt_cache_stat;
244 #define RT_CACHE_STAT_INC(field)                                          \
245                 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
246
247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
248                                 struct rtable **res);
249
250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
251 {
252         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
253                 & rt_hash_mask);
254 }
255
256 #ifdef CONFIG_PROC_FS
257 struct rt_cache_iter_state {
258         int bucket;
259 };
260
261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
262 {
263         struct rtable *r = NULL;
264         struct rt_cache_iter_state *st = seq->private;
265
266         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
267                 rcu_read_lock_bh();
268                 r = rt_hash_table[st->bucket].chain;
269                 if (r)
270                         break;
271                 rcu_read_unlock_bh();
272         }
273         return r;
274 }
275
276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
277 {
278         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
279
280         r = r->u.rt_next;
281         while (!r) {
282                 rcu_read_unlock_bh();
283                 if (--st->bucket < 0)
284                         break;
285                 rcu_read_lock_bh();
286                 r = rt_hash_table[st->bucket].chain;
287         }
288         return r;
289 }
290
291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
292 {
293         struct rtable *r = rt_cache_get_first(seq);
294
295         if (r)
296                 while (pos && (r = rt_cache_get_next(seq, r)))
297                         --pos;
298         return pos ? NULL : r;
299 }
300
301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
302 {
303         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304 }
305
306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
307 {
308         struct rtable *r = NULL;
309
310         if (v == SEQ_START_TOKEN)
311                 r = rt_cache_get_first(seq);
312         else
313                 r = rt_cache_get_next(seq, v);
314         ++*pos;
315         return r;
316 }
317
318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
319 {
320         if (v && v != SEQ_START_TOKEN)
321                 rcu_read_unlock_bh();
322 }
323
324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
325 {
326         if (v == SEQ_START_TOKEN)
327                 seq_printf(seq, "%-127s\n",
328                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
329                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
330                            "HHUptod\tSpecDst");
331         else {
332                 struct rtable *r = v;
333                 char temp[256];
334
335                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
336                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
337                         r->u.dst.dev ? r->u.dst.dev->name : "*",
338                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
339                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
340                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
341                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
342                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
343                         dst_metric(&r->u.dst, RTAX_WINDOW),
344                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
345                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
346                         r->fl.fl4_tos,
347                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
348                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
349                                        dev_queue_xmit) : 0,
350                         r->rt_spec_dst);
351                 seq_printf(seq, "%-127s\n", temp);
352         }
353         return 0;
354 }
355
356 static struct seq_operations rt_cache_seq_ops = {
357         .start  = rt_cache_seq_start,
358         .next   = rt_cache_seq_next,
359         .stop   = rt_cache_seq_stop,
360         .show   = rt_cache_seq_show,
361 };
362
363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
364 {
365         struct seq_file *seq;
366         int rc = -ENOMEM;
367         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
368
369         if (!s)
370                 goto out;
371         rc = seq_open(file, &rt_cache_seq_ops);
372         if (rc)
373                 goto out_kfree;
374         seq          = file->private_data;
375         seq->private = s;
376         memset(s, 0, sizeof(*s));
377 out:
378         return rc;
379 out_kfree:
380         kfree(s);
381         goto out;
382 }
383
384 static struct file_operations rt_cache_seq_fops = {
385         .owner   = THIS_MODULE,
386         .open    = rt_cache_seq_open,
387         .read    = seq_read,
388         .llseek  = seq_lseek,
389         .release = seq_release_private,
390 };
391
392
393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
394 {
395         int cpu;
396
397         if (*pos == 0)
398                 return SEQ_START_TOKEN;
399
400         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401                 if (!cpu_possible(cpu))
402                         continue;
403                 *pos = cpu+1;
404                 return per_cpu_ptr(rt_cache_stat, cpu);
405         }
406         return NULL;
407 }
408
409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410 {
411         int cpu;
412
413         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414                 if (!cpu_possible(cpu))
415                         continue;
416                 *pos = cpu+1;
417                 return per_cpu_ptr(rt_cache_stat, cpu);
418         }
419         return NULL;
420         
421 }
422
423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
424 {
425
426 }
427
428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429 {
430         struct rt_cache_stat *st = v;
431
432         if (v == SEQ_START_TOKEN) {
433                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
434                 return 0;
435         }
436         
437         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
438                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439                    atomic_read(&ipv4_dst_ops.entries),
440                    st->in_hit,
441                    st->in_slow_tot,
442                    st->in_slow_mc,
443                    st->in_no_route,
444                    st->in_brd,
445                    st->in_martian_dst,
446                    st->in_martian_src,
447
448                    st->out_hit,
449                    st->out_slow_tot,
450                    st->out_slow_mc, 
451
452                    st->gc_total,
453                    st->gc_ignored,
454                    st->gc_goal_miss,
455                    st->gc_dst_overflow,
456                    st->in_hlist_search,
457                    st->out_hlist_search
458                 );
459         return 0;
460 }
461
462 static struct seq_operations rt_cpu_seq_ops = {
463         .start  = rt_cpu_seq_start,
464         .next   = rt_cpu_seq_next,
465         .stop   = rt_cpu_seq_stop,
466         .show   = rt_cpu_seq_show,
467 };
468
469
470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471 {
472         return seq_open(file, &rt_cpu_seq_ops);
473 }
474
475 static struct file_operations rt_cpu_seq_fops = {
476         .owner   = THIS_MODULE,
477         .open    = rt_cpu_seq_open,
478         .read    = seq_read,
479         .llseek  = seq_lseek,
480         .release = seq_release,
481 };
482
483 #endif /* CONFIG_PROC_FS */
484   
485 static __inline__ void rt_free(struct rtable *rt)
486 {
487         multipath_remove(rt);
488         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 }
490
491 static __inline__ void rt_drop(struct rtable *rt)
492 {
493         multipath_remove(rt);
494         ip_rt_put(rt);
495         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 }
497
498 static __inline__ int rt_fast_clean(struct rtable *rth)
499 {
500         /* Kill broadcast/multicast entries very aggresively, if they
501            collide in hash table with more useful entries */
502         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
503                 rth->fl.iif && rth->u.rt_next;
504 }
505
506 static __inline__ int rt_valuable(struct rtable *rth)
507 {
508         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
509                 rth->u.dst.expires;
510 }
511
512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
513 {
514         unsigned long age;
515         int ret = 0;
516
517         if (atomic_read(&rth->u.dst.__refcnt))
518                 goto out;
519
520         ret = 1;
521         if (rth->u.dst.expires &&
522             time_after_eq(jiffies, rth->u.dst.expires))
523                 goto out;
524
525         age = jiffies - rth->u.dst.lastuse;
526         ret = 0;
527         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
528             (age <= tmo2 && rt_valuable(rth)))
529                 goto out;
530         ret = 1;
531 out:    return ret;
532 }
533
534 /* Bits of score are:
535  * 31: very valuable
536  * 30: not quite useless
537  * 29..0: usage counter
538  */
539 static inline u32 rt_score(struct rtable *rt)
540 {
541         u32 score = jiffies - rt->u.dst.lastuse;
542
543         score = ~score & ~(3<<30);
544
545         if (rt_valuable(rt))
546                 score |= (1<<31);
547
548         if (!rt->fl.iif ||
549             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
550                 score |= (1<<30);
551
552         return score;
553 }
554
555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556 {
557         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
558                fl1->oif     == fl2->oif &&
559                fl1->iif     == fl2->iif;
560 }
561
562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
564                                                 struct rtable *expentry,
565                                                 int *removed_count)
566 {
567         int passedexpired = 0;
568         struct rtable **nextstep = NULL;
569         struct rtable **rthp = chain_head;
570         struct rtable *rth;
571
572         if (removed_count)
573                 *removed_count = 0;
574
575         while ((rth = *rthp) != NULL) {
576                 if (rth == expentry)
577                         passedexpired = 1;
578
579                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
580                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
581                         if (*rthp == expentry) {
582                                 *rthp = rth->u.rt_next;
583                                 continue;
584                         } else {
585                                 *rthp = rth->u.rt_next;
586                                 rt_free(rth);
587                                 if (removed_count)
588                                         ++(*removed_count);
589                         }
590                 } else {
591                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
592                             passedexpired && !nextstep)
593                                 nextstep = &rth->u.rt_next;
594
595                         rthp = &rth->u.rt_next;
596                 }
597         }
598
599         rt_free(expentry);
600         if (removed_count)
601                 ++(*removed_count);
602
603         return nextstep;
604 }
605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606
607
608 /* This runs via a timer and thus is always in BH context. */
609 static void rt_check_expire(unsigned long dummy)
610 {
611         static unsigned int rover;
612         unsigned int i = rover, goal;
613         struct rtable *rth, **rthp;
614         unsigned long now = jiffies;
615         u64 mult;
616
617         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
618         if (ip_rt_gc_timeout > 1)
619                 do_div(mult, ip_rt_gc_timeout);
620         goal = (unsigned int)mult;
621         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
622         for (; goal > 0; goal--) {
623                 unsigned long tmo = ip_rt_gc_timeout;
624
625                 i = (i + 1) & rt_hash_mask;
626                 rthp = &rt_hash_table[i].chain;
627
628                 if (*rthp == 0)
629                         continue;
630                 spin_lock(rt_hash_lock_addr(i));
631                 while ((rth = *rthp) != NULL) {
632                         if (rth->u.dst.expires) {
633                                 /* Entry is expired even if it is in use */
634                                 if (time_before_eq(now, rth->u.dst.expires)) {
635                                         tmo >>= 1;
636                                         rthp = &rth->u.rt_next;
637                                         continue;
638                                 }
639                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
640                                 tmo >>= 1;
641                                 rthp = &rth->u.rt_next;
642                                 continue;
643                         }
644
645                         /* Cleanup aged off entries. */
646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
647                         /* remove all related balanced entries if necessary */
648                         if (rth->u.dst.flags & DST_BALANCED) {
649                                 rthp = rt_remove_balanced_route(
650                                         &rt_hash_table[i].chain,
651                                         rth, NULL);
652                                 if (!rthp)
653                                         break;
654                         } else {
655                                 *rthp = rth->u.rt_next;
656                                 rt_free(rth);
657                         }
658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
659                         *rthp = rth->u.rt_next;
660                         rt_free(rth);
661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
662                 }
663                 spin_unlock(rt_hash_lock_addr(i));
664
665                 /* Fallback loop breaker. */
666                 if (time_after(jiffies, now))
667                         break;
668         }
669         rover = i;
670         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
671 }
672
673 /* This can run from both BH and non-BH contexts, the latter
674  * in the case of a forced flush event.
675  */
676 static void rt_run_flush(unsigned long dummy)
677 {
678         int i;
679         struct rtable *rth, *next;
680
681         rt_deadline = 0;
682
683         get_random_bytes(&rt_hash_rnd, 4);
684
685         for (i = rt_hash_mask; i >= 0; i--) {
686                 spin_lock_bh(rt_hash_lock_addr(i));
687                 rth = rt_hash_table[i].chain;
688                 if (rth)
689                         rt_hash_table[i].chain = NULL;
690                 spin_unlock_bh(rt_hash_lock_addr(i));
691
692                 for (; rth; rth = next) {
693                         next = rth->u.rt_next;
694                         rt_free(rth);
695                 }
696         }
697 }
698
699 static DEFINE_SPINLOCK(rt_flush_lock);
700
701 void rt_cache_flush(int delay)
702 {
703         unsigned long now = jiffies;
704         int user_mode = !in_softirq();
705
706         if (delay < 0)
707                 delay = ip_rt_min_delay;
708
709         /* flush existing multipath state*/
710         multipath_flush();
711
712         spin_lock_bh(&rt_flush_lock);
713
714         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
715                 long tmo = (long)(rt_deadline - now);
716
717                 /* If flush timer is already running
718                    and flush request is not immediate (delay > 0):
719
720                    if deadline is not achieved, prolongate timer to "delay",
721                    otherwise fire it at deadline time.
722                  */
723
724                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
725                         tmo = 0;
726                 
727                 if (delay > tmo)
728                         delay = tmo;
729         }
730
731         if (delay <= 0) {
732                 spin_unlock_bh(&rt_flush_lock);
733                 rt_run_flush(0);
734                 return;
735         }
736
737         if (rt_deadline == 0)
738                 rt_deadline = now + ip_rt_max_delay;
739
740         mod_timer(&rt_flush_timer, now+delay);
741         spin_unlock_bh(&rt_flush_lock);
742 }
743
744 static void rt_secret_rebuild(unsigned long dummy)
745 {
746         unsigned long now = jiffies;
747
748         rt_cache_flush(0);
749         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
750 }
751
752 /*
753    Short description of GC goals.
754
755    We want to build algorithm, which will keep routing cache
756    at some equilibrium point, when number of aged off entries
757    is kept approximately equal to newly generated ones.
758
759    Current expiration strength is variable "expire".
760    We try to adjust it dynamically, so that if networking
761    is idle expires is large enough to keep enough of warm entries,
762    and when load increases it reduces to limit cache size.
763  */
764
765 static int rt_garbage_collect(void)
766 {
767         static unsigned long expire = RT_GC_TIMEOUT;
768         static unsigned long last_gc;
769         static int rover;
770         static int equilibrium;
771         struct rtable *rth, **rthp;
772         unsigned long now = jiffies;
773         int goal;
774
775         /*
776          * Garbage collection is pretty expensive,
777          * do not make it too frequently.
778          */
779
780         RT_CACHE_STAT_INC(gc_total);
781
782         if (now - last_gc < ip_rt_gc_min_interval &&
783             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
784                 RT_CACHE_STAT_INC(gc_ignored);
785                 goto out;
786         }
787
788         /* Calculate number of entries, which we want to expire now. */
789         goal = atomic_read(&ipv4_dst_ops.entries) -
790                 (ip_rt_gc_elasticity << rt_hash_log);
791         if (goal <= 0) {
792                 if (equilibrium < ipv4_dst_ops.gc_thresh)
793                         equilibrium = ipv4_dst_ops.gc_thresh;
794                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
795                 if (goal > 0) {
796                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
797                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
798                 }
799         } else {
800                 /* We are in dangerous area. Try to reduce cache really
801                  * aggressively.
802                  */
803                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
804                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805         }
806
807         if (now - last_gc >= ip_rt_gc_min_interval)
808                 last_gc = now;
809
810         if (goal <= 0) {
811                 equilibrium += goal;
812                 goto work_done;
813         }
814
815         do {
816                 int i, k;
817
818                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
819                         unsigned long tmo = expire;
820
821                         k = (k + 1) & rt_hash_mask;
822                         rthp = &rt_hash_table[k].chain;
823                         spin_lock_bh(rt_hash_lock_addr(k));
824                         while ((rth = *rthp) != NULL) {
825                                 if (!rt_may_expire(rth, tmo, expire)) {
826                                         tmo >>= 1;
827                                         rthp = &rth->u.rt_next;
828                                         continue;
829                                 }
830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
831                                 /* remove all related balanced entries
832                                  * if necessary
833                                  */
834                                 if (rth->u.dst.flags & DST_BALANCED) {
835                                         int r;
836
837                                         rthp = rt_remove_balanced_route(
838                                                 &rt_hash_table[i].chain,
839                                                 rth,
840                                                 &r);
841                                         goal -= r;
842                                         if (!rthp)
843                                                 break;
844                                 } else {
845                                         *rthp = rth->u.rt_next;
846                                         rt_free(rth);
847                                         goal--;
848                                 }
849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
850                                 *rthp = rth->u.rt_next;
851                                 rt_free(rth);
852                                 goal--;
853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
854                         }
855                         spin_unlock_bh(rt_hash_lock_addr(k));
856                         if (goal <= 0)
857                                 break;
858                 }
859                 rover = k;
860
861                 if (goal <= 0)
862                         goto work_done;
863
864                 /* Goal is not achieved. We stop process if:
865
866                    - if expire reduced to zero. Otherwise, expire is halfed.
867                    - if table is not full.
868                    - if we are called from interrupt.
869                    - jiffies check is just fallback/debug loop breaker.
870                      We will not spin here for long time in any case.
871                  */
872
873                 RT_CACHE_STAT_INC(gc_goal_miss);
874
875                 if (expire == 0)
876                         break;
877
878                 expire >>= 1;
879 #if RT_CACHE_DEBUG >= 2
880                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
881                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
882 #endif
883
884                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
885                         goto out;
886         } while (!in_softirq() && time_before_eq(jiffies, now));
887
888         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
889                 goto out;
890         if (net_ratelimit())
891                 printk(KERN_WARNING "dst cache overflow\n");
892         RT_CACHE_STAT_INC(gc_dst_overflow);
893         return 1;
894
895 work_done:
896         expire += ip_rt_gc_min_interval;
897         if (expire > ip_rt_gc_timeout ||
898             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
899                 expire = ip_rt_gc_timeout;
900 #if RT_CACHE_DEBUG >= 2
901         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
902                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
903 #endif
904 out:    return 0;
905 }
906
907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
908 {
909         struct rtable   *rth, **rthp;
910         unsigned long   now;
911         struct rtable *cand, **candp;
912         u32             min_score;
913         int             chain_length;
914         int attempts = !in_softirq();
915
916 restart:
917         chain_length = 0;
918         min_score = ~(u32)0;
919         cand = NULL;
920         candp = NULL;
921         now = jiffies;
922
923         rthp = &rt_hash_table[hash].chain;
924
925         spin_lock_bh(rt_hash_lock_addr(hash));
926         while ((rth = *rthp) != NULL) {
927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928                 if (!(rth->u.dst.flags & DST_BALANCED) &&
929                     compare_keys(&rth->fl, &rt->fl)) {
930 #else
931                 if (compare_keys(&rth->fl, &rt->fl)) {
932 #endif
933                         /* Put it first */
934                         *rthp = rth->u.rt_next;
935                         /*
936                          * Since lookup is lockfree, the deletion
937                          * must be visible to another weakly ordered CPU before
938                          * the insertion at the start of the hash chain.
939                          */
940                         rcu_assign_pointer(rth->u.rt_next,
941                                            rt_hash_table[hash].chain);
942                         /*
943                          * Since lookup is lockfree, the update writes
944                          * must be ordered for consistency on SMP.
945                          */
946                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
947
948                         rth->u.dst.__use++;
949                         dst_hold(&rth->u.dst);
950                         rth->u.dst.lastuse = now;
951                         spin_unlock_bh(rt_hash_lock_addr(hash));
952
953                         rt_drop(rt);
954                         *rp = rth;
955                         return 0;
956                 }
957
958                 if (!atomic_read(&rth->u.dst.__refcnt)) {
959                         u32 score = rt_score(rth);
960
961                         if (score <= min_score) {
962                                 cand = rth;
963                                 candp = rthp;
964                                 min_score = score;
965                         }
966                 }
967
968                 chain_length++;
969
970                 rthp = &rth->u.rt_next;
971         }
972
973         if (cand) {
974                 /* ip_rt_gc_elasticity used to be average length of chain
975                  * length, when exceeded gc becomes really aggressive.
976                  *
977                  * The second limit is less certain. At the moment it allows
978                  * only 2 entries per bucket. We will see.
979                  */
980                 if (chain_length > ip_rt_gc_elasticity) {
981                         *candp = cand->u.rt_next;
982                         rt_free(cand);
983                 }
984         }
985
986         /* Try to bind route to arp only if it is output
987            route or unicast forwarding path.
988          */
989         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
990                 int err = arp_bind_neighbour(&rt->u.dst);
991                 if (err) {
992                         spin_unlock_bh(rt_hash_lock_addr(hash));
993
994                         if (err != -ENOBUFS) {
995                                 rt_drop(rt);
996                                 return err;
997                         }
998
999                         /* Neighbour tables are full and nothing
1000                            can be released. Try to shrink route cache,
1001                            it is most likely it holds some neighbour records.
1002                          */
1003                         if (attempts-- > 0) {
1004                                 int saved_elasticity = ip_rt_gc_elasticity;
1005                                 int saved_int = ip_rt_gc_min_interval;
1006                                 ip_rt_gc_elasticity     = 1;
1007                                 ip_rt_gc_min_interval   = 0;
1008                                 rt_garbage_collect();
1009                                 ip_rt_gc_min_interval   = saved_int;
1010                                 ip_rt_gc_elasticity     = saved_elasticity;
1011                                 goto restart;
1012                         }
1013
1014                         if (net_ratelimit())
1015                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1016                         rt_drop(rt);
1017                         return -ENOBUFS;
1018                 }
1019         }
1020
1021         rt->u.rt_next = rt_hash_table[hash].chain;
1022 #if RT_CACHE_DEBUG >= 2
1023         if (rt->u.rt_next) {
1024                 struct rtable *trt;
1025                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026                        NIPQUAD(rt->rt_dst));
1027                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1029                 printk("\n");
1030         }
1031 #endif
1032         rt_hash_table[hash].chain = rt;
1033         spin_unlock_bh(rt_hash_lock_addr(hash));
1034         *rp = rt;
1035         return 0;
1036 }
1037
1038 void rt_bind_peer(struct rtable *rt, int create)
1039 {
1040         static DEFINE_SPINLOCK(rt_peer_lock);
1041         struct inet_peer *peer;
1042
1043         peer = inet_getpeer(rt->rt_dst, create);
1044
1045         spin_lock_bh(&rt_peer_lock);
1046         if (rt->peer == NULL) {
1047                 rt->peer = peer;
1048                 peer = NULL;
1049         }
1050         spin_unlock_bh(&rt_peer_lock);
1051         if (peer)
1052                 inet_putpeer(peer);
1053 }
1054
1055 /*
1056  * Peer allocation may fail only in serious out-of-memory conditions.  However
1057  * we still can generate some output.
1058  * Random ID selection looks a bit dangerous because we have no chances to
1059  * select ID being unique in a reasonable period of time.
1060  * But broken packet identifier may be better than no packet at all.
1061  */
1062 static void ip_select_fb_ident(struct iphdr *iph)
1063 {
1064         static DEFINE_SPINLOCK(ip_fb_id_lock);
1065         static u32 ip_fallback_id;
1066         u32 salt;
1067
1068         spin_lock_bh(&ip_fb_id_lock);
1069         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070         iph->id = htons(salt & 0xFFFF);
1071         ip_fallback_id = salt;
1072         spin_unlock_bh(&ip_fb_id_lock);
1073 }
1074
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076 {
1077         struct rtable *rt = (struct rtable *) dst;
1078
1079         if (rt) {
1080                 if (rt->peer == NULL)
1081                         rt_bind_peer(rt, 1);
1082
1083                 /* If peer is attached to destination, it is never detached,
1084                    so that we need not to grab a lock to dereference it.
1085                  */
1086                 if (rt->peer) {
1087                         iph->id = htons(inet_getid(rt->peer, more));
1088                         return;
1089                 }
1090         } else
1091                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
1092                        __builtin_return_address(0));
1093
1094         ip_select_fb_ident(iph);
1095 }
1096
1097 static void rt_del(unsigned hash, struct rtable *rt)
1098 {
1099         struct rtable **rthp;
1100
1101         spin_lock_bh(rt_hash_lock_addr(hash));
1102         ip_rt_put(rt);
1103         for (rthp = &rt_hash_table[hash].chain; *rthp;
1104              rthp = &(*rthp)->u.rt_next)
1105                 if (*rthp == rt) {
1106                         *rthp = rt->u.rt_next;
1107                         rt_free(rt);
1108                         break;
1109                 }
1110         spin_unlock_bh(rt_hash_lock_addr(hash));
1111 }
1112
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114                     u32 saddr, u8 tos, struct net_device *dev)
1115 {
1116         int i, k;
1117         struct in_device *in_dev = in_dev_get(dev);
1118         struct rtable *rth, **rthp;
1119         u32  skeys[2] = { saddr, 0 };
1120         int  ikeys[2] = { dev->ifindex, 0 };
1121
1122         tos &= IPTOS_RT_MASK;
1123
1124         if (!in_dev)
1125                 return;
1126
1127         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129                 goto reject_redirect;
1130
1131         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133                         goto reject_redirect;
1134                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135                         goto reject_redirect;
1136         } else {
1137                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1138                         goto reject_redirect;
1139         }
1140
1141         for (i = 0; i < 2; i++) {
1142                 for (k = 0; k < 2; k++) {
1143                         unsigned hash = rt_hash_code(daddr,
1144                                                      skeys[i] ^ (ikeys[k] << 5),
1145                                                      tos);
1146
1147                         rthp=&rt_hash_table[hash].chain;
1148
1149                         rcu_read_lock();
1150                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1151                                 struct rtable *rt;
1152
1153                                 if (rth->fl.fl4_dst != daddr ||
1154                                     rth->fl.fl4_src != skeys[i] ||
1155                                     rth->fl.fl4_tos != tos ||
1156                                     rth->fl.oif != ikeys[k] ||
1157                                     rth->fl.iif != 0) {
1158                                         rthp = &rth->u.rt_next;
1159                                         continue;
1160                                 }
1161
1162                                 if (rth->rt_dst != daddr ||
1163                                     rth->rt_src != saddr ||
1164                                     rth->u.dst.error ||
1165                                     rth->rt_gateway != old_gw ||
1166                                     rth->u.dst.dev != dev)
1167                                         break;
1168
1169                                 dst_hold(&rth->u.dst);
1170                                 rcu_read_unlock();
1171
1172                                 rt = dst_alloc(&ipv4_dst_ops);
1173                                 if (rt == NULL) {
1174                                         ip_rt_put(rth);
1175                                         in_dev_put(in_dev);
1176                                         return;
1177                                 }
1178
1179                                 /* Copy all the information. */
1180                                 *rt = *rth;
1181                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182                                 rt->u.dst.__use         = 1;
1183                                 atomic_set(&rt->u.dst.__refcnt, 1);
1184                                 rt->u.dst.child         = NULL;
1185                                 if (rt->u.dst.dev)
1186                                         dev_hold(rt->u.dst.dev);
1187                                 if (rt->idev)
1188                                         in_dev_hold(rt->idev);
1189                                 rt->u.dst.obsolete      = 0;
1190                                 rt->u.dst.lastuse       = jiffies;
1191                                 rt->u.dst.path          = &rt->u.dst;
1192                                 rt->u.dst.neighbour     = NULL;
1193                                 rt->u.dst.hh            = NULL;
1194                                 rt->u.dst.xfrm          = NULL;
1195
1196                                 rt->rt_flags            |= RTCF_REDIRECTED;
1197
1198                                 /* Gateway is different ... */
1199                                 rt->rt_gateway          = new_gw;
1200
1201                                 /* Redirect received -> path was valid */
1202                                 dst_confirm(&rth->u.dst);
1203
1204                                 if (rt->peer)
1205                                         atomic_inc(&rt->peer->refcnt);
1206
1207                                 if (arp_bind_neighbour(&rt->u.dst) ||
1208                                     !(rt->u.dst.neighbour->nud_state &
1209                                             NUD_VALID)) {
1210                                         if (rt->u.dst.neighbour)
1211                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1212                                         ip_rt_put(rth);
1213                                         rt_drop(rt);
1214                                         goto do_next;
1215                                 }
1216
1217                                 rt_del(hash, rth);
1218                                 if (!rt_intern_hash(hash, rt, &rt))
1219                                         ip_rt_put(rt);
1220                                 goto do_next;
1221                         }
1222                         rcu_read_unlock();
1223                 do_next:
1224                         ;
1225                 }
1226         }
1227         in_dev_put(in_dev);
1228         return;
1229
1230 reject_redirect:
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
1232         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234                         "%u.%u.%u.%u ignored.\n"
1235                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236                         "tos %02x\n",
1237                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1239 #endif
1240         in_dev_put(in_dev);
1241 }
1242
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244 {
1245         struct rtable *rt = (struct rtable*)dst;
1246         struct dst_entry *ret = dst;
1247
1248         if (rt) {
1249                 if (dst->obsolete) {
1250                         ip_rt_put(rt);
1251                         ret = NULL;
1252                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253                            rt->u.dst.expires) {
1254                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1255                                                      rt->fl.fl4_src ^
1256                                                         (rt->fl.oif << 5),
1257                                                      rt->fl.fl4_tos);
1258 #if RT_CACHE_DEBUG >= 1
1259                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260                                           "%u.%u.%u.%u/%02x dropped\n",
1261                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262 #endif
1263                         rt_del(hash, rt);
1264                         ret = NULL;
1265                 }
1266         }
1267         return ret;
1268 }
1269
1270 /*
1271  * Algorithm:
1272  *      1. The first ip_rt_redirect_number redirects are sent
1273  *         with exponential backoff, then we stop sending them at all,
1274  *         assuming that the host ignores our redirects.
1275  *      2. If we did not see packets requiring redirects
1276  *         during ip_rt_redirect_silence, we assume that the host
1277  *         forgot redirected route and start to send redirects again.
1278  *
1279  * This algorithm is much cheaper and more intelligent than dumb load limiting
1280  * in icmp.c.
1281  *
1282  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284  */
1285
1286 void ip_rt_send_redirect(struct sk_buff *skb)
1287 {
1288         struct rtable *rt = (struct rtable*)skb->dst;
1289         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1290
1291         if (!in_dev)
1292                 return;
1293
1294         if (!IN_DEV_TX_REDIRECTS(in_dev))
1295                 goto out;
1296
1297         /* No redirected packets during ip_rt_redirect_silence;
1298          * reset the algorithm.
1299          */
1300         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301                 rt->u.dst.rate_tokens = 0;
1302
1303         /* Too many ignored redirects; do not send anything
1304          * set u.dst.rate_last to the last seen redirected packet.
1305          */
1306         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307                 rt->u.dst.rate_last = jiffies;
1308                 goto out;
1309         }
1310
1311         /* Check for load limit; set rate_last to the latest sent
1312          * redirect.
1313          */
1314         if (time_after(jiffies,
1315                        (rt->u.dst.rate_last +
1316                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318                 rt->u.dst.rate_last = jiffies;
1319                 ++rt->u.dst.rate_tokens;
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323                     net_ratelimit())
1324                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1327                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1328 #endif
1329         }
1330 out:
1331         in_dev_put(in_dev);
1332 }
1333
1334 static int ip_error(struct sk_buff *skb)
1335 {
1336         struct rtable *rt = (struct rtable*)skb->dst;
1337         unsigned long now;
1338         int code;
1339
1340         switch (rt->u.dst.error) {
1341                 case EINVAL:
1342                 default:
1343                         goto out;
1344                 case EHOSTUNREACH:
1345                         code = ICMP_HOST_UNREACH;
1346                         break;
1347                 case ENETUNREACH:
1348                         code = ICMP_NET_UNREACH;
1349                         break;
1350                 case EACCES:
1351                         code = ICMP_PKT_FILTERED;
1352                         break;
1353         }
1354
1355         now = jiffies;
1356         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1359         rt->u.dst.rate_last = now;
1360         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363         }
1364
1365 out:    kfree_skb(skb);
1366         return 0;
1367
1368
1369 /*
1370  *      The last two values are not from the RFC but
1371  *      are needed for AMPRnet AX.25 paths.
1372  */
1373
1374 static unsigned short mtu_plateau[] =
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1378 {
1379         int i;
1380         
1381         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382                 if (old_mtu > mtu_plateau[i])
1383                         return mtu_plateau[i];
1384         return 68;
1385 }
1386
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388 {
1389         int i;
1390         unsigned short old_mtu = ntohs(iph->tot_len);
1391         struct rtable *rth;
1392         u32  skeys[2] = { iph->saddr, 0, };
1393         u32  daddr = iph->daddr;
1394         u8   tos = iph->tos & IPTOS_RT_MASK;
1395         unsigned short est_mtu = 0;
1396
1397         if (ipv4_config.no_pmtu_disc)
1398                 return 0;
1399
1400         for (i = 0; i < 2; i++) {
1401                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402
1403                 rcu_read_lock();
1404                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405                      rth = rcu_dereference(rth->u.rt_next)) {
1406                         if (rth->fl.fl4_dst == daddr &&
1407                             rth->fl.fl4_src == skeys[i] &&
1408                             rth->rt_dst  == daddr &&
1409                             rth->rt_src  == iph->saddr &&
1410                             rth->fl.fl4_tos == tos &&
1411                             rth->fl.iif == 0 &&
1412                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413                                 unsigned short mtu = new_mtu;
1414
1415                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1416
1417                                         /* BSD 4.2 compatibility hack :-( */
1418                                         if (mtu == 0 &&
1419                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420                                             old_mtu >= 68 + (iph->ihl << 2))
1421                                                 old_mtu -= iph->ihl << 2;
1422
1423                                         mtu = guess_mtu(old_mtu);
1424                                 }
1425                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1427                                                 dst_confirm(&rth->u.dst);
1428                                                 if (mtu < ip_rt_min_pmtu) {
1429                                                         mtu = ip_rt_min_pmtu;
1430                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1431                                                                 (1 << RTAX_MTU);
1432                                                 }
1433                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434                                                 dst_set_expires(&rth->u.dst,
1435                                                         ip_rt_mtu_expires);
1436                                         }
1437                                         est_mtu = mtu;
1438                                 }
1439                         }
1440                 }
1441                 rcu_read_unlock();
1442         }
1443         return est_mtu ? : new_mtu;
1444 }
1445
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 {
1448         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449             !(dst_metric_locked(dst, RTAX_MTU))) {
1450                 if (mtu < ip_rt_min_pmtu) {
1451                         mtu = ip_rt_min_pmtu;
1452                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453                 }
1454                 dst->metrics[RTAX_MTU-1] = mtu;
1455                 dst_set_expires(dst, ip_rt_mtu_expires);
1456         }
1457 }
1458
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460 {
1461         return NULL;
1462 }
1463
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 {
1466         struct rtable *rt = (struct rtable *) dst;
1467         struct inet_peer *peer = rt->peer;
1468         struct in_device *idev = rt->idev;
1469
1470         if (peer) {
1471                 rt->peer = NULL;
1472                 inet_putpeer(peer);
1473         }
1474
1475         if (idev) {
1476                 rt->idev = NULL;
1477                 in_dev_put(idev);
1478         }
1479 }
1480
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482                             int how)
1483 {
1484         struct rtable *rt = (struct rtable *) dst;
1485         struct in_device *idev = rt->idev;
1486         if (dev != &loopback_dev && idev && idev->dev == dev) {
1487                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488                 if (loopback_idev) {
1489                         rt->idev = loopback_idev;
1490                         in_dev_put(idev);
1491                 }
1492         }
1493 }
1494
1495 static void ipv4_link_failure(struct sk_buff *skb)
1496 {
1497         struct rtable *rt;
1498
1499         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500
1501         rt = (struct rtable *) skb->dst;
1502         if (rt)
1503                 dst_set_expires(&rt->u.dst, 0);
1504 }
1505
1506 static int ip_rt_bug(struct sk_buff *skb)
1507 {
1508         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510                 skb->dev ? skb->dev->name : "?");
1511         kfree_skb(skb);
1512         return 0;
1513 }
1514
1515 /*
1516    We do not cache source address of outgoing interface,
1517    because it is used only by IP RR, TS and SRR options,
1518    so that it out of fast path.
1519
1520    BTW remember: "addr" is allowed to be not aligned
1521    in IP options!
1522  */
1523
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 {
1526         u32 src;
1527         struct fib_result res;
1528
1529         if (rt->fl.iif == 0)
1530                 src = rt->rt_src;
1531         else if (fib_lookup(&rt->fl, &res) == 0) {
1532                 src = FIB_RES_PREFSRC(res);
1533                 fib_res_put(&res);
1534         } else
1535                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536                                         RT_SCOPE_UNIVERSE);
1537         memcpy(addr, &src, 4);
1538 }
1539
1540 #ifdef CONFIG_NET_CLS_ROUTE
1541 static void set_class_tag(struct rtable *rt, u32 tag)
1542 {
1543         if (!(rt->u.dst.tclassid & 0xFFFF))
1544                 rt->u.dst.tclassid |= tag & 0xFFFF;
1545         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547 }
1548 #endif
1549
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 {
1552         struct fib_info *fi = res->fi;
1553
1554         if (fi) {
1555                 if (FIB_RES_GW(*res) &&
1556                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557                         rt->rt_gateway = FIB_RES_GW(*res);
1558                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559                        sizeof(rt->u.dst.metrics));
1560                 if (fi->fib_mtu == 0) {
1561                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563                             rt->rt_gateway != rt->rt_dst &&
1564                             rt->u.dst.dev->mtu > 576)
1565                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566                 }
1567 #ifdef CONFIG_NET_CLS_ROUTE
1568                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569 #endif
1570         } else
1571                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572
1573         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579                                        ip_rt_min_advmss);
1580         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582
1583 #ifdef CONFIG_NET_CLS_ROUTE
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
1585         set_class_tag(rt, fib_rules_tclass(res));
1586 #endif
1587         set_class_tag(rt, itag);
1588 #endif
1589         rt->rt_type = res->type;
1590 }
1591
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593                                 u8 tos, struct net_device *dev, int our)
1594 {
1595         unsigned hash;
1596         struct rtable *rth;
1597         u32 spec_dst;
1598         struct in_device *in_dev = in_dev_get(dev);
1599         u32 itag = 0;
1600
1601         /* Primary sanity checks. */
1602
1603         if (in_dev == NULL)
1604                 return -EINVAL;
1605
1606         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607             skb->protocol != htons(ETH_P_IP))
1608                 goto e_inval;
1609
1610         if (ZERONET(saddr)) {
1611                 if (!LOCAL_MCAST(daddr))
1612                         goto e_inval;
1613                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614         } else if (fib_validate_source(saddr, 0, tos, 0,
1615                                         dev, &spec_dst, &itag) < 0)
1616                 goto e_inval;
1617
1618         rth = dst_alloc(&ipv4_dst_ops);
1619         if (!rth)
1620                 goto e_nobufs;
1621
1622         rth->u.dst.output= ip_rt_bug;
1623
1624         atomic_set(&rth->u.dst.__refcnt, 1);
1625         rth->u.dst.flags= DST_HOST;
1626         if (in_dev->cnf.no_policy)
1627                 rth->u.dst.flags |= DST_NOPOLICY;
1628         rth->fl.fl4_dst = daddr;
1629         rth->rt_dst     = daddr;
1630         rth->fl.fl4_tos = tos;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632         rth->fl.fl4_fwmark= skb->nfmark;
1633 #endif
1634         rth->fl.fl4_src = saddr;
1635         rth->rt_src     = saddr;
1636 #ifdef CONFIG_NET_CLS_ROUTE
1637         rth->u.dst.tclassid = itag;
1638 #endif
1639         rth->rt_iif     =
1640         rth->fl.iif     = dev->ifindex;
1641         rth->u.dst.dev  = &loopback_dev;
1642         dev_hold(rth->u.dst.dev);
1643         rth->idev       = in_dev_get(rth->u.dst.dev);
1644         rth->fl.oif     = 0;
1645         rth->rt_gateway = daddr;
1646         rth->rt_spec_dst= spec_dst;
1647         rth->rt_type    = RTN_MULTICAST;
1648         rth->rt_flags   = RTCF_MULTICAST;
1649         if (our) {
1650                 rth->u.dst.input= ip_local_deliver;
1651                 rth->rt_flags |= RTCF_LOCAL;
1652         }
1653
1654 #ifdef CONFIG_IP_MROUTE
1655         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656                 rth->u.dst.input = ip_mr_input;
1657 #endif
1658         RT_CACHE_STAT_INC(in_slow_mc);
1659
1660         in_dev_put(in_dev);
1661         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663
1664 e_nobufs:
1665         in_dev_put(in_dev);
1666         return -ENOBUFS;
1667
1668 e_inval:
1669         in_dev_put(in_dev);
1670         return -EINVAL;
1671 }
1672
1673
1674 static void ip_handle_martian_source(struct net_device *dev,
1675                                      struct in_device *in_dev,
1676                                      struct sk_buff *skb,
1677                                      u32 daddr,
1678                                      u32 saddr) 
1679 {
1680         RT_CACHE_STAT_INC(in_martian_src);
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
1682         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683                 /*
1684                  *      RFC1812 recommendation, if source is martian,
1685                  *      the only hint is MAC header.
1686                  */
1687                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688                         "%u.%u.%u.%u, on dev %s\n",
1689                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690                 if (dev->hard_header_len && skb->mac.raw) {
1691                         int i;
1692                         unsigned char *p = skb->mac.raw;
1693                         printk(KERN_WARNING "ll header: ");
1694                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1695                                 printk("%02x", *p);
1696                                 if (i < (dev->hard_header_len - 1))
1697                                         printk(":");
1698                         }
1699                         printk("\n");
1700                 }
1701         }
1702 #endif
1703 }
1704
1705 static inline int __mkroute_input(struct sk_buff *skb, 
1706                                   struct fib_result* res, 
1707                                   struct in_device *in_dev, 
1708                                   u32 daddr, u32 saddr, u32 tos, 
1709                                   struct rtable **result) 
1710 {
1711
1712         struct rtable *rth;
1713         int err;
1714         struct in_device *out_dev;
1715         unsigned flags = 0;
1716         u32 spec_dst, itag;
1717
1718         /* get a working reference to the output device */
1719         out_dev = in_dev_get(FIB_RES_DEV(*res));
1720         if (out_dev == NULL) {
1721                 if (net_ratelimit())
1722                         printk(KERN_CRIT "Bug in ip_route_input" \
1723                                "_slow(). Please, report\n");
1724                 return -EINVAL;
1725         }
1726
1727
1728         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1729                                   in_dev->dev, &spec_dst, &itag);
1730         if (err < 0) {
1731                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1732                                          saddr);
1733                 
1734                 err = -EINVAL;
1735                 goto cleanup;
1736         }
1737
1738         if (err)
1739                 flags |= RTCF_DIRECTSRC;
1740
1741         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742             (IN_DEV_SHARED_MEDIA(out_dev) ||
1743              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744                 flags |= RTCF_DOREDIRECT;
1745
1746         if (skb->protocol != htons(ETH_P_IP)) {
1747                 /* Not IP (i.e. ARP). Do not create route, if it is
1748                  * invalid for proxy arp. DNAT routes are always valid.
1749                  */
1750                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1751                         err = -EINVAL;
1752                         goto cleanup;
1753                 }
1754         }
1755
1756
1757         rth = dst_alloc(&ipv4_dst_ops);
1758         if (!rth) {
1759                 err = -ENOBUFS;
1760                 goto cleanup;
1761         }
1762
1763         atomic_set(&rth->u.dst.__refcnt, 1);
1764         rth->u.dst.flags= DST_HOST;
1765 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1766         if (res->fi->fib_nhs > 1)
1767                 rth->u.dst.flags |= DST_BALANCED;
1768 #endif
1769         if (in_dev->cnf.no_policy)
1770                 rth->u.dst.flags |= DST_NOPOLICY;
1771         if (in_dev->cnf.no_xfrm)
1772                 rth->u.dst.flags |= DST_NOXFRM;
1773         rth->fl.fl4_dst = daddr;
1774         rth->rt_dst     = daddr;
1775         rth->fl.fl4_tos = tos;
1776 #ifdef CONFIG_IP_ROUTE_FWMARK
1777         rth->fl.fl4_fwmark= skb->nfmark;
1778 #endif
1779         rth->fl.fl4_src = saddr;
1780         rth->rt_src     = saddr;
1781         rth->rt_gateway = daddr;
1782         rth->rt_iif     =
1783                 rth->fl.iif     = in_dev->dev->ifindex;
1784         rth->u.dst.dev  = (out_dev)->dev;
1785         dev_hold(rth->u.dst.dev);
1786         rth->idev       = in_dev_get(rth->u.dst.dev);
1787         rth->fl.oif     = 0;
1788         rth->rt_spec_dst= spec_dst;
1789
1790         rth->u.dst.input = ip_forward;
1791         rth->u.dst.output = ip_output;
1792
1793         rt_set_nexthop(rth, res, itag);
1794
1795         rth->rt_flags = flags;
1796
1797         *result = rth;
1798         err = 0;
1799  cleanup:
1800         /* release the working reference to the output device */
1801         in_dev_put(out_dev);
1802         return err;
1803 }                                               
1804
1805 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1806                                        struct fib_result* res, 
1807                                        const struct flowi *fl,
1808                                        struct in_device *in_dev,
1809                                        u32 daddr, u32 saddr, u32 tos)
1810 {
1811         struct rtable* rth = NULL;
1812         int err;
1813         unsigned hash;
1814
1815 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1816         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1817                 fib_select_multipath(fl, res);
1818 #endif
1819
1820         /* create a routing cache entry */
1821         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1822         if (err)
1823                 return err;
1824
1825         /* put it into the cache */
1826         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1828 }
1829
1830 static inline int ip_mkroute_input(struct sk_buff *skb, 
1831                                    struct fib_result* res, 
1832                                    const struct flowi *fl,
1833                                    struct in_device *in_dev,
1834                                    u32 daddr, u32 saddr, u32 tos)
1835 {
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1837         struct rtable* rth = NULL, *rtres;
1838         unsigned char hop, hopcount;
1839         int err = -EINVAL;
1840         unsigned int hash;
1841
1842         if (res->fi)
1843                 hopcount = res->fi->fib_nhs;
1844         else
1845                 hopcount = 1;
1846
1847         /* distinguish between multipath and singlepath */
1848         if (hopcount < 2)
1849                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850                                             saddr, tos);
1851         
1852         /* add all alternatives to the routing cache */
1853         for (hop = 0; hop < hopcount; hop++) {
1854                 res->nh_sel = hop;
1855
1856                 /* put reference to previous result */
1857                 if (hop)
1858                         ip_rt_put(rtres);
1859
1860                 /* create a routing cache entry */
1861                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1862                                       &rth);
1863                 if (err)
1864                         return err;
1865
1866                 /* put it into the cache */
1867                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1868                 err = rt_intern_hash(hash, rth, &rtres);
1869                 if (err)
1870                         return err;
1871
1872                 /* forward hop information to multipath impl. */
1873                 multipath_set_nhinfo(rth,
1874                                      FIB_RES_NETWORK(*res),
1875                                      FIB_RES_NETMASK(*res),
1876                                      res->prefixlen,
1877                                      &FIB_RES_NH(*res));
1878         }
1879         skb->dst = &rtres->u.dst;
1880         return err;
1881 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1882         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1883 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1884 }
1885
1886
1887 /*
1888  *      NOTE. We drop all the packets that has local source
1889  *      addresses, because every properly looped back packet
1890  *      must have correct destination already attached by output routine.
1891  *
1892  *      Such approach solves two big problems:
1893  *      1. Not simplex devices are handled properly.
1894  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1895  */
1896
1897 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1898                                u8 tos, struct net_device *dev)
1899 {
1900         struct fib_result res;
1901         struct in_device *in_dev = in_dev_get(dev);
1902         struct flowi fl = { .nl_u = { .ip4_u =
1903                                       { .daddr = daddr,
1904                                         .saddr = saddr,
1905                                         .tos = tos,
1906                                         .scope = RT_SCOPE_UNIVERSE,
1907 #ifdef CONFIG_IP_ROUTE_FWMARK
1908                                         .fwmark = skb->nfmark
1909 #endif
1910                                       } },
1911                             .iif = dev->ifindex };
1912         unsigned        flags = 0;
1913         u32             itag = 0;
1914         struct rtable * rth;
1915         unsigned        hash;
1916         u32             spec_dst;
1917         int             err = -EINVAL;
1918         int             free_res = 0;
1919
1920         /* IP on this device is disabled. */
1921
1922         if (!in_dev)
1923                 goto out;
1924
1925         /* Check for the most weird martians, which can be not detected
1926            by fib_lookup.
1927          */
1928
1929         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1930                 goto martian_source;
1931
1932         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1933                 goto brd_input;
1934
1935         /* Accept zero addresses only to limited broadcast;
1936          * I even do not know to fix it or not. Waiting for complains :-)
1937          */
1938         if (ZERONET(saddr))
1939                 goto martian_source;
1940
1941         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1942                 goto martian_destination;
1943
1944         /*
1945          *      Now we are ready to route packet.
1946          */
1947         if ((err = fib_lookup(&fl, &res)) != 0) {
1948                 if (!IN_DEV_FORWARD(in_dev))
1949                         goto e_hostunreach;
1950                 goto no_route;
1951         }
1952         free_res = 1;
1953
1954         RT_CACHE_STAT_INC(in_slow_tot);
1955
1956         if (res.type == RTN_BROADCAST)
1957                 goto brd_input;
1958
1959         if (res.type == RTN_LOCAL) {
1960                 int result;
1961                 result = fib_validate_source(saddr, daddr, tos,
1962                                              loopback_dev.ifindex,
1963                                              dev, &spec_dst, &itag);
1964                 if (result < 0)
1965                         goto martian_source;
1966                 if (result)
1967                         flags |= RTCF_DIRECTSRC;
1968                 spec_dst = daddr;
1969                 goto local_input;
1970         }
1971
1972         if (!IN_DEV_FORWARD(in_dev))
1973                 goto e_hostunreach;
1974         if (res.type != RTN_UNICAST)
1975                 goto martian_destination;
1976
1977         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1978         if (err == -ENOBUFS)
1979                 goto e_nobufs;
1980         if (err == -EINVAL)
1981                 goto e_inval;
1982         
1983 done:
1984         in_dev_put(in_dev);
1985         if (free_res)
1986                 fib_res_put(&res);
1987 out:    return err;
1988
1989 brd_input:
1990         if (skb->protocol != htons(ETH_P_IP))
1991                 goto e_inval;
1992
1993         if (ZERONET(saddr))
1994                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995         else {
1996                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1997                                           &itag);
1998                 if (err < 0)
1999                         goto martian_source;
2000                 if (err)
2001                         flags |= RTCF_DIRECTSRC;
2002         }
2003         flags |= RTCF_BROADCAST;
2004         res.type = RTN_BROADCAST;
2005         RT_CACHE_STAT_INC(in_brd);
2006
2007 local_input:
2008         rth = dst_alloc(&ipv4_dst_ops);
2009         if (!rth)
2010                 goto e_nobufs;
2011
2012         rth->u.dst.output= ip_rt_bug;
2013
2014         atomic_set(&rth->u.dst.__refcnt, 1);
2015         rth->u.dst.flags= DST_HOST;
2016         if (in_dev->cnf.no_policy)
2017                 rth->u.dst.flags |= DST_NOPOLICY;
2018         rth->fl.fl4_dst = daddr;
2019         rth->rt_dst     = daddr;
2020         rth->fl.fl4_tos = tos;
2021 #ifdef CONFIG_IP_ROUTE_FWMARK
2022         rth->fl.fl4_fwmark= skb->nfmark;
2023 #endif
2024         rth->fl.fl4_src = saddr;
2025         rth->rt_src     = saddr;
2026 #ifdef CONFIG_NET_CLS_ROUTE
2027         rth->u.dst.tclassid = itag;
2028 #endif
2029         rth->rt_iif     =
2030         rth->fl.iif     = dev->ifindex;
2031         rth->u.dst.dev  = &loopback_dev;
2032         dev_hold(rth->u.dst.dev);
2033         rth->idev       = in_dev_get(rth->u.dst.dev);
2034         rth->rt_gateway = daddr;
2035         rth->rt_spec_dst= spec_dst;
2036         rth->u.dst.input= ip_local_deliver;
2037         rth->rt_flags   = flags|RTCF_LOCAL;
2038         if (res.type == RTN_UNREACHABLE) {
2039                 rth->u.dst.input= ip_error;
2040                 rth->u.dst.error= -err;
2041                 rth->rt_flags   &= ~RTCF_LOCAL;
2042         }
2043         rth->rt_type    = res.type;
2044         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2045         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2046         goto done;
2047
2048 no_route:
2049         RT_CACHE_STAT_INC(in_no_route);
2050         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2051         res.type = RTN_UNREACHABLE;
2052         goto local_input;
2053
2054         /*
2055          *      Do not cache martian addresses: they should be logged (RFC1812)
2056          */
2057 martian_destination:
2058         RT_CACHE_STAT_INC(in_martian_dst);
2059 #ifdef CONFIG_IP_ROUTE_VERBOSE
2060         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2061                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2062                         "%u.%u.%u.%u, dev %s\n",
2063                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2064 #endif
2065
2066 e_hostunreach:
2067         err = -EHOSTUNREACH;
2068         goto done;
2069
2070 e_inval:
2071         err = -EINVAL;
2072         goto done;
2073
2074 e_nobufs:
2075         err = -ENOBUFS;
2076         goto done;
2077
2078 martian_source:
2079         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2080         goto e_inval;
2081 }
2082
2083 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2084                    u8 tos, struct net_device *dev)
2085 {
2086         struct rtable * rth;
2087         unsigned        hash;
2088         int iif = dev->ifindex;
2089
2090         tos &= IPTOS_RT_MASK;
2091         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2092
2093         rcu_read_lock();
2094         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2095              rth = rcu_dereference(rth->u.rt_next)) {
2096                 if (rth->fl.fl4_dst == daddr &&
2097                     rth->fl.fl4_src == saddr &&
2098                     rth->fl.iif == iif &&
2099                     rth->fl.oif == 0 &&
2100 #ifdef CONFIG_IP_ROUTE_FWMARK
2101                     rth->fl.fl4_fwmark == skb->nfmark &&
2102 #endif
2103                     rth->fl.fl4_tos == tos) {
2104                         rth->u.dst.lastuse = jiffies;
2105                         dst_hold(&rth->u.dst);
2106                         rth->u.dst.__use++;
2107                         RT_CACHE_STAT_INC(in_hit);
2108                         rcu_read_unlock();
2109                         skb->dst = (struct dst_entry*)rth;
2110                         return 0;
2111                 }
2112                 RT_CACHE_STAT_INC(in_hlist_search);
2113         }
2114         rcu_read_unlock();
2115
2116         /* Multicast recognition logic is moved from route cache to here.
2117            The problem was that too many Ethernet cards have broken/missing
2118            hardware multicast filters :-( As result the host on multicasting
2119            network acquires a lot of useless route cache entries, sort of
2120            SDR messages from all the world. Now we try to get rid of them.
2121            Really, provided software IP multicast filter is organized
2122            reasonably (at least, hashed), it does not result in a slowdown
2123            comparing with route cache reject entries.
2124            Note, that multicast routers are not affected, because
2125            route cache entry is created eventually.
2126          */
2127         if (MULTICAST(daddr)) {
2128                 struct in_device *in_dev;
2129
2130                 rcu_read_lock();
2131                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2132                         int our = ip_check_mc(in_dev, daddr, saddr,
2133                                 skb->nh.iph->protocol);
2134                         if (our
2135 #ifdef CONFIG_IP_MROUTE
2136                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2137 #endif
2138                             ) {
2139                                 rcu_read_unlock();
2140                                 return ip_route_input_mc(skb, daddr, saddr,
2141                                                          tos, dev, our);
2142                         }
2143                 }
2144                 rcu_read_unlock();
2145                 return -EINVAL;
2146         }
2147         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2148 }
2149
2150 static inline int __mkroute_output(struct rtable **result,
2151                                    struct fib_result* res, 
2152                                    const struct flowi *fl,
2153                                    const struct flowi *oldflp, 
2154                                    struct net_device *dev_out, 
2155                                    unsigned flags) 
2156 {
2157         struct rtable *rth;
2158         struct in_device *in_dev;
2159         u32 tos = RT_FL_TOS(oldflp);
2160         int err = 0;
2161
2162         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2163                 return -EINVAL;
2164
2165         if (fl->fl4_dst == 0xFFFFFFFF)
2166                 res->type = RTN_BROADCAST;
2167         else if (MULTICAST(fl->fl4_dst))
2168                 res->type = RTN_MULTICAST;
2169         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2170                 return -EINVAL;
2171
2172         if (dev_out->flags & IFF_LOOPBACK)
2173                 flags |= RTCF_LOCAL;
2174
2175         /* get work reference to inet device */
2176         in_dev = in_dev_get(dev_out);
2177         if (!in_dev)
2178                 return -EINVAL;
2179
2180         if (res->type == RTN_BROADCAST) {
2181                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182                 if (res->fi) {
2183                         fib_info_put(res->fi);
2184                         res->fi = NULL;
2185                 }
2186         } else if (res->type == RTN_MULTICAST) {
2187                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2188                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2189                                  oldflp->proto))
2190                         flags &= ~RTCF_LOCAL;
2191                 /* If multicast route do not exist use
2192                    default one, but do not gateway in this case.
2193                    Yes, it is hack.
2194                  */
2195                 if (res->fi && res->prefixlen < 4) {
2196                         fib_info_put(res->fi);
2197                         res->fi = NULL;
2198                 }
2199         }
2200
2201
2202         rth = dst_alloc(&ipv4_dst_ops);
2203         if (!rth) {
2204                 err = -ENOBUFS;
2205                 goto cleanup;
2206         }               
2207
2208         atomic_set(&rth->u.dst.__refcnt, 1);
2209         rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211         if (res->fi) {
2212                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213                 if (res->fi->fib_nhs > 1)
2214                         rth->u.dst.flags |= DST_BALANCED;
2215         }
2216 #endif
2217         if (in_dev->cnf.no_xfrm)
2218                 rth->u.dst.flags |= DST_NOXFRM;
2219         if (in_dev->cnf.no_policy)
2220                 rth->u.dst.flags |= DST_NOPOLICY;
2221
2222         rth->fl.fl4_dst = oldflp->fl4_dst;
2223         rth->fl.fl4_tos = tos;
2224         rth->fl.fl4_src = oldflp->fl4_src;
2225         rth->fl.oif     = oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 #endif
2229         rth->rt_dst     = fl->fl4_dst;
2230         rth->rt_src     = fl->fl4_src;
2231         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2232         /* get references to the devices that are to be hold by the routing 
2233            cache entry */
2234         rth->u.dst.dev  = dev_out;
2235         dev_hold(dev_out);
2236         rth->idev       = in_dev_get(dev_out);
2237         rth->rt_gateway = fl->fl4_dst;
2238         rth->rt_spec_dst= fl->fl4_src;
2239
2240         rth->u.dst.output=ip_output;
2241
2242         RT_CACHE_STAT_INC(out_slow_tot);
2243
2244         if (flags & RTCF_LOCAL) {
2245                 rth->u.dst.input = ip_local_deliver;
2246                 rth->rt_spec_dst = fl->fl4_dst;
2247         }
2248         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249                 rth->rt_spec_dst = fl->fl4_src;
2250                 if (flags & RTCF_LOCAL && 
2251                     !(dev_out->flags & IFF_LOOPBACK)) {
2252                         rth->u.dst.output = ip_mc_output;
2253                         RT_CACHE_STAT_INC(out_slow_mc);
2254                 }
2255 #ifdef CONFIG_IP_MROUTE
2256                 if (res->type == RTN_MULTICAST) {
2257                         if (IN_DEV_MFORWARD(in_dev) &&
2258                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2259                                 rth->u.dst.input = ip_mr_input;
2260                                 rth->u.dst.output = ip_mc_output;
2261                         }
2262                 }
2263 #endif
2264         }
2265
2266         rt_set_nexthop(rth, res, 0);
2267
2268         rth->rt_flags = flags;
2269
2270         *result = rth;
2271  cleanup:
2272         /* release work reference to inet device */
2273         in_dev_put(in_dev);
2274
2275         return err;
2276 }
2277
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279                                         struct fib_result* res,
2280                                         const struct flowi *fl,
2281                                         const struct flowi *oldflp,
2282                                         struct net_device *dev_out,
2283                                         unsigned flags)
2284 {
2285         struct rtable *rth = NULL;
2286         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287         unsigned hash;
2288         if (err == 0) {
2289                 u32 tos = RT_FL_TOS(oldflp);
2290
2291                 hash = rt_hash_code(oldflp->fl4_dst, 
2292                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2293                 err = rt_intern_hash(hash, rth, rp);
2294         }
2295         
2296         return err;
2297 }
2298
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300                                     struct fib_result* res,
2301                                     const struct flowi *fl,
2302                                     const struct flowi *oldflp,
2303                                     struct net_device *dev_out,
2304                                     unsigned flags)
2305 {
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307         u32 tos = RT_FL_TOS(oldflp);
2308         unsigned char hop;
2309         unsigned hash;
2310         int err = -EINVAL;
2311         struct rtable *rth = NULL;
2312
2313         if (res->fi && res->fi->fib_nhs > 1) {
2314                 unsigned char hopcount = res->fi->fib_nhs;
2315
2316                 for (hop = 0; hop < hopcount; hop++) {
2317                         struct net_device *dev2nexthop;
2318
2319                         res->nh_sel = hop;
2320
2321                         /* hold a work reference to the output device */
2322                         dev2nexthop = FIB_RES_DEV(*res);
2323                         dev_hold(dev2nexthop);
2324
2325                         /* put reference to previous result */
2326                         if (hop)
2327                                 ip_rt_put(*rp);
2328
2329                         err = __mkroute_output(&rth, res, fl, oldflp,
2330                                                dev2nexthop, flags);
2331
2332                         if (err != 0)
2333                                 goto cleanup;
2334
2335                         hash = rt_hash_code(oldflp->fl4_dst, 
2336                                             oldflp->fl4_src ^
2337                                             (oldflp->oif << 5), tos);
2338                         err = rt_intern_hash(hash, rth, rp);
2339
2340                         /* forward hop information to multipath impl. */
2341                         multipath_set_nhinfo(rth,
2342                                              FIB_RES_NETWORK(*res),
2343                                              FIB_RES_NETMASK(*res),
2344                                              res->prefixlen,
2345                                              &FIB_RES_NH(*res));
2346                 cleanup:
2347                         /* release work reference to output device */
2348                         dev_put(dev2nexthop);
2349
2350                         if (err != 0)
2351                                 return err;
2352                 }
2353                 return err;
2354         } else {
2355                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2356                                              flags);
2357         }
2358 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2359         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2360 #endif
2361 }
2362
2363 /*
2364  * Major route resolver routine.
2365  */
2366
2367 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368 {
2369         u32 tos = RT_FL_TOS(oldflp);
2370         struct flowi fl = { .nl_u = { .ip4_u =
2371                                       { .daddr = oldflp->fl4_dst,
2372                                         .saddr = oldflp->fl4_src,
2373                                         .tos = tos & IPTOS_RT_MASK,
2374                                         .scope = ((tos & RTO_ONLINK) ?
2375                                                   RT_SCOPE_LINK :
2376                                                   RT_SCOPE_UNIVERSE),
2377 #ifdef CONFIG_IP_ROUTE_FWMARK
2378                                         .fwmark = oldflp->fl4_fwmark
2379 #endif
2380                                       } },
2381                             .iif = loopback_dev.ifindex,
2382                             .oif = oldflp->oif };
2383         struct fib_result res;
2384         unsigned flags = 0;
2385         struct net_device *dev_out = NULL;
2386         int free_res = 0;
2387         int err;
2388
2389
2390         res.fi          = NULL;
2391 #ifdef CONFIG_IP_MULTIPLE_TABLES
2392         res.r           = NULL;
2393 #endif
2394
2395         if (oldflp->fl4_src) {
2396                 err = -EINVAL;
2397                 if (MULTICAST(oldflp->fl4_src) ||
2398                     BADCLASS(oldflp->fl4_src) ||
2399                     ZERONET(oldflp->fl4_src))
2400                         goto out;
2401
2402                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2403                 dev_out = ip_dev_find(oldflp->fl4_src);
2404                 if (dev_out == NULL)
2405                         goto out;
2406
2407                 /* I removed check for oif == dev_out->oif here.
2408                    It was wrong for two reasons:
2409                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2410                       assigned to multiple interfaces.
2411                    2. Moreover, we are allowed to send packets with saddr
2412                       of another iface. --ANK
2413                  */
2414
2415                 if (oldflp->oif == 0
2416                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2417                         /* Special hack: user can direct multicasts
2418                            and limited broadcast via necessary interface
2419                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2420                            This hack is not just for fun, it allows
2421                            vic,vat and friends to work.
2422                            They bind socket to loopback, set ttl to zero
2423                            and expect that it will work.
2424                            From the viewpoint of routing cache they are broken,
2425                            because we are not allowed to build multicast path
2426                            with loopback source addr (look, routing cache
2427                            cannot know, that ttl is zero, so that packet
2428                            will not leave this host and route is valid).
2429                            Luckily, this hack is good workaround.
2430                          */
2431
2432                         fl.oif = dev_out->ifindex;
2433                         goto make_route;
2434                 }
2435                 if (dev_out)
2436                         dev_put(dev_out);
2437                 dev_out = NULL;
2438         }
2439
2440
2441         if (oldflp->oif) {
2442                 dev_out = dev_get_by_index(oldflp->oif);
2443                 err = -ENODEV;
2444                 if (dev_out == NULL)
2445                         goto out;
2446
2447                 /* RACE: Check return value of inet_select_addr instead. */
2448                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2449                         dev_put(dev_out);
2450                         goto out;       /* Wrong error code */
2451                 }
2452
2453                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2454                         if (!fl.fl4_src)
2455                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2456                                                               RT_SCOPE_LINK);
2457                         goto make_route;
2458                 }
2459                 if (!fl.fl4_src) {
2460                         if (MULTICAST(oldflp->fl4_dst))
2461                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2462                                                               fl.fl4_scope);
2463                         else if (!oldflp->fl4_dst)
2464                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2465                                                               RT_SCOPE_HOST);
2466                 }
2467         }
2468
2469         if (!fl.fl4_dst) {
2470                 fl.fl4_dst = fl.fl4_src;
2471                 if (!fl.fl4_dst)
2472                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2473                 if (dev_out)
2474                         dev_put(dev_out);
2475                 dev_out = &loopback_dev;
2476                 dev_hold(dev_out);
2477                 fl.oif = loopback_dev.ifindex;
2478                 res.type = RTN_LOCAL;
2479                 flags |= RTCF_LOCAL;
2480                 goto make_route;
2481         }
2482
2483         if (fib_lookup(&fl, &res)) {
2484                 res.fi = NULL;
2485                 if (oldflp->oif) {
2486                         /* Apparently, routing tables are wrong. Assume,
2487                            that the destination is on link.
2488
2489                            WHY? DW.
2490                            Because we are allowed to send to iface
2491                            even if it has NO routes and NO assigned
2492                            addresses. When oif is specified, routing
2493                            tables are looked up with only one purpose:
2494                            to catch if destination is gatewayed, rather than
2495                            direct. Moreover, if MSG_DONTROUTE is set,
2496                            we send packet, ignoring both routing tables
2497                            and ifaddr state. --ANK
2498
2499
2500                            We could make it even if oif is unknown,
2501                            likely IPv6, but we do not.
2502                          */
2503
2504                         if (fl.fl4_src == 0)
2505                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2506                                                               RT_SCOPE_LINK);
2507                         res.type = RTN_UNICAST;
2508                         goto make_route;
2509                 }
2510                 if (dev_out)
2511                         dev_put(dev_out);
2512                 err = -ENETUNREACH;
2513                 goto out;
2514         }
2515         free_res = 1;
2516
2517         if (res.type == RTN_LOCAL) {
2518                 if (!fl.fl4_src)
2519                         fl.fl4_src = fl.fl4_dst;
2520                 if (dev_out)
2521                         dev_put(dev_out);
2522                 dev_out = &loopback_dev;
2523                 dev_hold(dev_out);
2524                 fl.oif = dev_out->ifindex;
2525                 if (res.fi)
2526                         fib_info_put(res.fi);
2527                 res.fi = NULL;
2528                 flags |= RTCF_LOCAL;
2529                 goto make_route;
2530         }
2531
2532 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2533         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2534                 fib_select_multipath(&fl, &res);
2535         else
2536 #endif
2537         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2538                 fib_select_default(&fl, &res);
2539
2540         if (!fl.fl4_src)
2541                 fl.fl4_src = FIB_RES_PREFSRC(res);
2542
2543         if (dev_out)
2544                 dev_put(dev_out);
2545         dev_out = FIB_RES_DEV(res);
2546         dev_hold(dev_out);
2547         fl.oif = dev_out->ifindex;
2548
2549
2550 make_route:
2551         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2552
2553
2554         if (free_res)
2555                 fib_res_put(&res);
2556         if (dev_out)
2557                 dev_put(dev_out);
2558 out:    return err;
2559 }
2560
2561 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2562 {
2563         unsigned hash;
2564         struct rtable *rth;
2565
2566         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2567
2568         rcu_read_lock_bh();
2569         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2570                 rth = rcu_dereference(rth->u.rt_next)) {
2571                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2572                     rth->fl.fl4_src == flp->fl4_src &&
2573                     rth->fl.iif == 0 &&
2574                     rth->fl.oif == flp->oif &&
2575 #ifdef CONFIG_IP_ROUTE_FWMARK
2576                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2577 #endif
2578                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2579                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2580
2581                         /* check for multipath routes and choose one if
2582                          * necessary
2583                          */
2584                         if (multipath_select_route(flp, rth, rp)) {
2585                                 dst_hold(&(*rp)->u.dst);
2586                                 RT_CACHE_STAT_INC(out_hit);
2587                                 rcu_read_unlock_bh();
2588                                 return 0;
2589                         }
2590
2591                         rth->u.dst.lastuse = jiffies;
2592                         dst_hold(&rth->u.dst);
2593                         rth->u.dst.__use++;
2594                         RT_CACHE_STAT_INC(out_hit);
2595                         rcu_read_unlock_bh();
2596                         *rp = rth;
2597                         return 0;
2598                 }
2599                 RT_CACHE_STAT_INC(out_hlist_search);
2600         }
2601         rcu_read_unlock_bh();
2602
2603         return ip_route_output_slow(rp, flp);
2604 }
2605
2606 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2607
2608 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2609 {
2610         int err;
2611
2612         if ((err = __ip_route_output_key(rp, flp)) != 0)
2613                 return err;
2614
2615         if (flp->proto) {
2616                 if (!flp->fl4_src)
2617                         flp->fl4_src = (*rp)->rt_src;
2618                 if (!flp->fl4_dst)
2619                         flp->fl4_dst = (*rp)->rt_dst;
2620                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2621         }
2622
2623         return 0;
2624 }
2625
2626 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2627
2628 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2629 {
2630         return ip_route_output_flow(rp, flp, NULL, 0);
2631 }
2632
2633 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2634                         int nowait, unsigned int flags)
2635 {
2636         struct rtable *rt = (struct rtable*)skb->dst;
2637         struct rtmsg *r;
2638         struct nlmsghdr  *nlh;
2639         unsigned char    *b = skb->tail;
2640         struct rta_cacheinfo ci;
2641 #ifdef CONFIG_IP_MROUTE
2642         struct rtattr *eptr;
2643 #endif
2644         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2645         r = NLMSG_DATA(nlh);
2646         r->rtm_family    = AF_INET;
2647         r->rtm_dst_len  = 32;
2648         r->rtm_src_len  = 0;
2649         r->rtm_tos      = rt->fl.fl4_tos;
2650         r->rtm_table    = RT_TABLE_MAIN;
2651         r->rtm_type     = rt->rt_type;
2652         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2653         r->rtm_protocol = RTPROT_UNSPEC;
2654         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2655         if (rt->rt_flags & RTCF_NOTIFY)
2656                 r->rtm_flags |= RTM_F_NOTIFY;
2657         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2658         if (rt->fl.fl4_src) {
2659                 r->rtm_src_len = 32;
2660                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2661         }
2662         if (rt->u.dst.dev)
2663                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2664 #ifdef CONFIG_NET_CLS_ROUTE
2665         if (rt->u.dst.tclassid)
2666                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2667 #endif
2668 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2669         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2670                 __u32 alg = rt->rt_multipath_alg;
2671
2672                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2673         }
2674 #endif
2675         if (rt->fl.iif)
2676                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2677         else if (rt->rt_src != rt->fl.fl4_src)
2678                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2679         if (rt->rt_dst != rt->rt_gateway)
2680                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2681         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2682                 goto rtattr_failure;
2683         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2684         ci.rta_used     = rt->u.dst.__use;
2685         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2686         if (rt->u.dst.expires)
2687                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2688         else
2689                 ci.rta_expires = 0;
2690         ci.rta_error    = rt->u.dst.error;
2691         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2692         if (rt->peer) {
2693                 ci.rta_id = rt->peer->ip_id_count;
2694                 if (rt->peer->tcp_ts_stamp) {
2695                         ci.rta_ts = rt->peer->tcp_ts;
2696                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2697                 }
2698         }
2699 #ifdef CONFIG_IP_MROUTE
2700         eptr = (struct rtattr*)skb->tail;
2701 #endif
2702         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2703         if (rt->fl.iif) {
2704 #ifdef CONFIG_IP_MROUTE
2705                 u32 dst = rt->rt_dst;
2706
2707                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2708                     ipv4_devconf.mc_forwarding) {
2709                         int err = ipmr_get_route(skb, r, nowait);
2710                         if (err <= 0) {
2711                                 if (!nowait) {
2712                                         if (err == 0)
2713                                                 return 0;
2714                                         goto nlmsg_failure;
2715                                 } else {
2716                                         if (err == -EMSGSIZE)
2717                                                 goto nlmsg_failure;
2718                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2719                                 }
2720                         }
2721                 } else
2722 #endif
2723                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2724         }
2725
2726         nlh->nlmsg_len = skb->tail - b;
2727         return skb->len;
2728
2729 nlmsg_failure:
2730 rtattr_failure:
2731         skb_trim(skb, b - skb->data);
2732         return -1;
2733 }
2734
2735 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2736 {
2737         struct rtattr **rta = arg;
2738         struct rtmsg *rtm = NLMSG_DATA(nlh);
2739         struct rtable *rt = NULL;
2740         u32 dst = 0;
2741         u32 src = 0;
2742         int iif = 0;
2743         int err = -ENOBUFS;
2744         struct sk_buff *skb;
2745
2746         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2747         if (!skb)
2748                 goto out;
2749
2750         /* Reserve room for dummy headers, this skb can pass
2751            through good chunk of routing engine.
2752          */
2753         skb->mac.raw = skb->data;
2754         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755
2756         if (rta[RTA_SRC - 1])
2757                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2758         if (rta[RTA_DST - 1])
2759                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2760         if (rta[RTA_IIF - 1])
2761                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2762
2763         if (iif) {
2764                 struct net_device *dev = __dev_get_by_index(iif);
2765                 err = -ENODEV;
2766                 if (!dev)
2767                         goto out_free;
2768                 skb->protocol   = htons(ETH_P_IP);
2769                 skb->dev        = dev;
2770                 local_bh_disable();
2771                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772                 local_bh_enable();
2773                 rt = (struct rtable*)skb->dst;
2774                 if (!err && rt->u.dst.error)
2775                         err = -rt->u.dst.error;
2776         } else {
2777                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2778                                                          .saddr = src,
2779                                                          .tos = rtm->rtm_tos } } };
2780                 int oif = 0;
2781                 if (rta[RTA_OIF - 1])
2782                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2783                 fl.oif = oif;
2784                 err = ip_route_output_key(&rt, &fl);
2785         }
2786         if (err)
2787                 goto out_free;
2788
2789         skb->dst = &rt->u.dst;
2790         if (rtm->rtm_flags & RTM_F_NOTIFY)
2791                 rt->rt_flags |= RTCF_NOTIFY;
2792
2793         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2794
2795         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2796                                 RTM_NEWROUTE, 0, 0);
2797         if (!err)
2798                 goto out_free;
2799         if (err < 0) {
2800                 err = -EMSGSIZE;
2801                 goto out_free;
2802         }
2803
2804         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2805         if (err > 0)
2806                 err = 0;
2807 out:    return err;
2808
2809 out_free:
2810         kfree_skb(skb);
2811         goto out;
2812 }
2813
2814 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2815 {
2816         struct rtable *rt;
2817         int h, s_h;
2818         int idx, s_idx;
2819
2820         s_h = cb->args[0];
2821         s_idx = idx = cb->args[1];
2822         for (h = 0; h <= rt_hash_mask; h++) {
2823                 if (h < s_h) continue;
2824                 if (h > s_h)
2825                         s_idx = 0;
2826                 rcu_read_lock_bh();
2827                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2828                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2829                         if (idx < s_idx)
2830                                 continue;
2831                         skb->dst = dst_clone(&rt->u.dst);
2832                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2833                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
2834                                          1, NLM_F_MULTI) <= 0) {
2835                                 dst_release(xchg(&skb->dst, NULL));
2836                                 rcu_read_unlock_bh();
2837                                 goto done;
2838                         }
2839                         dst_release(xchg(&skb->dst, NULL));
2840                 }
2841                 rcu_read_unlock_bh();
2842         }
2843
2844 done:
2845         cb->args[0] = h;
2846         cb->args[1] = idx;
2847         return skb->len;
2848 }
2849
2850 void ip_rt_multicast_event(struct in_device *in_dev)
2851 {
2852         rt_cache_flush(0);
2853 }
2854
2855 #ifdef CONFIG_SYSCTL
2856 static int flush_delay;
2857
2858 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2859                                         struct file *filp, void __user *buffer,
2860                                         size_t *lenp, loff_t *ppos)
2861 {
2862         if (write) {
2863                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2864                 rt_cache_flush(flush_delay);
2865                 return 0;
2866         } 
2867
2868         return -EINVAL;
2869 }
2870
2871 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2872                                                 int __user *name,
2873                                                 int nlen,
2874                                                 void __user *oldval,
2875                                                 size_t __user *oldlenp,
2876                                                 void __user *newval,
2877                                                 size_t newlen,
2878                                                 void **context)
2879 {
2880         int delay;
2881         if (newlen != sizeof(int))
2882                 return -EINVAL;
2883         if (get_user(delay, (int __user *)newval))
2884                 return -EFAULT; 
2885         rt_cache_flush(delay); 
2886         return 0;
2887 }
2888
2889 ctl_table ipv4_route_table[] = {
2890         {
2891                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2892                 .procname       = "flush",
2893                 .data           = &flush_delay,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0200,
2896                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2897                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2898         },
2899         {
2900                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2901                 .procname       = "min_delay",
2902                 .data           = &ip_rt_min_delay,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = &proc_dointvec_jiffies,
2906                 .strategy       = &sysctl_jiffies,
2907         },
2908         {
2909                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2910                 .procname       = "max_delay",
2911                 .data           = &ip_rt_max_delay,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = &proc_dointvec_jiffies,
2915                 .strategy       = &sysctl_jiffies,
2916         },
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2919                 .procname       = "gc_thresh",
2920                 .data           = &ipv4_dst_ops.gc_thresh,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec,
2924         },
2925         {
2926                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2927                 .procname       = "max_size",
2928                 .data           = &ip_rt_max_size,
2929                 .maxlen         = sizeof(int),
2930                 .mode           = 0644,
2931                 .proc_handler   = &proc_dointvec,
2932         },
2933         {
2934                 /*  Deprecated. Use gc_min_interval_ms */
2935  
2936                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2937                 .procname       = "gc_min_interval",
2938                 .data           = &ip_rt_gc_min_interval,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = &proc_dointvec_jiffies,
2942                 .strategy       = &sysctl_jiffies,
2943         },
2944         {
2945                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2946                 .procname       = "gc_min_interval_ms",
2947                 .data           = &ip_rt_gc_min_interval,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = &proc_dointvec_ms_jiffies,
2951                 .strategy       = &sysctl_ms_jiffies,
2952         },
2953         {
2954                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2955                 .procname       = "gc_timeout",
2956                 .data           = &ip_rt_gc_timeout,
2957                 .maxlen         = sizeof(int),
2958                 .mode           = 0644,
2959                 .proc_handler   = &proc_dointvec_jiffies,
2960                 .strategy       = &sysctl_jiffies,
2961         },
2962         {
2963                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2964                 .procname       = "gc_interval",
2965                 .data           = &ip_rt_gc_interval,
2966                 .maxlen         = sizeof(int),
2967                 .mode           = 0644,
2968                 .proc_handler   = &proc_dointvec_jiffies,
2969                 .strategy       = &sysctl_jiffies,
2970         },
2971         {
2972                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2973                 .procname       = "redirect_load",
2974                 .data           = &ip_rt_redirect_load,
2975                 .maxlen         = sizeof(int),
2976                 .mode           = 0644,
2977                 .proc_handler   = &proc_dointvec,
2978         },
2979         {
2980                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2981                 .procname       = "redirect_number",
2982                 .data           = &ip_rt_redirect_number,
2983                 .maxlen         = sizeof(int),
2984                 .mode           = 0644,
2985                 .proc_handler   = &proc_dointvec,
2986         },
2987         {
2988                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2989                 .procname       = "redirect_silence",
2990                 .data           = &ip_rt_redirect_silence,
2991                 .maxlen         = sizeof(int),
2992                 .mode           = 0644,
2993                 .proc_handler   = &proc_dointvec,
2994         },
2995         {
2996                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2997                 .procname       = "error_cost",
2998                 .data           = &ip_rt_error_cost,
2999                 .maxlen         = sizeof(int),
3000                 .mode           = 0644,
3001                 .proc_handler   = &proc_dointvec,
3002         },
3003         {
3004                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3005                 .procname       = "error_burst",
3006                 .data           = &ip_rt_error_burst,
3007                 .maxlen         = sizeof(int),
3008                 .mode           = 0644,
3009                 .proc_handler   = &proc_dointvec,
3010         },
3011         {
3012                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3013                 .procname       = "gc_elasticity",
3014                 .data           = &ip_rt_gc_elasticity,
3015                 .maxlen         = sizeof(int),
3016                 .mode           = 0644,
3017                 .proc_handler   = &proc_dointvec,
3018         },
3019         {
3020                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3021                 .procname       = "mtu_expires",
3022                 .data           = &ip_rt_mtu_expires,
3023                 .maxlen         = sizeof(int),
3024                 .mode           = 0644,
3025                 .proc_handler   = &proc_dointvec_jiffies,
3026                 .strategy       = &sysctl_jiffies,
3027         },
3028         {
3029                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3030                 .procname       = "min_pmtu",
3031                 .data           = &ip_rt_min_pmtu,
3032                 .maxlen         = sizeof(int),
3033                 .mode           = 0644,
3034                 .proc_handler   = &proc_dointvec,
3035         },
3036         {
3037                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3038                 .procname       = "min_adv_mss",
3039                 .data           = &ip_rt_min_advmss,
3040                 .maxlen         = sizeof(int),
3041                 .mode           = 0644,
3042                 .proc_handler   = &proc_dointvec,
3043         },
3044         {
3045                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3046                 .procname       = "secret_interval",
3047                 .data           = &ip_rt_secret_interval,
3048                 .maxlen         = sizeof(int),
3049                 .mode           = 0644,
3050                 .proc_handler   = &proc_dointvec_jiffies,
3051                 .strategy       = &sysctl_jiffies,
3052         },
3053         { .ctl_name = 0 }
3054 };
3055 #endif
3056
3057 #ifdef CONFIG_NET_CLS_ROUTE
3058 struct ip_rt_acct *ip_rt_acct;
3059
3060 /* This code sucks.  But you should have seen it before! --RR */
3061
3062 /* IP route accounting ptr for this logical cpu number. */
3063 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064
3065 #ifdef CONFIG_PROC_FS
3066 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3067                            int length, int *eof, void *data)
3068 {
3069         unsigned int i;
3070
3071         if ((offset & 3) || (length & 3))
3072                 return -EIO;
3073
3074         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3075                 *eof = 1;
3076                 return 0;
3077         }
3078
3079         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3080                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3081                 *eof = 1;
3082         }
3083
3084         offset /= sizeof(u32);
3085
3086         if (length > 0) {
3087                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3088                 u32 *dst = (u32 *) buffer;
3089
3090                 /* Copy first cpu. */
3091                 *start = buffer;
3092                 memcpy(dst, src, length);
3093
3094                 /* Add the other cpus in, one int at a time */
3095                 for_each_cpu(i) {
3096                         unsigned int j;
3097
3098                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099
3100                         for (j = 0; j < length/4; j++)
3101                                 dst[j] += src[j];
3102                 }
3103         }
3104         return length;
3105 }
3106 #endif /* CONFIG_PROC_FS */
3107 #endif /* CONFIG_NET_CLS_ROUTE */
3108
3109 static __initdata unsigned long rhash_entries;
3110 static int __init set_rhash_entries(char *str)
3111 {
3112         if (!str)
3113                 return 0;
3114         rhash_entries = simple_strtoul(str, &str, 0);
3115         return 1;
3116 }
3117 __setup("rhash_entries=", set_rhash_entries);
3118
3119 int __init ip_rt_init(void)
3120 {
3121         int rc = 0;
3122
3123         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3124                              (jiffies ^ (jiffies >> 7)));
3125
3126 #ifdef CONFIG_NET_CLS_ROUTE
3127         {
3128         int order;
3129         for (order = 0;
3130              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3131                 /* NOTHING */;
3132         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3133         if (!ip_rt_acct)
3134                 panic("IP: failed to allocate ip_rt_acct\n");
3135         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3136         }
3137 #endif
3138
3139         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3140                                                      sizeof(struct rtable),
3141                                                      0, SLAB_HWCACHE_ALIGN,
3142                                                      NULL, NULL);
3143
3144         if (!ipv4_dst_ops.kmem_cachep)
3145                 panic("IP: failed to allocate ip_dst_cache\n");
3146
3147         rt_hash_table = (struct rt_hash_bucket *)
3148                 alloc_large_system_hash("IP route cache",
3149                                         sizeof(struct rt_hash_bucket),
3150                                         rhash_entries,
3151                                         (num_physpages >= 128 * 1024) ?
3152                                                 (27 - PAGE_SHIFT) :
3153                                                 (29 - PAGE_SHIFT),
3154                                         HASH_HIGHMEM,
3155                                         &rt_hash_log,
3156                                         &rt_hash_mask,
3157                                         0);
3158         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3159         rt_hash_lock_init();
3160
3161         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3162         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3163
3164         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3165         if (!rt_cache_stat)
3166                 return -ENOMEM;
3167
3168         devinet_init();
3169         ip_fib_init();
3170
3171         init_timer(&rt_flush_timer);
3172         rt_flush_timer.function = rt_run_flush;
3173         init_timer(&rt_periodic_timer);
3174         rt_periodic_timer.function = rt_check_expire;
3175         init_timer(&rt_secret_timer);
3176         rt_secret_timer.function = rt_secret_rebuild;
3177
3178         /* All the timers, started at system startup tend
3179            to synchronize. Perturb it a bit.
3180          */
3181         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3182                                         ip_rt_gc_interval;
3183         add_timer(&rt_periodic_timer);
3184
3185         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3186                 ip_rt_secret_interval;
3187         add_timer(&rt_secret_timer);
3188
3189 #ifdef CONFIG_PROC_FS
3190         {
3191         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3192         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3193             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3194                                              proc_net_stat))) {
3195                 free_percpu(rt_cache_stat);
3196                 return -ENOMEM;
3197         }
3198         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3199         }
3200 #ifdef CONFIG_NET_CLS_ROUTE
3201         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3202 #endif
3203 #endif
3204 #ifdef CONFIG_XFRM
3205         xfrm_init();
3206         xfrm4_init();
3207 #endif
3208         return rc;
3209 }
3210
3211 EXPORT_SYMBOL(__ip_select_ident);
3212 EXPORT_SYMBOL(ip_route_input);
3213 EXPORT_SYMBOL(ip_route_output_key);