Manual merge with Linus
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112
113 #define IP_MAX_MTU      0xFFF0
114
115 #define RT_GC_TIMEOUT (300*HZ)
116
117 static int ip_rt_min_delay              = 2 * HZ;
118 static int ip_rt_max_delay              = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval            = 60 * HZ;
122 static int ip_rt_gc_min_interval        = HZ / 2;
123 static int ip_rt_redirect_number        = 9;
124 static int ip_rt_redirect_load          = HZ / 50;
125 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost             = HZ;
127 static int ip_rt_error_burst            = 5 * HZ;
128 static int ip_rt_gc_elasticity          = 8;
129 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu               = 512 + 20 + 20;
131 static int ip_rt_min_advmss             = 256;
132 static int ip_rt_secret_interval        = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .entry_size =           sizeof(struct rtable),
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(FILLER),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188
189
190 /*
191  * Route cache.
192  */
193
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203
204 struct rt_hash_bucket {
205         struct rtable   *chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  */
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ 4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ 2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ 1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ 512
220 #else
221 #define RT_HASH_LOCK_SZ 256
222 #endif
223
224 static spinlock_t       *rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init()    { \
227                 int i; \
228                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231                         spin_lock_init(&rt_hash_locks[i]); \
232                 }
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
237
238 static struct rt_hash_bucket    *rt_hash_table;
239 static unsigned                 rt_hash_mask;
240 static int                      rt_hash_log;
241 static unsigned int             rt_hash_rnd;
242
243 static struct rt_cache_stat *rt_cache_stat;
244 #define RT_CACHE_STAT_INC(field)                                          \
245                 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
246
247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
248                                 struct rtable **res);
249
250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
251 {
252         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
253                 & rt_hash_mask);
254 }
255
256 #ifdef CONFIG_PROC_FS
257 struct rt_cache_iter_state {
258         int bucket;
259 };
260
261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
262 {
263         struct rtable *r = NULL;
264         struct rt_cache_iter_state *st = seq->private;
265
266         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
267                 rcu_read_lock_bh();
268                 r = rt_hash_table[st->bucket].chain;
269                 if (r)
270                         break;
271                 rcu_read_unlock_bh();
272         }
273         return r;
274 }
275
276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
277 {
278         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
279
280         r = r->u.rt_next;
281         while (!r) {
282                 rcu_read_unlock_bh();
283                 if (--st->bucket < 0)
284                         break;
285                 rcu_read_lock_bh();
286                 r = rt_hash_table[st->bucket].chain;
287         }
288         return r;
289 }
290
291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
292 {
293         struct rtable *r = rt_cache_get_first(seq);
294
295         if (r)
296                 while (pos && (r = rt_cache_get_next(seq, r)))
297                         --pos;
298         return pos ? NULL : r;
299 }
300
301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
302 {
303         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304 }
305
306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
307 {
308         struct rtable *r = NULL;
309
310         if (v == SEQ_START_TOKEN)
311                 r = rt_cache_get_first(seq);
312         else
313                 r = rt_cache_get_next(seq, v);
314         ++*pos;
315         return r;
316 }
317
318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
319 {
320         if (v && v != SEQ_START_TOKEN)
321                 rcu_read_unlock_bh();
322 }
323
324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
325 {
326         if (v == SEQ_START_TOKEN)
327                 seq_printf(seq, "%-127s\n",
328                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
329                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
330                            "HHUptod\tSpecDst");
331         else {
332                 struct rtable *r = v;
333                 char temp[256];
334
335                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
336                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
337                         r->u.dst.dev ? r->u.dst.dev->name : "*",
338                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
339                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
340                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
341                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
342                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
343                         dst_metric(&r->u.dst, RTAX_WINDOW),
344                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
345                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
346                         r->fl.fl4_tos,
347                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
348                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
349                                        dev_queue_xmit) : 0,
350                         r->rt_spec_dst);
351                 seq_printf(seq, "%-127s\n", temp);
352         }
353         return 0;
354 }
355
356 static struct seq_operations rt_cache_seq_ops = {
357         .start  = rt_cache_seq_start,
358         .next   = rt_cache_seq_next,
359         .stop   = rt_cache_seq_stop,
360         .show   = rt_cache_seq_show,
361 };
362
363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
364 {
365         struct seq_file *seq;
366         int rc = -ENOMEM;
367         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
368
369         if (!s)
370                 goto out;
371         rc = seq_open(file, &rt_cache_seq_ops);
372         if (rc)
373                 goto out_kfree;
374         seq          = file->private_data;
375         seq->private = s;
376         memset(s, 0, sizeof(*s));
377 out:
378         return rc;
379 out_kfree:
380         kfree(s);
381         goto out;
382 }
383
384 static struct file_operations rt_cache_seq_fops = {
385         .owner   = THIS_MODULE,
386         .open    = rt_cache_seq_open,
387         .read    = seq_read,
388         .llseek  = seq_lseek,
389         .release = seq_release_private,
390 };
391
392
393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
394 {
395         int cpu;
396
397         if (*pos == 0)
398                 return SEQ_START_TOKEN;
399
400         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401                 if (!cpu_possible(cpu))
402                         continue;
403                 *pos = cpu+1;
404                 return per_cpu_ptr(rt_cache_stat, cpu);
405         }
406         return NULL;
407 }
408
409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410 {
411         int cpu;
412
413         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414                 if (!cpu_possible(cpu))
415                         continue;
416                 *pos = cpu+1;
417                 return per_cpu_ptr(rt_cache_stat, cpu);
418         }
419         return NULL;
420         
421 }
422
423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
424 {
425
426 }
427
428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429 {
430         struct rt_cache_stat *st = v;
431
432         if (v == SEQ_START_TOKEN) {
433                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
434                 return 0;
435         }
436         
437         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
438                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439                    atomic_read(&ipv4_dst_ops.entries),
440                    st->in_hit,
441                    st->in_slow_tot,
442                    st->in_slow_mc,
443                    st->in_no_route,
444                    st->in_brd,
445                    st->in_martian_dst,
446                    st->in_martian_src,
447
448                    st->out_hit,
449                    st->out_slow_tot,
450                    st->out_slow_mc, 
451
452                    st->gc_total,
453                    st->gc_ignored,
454                    st->gc_goal_miss,
455                    st->gc_dst_overflow,
456                    st->in_hlist_search,
457                    st->out_hlist_search
458                 );
459         return 0;
460 }
461
462 static struct seq_operations rt_cpu_seq_ops = {
463         .start  = rt_cpu_seq_start,
464         .next   = rt_cpu_seq_next,
465         .stop   = rt_cpu_seq_stop,
466         .show   = rt_cpu_seq_show,
467 };
468
469
470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471 {
472         return seq_open(file, &rt_cpu_seq_ops);
473 }
474
475 static struct file_operations rt_cpu_seq_fops = {
476         .owner   = THIS_MODULE,
477         .open    = rt_cpu_seq_open,
478         .read    = seq_read,
479         .llseek  = seq_lseek,
480         .release = seq_release,
481 };
482
483 #endif /* CONFIG_PROC_FS */
484   
485 static __inline__ void rt_free(struct rtable *rt)
486 {
487         multipath_remove(rt);
488         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 }
490
491 static __inline__ void rt_drop(struct rtable *rt)
492 {
493         multipath_remove(rt);
494         ip_rt_put(rt);
495         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 }
497
498 static __inline__ int rt_fast_clean(struct rtable *rth)
499 {
500         /* Kill broadcast/multicast entries very aggresively, if they
501            collide in hash table with more useful entries */
502         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
503                 rth->fl.iif && rth->u.rt_next;
504 }
505
506 static __inline__ int rt_valuable(struct rtable *rth)
507 {
508         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
509                 rth->u.dst.expires;
510 }
511
512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
513 {
514         unsigned long age;
515         int ret = 0;
516
517         if (atomic_read(&rth->u.dst.__refcnt))
518                 goto out;
519
520         ret = 1;
521         if (rth->u.dst.expires &&
522             time_after_eq(jiffies, rth->u.dst.expires))
523                 goto out;
524
525         age = jiffies - rth->u.dst.lastuse;
526         ret = 0;
527         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
528             (age <= tmo2 && rt_valuable(rth)))
529                 goto out;
530         ret = 1;
531 out:    return ret;
532 }
533
534 /* Bits of score are:
535  * 31: very valuable
536  * 30: not quite useless
537  * 29..0: usage counter
538  */
539 static inline u32 rt_score(struct rtable *rt)
540 {
541         u32 score = jiffies - rt->u.dst.lastuse;
542
543         score = ~score & ~(3<<30);
544
545         if (rt_valuable(rt))
546                 score |= (1<<31);
547
548         if (!rt->fl.iif ||
549             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
550                 score |= (1<<30);
551
552         return score;
553 }
554
555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556 {
557         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
558                fl1->oif     == fl2->oif &&
559                fl1->iif     == fl2->iif;
560 }
561
562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
564                                                 struct rtable *expentry,
565                                                 int *removed_count)
566 {
567         int passedexpired = 0;
568         struct rtable **nextstep = NULL;
569         struct rtable **rthp = chain_head;
570         struct rtable *rth;
571
572         if (removed_count)
573                 *removed_count = 0;
574
575         while ((rth = *rthp) != NULL) {
576                 if (rth == expentry)
577                         passedexpired = 1;
578
579                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
580                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
581                         if (*rthp == expentry) {
582                                 *rthp = rth->u.rt_next;
583                                 continue;
584                         } else {
585                                 *rthp = rth->u.rt_next;
586                                 rt_free(rth);
587                                 if (removed_count)
588                                         ++(*removed_count);
589                         }
590                 } else {
591                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
592                             passedexpired && !nextstep)
593                                 nextstep = &rth->u.rt_next;
594
595                         rthp = &rth->u.rt_next;
596                 }
597         }
598
599         rt_free(expentry);
600         if (removed_count)
601                 ++(*removed_count);
602
603         return nextstep;
604 }
605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606
607
608 /* This runs via a timer and thus is always in BH context. */
609 static void rt_check_expire(unsigned long dummy)
610 {
611         static unsigned int rover;
612         unsigned int i = rover, goal;
613         struct rtable *rth, **rthp;
614         unsigned long now = jiffies;
615         u64 mult;
616
617         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
618         if (ip_rt_gc_timeout > 1)
619                 do_div(mult, ip_rt_gc_timeout);
620         goal = (unsigned int)mult;
621         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
622         for (; goal > 0; goal--) {
623                 unsigned long tmo = ip_rt_gc_timeout;
624
625                 i = (i + 1) & rt_hash_mask;
626                 rthp = &rt_hash_table[i].chain;
627
628                 if (*rthp == 0)
629                         continue;
630                 spin_lock(rt_hash_lock_addr(i));
631                 while ((rth = *rthp) != NULL) {
632                         if (rth->u.dst.expires) {
633                                 /* Entry is expired even if it is in use */
634                                 if (time_before_eq(now, rth->u.dst.expires)) {
635                                         tmo >>= 1;
636                                         rthp = &rth->u.rt_next;
637                                         continue;
638                                 }
639                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
640                                 tmo >>= 1;
641                                 rthp = &rth->u.rt_next;
642                                 continue;
643                         }
644
645                         /* Cleanup aged off entries. */
646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
647                         /* remove all related balanced entries if necessary */
648                         if (rth->u.dst.flags & DST_BALANCED) {
649                                 rthp = rt_remove_balanced_route(
650                                         &rt_hash_table[i].chain,
651                                         rth, NULL);
652                                 if (!rthp)
653                                         break;
654                         } else {
655                                 *rthp = rth->u.rt_next;
656                                 rt_free(rth);
657                         }
658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
659                         *rthp = rth->u.rt_next;
660                         rt_free(rth);
661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
662                 }
663                 spin_unlock(rt_hash_lock_addr(i));
664
665                 /* Fallback loop breaker. */
666                 if (time_after(jiffies, now))
667                         break;
668         }
669         rover = i;
670         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
671 }
672
673 /* This can run from both BH and non-BH contexts, the latter
674  * in the case of a forced flush event.
675  */
676 static void rt_run_flush(unsigned long dummy)
677 {
678         int i;
679         struct rtable *rth, *next;
680
681         rt_deadline = 0;
682
683         get_random_bytes(&rt_hash_rnd, 4);
684
685         for (i = rt_hash_mask; i >= 0; i--) {
686                 spin_lock_bh(rt_hash_lock_addr(i));
687                 rth = rt_hash_table[i].chain;
688                 if (rth)
689                         rt_hash_table[i].chain = NULL;
690                 spin_unlock_bh(rt_hash_lock_addr(i));
691
692                 for (; rth; rth = next) {
693                         next = rth->u.rt_next;
694                         rt_free(rth);
695                 }
696         }
697 }
698
699 static DEFINE_SPINLOCK(rt_flush_lock);
700
701 void rt_cache_flush(int delay)
702 {
703         unsigned long now = jiffies;
704         int user_mode = !in_softirq();
705
706         if (delay < 0)
707                 delay = ip_rt_min_delay;
708
709         /* flush existing multipath state*/
710         multipath_flush();
711
712         spin_lock_bh(&rt_flush_lock);
713
714         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
715                 long tmo = (long)(rt_deadline - now);
716
717                 /* If flush timer is already running
718                    and flush request is not immediate (delay > 0):
719
720                    if deadline is not achieved, prolongate timer to "delay",
721                    otherwise fire it at deadline time.
722                  */
723
724                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
725                         tmo = 0;
726                 
727                 if (delay > tmo)
728                         delay = tmo;
729         }
730
731         if (delay <= 0) {
732                 spin_unlock_bh(&rt_flush_lock);
733                 rt_run_flush(0);
734                 return;
735         }
736
737         if (rt_deadline == 0)
738                 rt_deadline = now + ip_rt_max_delay;
739
740         mod_timer(&rt_flush_timer, now+delay);
741         spin_unlock_bh(&rt_flush_lock);
742 }
743
744 static void rt_secret_rebuild(unsigned long dummy)
745 {
746         unsigned long now = jiffies;
747
748         rt_cache_flush(0);
749         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
750 }
751
752 /*
753    Short description of GC goals.
754
755    We want to build algorithm, which will keep routing cache
756    at some equilibrium point, when number of aged off entries
757    is kept approximately equal to newly generated ones.
758
759    Current expiration strength is variable "expire".
760    We try to adjust it dynamically, so that if networking
761    is idle expires is large enough to keep enough of warm entries,
762    and when load increases it reduces to limit cache size.
763  */
764
765 static int rt_garbage_collect(void)
766 {
767         static unsigned long expire = RT_GC_TIMEOUT;
768         static unsigned long last_gc;
769         static int rover;
770         static int equilibrium;
771         struct rtable *rth, **rthp;
772         unsigned long now = jiffies;
773         int goal;
774
775         /*
776          * Garbage collection is pretty expensive,
777          * do not make it too frequently.
778          */
779
780         RT_CACHE_STAT_INC(gc_total);
781
782         if (now - last_gc < ip_rt_gc_min_interval &&
783             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
784                 RT_CACHE_STAT_INC(gc_ignored);
785                 goto out;
786         }
787
788         /* Calculate number of entries, which we want to expire now. */
789         goal = atomic_read(&ipv4_dst_ops.entries) -
790                 (ip_rt_gc_elasticity << rt_hash_log);
791         if (goal <= 0) {
792                 if (equilibrium < ipv4_dst_ops.gc_thresh)
793                         equilibrium = ipv4_dst_ops.gc_thresh;
794                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
795                 if (goal > 0) {
796                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
797                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
798                 }
799         } else {
800                 /* We are in dangerous area. Try to reduce cache really
801                  * aggressively.
802                  */
803                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
804                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805         }
806
807         if (now - last_gc >= ip_rt_gc_min_interval)
808                 last_gc = now;
809
810         if (goal <= 0) {
811                 equilibrium += goal;
812                 goto work_done;
813         }
814
815         do {
816                 int i, k;
817
818                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
819                         unsigned long tmo = expire;
820
821                         k = (k + 1) & rt_hash_mask;
822                         rthp = &rt_hash_table[k].chain;
823                         spin_lock_bh(rt_hash_lock_addr(k));
824                         while ((rth = *rthp) != NULL) {
825                                 if (!rt_may_expire(rth, tmo, expire)) {
826                                         tmo >>= 1;
827                                         rthp = &rth->u.rt_next;
828                                         continue;
829                                 }
830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
831                                 /* remove all related balanced entries
832                                  * if necessary
833                                  */
834                                 if (rth->u.dst.flags & DST_BALANCED) {
835                                         int r;
836
837                                         rthp = rt_remove_balanced_route(
838                                                 &rt_hash_table[i].chain,
839                                                 rth,
840                                                 &r);
841                                         goal -= r;
842                                         if (!rthp)
843                                                 break;
844                                 } else {
845                                         *rthp = rth->u.rt_next;
846                                         rt_free(rth);
847                                         goal--;
848                                 }
849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
850                                 *rthp = rth->u.rt_next;
851                                 rt_free(rth);
852                                 goal--;
853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
854                         }
855                         spin_unlock_bh(rt_hash_lock_addr(k));
856                         if (goal <= 0)
857                                 break;
858                 }
859                 rover = k;
860
861                 if (goal <= 0)
862                         goto work_done;
863
864                 /* Goal is not achieved. We stop process if:
865
866                    - if expire reduced to zero. Otherwise, expire is halfed.
867                    - if table is not full.
868                    - if we are called from interrupt.
869                    - jiffies check is just fallback/debug loop breaker.
870                      We will not spin here for long time in any case.
871                  */
872
873                 RT_CACHE_STAT_INC(gc_goal_miss);
874
875                 if (expire == 0)
876                         break;
877
878                 expire >>= 1;
879 #if RT_CACHE_DEBUG >= 2
880                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
881                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
882 #endif
883
884                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
885                         goto out;
886         } while (!in_softirq() && time_before_eq(jiffies, now));
887
888         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
889                 goto out;
890         if (net_ratelimit())
891                 printk(KERN_WARNING "dst cache overflow\n");
892         RT_CACHE_STAT_INC(gc_dst_overflow);
893         return 1;
894
895 work_done:
896         expire += ip_rt_gc_min_interval;
897         if (expire > ip_rt_gc_timeout ||
898             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
899                 expire = ip_rt_gc_timeout;
900 #if RT_CACHE_DEBUG >= 2
901         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
902                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
903 #endif
904 out:    return 0;
905 }
906
907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
908 {
909         struct rtable   *rth, **rthp;
910         unsigned long   now;
911         struct rtable *cand, **candp;
912         u32             min_score;
913         int             chain_length;
914         int attempts = !in_softirq();
915
916 restart:
917         chain_length = 0;
918         min_score = ~(u32)0;
919         cand = NULL;
920         candp = NULL;
921         now = jiffies;
922
923         rthp = &rt_hash_table[hash].chain;
924
925         spin_lock_bh(rt_hash_lock_addr(hash));
926         while ((rth = *rthp) != NULL) {
927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928                 if (!(rth->u.dst.flags & DST_BALANCED) &&
929                     compare_keys(&rth->fl, &rt->fl)) {
930 #else
931                 if (compare_keys(&rth->fl, &rt->fl)) {
932 #endif
933                         /* Put it first */
934                         *rthp = rth->u.rt_next;
935                         /*
936                          * Since lookup is lockfree, the deletion
937                          * must be visible to another weakly ordered CPU before
938                          * the insertion at the start of the hash chain.
939                          */
940                         rcu_assign_pointer(rth->u.rt_next,
941                                            rt_hash_table[hash].chain);
942                         /*
943                          * Since lookup is lockfree, the update writes
944                          * must be ordered for consistency on SMP.
945                          */
946                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
947
948                         rth->u.dst.__use++;
949                         dst_hold(&rth->u.dst);
950                         rth->u.dst.lastuse = now;
951                         spin_unlock_bh(rt_hash_lock_addr(hash));
952
953                         rt_drop(rt);
954                         *rp = rth;
955                         return 0;
956                 }
957
958                 if (!atomic_read(&rth->u.dst.__refcnt)) {
959                         u32 score = rt_score(rth);
960
961                         if (score <= min_score) {
962                                 cand = rth;
963                                 candp = rthp;
964                                 min_score = score;
965                         }
966                 }
967
968                 chain_length++;
969
970                 rthp = &rth->u.rt_next;
971         }
972
973         if (cand) {
974                 /* ip_rt_gc_elasticity used to be average length of chain
975                  * length, when exceeded gc becomes really aggressive.
976                  *
977                  * The second limit is less certain. At the moment it allows
978                  * only 2 entries per bucket. We will see.
979                  */
980                 if (chain_length > ip_rt_gc_elasticity) {
981                         *candp = cand->u.rt_next;
982                         rt_free(cand);
983                 }
984         }
985
986         /* Try to bind route to arp only if it is output
987            route or unicast forwarding path.
988          */
989         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
990                 int err = arp_bind_neighbour(&rt->u.dst);
991                 if (err) {
992                         spin_unlock_bh(rt_hash_lock_addr(hash));
993
994                         if (err != -ENOBUFS) {
995                                 rt_drop(rt);
996                                 return err;
997                         }
998
999                         /* Neighbour tables are full and nothing
1000                            can be released. Try to shrink route cache,
1001                            it is most likely it holds some neighbour records.
1002                          */
1003                         if (attempts-- > 0) {
1004                                 int saved_elasticity = ip_rt_gc_elasticity;
1005                                 int saved_int = ip_rt_gc_min_interval;
1006                                 ip_rt_gc_elasticity     = 1;
1007                                 ip_rt_gc_min_interval   = 0;
1008                                 rt_garbage_collect();
1009                                 ip_rt_gc_min_interval   = saved_int;
1010                                 ip_rt_gc_elasticity     = saved_elasticity;
1011                                 goto restart;
1012                         }
1013
1014                         if (net_ratelimit())
1015                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1016                         rt_drop(rt);
1017                         return -ENOBUFS;
1018                 }
1019         }
1020
1021         rt->u.rt_next = rt_hash_table[hash].chain;
1022 #if RT_CACHE_DEBUG >= 2
1023         if (rt->u.rt_next) {
1024                 struct rtable *trt;
1025                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026                        NIPQUAD(rt->rt_dst));
1027                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1029                 printk("\n");
1030         }
1031 #endif
1032         rt_hash_table[hash].chain = rt;
1033         spin_unlock_bh(rt_hash_lock_addr(hash));
1034         *rp = rt;
1035         return 0;
1036 }
1037
1038 void rt_bind_peer(struct rtable *rt, int create)
1039 {
1040         static DEFINE_SPINLOCK(rt_peer_lock);
1041         struct inet_peer *peer;
1042
1043         peer = inet_getpeer(rt->rt_dst, create);
1044
1045         spin_lock_bh(&rt_peer_lock);
1046         if (rt->peer == NULL) {
1047                 rt->peer = peer;
1048                 peer = NULL;
1049         }
1050         spin_unlock_bh(&rt_peer_lock);
1051         if (peer)
1052                 inet_putpeer(peer);
1053 }
1054
1055 /*
1056  * Peer allocation may fail only in serious out-of-memory conditions.  However
1057  * we still can generate some output.
1058  * Random ID selection looks a bit dangerous because we have no chances to
1059  * select ID being unique in a reasonable period of time.
1060  * But broken packet identifier may be better than no packet at all.
1061  */
1062 static void ip_select_fb_ident(struct iphdr *iph)
1063 {
1064         static DEFINE_SPINLOCK(ip_fb_id_lock);
1065         static u32 ip_fallback_id;
1066         u32 salt;
1067
1068         spin_lock_bh(&ip_fb_id_lock);
1069         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070         iph->id = htons(salt & 0xFFFF);
1071         ip_fallback_id = salt;
1072         spin_unlock_bh(&ip_fb_id_lock);
1073 }
1074
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076 {
1077         struct rtable *rt = (struct rtable *) dst;
1078
1079         if (rt) {
1080                 if (rt->peer == NULL)
1081                         rt_bind_peer(rt, 1);
1082
1083                 /* If peer is attached to destination, it is never detached,
1084                    so that we need not to grab a lock to dereference it.
1085                  */
1086                 if (rt->peer) {
1087                         iph->id = htons(inet_getid(rt->peer, more));
1088                         return;
1089                 }
1090         } else
1091                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
1092                        __builtin_return_address(0));
1093
1094         ip_select_fb_ident(iph);
1095 }
1096
1097 static void rt_del(unsigned hash, struct rtable *rt)
1098 {
1099         struct rtable **rthp;
1100
1101         spin_lock_bh(rt_hash_lock_addr(hash));
1102         ip_rt_put(rt);
1103         for (rthp = &rt_hash_table[hash].chain; *rthp;
1104              rthp = &(*rthp)->u.rt_next)
1105                 if (*rthp == rt) {
1106                         *rthp = rt->u.rt_next;
1107                         rt_free(rt);
1108                         break;
1109                 }
1110         spin_unlock_bh(rt_hash_lock_addr(hash));
1111 }
1112
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114                     u32 saddr, u8 tos, struct net_device *dev)
1115 {
1116         int i, k;
1117         struct in_device *in_dev = in_dev_get(dev);
1118         struct rtable *rth, **rthp;
1119         u32  skeys[2] = { saddr, 0 };
1120         int  ikeys[2] = { dev->ifindex, 0 };
1121
1122         tos &= IPTOS_RT_MASK;
1123
1124         if (!in_dev)
1125                 return;
1126
1127         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129                 goto reject_redirect;
1130
1131         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133                         goto reject_redirect;
1134                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135                         goto reject_redirect;
1136         } else {
1137                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1138                         goto reject_redirect;
1139         }
1140
1141         for (i = 0; i < 2; i++) {
1142                 for (k = 0; k < 2; k++) {
1143                         unsigned hash = rt_hash_code(daddr,
1144                                                      skeys[i] ^ (ikeys[k] << 5),
1145                                                      tos);
1146
1147                         rthp=&rt_hash_table[hash].chain;
1148
1149                         rcu_read_lock();
1150                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1151                                 struct rtable *rt;
1152
1153                                 if (rth->fl.fl4_dst != daddr ||
1154                                     rth->fl.fl4_src != skeys[i] ||
1155                                     rth->fl.fl4_tos != tos ||
1156                                     rth->fl.oif != ikeys[k] ||
1157                                     rth->fl.iif != 0) {
1158                                         rthp = &rth->u.rt_next;
1159                                         continue;
1160                                 }
1161
1162                                 if (rth->rt_dst != daddr ||
1163                                     rth->rt_src != saddr ||
1164                                     rth->u.dst.error ||
1165                                     rth->rt_gateway != old_gw ||
1166                                     rth->u.dst.dev != dev)
1167                                         break;
1168
1169                                 dst_hold(&rth->u.dst);
1170                                 rcu_read_unlock();
1171
1172                                 rt = dst_alloc(&ipv4_dst_ops);
1173                                 if (rt == NULL) {
1174                                         ip_rt_put(rth);
1175                                         in_dev_put(in_dev);
1176                                         return;
1177                                 }
1178
1179                                 /* Copy all the information. */
1180                                 *rt = *rth;
1181                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182                                 rt->u.dst.__use         = 1;
1183                                 atomic_set(&rt->u.dst.__refcnt, 1);
1184                                 rt->u.dst.child         = NULL;
1185                                 if (rt->u.dst.dev)
1186                                         dev_hold(rt->u.dst.dev);
1187                                 if (rt->idev)
1188                                         in_dev_hold(rt->idev);
1189                                 rt->u.dst.obsolete      = 0;
1190                                 rt->u.dst.lastuse       = jiffies;
1191                                 rt->u.dst.path          = &rt->u.dst;
1192                                 rt->u.dst.neighbour     = NULL;
1193                                 rt->u.dst.hh            = NULL;
1194                                 rt->u.dst.xfrm          = NULL;
1195
1196                                 rt->rt_flags            |= RTCF_REDIRECTED;
1197
1198                                 /* Gateway is different ... */
1199                                 rt->rt_gateway          = new_gw;
1200
1201                                 /* Redirect received -> path was valid */
1202                                 dst_confirm(&rth->u.dst);
1203
1204                                 if (rt->peer)
1205                                         atomic_inc(&rt->peer->refcnt);
1206
1207                                 if (arp_bind_neighbour(&rt->u.dst) ||
1208                                     !(rt->u.dst.neighbour->nud_state &
1209                                             NUD_VALID)) {
1210                                         if (rt->u.dst.neighbour)
1211                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1212                                         ip_rt_put(rth);
1213                                         rt_drop(rt);
1214                                         goto do_next;
1215                                 }
1216
1217                                 rt_del(hash, rth);
1218                                 if (!rt_intern_hash(hash, rt, &rt))
1219                                         ip_rt_put(rt);
1220                                 goto do_next;
1221                         }
1222                         rcu_read_unlock();
1223                 do_next:
1224                         ;
1225                 }
1226         }
1227         in_dev_put(in_dev);
1228         return;
1229
1230 reject_redirect:
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
1232         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234                         "%u.%u.%u.%u ignored.\n"
1235                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236                         "tos %02x\n",
1237                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1239 #endif
1240         in_dev_put(in_dev);
1241 }
1242
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244 {
1245         struct rtable *rt = (struct rtable*)dst;
1246         struct dst_entry *ret = dst;
1247
1248         if (rt) {
1249                 if (dst->obsolete) {
1250                         ip_rt_put(rt);
1251                         ret = NULL;
1252                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253                            rt->u.dst.expires) {
1254                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1255                                                      rt->fl.fl4_src ^
1256                                                         (rt->fl.oif << 5),
1257                                                      rt->fl.fl4_tos);
1258 #if RT_CACHE_DEBUG >= 1
1259                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260                                           "%u.%u.%u.%u/%02x dropped\n",
1261                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262 #endif
1263                         rt_del(hash, rt);
1264                         ret = NULL;
1265                 }
1266         }
1267         return ret;
1268 }
1269
1270 /*
1271  * Algorithm:
1272  *      1. The first ip_rt_redirect_number redirects are sent
1273  *         with exponential backoff, then we stop sending them at all,
1274  *         assuming that the host ignores our redirects.
1275  *      2. If we did not see packets requiring redirects
1276  *         during ip_rt_redirect_silence, we assume that the host
1277  *         forgot redirected route and start to send redirects again.
1278  *
1279  * This algorithm is much cheaper and more intelligent than dumb load limiting
1280  * in icmp.c.
1281  *
1282  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284  */
1285
1286 void ip_rt_send_redirect(struct sk_buff *skb)
1287 {
1288         struct rtable *rt = (struct rtable*)skb->dst;
1289         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1290
1291         if (!in_dev)
1292                 return;
1293
1294         if (!IN_DEV_TX_REDIRECTS(in_dev))
1295                 goto out;
1296
1297         /* No redirected packets during ip_rt_redirect_silence;
1298          * reset the algorithm.
1299          */
1300         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301                 rt->u.dst.rate_tokens = 0;
1302
1303         /* Too many ignored redirects; do not send anything
1304          * set u.dst.rate_last to the last seen redirected packet.
1305          */
1306         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307                 rt->u.dst.rate_last = jiffies;
1308                 goto out;
1309         }
1310
1311         /* Check for load limit; set rate_last to the latest sent
1312          * redirect.
1313          */
1314         if (time_after(jiffies,
1315                        (rt->u.dst.rate_last +
1316                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318                 rt->u.dst.rate_last = jiffies;
1319                 ++rt->u.dst.rate_tokens;
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323                     net_ratelimit())
1324                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1327                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1328 #endif
1329         }
1330 out:
1331         in_dev_put(in_dev);
1332 }
1333
1334 static int ip_error(struct sk_buff *skb)
1335 {
1336         struct rtable *rt = (struct rtable*)skb->dst;
1337         unsigned long now;
1338         int code;
1339
1340         switch (rt->u.dst.error) {
1341                 case EINVAL:
1342                 default:
1343                         goto out;
1344                 case EHOSTUNREACH:
1345                         code = ICMP_HOST_UNREACH;
1346                         break;
1347                 case ENETUNREACH:
1348                         code = ICMP_NET_UNREACH;
1349                         break;
1350                 case EACCES:
1351                         code = ICMP_PKT_FILTERED;
1352                         break;
1353         }
1354
1355         now = jiffies;
1356         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1359         rt->u.dst.rate_last = now;
1360         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363         }
1364
1365 out:    kfree_skb(skb);
1366         return 0;
1367
1368
1369 /*
1370  *      The last two values are not from the RFC but
1371  *      are needed for AMPRnet AX.25 paths.
1372  */
1373
1374 static unsigned short mtu_plateau[] =
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1378 {
1379         int i;
1380         
1381         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382                 if (old_mtu > mtu_plateau[i])
1383                         return mtu_plateau[i];
1384         return 68;
1385 }
1386
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388 {
1389         int i;
1390         unsigned short old_mtu = ntohs(iph->tot_len);
1391         struct rtable *rth;
1392         u32  skeys[2] = { iph->saddr, 0, };
1393         u32  daddr = iph->daddr;
1394         u8   tos = iph->tos & IPTOS_RT_MASK;
1395         unsigned short est_mtu = 0;
1396
1397         if (ipv4_config.no_pmtu_disc)
1398                 return 0;
1399
1400         for (i = 0; i < 2; i++) {
1401                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402
1403                 rcu_read_lock();
1404                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405                      rth = rcu_dereference(rth->u.rt_next)) {
1406                         if (rth->fl.fl4_dst == daddr &&
1407                             rth->fl.fl4_src == skeys[i] &&
1408                             rth->rt_dst  == daddr &&
1409                             rth->rt_src  == iph->saddr &&
1410                             rth->fl.fl4_tos == tos &&
1411                             rth->fl.iif == 0 &&
1412                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413                                 unsigned short mtu = new_mtu;
1414
1415                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1416
1417                                         /* BSD 4.2 compatibility hack :-( */
1418                                         if (mtu == 0 &&
1419                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420                                             old_mtu >= 68 + (iph->ihl << 2))
1421                                                 old_mtu -= iph->ihl << 2;
1422
1423                                         mtu = guess_mtu(old_mtu);
1424                                 }
1425                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1427                                                 dst_confirm(&rth->u.dst);
1428                                                 if (mtu < ip_rt_min_pmtu) {
1429                                                         mtu = ip_rt_min_pmtu;
1430                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1431                                                                 (1 << RTAX_MTU);
1432                                                 }
1433                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434                                                 dst_set_expires(&rth->u.dst,
1435                                                         ip_rt_mtu_expires);
1436                                         }
1437                                         est_mtu = mtu;
1438                                 }
1439                         }
1440                 }
1441                 rcu_read_unlock();
1442         }
1443         return est_mtu ? : new_mtu;
1444 }
1445
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 {
1448         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449             !(dst_metric_locked(dst, RTAX_MTU))) {
1450                 if (mtu < ip_rt_min_pmtu) {
1451                         mtu = ip_rt_min_pmtu;
1452                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453                 }
1454                 dst->metrics[RTAX_MTU-1] = mtu;
1455                 dst_set_expires(dst, ip_rt_mtu_expires);
1456         }
1457 }
1458
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460 {
1461         return NULL;
1462 }
1463
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 {
1466         struct rtable *rt = (struct rtable *) dst;
1467         struct inet_peer *peer = rt->peer;
1468         struct in_device *idev = rt->idev;
1469
1470         if (peer) {
1471                 rt->peer = NULL;
1472                 inet_putpeer(peer);
1473         }
1474
1475         if (idev) {
1476                 rt->idev = NULL;
1477                 in_dev_put(idev);
1478         }
1479 }
1480
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482                             int how)
1483 {
1484         struct rtable *rt = (struct rtable *) dst;
1485         struct in_device *idev = rt->idev;
1486         if (dev != &loopback_dev && idev && idev->dev == dev) {
1487                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488                 if (loopback_idev) {
1489                         rt->idev = loopback_idev;
1490                         in_dev_put(idev);
1491                 }
1492         }
1493 }
1494
1495 static void ipv4_link_failure(struct sk_buff *skb)
1496 {
1497         struct rtable *rt;
1498
1499         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500
1501         rt = (struct rtable *) skb->dst;
1502         if (rt)
1503                 dst_set_expires(&rt->u.dst, 0);
1504 }
1505
1506 static int ip_rt_bug(struct sk_buff *skb)
1507 {
1508         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510                 skb->dev ? skb->dev->name : "?");
1511         kfree_skb(skb);
1512         return 0;
1513 }
1514
1515 /*
1516    We do not cache source address of outgoing interface,
1517    because it is used only by IP RR, TS and SRR options,
1518    so that it out of fast path.
1519
1520    BTW remember: "addr" is allowed to be not aligned
1521    in IP options!
1522  */
1523
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 {
1526         u32 src;
1527         struct fib_result res;
1528
1529         if (rt->fl.iif == 0)
1530                 src = rt->rt_src;
1531         else if (fib_lookup(&rt->fl, &res) == 0) {
1532                 src = FIB_RES_PREFSRC(res);
1533                 fib_res_put(&res);
1534         } else
1535                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536                                         RT_SCOPE_UNIVERSE);
1537         memcpy(addr, &src, 4);
1538 }
1539
1540 #ifdef CONFIG_NET_CLS_ROUTE
1541 static void set_class_tag(struct rtable *rt, u32 tag)
1542 {
1543         if (!(rt->u.dst.tclassid & 0xFFFF))
1544                 rt->u.dst.tclassid |= tag & 0xFFFF;
1545         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547 }
1548 #endif
1549
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 {
1552         struct fib_info *fi = res->fi;
1553
1554         if (fi) {
1555                 if (FIB_RES_GW(*res) &&
1556                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557                         rt->rt_gateway = FIB_RES_GW(*res);
1558                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559                        sizeof(rt->u.dst.metrics));
1560                 if (fi->fib_mtu == 0) {
1561                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563                             rt->rt_gateway != rt->rt_dst &&
1564                             rt->u.dst.dev->mtu > 576)
1565                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566                 }
1567 #ifdef CONFIG_NET_CLS_ROUTE
1568                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569 #endif
1570         } else
1571                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572
1573         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579                                        ip_rt_min_advmss);
1580         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582
1583 #ifdef CONFIG_NET_CLS_ROUTE
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
1585         set_class_tag(rt, fib_rules_tclass(res));
1586 #endif
1587         set_class_tag(rt, itag);
1588 #endif
1589         rt->rt_type = res->type;
1590 }
1591
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593                                 u8 tos, struct net_device *dev, int our)
1594 {
1595         unsigned hash;
1596         struct rtable *rth;
1597         u32 spec_dst;
1598         struct in_device *in_dev = in_dev_get(dev);
1599         u32 itag = 0;
1600
1601         /* Primary sanity checks. */
1602
1603         if (in_dev == NULL)
1604                 return -EINVAL;
1605
1606         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607             skb->protocol != htons(ETH_P_IP))
1608                 goto e_inval;
1609
1610         if (ZERONET(saddr)) {
1611                 if (!LOCAL_MCAST(daddr))
1612                         goto e_inval;
1613                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614         } else if (fib_validate_source(saddr, 0, tos, 0,
1615                                         dev, &spec_dst, &itag) < 0)
1616                 goto e_inval;
1617
1618         rth = dst_alloc(&ipv4_dst_ops);
1619         if (!rth)
1620                 goto e_nobufs;
1621
1622         rth->u.dst.output= ip_rt_bug;
1623
1624         atomic_set(&rth->u.dst.__refcnt, 1);
1625         rth->u.dst.flags= DST_HOST;
1626         if (in_dev->cnf.no_policy)
1627                 rth->u.dst.flags |= DST_NOPOLICY;
1628         rth->fl.fl4_dst = daddr;
1629         rth->rt_dst     = daddr;
1630         rth->fl.fl4_tos = tos;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632         rth->fl.fl4_fwmark= skb->nfmark;
1633 #endif
1634         rth->fl.fl4_src = saddr;
1635         rth->rt_src     = saddr;
1636 #ifdef CONFIG_NET_CLS_ROUTE
1637         rth->u.dst.tclassid = itag;
1638 #endif
1639         rth->rt_iif     =
1640         rth->fl.iif     = dev->ifindex;
1641         rth->u.dst.dev  = &loopback_dev;
1642         dev_hold(rth->u.dst.dev);
1643         rth->idev       = in_dev_get(rth->u.dst.dev);
1644         rth->fl.oif     = 0;
1645         rth->rt_gateway = daddr;
1646         rth->rt_spec_dst= spec_dst;
1647         rth->rt_type    = RTN_MULTICAST;
1648         rth->rt_flags   = RTCF_MULTICAST;
1649         if (our) {
1650                 rth->u.dst.input= ip_local_deliver;
1651                 rth->rt_flags |= RTCF_LOCAL;
1652         }
1653
1654 #ifdef CONFIG_IP_MROUTE
1655         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656                 rth->u.dst.input = ip_mr_input;
1657 #endif
1658         RT_CACHE_STAT_INC(in_slow_mc);
1659
1660         in_dev_put(in_dev);
1661         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663
1664 e_nobufs:
1665         in_dev_put(in_dev);
1666         return -ENOBUFS;
1667
1668 e_inval:
1669         in_dev_put(in_dev);
1670         return -EINVAL;
1671 }
1672
1673
1674 static void ip_handle_martian_source(struct net_device *dev,
1675                                      struct in_device *in_dev,
1676                                      struct sk_buff *skb,
1677                                      u32 daddr,
1678                                      u32 saddr) 
1679 {
1680         RT_CACHE_STAT_INC(in_martian_src);
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
1682         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683                 /*
1684                  *      RFC1812 recommendation, if source is martian,
1685                  *      the only hint is MAC header.
1686                  */
1687                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688                         "%u.%u.%u.%u, on dev %s\n",
1689                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690                 if (dev->hard_header_len && skb->mac.raw) {
1691                         int i;
1692                         unsigned char *p = skb->mac.raw;
1693                         printk(KERN_WARNING "ll header: ");
1694                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1695                                 printk("%02x", *p);
1696                                 if (i < (dev->hard_header_len - 1))
1697                                         printk(":");
1698                         }
1699                         printk("\n");
1700                 }
1701         }
1702 #endif
1703 }
1704
1705 static inline int __mkroute_input(struct sk_buff *skb, 
1706                                   struct fib_result* res, 
1707                                   struct in_device *in_dev, 
1708                                   u32 daddr, u32 saddr, u32 tos, 
1709                                   struct rtable **result) 
1710 {
1711
1712         struct rtable *rth;
1713         int err;
1714         struct in_device *out_dev;
1715         unsigned flags = 0;
1716         u32 spec_dst, itag;
1717
1718         /* get a working reference to the output device */
1719         out_dev = in_dev_get(FIB_RES_DEV(*res));
1720         if (out_dev == NULL) {
1721                 if (net_ratelimit())
1722                         printk(KERN_CRIT "Bug in ip_route_input" \
1723                                "_slow(). Please, report\n");
1724                 return -EINVAL;
1725         }
1726
1727
1728         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1729                                   in_dev->dev, &spec_dst, &itag);
1730         if (err < 0) {
1731                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1732                                          saddr);
1733                 
1734                 err = -EINVAL;
1735                 goto cleanup;
1736         }
1737
1738         if (err)
1739                 flags |= RTCF_DIRECTSRC;
1740
1741         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742             (IN_DEV_SHARED_MEDIA(out_dev) ||
1743              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744                 flags |= RTCF_DOREDIRECT;
1745
1746         if (skb->protocol != htons(ETH_P_IP)) {
1747                 /* Not IP (i.e. ARP). Do not create route, if it is
1748                  * invalid for proxy arp. DNAT routes are always valid.
1749                  */
1750                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1751                         err = -EINVAL;
1752                         goto cleanup;
1753                 }
1754         }
1755
1756
1757         rth = dst_alloc(&ipv4_dst_ops);
1758         if (!rth) {
1759                 err = -ENOBUFS;
1760                 goto cleanup;
1761         }
1762
1763         atomic_set(&rth->u.dst.__refcnt, 1);
1764         rth->u.dst.flags= DST_HOST;
1765 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1766         if (res->fi->fib_nhs > 1)
1767                 rth->u.dst.flags |= DST_BALANCED;
1768 #endif
1769         if (in_dev->cnf.no_policy)
1770                 rth->u.dst.flags |= DST_NOPOLICY;
1771         if (in_dev->cnf.no_xfrm)
1772                 rth->u.dst.flags |= DST_NOXFRM;
1773         rth->fl.fl4_dst = daddr;
1774         rth->rt_dst     = daddr;
1775         rth->fl.fl4_tos = tos;
1776 #ifdef CONFIG_IP_ROUTE_FWMARK
1777         rth->fl.fl4_fwmark= skb->nfmark;
1778 #endif
1779         rth->fl.fl4_src = saddr;
1780         rth->rt_src     = saddr;
1781         rth->rt_gateway = daddr;
1782         rth->rt_iif     =
1783                 rth->fl.iif     = in_dev->dev->ifindex;
1784         rth->u.dst.dev  = (out_dev)->dev;
1785         dev_hold(rth->u.dst.dev);
1786         rth->idev       = in_dev_get(rth->u.dst.dev);
1787         rth->fl.oif     = 0;
1788         rth->rt_spec_dst= spec_dst;
1789
1790         rth->u.dst.input = ip_forward;
1791         rth->u.dst.output = ip_output;
1792
1793         rt_set_nexthop(rth, res, itag);
1794
1795         rth->rt_flags = flags;
1796
1797         *result = rth;
1798         err = 0;
1799  cleanup:
1800         /* release the working reference to the output device */
1801         in_dev_put(out_dev);
1802         return err;
1803 }                                               
1804
1805 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1806                                        struct fib_result* res, 
1807                                        const struct flowi *fl,
1808                                        struct in_device *in_dev,
1809                                        u32 daddr, u32 saddr, u32 tos)
1810 {
1811         struct rtable* rth = NULL;
1812         int err;
1813         unsigned hash;
1814
1815 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1816         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1817                 fib_select_multipath(fl, res);
1818 #endif
1819
1820         /* create a routing cache entry */
1821         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1822         if (err)
1823                 return err;
1824
1825         /* put it into the cache */
1826         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1828 }
1829
1830 static inline int ip_mkroute_input(struct sk_buff *skb, 
1831                                    struct fib_result* res, 
1832                                    const struct flowi *fl,
1833                                    struct in_device *in_dev,
1834                                    u32 daddr, u32 saddr, u32 tos)
1835 {
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1837         struct rtable* rth = NULL, *rtres;
1838         unsigned char hop, hopcount;
1839         int err = -EINVAL;
1840         unsigned int hash;
1841
1842         if (res->fi)
1843                 hopcount = res->fi->fib_nhs;
1844         else
1845                 hopcount = 1;
1846
1847         /* distinguish between multipath and singlepath */
1848         if (hopcount < 2)
1849                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850                                             saddr, tos);
1851         
1852         /* add all alternatives to the routing cache */
1853         for (hop = 0; hop < hopcount; hop++) {
1854                 res->nh_sel = hop;
1855
1856                 /* put reference to previous result */
1857                 if (hop)
1858                         ip_rt_put(rtres);
1859
1860                 /* create a routing cache entry */
1861                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1862                                       &rth);
1863                 if (err)
1864                         return err;
1865
1866                 /* put it into the cache */
1867                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1868                 err = rt_intern_hash(hash, rth, &rtres);
1869                 if (err)
1870                         return err;
1871
1872                 /* forward hop information to multipath impl. */
1873                 multipath_set_nhinfo(rth,
1874                                      FIB_RES_NETWORK(*res),
1875                                      FIB_RES_NETMASK(*res),
1876                                      res->prefixlen,
1877                                      &FIB_RES_NH(*res));
1878         }
1879         skb->dst = &rtres->u.dst;
1880         return err;
1881 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1882         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1883 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1884 }
1885
1886
1887 /*
1888  *      NOTE. We drop all the packets that has local source
1889  *      addresses, because every properly looped back packet
1890  *      must have correct destination already attached by output routine.
1891  *
1892  *      Such approach solves two big problems:
1893  *      1. Not simplex devices are handled properly.
1894  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1895  */
1896
1897 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1898                                u8 tos, struct net_device *dev)
1899 {
1900         struct fib_result res;
1901         struct in_device *in_dev = in_dev_get(dev);
1902         struct flowi fl = { .nl_u = { .ip4_u =
1903                                       { .daddr = daddr,
1904                                         .saddr = saddr,
1905                                         .tos = tos,
1906                                         .scope = RT_SCOPE_UNIVERSE,
1907 #ifdef CONFIG_IP_ROUTE_FWMARK
1908                                         .fwmark = skb->nfmark
1909 #endif
1910                                       } },
1911                             .iif = dev->ifindex };
1912         unsigned        flags = 0;
1913         u32             itag = 0;
1914         struct rtable * rth;
1915         unsigned        hash;
1916         u32             spec_dst;
1917         int             err = -EINVAL;
1918         int             free_res = 0;
1919
1920         /* IP on this device is disabled. */
1921
1922         if (!in_dev)
1923                 goto out;
1924
1925         /* Check for the most weird martians, which can be not detected
1926            by fib_lookup.
1927          */
1928
1929         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1930                 goto martian_source;
1931
1932         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1933                 goto brd_input;
1934
1935         /* Accept zero addresses only to limited broadcast;
1936          * I even do not know to fix it or not. Waiting for complains :-)
1937          */
1938         if (ZERONET(saddr))
1939                 goto martian_source;
1940
1941         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1942                 goto martian_destination;
1943
1944         /*
1945          *      Now we are ready to route packet.
1946          */
1947         if ((err = fib_lookup(&fl, &res)) != 0) {
1948                 if (!IN_DEV_FORWARD(in_dev))
1949                         goto e_hostunreach;
1950                 goto no_route;
1951         }
1952         free_res = 1;
1953
1954         RT_CACHE_STAT_INC(in_slow_tot);
1955
1956         if (res.type == RTN_BROADCAST)
1957                 goto brd_input;
1958
1959         if (res.type == RTN_LOCAL) {
1960                 int result;
1961                 result = fib_validate_source(saddr, daddr, tos,
1962                                              loopback_dev.ifindex,
1963                                              dev, &spec_dst, &itag);
1964                 if (result < 0)
1965                         goto martian_source;
1966                 if (result)
1967                         flags |= RTCF_DIRECTSRC;
1968                 spec_dst = daddr;
1969                 goto local_input;
1970         }
1971
1972         if (!IN_DEV_FORWARD(in_dev))
1973                 goto e_hostunreach;
1974         if (res.type != RTN_UNICAST)
1975                 goto martian_destination;
1976
1977         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1978         if (err == -ENOBUFS)
1979                 goto e_nobufs;
1980         if (err == -EINVAL)
1981                 goto e_inval;
1982         
1983 done:
1984         in_dev_put(in_dev);
1985         if (free_res)
1986                 fib_res_put(&res);
1987 out:    return err;
1988
1989 brd_input:
1990         if (skb->protocol != htons(ETH_P_IP))
1991                 goto e_inval;
1992
1993         if (ZERONET(saddr))
1994                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995         else {
1996                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1997                                           &itag);
1998                 if (err < 0)
1999                         goto martian_source;
2000                 if (err)
2001                         flags |= RTCF_DIRECTSRC;
2002         }
2003         flags |= RTCF_BROADCAST;
2004         res.type = RTN_BROADCAST;
2005         RT_CACHE_STAT_INC(in_brd);
2006
2007 local_input:
2008         rth = dst_alloc(&ipv4_dst_ops);
2009         if (!rth)
2010                 goto e_nobufs;
2011
2012         rth->u.dst.output= ip_rt_bug;
2013
2014         atomic_set(&rth->u.dst.__refcnt, 1);
2015         rth->u.dst.flags= DST_HOST;
2016         if (in_dev->cnf.no_policy)
2017                 rth->u.dst.flags |= DST_NOPOLICY;
2018         rth->fl.fl4_dst = daddr;
2019         rth->rt_dst     = daddr;
2020         rth->fl.fl4_tos = tos;
2021 #ifdef CONFIG_IP_ROUTE_FWMARK
2022         rth->fl.fl4_fwmark= skb->nfmark;
2023 #endif
2024         rth->fl.fl4_src = saddr;
2025         rth->rt_src     = saddr;
2026 #ifdef CONFIG_NET_CLS_ROUTE
2027         rth->u.dst.tclassid = itag;
2028 #endif
2029         rth->rt_iif     =
2030         rth->fl.iif     = dev->ifindex;
2031         rth->u.dst.dev  = &loopback_dev;
2032         dev_hold(rth->u.dst.dev);
2033         rth->idev       = in_dev_get(rth->u.dst.dev);
2034         rth->rt_gateway = daddr;
2035         rth->rt_spec_dst= spec_dst;
2036         rth->u.dst.input= ip_local_deliver;
2037         rth->rt_flags   = flags|RTCF_LOCAL;
2038         if (res.type == RTN_UNREACHABLE) {
2039                 rth->u.dst.input= ip_error;
2040                 rth->u.dst.error= -err;
2041                 rth->rt_flags   &= ~RTCF_LOCAL;
2042         }
2043         rth->rt_type    = res.type;
2044         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2045         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2046         goto done;
2047
2048 no_route:
2049         RT_CACHE_STAT_INC(in_no_route);
2050         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2051         res.type = RTN_UNREACHABLE;
2052         goto local_input;
2053
2054         /*
2055          *      Do not cache martian addresses: they should be logged (RFC1812)
2056          */
2057 martian_destination:
2058         RT_CACHE_STAT_INC(in_martian_dst);
2059 #ifdef CONFIG_IP_ROUTE_VERBOSE
2060         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2061                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2062                         "%u.%u.%u.%u, dev %s\n",
2063                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2064 #endif
2065
2066 e_hostunreach:
2067         err = -EHOSTUNREACH;
2068         goto done;
2069
2070 e_inval:
2071         err = -EINVAL;
2072         goto done;
2073
2074 e_nobufs:
2075         err = -ENOBUFS;
2076         goto done;
2077
2078 martian_source:
2079         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2080         goto e_inval;
2081 }
2082
2083 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2084                    u8 tos, struct net_device *dev)
2085 {
2086         struct rtable * rth;
2087         unsigned        hash;
2088         int iif = dev->ifindex;
2089
2090         tos &= IPTOS_RT_MASK;
2091         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2092
2093         rcu_read_lock();
2094         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2095              rth = rcu_dereference(rth->u.rt_next)) {
2096                 if (rth->fl.fl4_dst == daddr &&
2097                     rth->fl.fl4_src == saddr &&
2098                     rth->fl.iif == iif &&
2099                     rth->fl.oif == 0 &&
2100 #ifdef CONFIG_IP_ROUTE_FWMARK
2101                     rth->fl.fl4_fwmark == skb->nfmark &&
2102 #endif
2103                     rth->fl.fl4_tos == tos) {
2104                         rth->u.dst.lastuse = jiffies;
2105                         dst_hold(&rth->u.dst);
2106                         rth->u.dst.__use++;
2107                         RT_CACHE_STAT_INC(in_hit);
2108                         rcu_read_unlock();
2109                         skb->dst = (struct dst_entry*)rth;
2110                         return 0;
2111                 }
2112                 RT_CACHE_STAT_INC(in_hlist_search);
2113         }
2114         rcu_read_unlock();
2115
2116         /* Multicast recognition logic is moved from route cache to here.
2117            The problem was that too many Ethernet cards have broken/missing
2118            hardware multicast filters :-( As result the host on multicasting
2119            network acquires a lot of useless route cache entries, sort of
2120            SDR messages from all the world. Now we try to get rid of them.
2121            Really, provided software IP multicast filter is organized
2122            reasonably (at least, hashed), it does not result in a slowdown
2123            comparing with route cache reject entries.
2124            Note, that multicast routers are not affected, because
2125            route cache entry is created eventually.
2126          */
2127         if (MULTICAST(daddr)) {
2128                 struct in_device *in_dev;
2129
2130                 rcu_read_lock();
2131                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2132                         int our = ip_check_mc(in_dev, daddr, saddr,
2133                                 skb->nh.iph->protocol);
2134                         if (our
2135 #ifdef CONFIG_IP_MROUTE
2136                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2137 #endif
2138                             ) {
2139                                 rcu_read_unlock();
2140                                 return ip_route_input_mc(skb, daddr, saddr,
2141                                                          tos, dev, our);
2142                         }
2143                 }
2144                 rcu_read_unlock();
2145                 return -EINVAL;
2146         }
2147         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2148 }
2149
2150 static inline int __mkroute_output(struct rtable **result,
2151                                    struct fib_result* res, 
2152                                    const struct flowi *fl,
2153                                    const struct flowi *oldflp, 
2154                                    struct net_device *dev_out, 
2155                                    unsigned flags) 
2156 {
2157         struct rtable *rth;
2158         struct in_device *in_dev;
2159         u32 tos = RT_FL_TOS(oldflp);
2160         int err = 0;
2161
2162         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2163                 return -EINVAL;
2164
2165         if (fl->fl4_dst == 0xFFFFFFFF)
2166                 res->type = RTN_BROADCAST;
2167         else if (MULTICAST(fl->fl4_dst))
2168                 res->type = RTN_MULTICAST;
2169         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2170                 return -EINVAL;
2171
2172         if (dev_out->flags & IFF_LOOPBACK)
2173                 flags |= RTCF_LOCAL;
2174
2175         /* get work reference to inet device */
2176         in_dev = in_dev_get(dev_out);
2177         if (!in_dev)
2178                 return -EINVAL;
2179
2180         if (res->type == RTN_BROADCAST) {
2181                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182                 if (res->fi) {
2183                         fib_info_put(res->fi);
2184                         res->fi = NULL;
2185                 }
2186         } else if (res->type == RTN_MULTICAST) {
2187                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2188                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2189                                  oldflp->proto))
2190                         flags &= ~RTCF_LOCAL;
2191                 /* If multicast route do not exist use
2192                    default one, but do not gateway in this case.
2193                    Yes, it is hack.
2194                  */
2195                 if (res->fi && res->prefixlen < 4) {
2196                         fib_info_put(res->fi);
2197                         res->fi = NULL;
2198                 }
2199         }
2200
2201
2202         rth = dst_alloc(&ipv4_dst_ops);
2203         if (!rth) {
2204                 err = -ENOBUFS;
2205                 goto cleanup;
2206         }               
2207
2208         atomic_set(&rth->u.dst.__refcnt, 1);
2209         rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211         if (res->fi) {
2212                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213                 if (res->fi->fib_nhs > 1)
2214                         rth->u.dst.flags |= DST_BALANCED;
2215         }
2216 #endif
2217         if (in_dev->cnf.no_xfrm)
2218                 rth->u.dst.flags |= DST_NOXFRM;
2219         if (in_dev->cnf.no_policy)
2220                 rth->u.dst.flags |= DST_NOPOLICY;
2221
2222         rth->fl.fl4_dst = oldflp->fl4_dst;
2223         rth->fl.fl4_tos = tos;
2224         rth->fl.fl4_src = oldflp->fl4_src;
2225         rth->fl.oif     = oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 #endif
2229         rth->rt_dst     = fl->fl4_dst;
2230         rth->rt_src     = fl->fl4_src;
2231         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2232         /* get references to the devices that are to be hold by the routing 
2233            cache entry */
2234         rth->u.dst.dev  = dev_out;
2235         dev_hold(dev_out);
2236         rth->idev       = in_dev_get(dev_out);
2237         rth->rt_gateway = fl->fl4_dst;
2238         rth->rt_spec_dst= fl->fl4_src;
2239
2240         rth->u.dst.output=ip_output;
2241
2242         RT_CACHE_STAT_INC(out_slow_tot);
2243
2244         if (flags & RTCF_LOCAL) {
2245                 rth->u.dst.input = ip_local_deliver;
2246                 rth->rt_spec_dst = fl->fl4_dst;
2247         }
2248         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249                 rth->rt_spec_dst = fl->fl4_src;
2250                 if (flags & RTCF_LOCAL && 
2251                     !(dev_out->flags & IFF_LOOPBACK)) {
2252                         rth->u.dst.output = ip_mc_output;
2253                         RT_CACHE_STAT_INC(out_slow_mc);
2254                 }
2255 #ifdef CONFIG_IP_MROUTE
2256                 if (res->type == RTN_MULTICAST) {
2257                         if (IN_DEV_MFORWARD(in_dev) &&
2258                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2259                                 rth->u.dst.input = ip_mr_input;
2260                                 rth->u.dst.output = ip_mc_output;
2261                         }
2262                 }
2263 #endif
2264         }
2265
2266         rt_set_nexthop(rth, res, 0);
2267
2268         rth->rt_flags = flags;
2269
2270         *result = rth;
2271  cleanup:
2272         /* release work reference to inet device */
2273         in_dev_put(in_dev);
2274
2275         return err;
2276 }
2277
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279                                         struct fib_result* res,
2280                                         const struct flowi *fl,
2281                                         const struct flowi *oldflp,
2282                                         struct net_device *dev_out,
2283                                         unsigned flags)
2284 {
2285         struct rtable *rth = NULL;
2286         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287         unsigned hash;
2288         if (err == 0) {
2289                 u32 tos = RT_FL_TOS(oldflp);
2290
2291                 hash = rt_hash_code(oldflp->fl4_dst, 
2292                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2293                 err = rt_intern_hash(hash, rth, rp);
2294         }
2295         
2296         return err;
2297 }
2298
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300                                     struct fib_result* res,
2301                                     const struct flowi *fl,
2302                                     const struct flowi *oldflp,
2303                                     struct net_device *dev_out,
2304                                     unsigned flags)
2305 {
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307         u32 tos = RT_FL_TOS(oldflp);
2308         unsigned char hop;
2309         unsigned hash;
2310         int err = -EINVAL;
2311         struct rtable *rth = NULL;
2312
2313         if (res->fi && res->fi->fib_nhs > 1) {
2314                 unsigned char hopcount = res->fi->fib_nhs;
2315
2316                 for (hop = 0; hop < hopcount; hop++) {
2317                         struct net_device *dev2nexthop;
2318
2319                         res->nh_sel = hop;
2320
2321                         /* hold a work reference to the output device */
2322                         dev2nexthop = FIB_RES_DEV(*res);
2323                         dev_hold(dev2nexthop);
2324
2325                         /* put reference to previous result */
2326                         if (hop)
2327                                 ip_rt_put(*rp);
2328
2329                         err = __mkroute_output(&rth, res, fl, oldflp,
2330                                                dev2nexthop, flags);
2331
2332                         if (err != 0)
2333                                 goto cleanup;
2334
2335                         hash = rt_hash_code(oldflp->fl4_dst, 
2336                                             oldflp->fl4_src ^
2337                                             (oldflp->oif << 5), tos);
2338                         err = rt_intern_hash(hash, rth, rp);
2339
2340                         /* forward hop information to multipath impl. */
2341                         multipath_set_nhinfo(rth,
2342                                              FIB_RES_NETWORK(*res),
2343                                              FIB_RES_NETMASK(*res),
2344                                              res->prefixlen,
2345                                              &FIB_RES_NH(*res));
2346                 cleanup:
2347                         /* release work reference to output device */
2348                         dev_put(dev2nexthop);
2349
2350                         if (err != 0)
2351                                 return err;
2352                 }
2353                 return err;
2354         } else {
2355                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2356                                              flags);
2357         }
2358 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2359         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2360 #endif
2361 }
2362
2363 /*
2364  * Major route resolver routine.
2365  */
2366
2367 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368 {
2369         u32 tos = RT_FL_TOS(oldflp);
2370         struct flowi fl = { .nl_u = { .ip4_u =
2371                                       { .daddr = oldflp->fl4_dst,
2372                                         .saddr = oldflp->fl4_src,
2373                                         .tos = tos & IPTOS_RT_MASK,
2374                                         .scope = ((tos & RTO_ONLINK) ?
2375                                                   RT_SCOPE_LINK :
2376                                                   RT_SCOPE_UNIVERSE),
2377 #ifdef CONFIG_IP_ROUTE_FWMARK
2378                                         .fwmark = oldflp->fl4_fwmark
2379 #endif
2380                                       } },
2381                             .iif = loopback_dev.ifindex,
2382                             .oif = oldflp->oif };
2383         struct fib_result res;
2384         unsigned flags = 0;
2385         struct net_device *dev_out = NULL;
2386         int free_res = 0;
2387         int err;
2388
2389
2390         res.fi          = NULL;
2391 #ifdef CONFIG_IP_MULTIPLE_TABLES
2392         res.r           = NULL;
2393 #endif
2394
2395         if (oldflp->fl4_src) {
2396                 err = -EINVAL;
2397                 if (MULTICAST(oldflp->fl4_src) ||
2398                     BADCLASS(oldflp->fl4_src) ||
2399                     ZERONET(oldflp->fl4_src))
2400                         goto out;
2401
2402                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2403                 dev_out = ip_dev_find(oldflp->fl4_src);
2404                 if (dev_out == NULL)
2405                         goto out;
2406
2407                 /* I removed check for oif == dev_out->oif here.
2408                    It was wrong for two reasons:
2409                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2410                       assigned to multiple interfaces.
2411                    2. Moreover, we are allowed to send packets with saddr
2412                       of another iface. --ANK
2413                  */
2414
2415                 if (oldflp->oif == 0
2416                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2417                         /* Special hack: user can direct multicasts
2418                            and limited broadcast via necessary interface
2419                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2420                            This hack is not just for fun, it allows
2421                            vic,vat and friends to work.
2422                            They bind socket to loopback, set ttl to zero
2423                            and expect that it will work.
2424                            From the viewpoint of routing cache they are broken,
2425                            because we are not allowed to build multicast path
2426                            with loopback source addr (look, routing cache
2427                            cannot know, that ttl is zero, so that packet
2428                            will not leave this host and route is valid).
2429                            Luckily, this hack is good workaround.
2430                          */
2431
2432                         fl.oif = dev_out->ifindex;
2433                         goto make_route;
2434                 }
2435                 if (dev_out)
2436                         dev_put(dev_out);
2437                 dev_out = NULL;
2438         }
2439
2440
2441         if (oldflp->oif) {
2442                 dev_out = dev_get_by_index(oldflp->oif);
2443                 err = -ENODEV;
2444                 if (dev_out == NULL)
2445                         goto out;
2446                 if (__in_dev_get(dev_out) == NULL) {
2447                         dev_put(dev_out);
2448                         goto out;       /* Wrong error code */
2449                 }
2450
2451                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2452                         if (!fl.fl4_src)
2453                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2454                                                               RT_SCOPE_LINK);
2455                         goto make_route;
2456                 }
2457                 if (!fl.fl4_src) {
2458                         if (MULTICAST(oldflp->fl4_dst))
2459                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2460                                                               fl.fl4_scope);
2461                         else if (!oldflp->fl4_dst)
2462                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2463                                                               RT_SCOPE_HOST);
2464                 }
2465         }
2466
2467         if (!fl.fl4_dst) {
2468                 fl.fl4_dst = fl.fl4_src;
2469                 if (!fl.fl4_dst)
2470                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2471                 if (dev_out)
2472                         dev_put(dev_out);
2473                 dev_out = &loopback_dev;
2474                 dev_hold(dev_out);
2475                 fl.oif = loopback_dev.ifindex;
2476                 res.type = RTN_LOCAL;
2477                 flags |= RTCF_LOCAL;
2478                 goto make_route;
2479         }
2480
2481         if (fib_lookup(&fl, &res)) {
2482                 res.fi = NULL;
2483                 if (oldflp->oif) {
2484                         /* Apparently, routing tables are wrong. Assume,
2485                            that the destination is on link.
2486
2487                            WHY? DW.
2488                            Because we are allowed to send to iface
2489                            even if it has NO routes and NO assigned
2490                            addresses. When oif is specified, routing
2491                            tables are looked up with only one purpose:
2492                            to catch if destination is gatewayed, rather than
2493                            direct. Moreover, if MSG_DONTROUTE is set,
2494                            we send packet, ignoring both routing tables
2495                            and ifaddr state. --ANK
2496
2497
2498                            We could make it even if oif is unknown,
2499                            likely IPv6, but we do not.
2500                          */
2501
2502                         if (fl.fl4_src == 0)
2503                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2504                                                               RT_SCOPE_LINK);
2505                         res.type = RTN_UNICAST;
2506                         goto make_route;
2507                 }
2508                 if (dev_out)
2509                         dev_put(dev_out);
2510                 err = -ENETUNREACH;
2511                 goto out;
2512         }
2513         free_res = 1;
2514
2515         if (res.type == RTN_LOCAL) {
2516                 if (!fl.fl4_src)
2517                         fl.fl4_src = fl.fl4_dst;
2518                 if (dev_out)
2519                         dev_put(dev_out);
2520                 dev_out = &loopback_dev;
2521                 dev_hold(dev_out);
2522                 fl.oif = dev_out->ifindex;
2523                 if (res.fi)
2524                         fib_info_put(res.fi);
2525                 res.fi = NULL;
2526                 flags |= RTCF_LOCAL;
2527                 goto make_route;
2528         }
2529
2530 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2531         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2532                 fib_select_multipath(&fl, &res);
2533         else
2534 #endif
2535         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2536                 fib_select_default(&fl, &res);
2537
2538         if (!fl.fl4_src)
2539                 fl.fl4_src = FIB_RES_PREFSRC(res);
2540
2541         if (dev_out)
2542                 dev_put(dev_out);
2543         dev_out = FIB_RES_DEV(res);
2544         dev_hold(dev_out);
2545         fl.oif = dev_out->ifindex;
2546
2547
2548 make_route:
2549         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2550
2551
2552         if (free_res)
2553                 fib_res_put(&res);
2554         if (dev_out)
2555                 dev_put(dev_out);
2556 out:    return err;
2557 }
2558
2559 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2560 {
2561         unsigned hash;
2562         struct rtable *rth;
2563
2564         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2565
2566         rcu_read_lock_bh();
2567         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2568                 rth = rcu_dereference(rth->u.rt_next)) {
2569                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2570                     rth->fl.fl4_src == flp->fl4_src &&
2571                     rth->fl.iif == 0 &&
2572                     rth->fl.oif == flp->oif &&
2573 #ifdef CONFIG_IP_ROUTE_FWMARK
2574                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2575 #endif
2576                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2577                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2578
2579                         /* check for multipath routes and choose one if
2580                          * necessary
2581                          */
2582                         if (multipath_select_route(flp, rth, rp)) {
2583                                 dst_hold(&(*rp)->u.dst);
2584                                 RT_CACHE_STAT_INC(out_hit);
2585                                 rcu_read_unlock_bh();
2586                                 return 0;
2587                         }
2588
2589                         rth->u.dst.lastuse = jiffies;
2590                         dst_hold(&rth->u.dst);
2591                         rth->u.dst.__use++;
2592                         RT_CACHE_STAT_INC(out_hit);
2593                         rcu_read_unlock_bh();
2594                         *rp = rth;
2595                         return 0;
2596                 }
2597                 RT_CACHE_STAT_INC(out_hlist_search);
2598         }
2599         rcu_read_unlock_bh();
2600
2601         return ip_route_output_slow(rp, flp);
2602 }
2603
2604 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2605
2606 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2607 {
2608         int err;
2609
2610         if ((err = __ip_route_output_key(rp, flp)) != 0)
2611                 return err;
2612
2613         if (flp->proto) {
2614                 if (!flp->fl4_src)
2615                         flp->fl4_src = (*rp)->rt_src;
2616                 if (!flp->fl4_dst)
2617                         flp->fl4_dst = (*rp)->rt_dst;
2618                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2619         }
2620
2621         return 0;
2622 }
2623
2624 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2625
2626 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2627 {
2628         return ip_route_output_flow(rp, flp, NULL, 0);
2629 }
2630
2631 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2632                         int nowait, unsigned int flags)
2633 {
2634         struct rtable *rt = (struct rtable*)skb->dst;
2635         struct rtmsg *r;
2636         struct nlmsghdr  *nlh;
2637         unsigned char    *b = skb->tail;
2638         struct rta_cacheinfo ci;
2639 #ifdef CONFIG_IP_MROUTE
2640         struct rtattr *eptr;
2641 #endif
2642         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2643         r = NLMSG_DATA(nlh);
2644         r->rtm_family    = AF_INET;
2645         r->rtm_dst_len  = 32;
2646         r->rtm_src_len  = 0;
2647         r->rtm_tos      = rt->fl.fl4_tos;
2648         r->rtm_table    = RT_TABLE_MAIN;
2649         r->rtm_type     = rt->rt_type;
2650         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2651         r->rtm_protocol = RTPROT_UNSPEC;
2652         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2653         if (rt->rt_flags & RTCF_NOTIFY)
2654                 r->rtm_flags |= RTM_F_NOTIFY;
2655         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2656         if (rt->fl.fl4_src) {
2657                 r->rtm_src_len = 32;
2658                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2659         }
2660         if (rt->u.dst.dev)
2661                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2662 #ifdef CONFIG_NET_CLS_ROUTE
2663         if (rt->u.dst.tclassid)
2664                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2665 #endif
2666 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2668                 __u32 alg = rt->rt_multipath_alg;
2669
2670                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2671         }
2672 #endif
2673         if (rt->fl.iif)
2674                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2675         else if (rt->rt_src != rt->fl.fl4_src)
2676                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2677         if (rt->rt_dst != rt->rt_gateway)
2678                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2679         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2680                 goto rtattr_failure;
2681         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2682         ci.rta_used     = rt->u.dst.__use;
2683         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2684         if (rt->u.dst.expires)
2685                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2686         else
2687                 ci.rta_expires = 0;
2688         ci.rta_error    = rt->u.dst.error;
2689         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2690         if (rt->peer) {
2691                 ci.rta_id = rt->peer->ip_id_count;
2692                 if (rt->peer->tcp_ts_stamp) {
2693                         ci.rta_ts = rt->peer->tcp_ts;
2694                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2695                 }
2696         }
2697 #ifdef CONFIG_IP_MROUTE
2698         eptr = (struct rtattr*)skb->tail;
2699 #endif
2700         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2701         if (rt->fl.iif) {
2702 #ifdef CONFIG_IP_MROUTE
2703                 u32 dst = rt->rt_dst;
2704
2705                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2706                     ipv4_devconf.mc_forwarding) {
2707                         int err = ipmr_get_route(skb, r, nowait);
2708                         if (err <= 0) {
2709                                 if (!nowait) {
2710                                         if (err == 0)
2711                                                 return 0;
2712                                         goto nlmsg_failure;
2713                                 } else {
2714                                         if (err == -EMSGSIZE)
2715                                                 goto nlmsg_failure;
2716                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2717                                 }
2718                         }
2719                 } else
2720 #endif
2721                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2722         }
2723
2724         nlh->nlmsg_len = skb->tail - b;
2725         return skb->len;
2726
2727 nlmsg_failure:
2728 rtattr_failure:
2729         skb_trim(skb, b - skb->data);
2730         return -1;
2731 }
2732
2733 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2734 {
2735         struct rtattr **rta = arg;
2736         struct rtmsg *rtm = NLMSG_DATA(nlh);
2737         struct rtable *rt = NULL;
2738         u32 dst = 0;
2739         u32 src = 0;
2740         int iif = 0;
2741         int err = -ENOBUFS;
2742         struct sk_buff *skb;
2743
2744         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2745         if (!skb)
2746                 goto out;
2747
2748         /* Reserve room for dummy headers, this skb can pass
2749            through good chunk of routing engine.
2750          */
2751         skb->mac.raw = skb->data;
2752         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2753
2754         if (rta[RTA_SRC - 1])
2755                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2756         if (rta[RTA_DST - 1])
2757                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2758         if (rta[RTA_IIF - 1])
2759                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2760
2761         if (iif) {
2762                 struct net_device *dev = __dev_get_by_index(iif);
2763                 err = -ENODEV;
2764                 if (!dev)
2765                         goto out_free;
2766                 skb->protocol   = htons(ETH_P_IP);
2767                 skb->dev        = dev;
2768                 local_bh_disable();
2769                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2770                 local_bh_enable();
2771                 rt = (struct rtable*)skb->dst;
2772                 if (!err && rt->u.dst.error)
2773                         err = -rt->u.dst.error;
2774         } else {
2775                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2776                                                          .saddr = src,
2777                                                          .tos = rtm->rtm_tos } } };
2778                 int oif = 0;
2779                 if (rta[RTA_OIF - 1])
2780                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2781                 fl.oif = oif;
2782                 err = ip_route_output_key(&rt, &fl);
2783         }
2784         if (err)
2785                 goto out_free;
2786
2787         skb->dst = &rt->u.dst;
2788         if (rtm->rtm_flags & RTM_F_NOTIFY)
2789                 rt->rt_flags |= RTCF_NOTIFY;
2790
2791         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2792
2793         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2794                                 RTM_NEWROUTE, 0, 0);
2795         if (!err)
2796                 goto out_free;
2797         if (err < 0) {
2798                 err = -EMSGSIZE;
2799                 goto out_free;
2800         }
2801
2802         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2803         if (err > 0)
2804                 err = 0;
2805 out:    return err;
2806
2807 out_free:
2808         kfree_skb(skb);
2809         goto out;
2810 }
2811
2812 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2813 {
2814         struct rtable *rt;
2815         int h, s_h;
2816         int idx, s_idx;
2817
2818         s_h = cb->args[0];
2819         s_idx = idx = cb->args[1];
2820         for (h = 0; h <= rt_hash_mask; h++) {
2821                 if (h < s_h) continue;
2822                 if (h > s_h)
2823                         s_idx = 0;
2824                 rcu_read_lock_bh();
2825                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2826                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2827                         if (idx < s_idx)
2828                                 continue;
2829                         skb->dst = dst_clone(&rt->u.dst);
2830                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2831                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
2832                                          1, NLM_F_MULTI) <= 0) {
2833                                 dst_release(xchg(&skb->dst, NULL));
2834                                 rcu_read_unlock_bh();
2835                                 goto done;
2836                         }
2837                         dst_release(xchg(&skb->dst, NULL));
2838                 }
2839                 rcu_read_unlock_bh();
2840         }
2841
2842 done:
2843         cb->args[0] = h;
2844         cb->args[1] = idx;
2845         return skb->len;
2846 }
2847
2848 void ip_rt_multicast_event(struct in_device *in_dev)
2849 {
2850         rt_cache_flush(0);
2851 }
2852
2853 #ifdef CONFIG_SYSCTL
2854 static int flush_delay;
2855
2856 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2857                                         struct file *filp, void __user *buffer,
2858                                         size_t *lenp, loff_t *ppos)
2859 {
2860         if (write) {
2861                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2862                 rt_cache_flush(flush_delay);
2863                 return 0;
2864         } 
2865
2866         return -EINVAL;
2867 }
2868
2869 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2870                                                 int __user *name,
2871                                                 int nlen,
2872                                                 void __user *oldval,
2873                                                 size_t __user *oldlenp,
2874                                                 void __user *newval,
2875                                                 size_t newlen,
2876                                                 void **context)
2877 {
2878         int delay;
2879         if (newlen != sizeof(int))
2880                 return -EINVAL;
2881         if (get_user(delay, (int __user *)newval))
2882                 return -EFAULT; 
2883         rt_cache_flush(delay); 
2884         return 0;
2885 }
2886
2887 ctl_table ipv4_route_table[] = {
2888         {
2889                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2890                 .procname       = "flush",
2891                 .data           = &flush_delay,
2892                 .maxlen         = sizeof(int),
2893                 .mode           = 0200,
2894                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2895                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2896         },
2897         {
2898                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2899                 .procname       = "min_delay",
2900                 .data           = &ip_rt_min_delay,
2901                 .maxlen         = sizeof(int),
2902                 .mode           = 0644,
2903                 .proc_handler   = &proc_dointvec_jiffies,
2904                 .strategy       = &sysctl_jiffies,
2905         },
2906         {
2907                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2908                 .procname       = "max_delay",
2909                 .data           = &ip_rt_max_delay,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = &proc_dointvec_jiffies,
2913                 .strategy       = &sysctl_jiffies,
2914         },
2915         {
2916                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2917                 .procname       = "gc_thresh",
2918                 .data           = &ipv4_dst_ops.gc_thresh,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = &proc_dointvec,
2922         },
2923         {
2924                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2925                 .procname       = "max_size",
2926                 .data           = &ip_rt_max_size,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = &proc_dointvec,
2930         },
2931         {
2932                 /*  Deprecated. Use gc_min_interval_ms */
2933  
2934                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935                 .procname       = "gc_min_interval",
2936                 .data           = &ip_rt_gc_min_interval,
2937                 .maxlen         = sizeof(int),
2938                 .mode           = 0644,
2939                 .proc_handler   = &proc_dointvec_jiffies,
2940                 .strategy       = &sysctl_jiffies,
2941         },
2942         {
2943                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944                 .procname       = "gc_min_interval_ms",
2945                 .data           = &ip_rt_gc_min_interval,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = &proc_dointvec_ms_jiffies,
2949                 .strategy       = &sysctl_ms_jiffies,
2950         },
2951         {
2952                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2953                 .procname       = "gc_timeout",
2954                 .data           = &ip_rt_gc_timeout,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = &proc_dointvec_jiffies,
2958                 .strategy       = &sysctl_jiffies,
2959         },
2960         {
2961                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2962                 .procname       = "gc_interval",
2963                 .data           = &ip_rt_gc_interval,
2964                 .maxlen         = sizeof(int),
2965                 .mode           = 0644,
2966                 .proc_handler   = &proc_dointvec_jiffies,
2967                 .strategy       = &sysctl_jiffies,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2971                 .procname       = "redirect_load",
2972                 .data           = &ip_rt_redirect_load,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec,
2976         },
2977         {
2978                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979                 .procname       = "redirect_number",
2980                 .data           = &ip_rt_redirect_number,
2981                 .maxlen         = sizeof(int),
2982                 .mode           = 0644,
2983                 .proc_handler   = &proc_dointvec,
2984         },
2985         {
2986                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987                 .procname       = "redirect_silence",
2988                 .data           = &ip_rt_redirect_silence,
2989                 .maxlen         = sizeof(int),
2990                 .mode           = 0644,
2991                 .proc_handler   = &proc_dointvec,
2992         },
2993         {
2994                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2995                 .procname       = "error_cost",
2996                 .data           = &ip_rt_error_cost,
2997                 .maxlen         = sizeof(int),
2998                 .mode           = 0644,
2999                 .proc_handler   = &proc_dointvec,
3000         },
3001         {
3002                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3003                 .procname       = "error_burst",
3004                 .data           = &ip_rt_error_burst,
3005                 .maxlen         = sizeof(int),
3006                 .mode           = 0644,
3007                 .proc_handler   = &proc_dointvec,
3008         },
3009         {
3010                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3011                 .procname       = "gc_elasticity",
3012                 .data           = &ip_rt_gc_elasticity,
3013                 .maxlen         = sizeof(int),
3014                 .mode           = 0644,
3015                 .proc_handler   = &proc_dointvec,
3016         },
3017         {
3018                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3019                 .procname       = "mtu_expires",
3020                 .data           = &ip_rt_mtu_expires,
3021                 .maxlen         = sizeof(int),
3022                 .mode           = 0644,
3023                 .proc_handler   = &proc_dointvec_jiffies,
3024                 .strategy       = &sysctl_jiffies,
3025         },
3026         {
3027                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3028                 .procname       = "min_pmtu",
3029                 .data           = &ip_rt_min_pmtu,
3030                 .maxlen         = sizeof(int),
3031                 .mode           = 0644,
3032                 .proc_handler   = &proc_dointvec,
3033         },
3034         {
3035                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3036                 .procname       = "min_adv_mss",
3037                 .data           = &ip_rt_min_advmss,
3038                 .maxlen         = sizeof(int),
3039                 .mode           = 0644,
3040                 .proc_handler   = &proc_dointvec,
3041         },
3042         {
3043                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3044                 .procname       = "secret_interval",
3045                 .data           = &ip_rt_secret_interval,
3046                 .maxlen         = sizeof(int),
3047                 .mode           = 0644,
3048                 .proc_handler   = &proc_dointvec_jiffies,
3049                 .strategy       = &sysctl_jiffies,
3050         },
3051         { .ctl_name = 0 }
3052 };
3053 #endif
3054
3055 #ifdef CONFIG_NET_CLS_ROUTE
3056 struct ip_rt_acct *ip_rt_acct;
3057
3058 /* This code sucks.  But you should have seen it before! --RR */
3059
3060 /* IP route accounting ptr for this logical cpu number. */
3061 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062
3063 #ifdef CONFIG_PROC_FS
3064 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065                            int length, int *eof, void *data)
3066 {
3067         unsigned int i;
3068
3069         if ((offset & 3) || (length & 3))
3070                 return -EIO;
3071
3072         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073                 *eof = 1;
3074                 return 0;
3075         }
3076
3077         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3079                 *eof = 1;
3080         }
3081
3082         offset /= sizeof(u32);
3083
3084         if (length > 0) {
3085                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086                 u32 *dst = (u32 *) buffer;
3087
3088                 /* Copy first cpu. */
3089                 *start = buffer;
3090                 memcpy(dst, src, length);
3091
3092                 /* Add the other cpus in, one int at a time */
3093                 for_each_cpu(i) {
3094                         unsigned int j;
3095
3096                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097
3098                         for (j = 0; j < length/4; j++)
3099                                 dst[j] += src[j];
3100                 }
3101         }
3102         return length;
3103 }
3104 #endif /* CONFIG_PROC_FS */
3105 #endif /* CONFIG_NET_CLS_ROUTE */
3106
3107 static __initdata unsigned long rhash_entries;
3108 static int __init set_rhash_entries(char *str)
3109 {
3110         if (!str)
3111                 return 0;
3112         rhash_entries = simple_strtoul(str, &str, 0);
3113         return 1;
3114 }
3115 __setup("rhash_entries=", set_rhash_entries);
3116
3117 int __init ip_rt_init(void)
3118 {
3119         int rc = 0;
3120
3121         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122                              (jiffies ^ (jiffies >> 7)));
3123
3124 #ifdef CONFIG_NET_CLS_ROUTE
3125         {
3126         int order;
3127         for (order = 0;
3128              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129                 /* NOTHING */;
3130         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131         if (!ip_rt_acct)
3132                 panic("IP: failed to allocate ip_rt_acct\n");
3133         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3134         }
3135 #endif
3136
3137         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3138                                                      sizeof(struct rtable),
3139                                                      0, SLAB_HWCACHE_ALIGN,
3140                                                      NULL, NULL);
3141
3142         if (!ipv4_dst_ops.kmem_cachep)
3143                 panic("IP: failed to allocate ip_dst_cache\n");
3144
3145         rt_hash_table = (struct rt_hash_bucket *)
3146                 alloc_large_system_hash("IP route cache",
3147                                         sizeof(struct rt_hash_bucket),
3148                                         rhash_entries,
3149                                         (num_physpages >= 128 * 1024) ?
3150                                                 (27 - PAGE_SHIFT) :
3151                                                 (29 - PAGE_SHIFT),
3152                                         HASH_HIGHMEM,
3153                                         &rt_hash_log,
3154                                         &rt_hash_mask,
3155                                         0);
3156         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157         rt_hash_lock_init();
3158
3159         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3161
3162         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3163         if (!rt_cache_stat)
3164                 return -ENOMEM;
3165
3166         devinet_init();
3167         ip_fib_init();
3168
3169         init_timer(&rt_flush_timer);
3170         rt_flush_timer.function = rt_run_flush;
3171         init_timer(&rt_periodic_timer);
3172         rt_periodic_timer.function = rt_check_expire;
3173         init_timer(&rt_secret_timer);
3174         rt_secret_timer.function = rt_secret_rebuild;
3175
3176         /* All the timers, started at system startup tend
3177            to synchronize. Perturb it a bit.
3178          */
3179         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3180                                         ip_rt_gc_interval;
3181         add_timer(&rt_periodic_timer);
3182
3183         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3184                 ip_rt_secret_interval;
3185         add_timer(&rt_secret_timer);
3186
3187 #ifdef CONFIG_PROC_FS
3188         {
3189         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3190         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3191             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3192                                              proc_net_stat))) {
3193                 free_percpu(rt_cache_stat);
3194                 return -ENOMEM;
3195         }
3196         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3197         }
3198 #ifdef CONFIG_NET_CLS_ROUTE
3199         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3200 #endif
3201 #endif
3202 #ifdef CONFIG_XFRM
3203         xfrm_init();
3204         xfrm4_init();
3205 #endif
3206         return rc;
3207 }
3208
3209 EXPORT_SYMBOL(__ip_select_ident);
3210 EXPORT_SYMBOL(ip_route_input);
3211 EXPORT_SYMBOL(ip_route_output_key);