Auto merge with /home/aegl/GIT/linus
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112
113 #define IP_MAX_MTU      0xFFF0
114
115 #define RT_GC_TIMEOUT (300*HZ)
116
117 static int ip_rt_min_delay              = 2 * HZ;
118 static int ip_rt_max_delay              = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval            = 60 * HZ;
122 static int ip_rt_gc_min_interval        = HZ / 2;
123 static int ip_rt_redirect_number        = 9;
124 static int ip_rt_redirect_load          = HZ / 50;
125 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost             = HZ;
127 static int ip_rt_error_burst            = 5 * HZ;
128 static int ip_rt_gc_elasticity          = 8;
129 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu               = 512 + 20 + 20;
131 static int ip_rt_min_advmss             = 256;
132 static int ip_rt_secret_interval        = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .entry_size =           sizeof(struct rtable),
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(FILLER),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188
189
190 /*
191  * Route cache.
192  */
193
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203
204 struct rt_hash_bucket {
205         struct rtable   *chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  */
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ 4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ 2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ 1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ 512
220 #else
221 #define RT_HASH_LOCK_SZ 256
222 #endif
223
224 static spinlock_t       *rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init()    { \
227                 int i; \
228                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231                         spin_lock_init(&rt_hash_locks[i]); \
232                 }
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
237
238 static struct rt_hash_bucket    *rt_hash_table;
239 static unsigned                 rt_hash_mask;
240 static int                      rt_hash_log;
241 static unsigned int             rt_hash_rnd;
242
243 struct rt_cache_stat *rt_cache_stat;
244
245 static int rt_intern_hash(unsigned hash, struct rtable *rth,
246                                 struct rtable **res);
247
248 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
249 {
250         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
251                 & rt_hash_mask);
252 }
253
254 #ifdef CONFIG_PROC_FS
255 struct rt_cache_iter_state {
256         int bucket;
257 };
258
259 static struct rtable *rt_cache_get_first(struct seq_file *seq)
260 {
261         struct rtable *r = NULL;
262         struct rt_cache_iter_state *st = seq->private;
263
264         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
265                 rcu_read_lock_bh();
266                 r = rt_hash_table[st->bucket].chain;
267                 if (r)
268                         break;
269                 rcu_read_unlock_bh();
270         }
271         return r;
272 }
273
274 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
275 {
276         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
277
278         r = r->u.rt_next;
279         while (!r) {
280                 rcu_read_unlock_bh();
281                 if (--st->bucket < 0)
282                         break;
283                 rcu_read_lock_bh();
284                 r = rt_hash_table[st->bucket].chain;
285         }
286         return r;
287 }
288
289 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
290 {
291         struct rtable *r = rt_cache_get_first(seq);
292
293         if (r)
294                 while (pos && (r = rt_cache_get_next(seq, r)))
295                         --pos;
296         return pos ? NULL : r;
297 }
298
299 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
300 {
301         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
302 }
303
304 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
305 {
306         struct rtable *r = NULL;
307
308         if (v == SEQ_START_TOKEN)
309                 r = rt_cache_get_first(seq);
310         else
311                 r = rt_cache_get_next(seq, v);
312         ++*pos;
313         return r;
314 }
315
316 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
317 {
318         if (v && v != SEQ_START_TOKEN)
319                 rcu_read_unlock_bh();
320 }
321
322 static int rt_cache_seq_show(struct seq_file *seq, void *v)
323 {
324         if (v == SEQ_START_TOKEN)
325                 seq_printf(seq, "%-127s\n",
326                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
327                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
328                            "HHUptod\tSpecDst");
329         else {
330                 struct rtable *r = v;
331                 char temp[256];
332
333                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
334                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
335                         r->u.dst.dev ? r->u.dst.dev->name : "*",
336                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
337                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
338                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
339                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
340                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
341                         dst_metric(&r->u.dst, RTAX_WINDOW),
342                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
343                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
344                         r->fl.fl4_tos,
345                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
346                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
347                                        dev_queue_xmit) : 0,
348                         r->rt_spec_dst);
349                 seq_printf(seq, "%-127s\n", temp);
350         }
351         return 0;
352 }
353
354 static struct seq_operations rt_cache_seq_ops = {
355         .start  = rt_cache_seq_start,
356         .next   = rt_cache_seq_next,
357         .stop   = rt_cache_seq_stop,
358         .show   = rt_cache_seq_show,
359 };
360
361 static int rt_cache_seq_open(struct inode *inode, struct file *file)
362 {
363         struct seq_file *seq;
364         int rc = -ENOMEM;
365         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
366
367         if (!s)
368                 goto out;
369         rc = seq_open(file, &rt_cache_seq_ops);
370         if (rc)
371                 goto out_kfree;
372         seq          = file->private_data;
373         seq->private = s;
374         memset(s, 0, sizeof(*s));
375 out:
376         return rc;
377 out_kfree:
378         kfree(s);
379         goto out;
380 }
381
382 static struct file_operations rt_cache_seq_fops = {
383         .owner   = THIS_MODULE,
384         .open    = rt_cache_seq_open,
385         .read    = seq_read,
386         .llseek  = seq_lseek,
387         .release = seq_release_private,
388 };
389
390
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393         int cpu;
394
395         if (*pos == 0)
396                 return SEQ_START_TOKEN;
397
398         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399                 if (!cpu_possible(cpu))
400                         continue;
401                 *pos = cpu+1;
402                 return per_cpu_ptr(rt_cache_stat, cpu);
403         }
404         return NULL;
405 }
406
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408 {
409         int cpu;
410
411         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412                 if (!cpu_possible(cpu))
413                         continue;
414                 *pos = cpu+1;
415                 return per_cpu_ptr(rt_cache_stat, cpu);
416         }
417         return NULL;
418         
419 }
420
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422 {
423
424 }
425
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 {
428         struct rt_cache_stat *st = v;
429
430         if (v == SEQ_START_TOKEN) {
431                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432                 return 0;
433         }
434         
435         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
436                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437                    atomic_read(&ipv4_dst_ops.entries),
438                    st->in_hit,
439                    st->in_slow_tot,
440                    st->in_slow_mc,
441                    st->in_no_route,
442                    st->in_brd,
443                    st->in_martian_dst,
444                    st->in_martian_src,
445
446                    st->out_hit,
447                    st->out_slow_tot,
448                    st->out_slow_mc, 
449
450                    st->gc_total,
451                    st->gc_ignored,
452                    st->gc_goal_miss,
453                    st->gc_dst_overflow,
454                    st->in_hlist_search,
455                    st->out_hlist_search
456                 );
457         return 0;
458 }
459
460 static struct seq_operations rt_cpu_seq_ops = {
461         .start  = rt_cpu_seq_start,
462         .next   = rt_cpu_seq_next,
463         .stop   = rt_cpu_seq_stop,
464         .show   = rt_cpu_seq_show,
465 };
466
467
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 {
470         return seq_open(file, &rt_cpu_seq_ops);
471 }
472
473 static struct file_operations rt_cpu_seq_fops = {
474         .owner   = THIS_MODULE,
475         .open    = rt_cpu_seq_open,
476         .read    = seq_read,
477         .llseek  = seq_lseek,
478         .release = seq_release,
479 };
480
481 #endif /* CONFIG_PROC_FS */
482   
483 static __inline__ void rt_free(struct rtable *rt)
484 {
485         multipath_remove(rt);
486         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
487 }
488
489 static __inline__ void rt_drop(struct rtable *rt)
490 {
491         multipath_remove(rt);
492         ip_rt_put(rt);
493         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
494 }
495
496 static __inline__ int rt_fast_clean(struct rtable *rth)
497 {
498         /* Kill broadcast/multicast entries very aggresively, if they
499            collide in hash table with more useful entries */
500         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
501                 rth->fl.iif && rth->u.rt_next;
502 }
503
504 static __inline__ int rt_valuable(struct rtable *rth)
505 {
506         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
507                 rth->u.dst.expires;
508 }
509
510 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
511 {
512         unsigned long age;
513         int ret = 0;
514
515         if (atomic_read(&rth->u.dst.__refcnt))
516                 goto out;
517
518         ret = 1;
519         if (rth->u.dst.expires &&
520             time_after_eq(jiffies, rth->u.dst.expires))
521                 goto out;
522
523         age = jiffies - rth->u.dst.lastuse;
524         ret = 0;
525         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
526             (age <= tmo2 && rt_valuable(rth)))
527                 goto out;
528         ret = 1;
529 out:    return ret;
530 }
531
532 /* Bits of score are:
533  * 31: very valuable
534  * 30: not quite useless
535  * 29..0: usage counter
536  */
537 static inline u32 rt_score(struct rtable *rt)
538 {
539         u32 score = jiffies - rt->u.dst.lastuse;
540
541         score = ~score & ~(3<<30);
542
543         if (rt_valuable(rt))
544                 score |= (1<<31);
545
546         if (!rt->fl.iif ||
547             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
548                 score |= (1<<30);
549
550         return score;
551 }
552
553 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
554 {
555         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
556                fl1->oif     == fl2->oif &&
557                fl1->iif     == fl2->iif;
558 }
559
560 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
561 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
562                                                 struct rtable *expentry,
563                                                 int *removed_count)
564 {
565         int passedexpired = 0;
566         struct rtable **nextstep = NULL;
567         struct rtable **rthp = chain_head;
568         struct rtable *rth;
569
570         if (removed_count)
571                 *removed_count = 0;
572
573         while ((rth = *rthp) != NULL) {
574                 if (rth == expentry)
575                         passedexpired = 1;
576
577                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
578                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
579                         if (*rthp == expentry) {
580                                 *rthp = rth->u.rt_next;
581                                 continue;
582                         } else {
583                                 *rthp = rth->u.rt_next;
584                                 rt_free(rth);
585                                 if (removed_count)
586                                         ++(*removed_count);
587                         }
588                 } else {
589                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
590                             passedexpired && !nextstep)
591                                 nextstep = &rth->u.rt_next;
592
593                         rthp = &rth->u.rt_next;
594                 }
595         }
596
597         rt_free(expentry);
598         if (removed_count)
599                 ++(*removed_count);
600
601         return nextstep;
602 }
603 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
604
605
606 /* This runs via a timer and thus is always in BH context. */
607 static void rt_check_expire(unsigned long dummy)
608 {
609         static unsigned int rover;
610         unsigned int i = rover, goal;
611         struct rtable *rth, **rthp;
612         unsigned long now = jiffies;
613         u64 mult;
614
615         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616         if (ip_rt_gc_timeout > 1)
617                 do_div(mult, ip_rt_gc_timeout);
618         goal = (unsigned int)mult;
619         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620         for (; goal > 0; goal--) {
621                 unsigned long tmo = ip_rt_gc_timeout;
622
623                 i = (i + 1) & rt_hash_mask;
624                 rthp = &rt_hash_table[i].chain;
625
626                 if (*rthp == 0)
627                         continue;
628                 spin_lock(rt_hash_lock_addr(i));
629                 while ((rth = *rthp) != NULL) {
630                         if (rth->u.dst.expires) {
631                                 /* Entry is expired even if it is in use */
632                                 if (time_before_eq(now, rth->u.dst.expires)) {
633                                         tmo >>= 1;
634                                         rthp = &rth->u.rt_next;
635                                         continue;
636                                 }
637                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
638                                 tmo >>= 1;
639                                 rthp = &rth->u.rt_next;
640                                 continue;
641                         }
642
643                         /* Cleanup aged off entries. */
644 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
645                         /* remove all related balanced entries if necessary */
646                         if (rth->u.dst.flags & DST_BALANCED) {
647                                 rthp = rt_remove_balanced_route(
648                                         &rt_hash_table[i].chain,
649                                         rth, NULL);
650                                 if (!rthp)
651                                         break;
652                         } else {
653                                 *rthp = rth->u.rt_next;
654                                 rt_free(rth);
655                         }
656 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
657                         *rthp = rth->u.rt_next;
658                         rt_free(rth);
659 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
660                 }
661                 spin_unlock(rt_hash_lock_addr(i));
662
663                 /* Fallback loop breaker. */
664                 if (time_after(jiffies, now))
665                         break;
666         }
667         rover = i;
668         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
669 }
670
671 /* This can run from both BH and non-BH contexts, the latter
672  * in the case of a forced flush event.
673  */
674 static void rt_run_flush(unsigned long dummy)
675 {
676         int i;
677         struct rtable *rth, *next;
678
679         rt_deadline = 0;
680
681         get_random_bytes(&rt_hash_rnd, 4);
682
683         for (i = rt_hash_mask; i >= 0; i--) {
684                 spin_lock_bh(rt_hash_lock_addr(i));
685                 rth = rt_hash_table[i].chain;
686                 if (rth)
687                         rt_hash_table[i].chain = NULL;
688                 spin_unlock_bh(rt_hash_lock_addr(i));
689
690                 for (; rth; rth = next) {
691                         next = rth->u.rt_next;
692                         rt_free(rth);
693                 }
694         }
695 }
696
697 static DEFINE_SPINLOCK(rt_flush_lock);
698
699 void rt_cache_flush(int delay)
700 {
701         unsigned long now = jiffies;
702         int user_mode = !in_softirq();
703
704         if (delay < 0)
705                 delay = ip_rt_min_delay;
706
707         /* flush existing multipath state*/
708         multipath_flush();
709
710         spin_lock_bh(&rt_flush_lock);
711
712         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
713                 long tmo = (long)(rt_deadline - now);
714
715                 /* If flush timer is already running
716                    and flush request is not immediate (delay > 0):
717
718                    if deadline is not achieved, prolongate timer to "delay",
719                    otherwise fire it at deadline time.
720                  */
721
722                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
723                         tmo = 0;
724                 
725                 if (delay > tmo)
726                         delay = tmo;
727         }
728
729         if (delay <= 0) {
730                 spin_unlock_bh(&rt_flush_lock);
731                 rt_run_flush(0);
732                 return;
733         }
734
735         if (rt_deadline == 0)
736                 rt_deadline = now + ip_rt_max_delay;
737
738         mod_timer(&rt_flush_timer, now+delay);
739         spin_unlock_bh(&rt_flush_lock);
740 }
741
742 static void rt_secret_rebuild(unsigned long dummy)
743 {
744         unsigned long now = jiffies;
745
746         rt_cache_flush(0);
747         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
748 }
749
750 /*
751    Short description of GC goals.
752
753    We want to build algorithm, which will keep routing cache
754    at some equilibrium point, when number of aged off entries
755    is kept approximately equal to newly generated ones.
756
757    Current expiration strength is variable "expire".
758    We try to adjust it dynamically, so that if networking
759    is idle expires is large enough to keep enough of warm entries,
760    and when load increases it reduces to limit cache size.
761  */
762
763 static int rt_garbage_collect(void)
764 {
765         static unsigned long expire = RT_GC_TIMEOUT;
766         static unsigned long last_gc;
767         static int rover;
768         static int equilibrium;
769         struct rtable *rth, **rthp;
770         unsigned long now = jiffies;
771         int goal;
772
773         /*
774          * Garbage collection is pretty expensive,
775          * do not make it too frequently.
776          */
777
778         RT_CACHE_STAT_INC(gc_total);
779
780         if (now - last_gc < ip_rt_gc_min_interval &&
781             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
782                 RT_CACHE_STAT_INC(gc_ignored);
783                 goto out;
784         }
785
786         /* Calculate number of entries, which we want to expire now. */
787         goal = atomic_read(&ipv4_dst_ops.entries) -
788                 (ip_rt_gc_elasticity << rt_hash_log);
789         if (goal <= 0) {
790                 if (equilibrium < ipv4_dst_ops.gc_thresh)
791                         equilibrium = ipv4_dst_ops.gc_thresh;
792                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
793                 if (goal > 0) {
794                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
795                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
796                 }
797         } else {
798                 /* We are in dangerous area. Try to reduce cache really
799                  * aggressively.
800                  */
801                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
802                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
803         }
804
805         if (now - last_gc >= ip_rt_gc_min_interval)
806                 last_gc = now;
807
808         if (goal <= 0) {
809                 equilibrium += goal;
810                 goto work_done;
811         }
812
813         do {
814                 int i, k;
815
816                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
817                         unsigned long tmo = expire;
818
819                         k = (k + 1) & rt_hash_mask;
820                         rthp = &rt_hash_table[k].chain;
821                         spin_lock_bh(rt_hash_lock_addr(k));
822                         while ((rth = *rthp) != NULL) {
823                                 if (!rt_may_expire(rth, tmo, expire)) {
824                                         tmo >>= 1;
825                                         rthp = &rth->u.rt_next;
826                                         continue;
827                                 }
828 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
829                                 /* remove all related balanced entries
830                                  * if necessary
831                                  */
832                                 if (rth->u.dst.flags & DST_BALANCED) {
833                                         int r;
834
835                                         rthp = rt_remove_balanced_route(
836                                                 &rt_hash_table[i].chain,
837                                                 rth,
838                                                 &r);
839                                         goal -= r;
840                                         if (!rthp)
841                                                 break;
842                                 } else {
843                                         *rthp = rth->u.rt_next;
844                                         rt_free(rth);
845                                         goal--;
846                                 }
847 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
848                                 *rthp = rth->u.rt_next;
849                                 rt_free(rth);
850                                 goal--;
851 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
852                         }
853                         spin_unlock_bh(rt_hash_lock_addr(k));
854                         if (goal <= 0)
855                                 break;
856                 }
857                 rover = k;
858
859                 if (goal <= 0)
860                         goto work_done;
861
862                 /* Goal is not achieved. We stop process if:
863
864                    - if expire reduced to zero. Otherwise, expire is halfed.
865                    - if table is not full.
866                    - if we are called from interrupt.
867                    - jiffies check is just fallback/debug loop breaker.
868                      We will not spin here for long time in any case.
869                  */
870
871                 RT_CACHE_STAT_INC(gc_goal_miss);
872
873                 if (expire == 0)
874                         break;
875
876                 expire >>= 1;
877 #if RT_CACHE_DEBUG >= 2
878                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
879                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
880 #endif
881
882                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
883                         goto out;
884         } while (!in_softirq() && time_before_eq(jiffies, now));
885
886         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
887                 goto out;
888         if (net_ratelimit())
889                 printk(KERN_WARNING "dst cache overflow\n");
890         RT_CACHE_STAT_INC(gc_dst_overflow);
891         return 1;
892
893 work_done:
894         expire += ip_rt_gc_min_interval;
895         if (expire > ip_rt_gc_timeout ||
896             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
897                 expire = ip_rt_gc_timeout;
898 #if RT_CACHE_DEBUG >= 2
899         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
900                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
901 #endif
902 out:    return 0;
903 }
904
905 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
906 {
907         struct rtable   *rth, **rthp;
908         unsigned long   now;
909         struct rtable *cand, **candp;
910         u32             min_score;
911         int             chain_length;
912         int attempts = !in_softirq();
913
914 restart:
915         chain_length = 0;
916         min_score = ~(u32)0;
917         cand = NULL;
918         candp = NULL;
919         now = jiffies;
920
921         rthp = &rt_hash_table[hash].chain;
922
923         spin_lock_bh(rt_hash_lock_addr(hash));
924         while ((rth = *rthp) != NULL) {
925 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
926                 if (!(rth->u.dst.flags & DST_BALANCED) &&
927                     compare_keys(&rth->fl, &rt->fl)) {
928 #else
929                 if (compare_keys(&rth->fl, &rt->fl)) {
930 #endif
931                         /* Put it first */
932                         *rthp = rth->u.rt_next;
933                         /*
934                          * Since lookup is lockfree, the deletion
935                          * must be visible to another weakly ordered CPU before
936                          * the insertion at the start of the hash chain.
937                          */
938                         rcu_assign_pointer(rth->u.rt_next,
939                                            rt_hash_table[hash].chain);
940                         /*
941                          * Since lookup is lockfree, the update writes
942                          * must be ordered for consistency on SMP.
943                          */
944                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
945
946                         rth->u.dst.__use++;
947                         dst_hold(&rth->u.dst);
948                         rth->u.dst.lastuse = now;
949                         spin_unlock_bh(rt_hash_lock_addr(hash));
950
951                         rt_drop(rt);
952                         *rp = rth;
953                         return 0;
954                 }
955
956                 if (!atomic_read(&rth->u.dst.__refcnt)) {
957                         u32 score = rt_score(rth);
958
959                         if (score <= min_score) {
960                                 cand = rth;
961                                 candp = rthp;
962                                 min_score = score;
963                         }
964                 }
965
966                 chain_length++;
967
968                 rthp = &rth->u.rt_next;
969         }
970
971         if (cand) {
972                 /* ip_rt_gc_elasticity used to be average length of chain
973                  * length, when exceeded gc becomes really aggressive.
974                  *
975                  * The second limit is less certain. At the moment it allows
976                  * only 2 entries per bucket. We will see.
977                  */
978                 if (chain_length > ip_rt_gc_elasticity) {
979                         *candp = cand->u.rt_next;
980                         rt_free(cand);
981                 }
982         }
983
984         /* Try to bind route to arp only if it is output
985            route or unicast forwarding path.
986          */
987         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
988                 int err = arp_bind_neighbour(&rt->u.dst);
989                 if (err) {
990                         spin_unlock_bh(rt_hash_lock_addr(hash));
991
992                         if (err != -ENOBUFS) {
993                                 rt_drop(rt);
994                                 return err;
995                         }
996
997                         /* Neighbour tables are full and nothing
998                            can be released. Try to shrink route cache,
999                            it is most likely it holds some neighbour records.
1000                          */
1001                         if (attempts-- > 0) {
1002                                 int saved_elasticity = ip_rt_gc_elasticity;
1003                                 int saved_int = ip_rt_gc_min_interval;
1004                                 ip_rt_gc_elasticity     = 1;
1005                                 ip_rt_gc_min_interval   = 0;
1006                                 rt_garbage_collect();
1007                                 ip_rt_gc_min_interval   = saved_int;
1008                                 ip_rt_gc_elasticity     = saved_elasticity;
1009                                 goto restart;
1010                         }
1011
1012                         if (net_ratelimit())
1013                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1014                         rt_drop(rt);
1015                         return -ENOBUFS;
1016                 }
1017         }
1018
1019         rt->u.rt_next = rt_hash_table[hash].chain;
1020 #if RT_CACHE_DEBUG >= 2
1021         if (rt->u.rt_next) {
1022                 struct rtable *trt;
1023                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1024                        NIPQUAD(rt->rt_dst));
1025                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1026                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1027                 printk("\n");
1028         }
1029 #endif
1030         rt_hash_table[hash].chain = rt;
1031         spin_unlock_bh(rt_hash_lock_addr(hash));
1032         *rp = rt;
1033         return 0;
1034 }
1035
1036 void rt_bind_peer(struct rtable *rt, int create)
1037 {
1038         static DEFINE_SPINLOCK(rt_peer_lock);
1039         struct inet_peer *peer;
1040
1041         peer = inet_getpeer(rt->rt_dst, create);
1042
1043         spin_lock_bh(&rt_peer_lock);
1044         if (rt->peer == NULL) {
1045                 rt->peer = peer;
1046                 peer = NULL;
1047         }
1048         spin_unlock_bh(&rt_peer_lock);
1049         if (peer)
1050                 inet_putpeer(peer);
1051 }
1052
1053 /*
1054  * Peer allocation may fail only in serious out-of-memory conditions.  However
1055  * we still can generate some output.
1056  * Random ID selection looks a bit dangerous because we have no chances to
1057  * select ID being unique in a reasonable period of time.
1058  * But broken packet identifier may be better than no packet at all.
1059  */
1060 static void ip_select_fb_ident(struct iphdr *iph)
1061 {
1062         static DEFINE_SPINLOCK(ip_fb_id_lock);
1063         static u32 ip_fallback_id;
1064         u32 salt;
1065
1066         spin_lock_bh(&ip_fb_id_lock);
1067         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1068         iph->id = htons(salt & 0xFFFF);
1069         ip_fallback_id = salt;
1070         spin_unlock_bh(&ip_fb_id_lock);
1071 }
1072
1073 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1074 {
1075         struct rtable *rt = (struct rtable *) dst;
1076
1077         if (rt) {
1078                 if (rt->peer == NULL)
1079                         rt_bind_peer(rt, 1);
1080
1081                 /* If peer is attached to destination, it is never detached,
1082                    so that we need not to grab a lock to dereference it.
1083                  */
1084                 if (rt->peer) {
1085                         iph->id = htons(inet_getid(rt->peer, more));
1086                         return;
1087                 }
1088         } else
1089                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
1090                        __builtin_return_address(0));
1091
1092         ip_select_fb_ident(iph);
1093 }
1094
1095 static void rt_del(unsigned hash, struct rtable *rt)
1096 {
1097         struct rtable **rthp;
1098
1099         spin_lock_bh(rt_hash_lock_addr(hash));
1100         ip_rt_put(rt);
1101         for (rthp = &rt_hash_table[hash].chain; *rthp;
1102              rthp = &(*rthp)->u.rt_next)
1103                 if (*rthp == rt) {
1104                         *rthp = rt->u.rt_next;
1105                         rt_free(rt);
1106                         break;
1107                 }
1108         spin_unlock_bh(rt_hash_lock_addr(hash));
1109 }
1110
1111 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1112                     u32 saddr, u8 tos, struct net_device *dev)
1113 {
1114         int i, k;
1115         struct in_device *in_dev = in_dev_get(dev);
1116         struct rtable *rth, **rthp;
1117         u32  skeys[2] = { saddr, 0 };
1118         int  ikeys[2] = { dev->ifindex, 0 };
1119
1120         tos &= IPTOS_RT_MASK;
1121
1122         if (!in_dev)
1123                 return;
1124
1125         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1126             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1127                 goto reject_redirect;
1128
1129         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1130                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1131                         goto reject_redirect;
1132                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1133                         goto reject_redirect;
1134         } else {
1135                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1136                         goto reject_redirect;
1137         }
1138
1139         for (i = 0; i < 2; i++) {
1140                 for (k = 0; k < 2; k++) {
1141                         unsigned hash = rt_hash_code(daddr,
1142                                                      skeys[i] ^ (ikeys[k] << 5),
1143                                                      tos);
1144
1145                         rthp=&rt_hash_table[hash].chain;
1146
1147                         rcu_read_lock();
1148                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1149                                 struct rtable *rt;
1150
1151                                 if (rth->fl.fl4_dst != daddr ||
1152                                     rth->fl.fl4_src != skeys[i] ||
1153                                     rth->fl.fl4_tos != tos ||
1154                                     rth->fl.oif != ikeys[k] ||
1155                                     rth->fl.iif != 0) {
1156                                         rthp = &rth->u.rt_next;
1157                                         continue;
1158                                 }
1159
1160                                 if (rth->rt_dst != daddr ||
1161                                     rth->rt_src != saddr ||
1162                                     rth->u.dst.error ||
1163                                     rth->rt_gateway != old_gw ||
1164                                     rth->u.dst.dev != dev)
1165                                         break;
1166
1167                                 dst_hold(&rth->u.dst);
1168                                 rcu_read_unlock();
1169
1170                                 rt = dst_alloc(&ipv4_dst_ops);
1171                                 if (rt == NULL) {
1172                                         ip_rt_put(rth);
1173                                         in_dev_put(in_dev);
1174                                         return;
1175                                 }
1176
1177                                 /* Copy all the information. */
1178                                 *rt = *rth;
1179                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1180                                 rt->u.dst.__use         = 1;
1181                                 atomic_set(&rt->u.dst.__refcnt, 1);
1182                                 rt->u.dst.child         = NULL;
1183                                 if (rt->u.dst.dev)
1184                                         dev_hold(rt->u.dst.dev);
1185                                 if (rt->idev)
1186                                         in_dev_hold(rt->idev);
1187                                 rt->u.dst.obsolete      = 0;
1188                                 rt->u.dst.lastuse       = jiffies;
1189                                 rt->u.dst.path          = &rt->u.dst;
1190                                 rt->u.dst.neighbour     = NULL;
1191                                 rt->u.dst.hh            = NULL;
1192                                 rt->u.dst.xfrm          = NULL;
1193
1194                                 rt->rt_flags            |= RTCF_REDIRECTED;
1195
1196                                 /* Gateway is different ... */
1197                                 rt->rt_gateway          = new_gw;
1198
1199                                 /* Redirect received -> path was valid */
1200                                 dst_confirm(&rth->u.dst);
1201
1202                                 if (rt->peer)
1203                                         atomic_inc(&rt->peer->refcnt);
1204
1205                                 if (arp_bind_neighbour(&rt->u.dst) ||
1206                                     !(rt->u.dst.neighbour->nud_state &
1207                                             NUD_VALID)) {
1208                                         if (rt->u.dst.neighbour)
1209                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1210                                         ip_rt_put(rth);
1211                                         rt_drop(rt);
1212                                         goto do_next;
1213                                 }
1214
1215                                 rt_del(hash, rth);
1216                                 if (!rt_intern_hash(hash, rt, &rt))
1217                                         ip_rt_put(rt);
1218                                 goto do_next;
1219                         }
1220                         rcu_read_unlock();
1221                 do_next:
1222                         ;
1223                 }
1224         }
1225         in_dev_put(in_dev);
1226         return;
1227
1228 reject_redirect:
1229 #ifdef CONFIG_IP_ROUTE_VERBOSE
1230         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1231                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1232                         "%u.%u.%u.%u ignored.\n"
1233                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1234                         "tos %02x\n",
1235                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1236                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1237 #endif
1238         in_dev_put(in_dev);
1239 }
1240
1241 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1242 {
1243         struct rtable *rt = (struct rtable*)dst;
1244         struct dst_entry *ret = dst;
1245
1246         if (rt) {
1247                 if (dst->obsolete) {
1248                         ip_rt_put(rt);
1249                         ret = NULL;
1250                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1251                            rt->u.dst.expires) {
1252                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1253                                                      rt->fl.fl4_src ^
1254                                                         (rt->fl.oif << 5),
1255                                                      rt->fl.fl4_tos);
1256 #if RT_CACHE_DEBUG >= 1
1257                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1258                                           "%u.%u.%u.%u/%02x dropped\n",
1259                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1260 #endif
1261                         rt_del(hash, rt);
1262                         ret = NULL;
1263                 }
1264         }
1265         return ret;
1266 }
1267
1268 /*
1269  * Algorithm:
1270  *      1. The first ip_rt_redirect_number redirects are sent
1271  *         with exponential backoff, then we stop sending them at all,
1272  *         assuming that the host ignores our redirects.
1273  *      2. If we did not see packets requiring redirects
1274  *         during ip_rt_redirect_silence, we assume that the host
1275  *         forgot redirected route and start to send redirects again.
1276  *
1277  * This algorithm is much cheaper and more intelligent than dumb load limiting
1278  * in icmp.c.
1279  *
1280  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1281  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1282  */
1283
1284 void ip_rt_send_redirect(struct sk_buff *skb)
1285 {
1286         struct rtable *rt = (struct rtable*)skb->dst;
1287         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1288
1289         if (!in_dev)
1290                 return;
1291
1292         if (!IN_DEV_TX_REDIRECTS(in_dev))
1293                 goto out;
1294
1295         /* No redirected packets during ip_rt_redirect_silence;
1296          * reset the algorithm.
1297          */
1298         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1299                 rt->u.dst.rate_tokens = 0;
1300
1301         /* Too many ignored redirects; do not send anything
1302          * set u.dst.rate_last to the last seen redirected packet.
1303          */
1304         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1305                 rt->u.dst.rate_last = jiffies;
1306                 goto out;
1307         }
1308
1309         /* Check for load limit; set rate_last to the latest sent
1310          * redirect.
1311          */
1312         if (time_after(jiffies,
1313                        (rt->u.dst.rate_last +
1314                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1315                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1316                 rt->u.dst.rate_last = jiffies;
1317                 ++rt->u.dst.rate_tokens;
1318 #ifdef CONFIG_IP_ROUTE_VERBOSE
1319                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1320                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1321                     net_ratelimit())
1322                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1323                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1324                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1325                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1326 #endif
1327         }
1328 out:
1329         in_dev_put(in_dev);
1330 }
1331
1332 static int ip_error(struct sk_buff *skb)
1333 {
1334         struct rtable *rt = (struct rtable*)skb->dst;
1335         unsigned long now;
1336         int code;
1337
1338         switch (rt->u.dst.error) {
1339                 case EINVAL:
1340                 default:
1341                         goto out;
1342                 case EHOSTUNREACH:
1343                         code = ICMP_HOST_UNREACH;
1344                         break;
1345                 case ENETUNREACH:
1346                         code = ICMP_NET_UNREACH;
1347                         break;
1348                 case EACCES:
1349                         code = ICMP_PKT_FILTERED;
1350                         break;
1351         }
1352
1353         now = jiffies;
1354         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1355         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1356                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1357         rt->u.dst.rate_last = now;
1358         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1359                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1360                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1361         }
1362
1363 out:    kfree_skb(skb);
1364         return 0;
1365
1366
1367 /*
1368  *      The last two values are not from the RFC but
1369  *      are needed for AMPRnet AX.25 paths.
1370  */
1371
1372 static unsigned short mtu_plateau[] =
1373 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1374
1375 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1376 {
1377         int i;
1378         
1379         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1380                 if (old_mtu > mtu_plateau[i])
1381                         return mtu_plateau[i];
1382         return 68;
1383 }
1384
1385 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1386 {
1387         int i;
1388         unsigned short old_mtu = ntohs(iph->tot_len);
1389         struct rtable *rth;
1390         u32  skeys[2] = { iph->saddr, 0, };
1391         u32  daddr = iph->daddr;
1392         u8   tos = iph->tos & IPTOS_RT_MASK;
1393         unsigned short est_mtu = 0;
1394
1395         if (ipv4_config.no_pmtu_disc)
1396                 return 0;
1397
1398         for (i = 0; i < 2; i++) {
1399                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1400
1401                 rcu_read_lock();
1402                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1403                      rth = rcu_dereference(rth->u.rt_next)) {
1404                         if (rth->fl.fl4_dst == daddr &&
1405                             rth->fl.fl4_src == skeys[i] &&
1406                             rth->rt_dst  == daddr &&
1407                             rth->rt_src  == iph->saddr &&
1408                             rth->fl.fl4_tos == tos &&
1409                             rth->fl.iif == 0 &&
1410                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1411                                 unsigned short mtu = new_mtu;
1412
1413                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1414
1415                                         /* BSD 4.2 compatibility hack :-( */
1416                                         if (mtu == 0 &&
1417                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1418                                             old_mtu >= 68 + (iph->ihl << 2))
1419                                                 old_mtu -= iph->ihl << 2;
1420
1421                                         mtu = guess_mtu(old_mtu);
1422                                 }
1423                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1424                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1425                                                 dst_confirm(&rth->u.dst);
1426                                                 if (mtu < ip_rt_min_pmtu) {
1427                                                         mtu = ip_rt_min_pmtu;
1428                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1429                                                                 (1 << RTAX_MTU);
1430                                                 }
1431                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1432                                                 dst_set_expires(&rth->u.dst,
1433                                                         ip_rt_mtu_expires);
1434                                         }
1435                                         est_mtu = mtu;
1436                                 }
1437                         }
1438                 }
1439                 rcu_read_unlock();
1440         }
1441         return est_mtu ? : new_mtu;
1442 }
1443
1444 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1445 {
1446         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1447             !(dst_metric_locked(dst, RTAX_MTU))) {
1448                 if (mtu < ip_rt_min_pmtu) {
1449                         mtu = ip_rt_min_pmtu;
1450                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1451                 }
1452                 dst->metrics[RTAX_MTU-1] = mtu;
1453                 dst_set_expires(dst, ip_rt_mtu_expires);
1454         }
1455 }
1456
1457 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1458 {
1459         return NULL;
1460 }
1461
1462 static void ipv4_dst_destroy(struct dst_entry *dst)
1463 {
1464         struct rtable *rt = (struct rtable *) dst;
1465         struct inet_peer *peer = rt->peer;
1466         struct in_device *idev = rt->idev;
1467
1468         if (peer) {
1469                 rt->peer = NULL;
1470                 inet_putpeer(peer);
1471         }
1472
1473         if (idev) {
1474                 rt->idev = NULL;
1475                 in_dev_put(idev);
1476         }
1477 }
1478
1479 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1480                             int how)
1481 {
1482         struct rtable *rt = (struct rtable *) dst;
1483         struct in_device *idev = rt->idev;
1484         if (dev != &loopback_dev && idev && idev->dev == dev) {
1485                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1486                 if (loopback_idev) {
1487                         rt->idev = loopback_idev;
1488                         in_dev_put(idev);
1489                 }
1490         }
1491 }
1492
1493 static void ipv4_link_failure(struct sk_buff *skb)
1494 {
1495         struct rtable *rt;
1496
1497         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1498
1499         rt = (struct rtable *) skb->dst;
1500         if (rt)
1501                 dst_set_expires(&rt->u.dst, 0);
1502 }
1503
1504 static int ip_rt_bug(struct sk_buff *skb)
1505 {
1506         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1507                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1508                 skb->dev ? skb->dev->name : "?");
1509         kfree_skb(skb);
1510         return 0;
1511 }
1512
1513 /*
1514    We do not cache source address of outgoing interface,
1515    because it is used only by IP RR, TS and SRR options,
1516    so that it out of fast path.
1517
1518    BTW remember: "addr" is allowed to be not aligned
1519    in IP options!
1520  */
1521
1522 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1523 {
1524         u32 src;
1525         struct fib_result res;
1526
1527         if (rt->fl.iif == 0)
1528                 src = rt->rt_src;
1529         else if (fib_lookup(&rt->fl, &res) == 0) {
1530                 src = FIB_RES_PREFSRC(res);
1531                 fib_res_put(&res);
1532         } else
1533                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1534                                         RT_SCOPE_UNIVERSE);
1535         memcpy(addr, &src, 4);
1536 }
1537
1538 #ifdef CONFIG_NET_CLS_ROUTE
1539 static void set_class_tag(struct rtable *rt, u32 tag)
1540 {
1541         if (!(rt->u.dst.tclassid & 0xFFFF))
1542                 rt->u.dst.tclassid |= tag & 0xFFFF;
1543         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1544                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1545 }
1546 #endif
1547
1548 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1549 {
1550         struct fib_info *fi = res->fi;
1551
1552         if (fi) {
1553                 if (FIB_RES_GW(*res) &&
1554                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1555                         rt->rt_gateway = FIB_RES_GW(*res);
1556                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1557                        sizeof(rt->u.dst.metrics));
1558                 if (fi->fib_mtu == 0) {
1559                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1560                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1561                             rt->rt_gateway != rt->rt_dst &&
1562                             rt->u.dst.dev->mtu > 576)
1563                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1564                 }
1565 #ifdef CONFIG_NET_CLS_ROUTE
1566                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1567 #endif
1568         } else
1569                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1570
1571         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1572                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1573         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1574                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1575         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1576                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1577                                        ip_rt_min_advmss);
1578         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1579                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1580
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582 #ifdef CONFIG_IP_MULTIPLE_TABLES
1583         set_class_tag(rt, fib_rules_tclass(res));
1584 #endif
1585         set_class_tag(rt, itag);
1586 #endif
1587         rt->rt_type = res->type;
1588 }
1589
1590 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1591                                 u8 tos, struct net_device *dev, int our)
1592 {
1593         unsigned hash;
1594         struct rtable *rth;
1595         u32 spec_dst;
1596         struct in_device *in_dev = in_dev_get(dev);
1597         u32 itag = 0;
1598
1599         /* Primary sanity checks. */
1600
1601         if (in_dev == NULL)
1602                 return -EINVAL;
1603
1604         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1605             skb->protocol != htons(ETH_P_IP))
1606                 goto e_inval;
1607
1608         if (ZERONET(saddr)) {
1609                 if (!LOCAL_MCAST(daddr))
1610                         goto e_inval;
1611                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1612         } else if (fib_validate_source(saddr, 0, tos, 0,
1613                                         dev, &spec_dst, &itag) < 0)
1614                 goto e_inval;
1615
1616         rth = dst_alloc(&ipv4_dst_ops);
1617         if (!rth)
1618                 goto e_nobufs;
1619
1620         rth->u.dst.output= ip_rt_bug;
1621
1622         atomic_set(&rth->u.dst.__refcnt, 1);
1623         rth->u.dst.flags= DST_HOST;
1624         if (in_dev->cnf.no_policy)
1625                 rth->u.dst.flags |= DST_NOPOLICY;
1626         rth->fl.fl4_dst = daddr;
1627         rth->rt_dst     = daddr;
1628         rth->fl.fl4_tos = tos;
1629 #ifdef CONFIG_IP_ROUTE_FWMARK
1630         rth->fl.fl4_fwmark= skb->nfmark;
1631 #endif
1632         rth->fl.fl4_src = saddr;
1633         rth->rt_src     = saddr;
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635         rth->u.dst.tclassid = itag;
1636 #endif
1637         rth->rt_iif     =
1638         rth->fl.iif     = dev->ifindex;
1639         rth->u.dst.dev  = &loopback_dev;
1640         dev_hold(rth->u.dst.dev);
1641         rth->idev       = in_dev_get(rth->u.dst.dev);
1642         rth->fl.oif     = 0;
1643         rth->rt_gateway = daddr;
1644         rth->rt_spec_dst= spec_dst;
1645         rth->rt_type    = RTN_MULTICAST;
1646         rth->rt_flags   = RTCF_MULTICAST;
1647         if (our) {
1648                 rth->u.dst.input= ip_local_deliver;
1649                 rth->rt_flags |= RTCF_LOCAL;
1650         }
1651
1652 #ifdef CONFIG_IP_MROUTE
1653         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1654                 rth->u.dst.input = ip_mr_input;
1655 #endif
1656         RT_CACHE_STAT_INC(in_slow_mc);
1657
1658         in_dev_put(in_dev);
1659         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1660         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1661
1662 e_nobufs:
1663         in_dev_put(in_dev);
1664         return -ENOBUFS;
1665
1666 e_inval:
1667         in_dev_put(in_dev);
1668         return -EINVAL;
1669 }
1670
1671
1672 static void ip_handle_martian_source(struct net_device *dev,
1673                                      struct in_device *in_dev,
1674                                      struct sk_buff *skb,
1675                                      u32 daddr,
1676                                      u32 saddr) 
1677 {
1678         RT_CACHE_STAT_INC(in_martian_src);
1679 #ifdef CONFIG_IP_ROUTE_VERBOSE
1680         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1681                 /*
1682                  *      RFC1812 recommendation, if source is martian,
1683                  *      the only hint is MAC header.
1684                  */
1685                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1686                         "%u.%u.%u.%u, on dev %s\n",
1687                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1688                 if (dev->hard_header_len) {
1689                         int i;
1690                         unsigned char *p = skb->mac.raw;
1691                         printk(KERN_WARNING "ll header: ");
1692                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1693                                 printk("%02x", *p);
1694                                 if (i < (dev->hard_header_len - 1))
1695                                         printk(":");
1696                         }
1697                         printk("\n");
1698                 }
1699         }
1700 #endif
1701 }
1702
1703 static inline int __mkroute_input(struct sk_buff *skb, 
1704                                   struct fib_result* res, 
1705                                   struct in_device *in_dev, 
1706                                   u32 daddr, u32 saddr, u32 tos, 
1707                                   struct rtable **result) 
1708 {
1709
1710         struct rtable *rth;
1711         int err;
1712         struct in_device *out_dev;
1713         unsigned flags = 0;
1714         u32 spec_dst, itag;
1715
1716         /* get a working reference to the output device */
1717         out_dev = in_dev_get(FIB_RES_DEV(*res));
1718         if (out_dev == NULL) {
1719                 if (net_ratelimit())
1720                         printk(KERN_CRIT "Bug in ip_route_input" \
1721                                "_slow(). Please, report\n");
1722                 return -EINVAL;
1723         }
1724
1725
1726         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1727                                   in_dev->dev, &spec_dst, &itag);
1728         if (err < 0) {
1729                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1730                                          saddr);
1731                 
1732                 err = -EINVAL;
1733                 goto cleanup;
1734         }
1735
1736         if (err)
1737                 flags |= RTCF_DIRECTSRC;
1738
1739         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1740             (IN_DEV_SHARED_MEDIA(out_dev) ||
1741              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1742                 flags |= RTCF_DOREDIRECT;
1743
1744         if (skb->protocol != htons(ETH_P_IP)) {
1745                 /* Not IP (i.e. ARP). Do not create route, if it is
1746                  * invalid for proxy arp. DNAT routes are always valid.
1747                  */
1748                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1749                         err = -EINVAL;
1750                         goto cleanup;
1751                 }
1752         }
1753
1754
1755         rth = dst_alloc(&ipv4_dst_ops);
1756         if (!rth) {
1757                 err = -ENOBUFS;
1758                 goto cleanup;
1759         }
1760
1761         rth->u.dst.flags= DST_HOST;
1762 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1763         if (res->fi->fib_nhs > 1)
1764                 rth->u.dst.flags |= DST_BALANCED;
1765 #endif
1766         if (in_dev->cnf.no_policy)
1767                 rth->u.dst.flags |= DST_NOPOLICY;
1768         if (in_dev->cnf.no_xfrm)
1769                 rth->u.dst.flags |= DST_NOXFRM;
1770         rth->fl.fl4_dst = daddr;
1771         rth->rt_dst     = daddr;
1772         rth->fl.fl4_tos = tos;
1773 #ifdef CONFIG_IP_ROUTE_FWMARK
1774         rth->fl.fl4_fwmark= skb->nfmark;
1775 #endif
1776         rth->fl.fl4_src = saddr;
1777         rth->rt_src     = saddr;
1778         rth->rt_gateway = daddr;
1779         rth->rt_iif     =
1780                 rth->fl.iif     = in_dev->dev->ifindex;
1781         rth->u.dst.dev  = (out_dev)->dev;
1782         dev_hold(rth->u.dst.dev);
1783         rth->idev       = in_dev_get(rth->u.dst.dev);
1784         rth->fl.oif     = 0;
1785         rth->rt_spec_dst= spec_dst;
1786
1787         rth->u.dst.input = ip_forward;
1788         rth->u.dst.output = ip_output;
1789
1790         rt_set_nexthop(rth, res, itag);
1791
1792         rth->rt_flags = flags;
1793
1794         *result = rth;
1795         err = 0;
1796  cleanup:
1797         /* release the working reference to the output device */
1798         in_dev_put(out_dev);
1799         return err;
1800 }                                               
1801
1802 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1803                                        struct fib_result* res, 
1804                                        const struct flowi *fl,
1805                                        struct in_device *in_dev,
1806                                        u32 daddr, u32 saddr, u32 tos)
1807 {
1808         struct rtable* rth = NULL;
1809         int err;
1810         unsigned hash;
1811
1812 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1813         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1814                 fib_select_multipath(fl, res);
1815 #endif
1816
1817         /* create a routing cache entry */
1818         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1819         if (err)
1820                 return err;
1821         atomic_set(&rth->u.dst.__refcnt, 1);
1822
1823         /* put it into the cache */
1824         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1826 }
1827
1828 static inline int ip_mkroute_input(struct sk_buff *skb, 
1829                                    struct fib_result* res, 
1830                                    const struct flowi *fl,
1831                                    struct in_device *in_dev,
1832                                    u32 daddr, u32 saddr, u32 tos)
1833 {
1834 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1835         struct rtable* rth = NULL;
1836         unsigned char hop, hopcount, lasthop;
1837         int err = -EINVAL;
1838         unsigned int hash;
1839
1840         if (res->fi)
1841                 hopcount = res->fi->fib_nhs;
1842         else
1843                 hopcount = 1;
1844
1845         lasthop = hopcount - 1;
1846
1847         /* distinguish between multipath and singlepath */
1848         if (hopcount < 2)
1849                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850                                             saddr, tos);
1851         
1852         /* add all alternatives to the routing cache */
1853         for (hop = 0; hop < hopcount; hop++) {
1854                 res->nh_sel = hop;
1855
1856                 /* create a routing cache entry */
1857                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1858                                       &rth);
1859                 if (err)
1860                         return err;
1861
1862                 /* put it into the cache */
1863                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1864                 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1865                 if (err)
1866                         return err;
1867
1868                 /* forward hop information to multipath impl. */
1869                 multipath_set_nhinfo(rth,
1870                                      FIB_RES_NETWORK(*res),
1871                                      FIB_RES_NETMASK(*res),
1872                                      res->prefixlen,
1873                                      &FIB_RES_NH(*res));
1874
1875                 /* only for the last hop the reference count is handled
1876                  * outside
1877                  */
1878                 if (hop == lasthop)
1879                         atomic_set(&(skb->dst->__refcnt), 1);
1880         }
1881         return err;
1882 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1883         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1884 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1885 }
1886
1887
1888 /*
1889  *      NOTE. We drop all the packets that has local source
1890  *      addresses, because every properly looped back packet
1891  *      must have correct destination already attached by output routine.
1892  *
1893  *      Such approach solves two big problems:
1894  *      1. Not simplex devices are handled properly.
1895  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1896  */
1897
1898 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1899                                u8 tos, struct net_device *dev)
1900 {
1901         struct fib_result res;
1902         struct in_device *in_dev = in_dev_get(dev);
1903         struct flowi fl = { .nl_u = { .ip4_u =
1904                                       { .daddr = daddr,
1905                                         .saddr = saddr,
1906                                         .tos = tos,
1907                                         .scope = RT_SCOPE_UNIVERSE,
1908 #ifdef CONFIG_IP_ROUTE_FWMARK
1909                                         .fwmark = skb->nfmark
1910 #endif
1911                                       } },
1912                             .iif = dev->ifindex };
1913         unsigned        flags = 0;
1914         u32             itag = 0;
1915         struct rtable * rth;
1916         unsigned        hash;
1917         u32             spec_dst;
1918         int             err = -EINVAL;
1919         int             free_res = 0;
1920
1921         /* IP on this device is disabled. */
1922
1923         if (!in_dev)
1924                 goto out;
1925
1926         /* Check for the most weird martians, which can be not detected
1927            by fib_lookup.
1928          */
1929
1930         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1931                 goto martian_source;
1932
1933         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1934                 goto brd_input;
1935
1936         /* Accept zero addresses only to limited broadcast;
1937          * I even do not know to fix it or not. Waiting for complains :-)
1938          */
1939         if (ZERONET(saddr))
1940                 goto martian_source;
1941
1942         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1943                 goto martian_destination;
1944
1945         /*
1946          *      Now we are ready to route packet.
1947          */
1948         if ((err = fib_lookup(&fl, &res)) != 0) {
1949                 if (!IN_DEV_FORWARD(in_dev))
1950                         goto e_hostunreach;
1951                 goto no_route;
1952         }
1953         free_res = 1;
1954
1955         RT_CACHE_STAT_INC(in_slow_tot);
1956
1957         if (res.type == RTN_BROADCAST)
1958                 goto brd_input;
1959
1960         if (res.type == RTN_LOCAL) {
1961                 int result;
1962                 result = fib_validate_source(saddr, daddr, tos,
1963                                              loopback_dev.ifindex,
1964                                              dev, &spec_dst, &itag);
1965                 if (result < 0)
1966                         goto martian_source;
1967                 if (result)
1968                         flags |= RTCF_DIRECTSRC;
1969                 spec_dst = daddr;
1970                 goto local_input;
1971         }
1972
1973         if (!IN_DEV_FORWARD(in_dev))
1974                 goto e_hostunreach;
1975         if (res.type != RTN_UNICAST)
1976                 goto martian_destination;
1977
1978         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1979         if (err == -ENOBUFS)
1980                 goto e_nobufs;
1981         if (err == -EINVAL)
1982                 goto e_inval;
1983         
1984 done:
1985         in_dev_put(in_dev);
1986         if (free_res)
1987                 fib_res_put(&res);
1988 out:    return err;
1989
1990 brd_input:
1991         if (skb->protocol != htons(ETH_P_IP))
1992                 goto e_inval;
1993
1994         if (ZERONET(saddr))
1995                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1996         else {
1997                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1998                                           &itag);
1999                 if (err < 0)
2000                         goto martian_source;
2001                 if (err)
2002                         flags |= RTCF_DIRECTSRC;
2003         }
2004         flags |= RTCF_BROADCAST;
2005         res.type = RTN_BROADCAST;
2006         RT_CACHE_STAT_INC(in_brd);
2007
2008 local_input:
2009         rth = dst_alloc(&ipv4_dst_ops);
2010         if (!rth)
2011                 goto e_nobufs;
2012
2013         rth->u.dst.output= ip_rt_bug;
2014
2015         atomic_set(&rth->u.dst.__refcnt, 1);
2016         rth->u.dst.flags= DST_HOST;
2017         if (in_dev->cnf.no_policy)
2018                 rth->u.dst.flags |= DST_NOPOLICY;
2019         rth->fl.fl4_dst = daddr;
2020         rth->rt_dst     = daddr;
2021         rth->fl.fl4_tos = tos;
2022 #ifdef CONFIG_IP_ROUTE_FWMARK
2023         rth->fl.fl4_fwmark= skb->nfmark;
2024 #endif
2025         rth->fl.fl4_src = saddr;
2026         rth->rt_src     = saddr;
2027 #ifdef CONFIG_NET_CLS_ROUTE
2028         rth->u.dst.tclassid = itag;
2029 #endif
2030         rth->rt_iif     =
2031         rth->fl.iif     = dev->ifindex;
2032         rth->u.dst.dev  = &loopback_dev;
2033         dev_hold(rth->u.dst.dev);
2034         rth->idev       = in_dev_get(rth->u.dst.dev);
2035         rth->rt_gateway = daddr;
2036         rth->rt_spec_dst= spec_dst;
2037         rth->u.dst.input= ip_local_deliver;
2038         rth->rt_flags   = flags|RTCF_LOCAL;
2039         if (res.type == RTN_UNREACHABLE) {
2040                 rth->u.dst.input= ip_error;
2041                 rth->u.dst.error= -err;
2042                 rth->rt_flags   &= ~RTCF_LOCAL;
2043         }
2044         rth->rt_type    = res.type;
2045         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2046         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2047         goto done;
2048
2049 no_route:
2050         RT_CACHE_STAT_INC(in_no_route);
2051         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2052         res.type = RTN_UNREACHABLE;
2053         goto local_input;
2054
2055         /*
2056          *      Do not cache martian addresses: they should be logged (RFC1812)
2057          */
2058 martian_destination:
2059         RT_CACHE_STAT_INC(in_martian_dst);
2060 #ifdef CONFIG_IP_ROUTE_VERBOSE
2061         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2062                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2063                         "%u.%u.%u.%u, dev %s\n",
2064                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2065 #endif
2066
2067 e_hostunreach:
2068         err = -EHOSTUNREACH;
2069         goto done;
2070
2071 e_inval:
2072         err = -EINVAL;
2073         goto done;
2074
2075 e_nobufs:
2076         err = -ENOBUFS;
2077         goto done;
2078
2079 martian_source:
2080         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2081         goto e_inval;
2082 }
2083
2084 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2085                    u8 tos, struct net_device *dev)
2086 {
2087         struct rtable * rth;
2088         unsigned        hash;
2089         int iif = dev->ifindex;
2090
2091         tos &= IPTOS_RT_MASK;
2092         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2093
2094         rcu_read_lock();
2095         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2096              rth = rcu_dereference(rth->u.rt_next)) {
2097                 if (rth->fl.fl4_dst == daddr &&
2098                     rth->fl.fl4_src == saddr &&
2099                     rth->fl.iif == iif &&
2100                     rth->fl.oif == 0 &&
2101 #ifdef CONFIG_IP_ROUTE_FWMARK
2102                     rth->fl.fl4_fwmark == skb->nfmark &&
2103 #endif
2104                     rth->fl.fl4_tos == tos) {
2105                         rth->u.dst.lastuse = jiffies;
2106                         dst_hold(&rth->u.dst);
2107                         rth->u.dst.__use++;
2108                         RT_CACHE_STAT_INC(in_hit);
2109                         rcu_read_unlock();
2110                         skb->dst = (struct dst_entry*)rth;
2111                         return 0;
2112                 }
2113                 RT_CACHE_STAT_INC(in_hlist_search);
2114         }
2115         rcu_read_unlock();
2116
2117         /* Multicast recognition logic is moved from route cache to here.
2118            The problem was that too many Ethernet cards have broken/missing
2119            hardware multicast filters :-( As result the host on multicasting
2120            network acquires a lot of useless route cache entries, sort of
2121            SDR messages from all the world. Now we try to get rid of them.
2122            Really, provided software IP multicast filter is organized
2123            reasonably (at least, hashed), it does not result in a slowdown
2124            comparing with route cache reject entries.
2125            Note, that multicast routers are not affected, because
2126            route cache entry is created eventually.
2127          */
2128         if (MULTICAST(daddr)) {
2129                 struct in_device *in_dev;
2130
2131                 rcu_read_lock();
2132                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2133                         int our = ip_check_mc(in_dev, daddr, saddr,
2134                                 skb->nh.iph->protocol);
2135                         if (our
2136 #ifdef CONFIG_IP_MROUTE
2137                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2138 #endif
2139                             ) {
2140                                 rcu_read_unlock();
2141                                 return ip_route_input_mc(skb, daddr, saddr,
2142                                                          tos, dev, our);
2143                         }
2144                 }
2145                 rcu_read_unlock();
2146                 return -EINVAL;
2147         }
2148         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2149 }
2150
2151 static inline int __mkroute_output(struct rtable **result,
2152                                    struct fib_result* res, 
2153                                    const struct flowi *fl,
2154                                    const struct flowi *oldflp, 
2155                                    struct net_device *dev_out, 
2156                                    unsigned flags) 
2157 {
2158         struct rtable *rth;
2159         struct in_device *in_dev;
2160         u32 tos = RT_FL_TOS(oldflp);
2161         int err = 0;
2162
2163         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2164                 return -EINVAL;
2165
2166         if (fl->fl4_dst == 0xFFFFFFFF)
2167                 res->type = RTN_BROADCAST;
2168         else if (MULTICAST(fl->fl4_dst))
2169                 res->type = RTN_MULTICAST;
2170         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2171                 return -EINVAL;
2172
2173         if (dev_out->flags & IFF_LOOPBACK)
2174                 flags |= RTCF_LOCAL;
2175
2176         /* get work reference to inet device */
2177         in_dev = in_dev_get(dev_out);
2178         if (!in_dev)
2179                 return -EINVAL;
2180
2181         if (res->type == RTN_BROADCAST) {
2182                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2183                 if (res->fi) {
2184                         fib_info_put(res->fi);
2185                         res->fi = NULL;
2186                 }
2187         } else if (res->type == RTN_MULTICAST) {
2188                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2189                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2190                                  oldflp->proto))
2191                         flags &= ~RTCF_LOCAL;
2192                 /* If multicast route do not exist use
2193                    default one, but do not gateway in this case.
2194                    Yes, it is hack.
2195                  */
2196                 if (res->fi && res->prefixlen < 4) {
2197                         fib_info_put(res->fi);
2198                         res->fi = NULL;
2199                 }
2200         }
2201
2202
2203         rth = dst_alloc(&ipv4_dst_ops);
2204         if (!rth) {
2205                 err = -ENOBUFS;
2206                 goto cleanup;
2207         }               
2208
2209         rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211         if (res->fi) {
2212                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213                 if (res->fi->fib_nhs > 1)
2214                         rth->u.dst.flags |= DST_BALANCED;
2215         }
2216 #endif
2217         if (in_dev->cnf.no_xfrm)
2218                 rth->u.dst.flags |= DST_NOXFRM;
2219         if (in_dev->cnf.no_policy)
2220                 rth->u.dst.flags |= DST_NOPOLICY;
2221
2222         rth->fl.fl4_dst = oldflp->fl4_dst;
2223         rth->fl.fl4_tos = tos;
2224         rth->fl.fl4_src = oldflp->fl4_src;
2225         rth->fl.oif     = oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 #endif
2229         rth->rt_dst     = fl->fl4_dst;
2230         rth->rt_src     = fl->fl4_src;
2231         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2232         /* get references to the devices that are to be hold by the routing 
2233            cache entry */
2234         rth->u.dst.dev  = dev_out;
2235         dev_hold(dev_out);
2236         rth->idev       = in_dev_get(dev_out);
2237         rth->rt_gateway = fl->fl4_dst;
2238         rth->rt_spec_dst= fl->fl4_src;
2239
2240         rth->u.dst.output=ip_output;
2241
2242         RT_CACHE_STAT_INC(out_slow_tot);
2243
2244         if (flags & RTCF_LOCAL) {
2245                 rth->u.dst.input = ip_local_deliver;
2246                 rth->rt_spec_dst = fl->fl4_dst;
2247         }
2248         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249                 rth->rt_spec_dst = fl->fl4_src;
2250                 if (flags & RTCF_LOCAL && 
2251                     !(dev_out->flags & IFF_LOOPBACK)) {
2252                         rth->u.dst.output = ip_mc_output;
2253                         RT_CACHE_STAT_INC(out_slow_mc);
2254                 }
2255 #ifdef CONFIG_IP_MROUTE
2256                 if (res->type == RTN_MULTICAST) {
2257                         if (IN_DEV_MFORWARD(in_dev) &&
2258                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2259                                 rth->u.dst.input = ip_mr_input;
2260                                 rth->u.dst.output = ip_mc_output;
2261                         }
2262                 }
2263 #endif
2264         }
2265
2266         rt_set_nexthop(rth, res, 0);
2267
2268         rth->rt_flags = flags;
2269
2270         *result = rth;
2271  cleanup:
2272         /* release work reference to inet device */
2273         in_dev_put(in_dev);
2274
2275         return err;
2276 }
2277
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279                                         struct fib_result* res,
2280                                         const struct flowi *fl,
2281                                         const struct flowi *oldflp,
2282                                         struct net_device *dev_out,
2283                                         unsigned flags)
2284 {
2285         struct rtable *rth = NULL;
2286         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287         unsigned hash;
2288         if (err == 0) {
2289                 u32 tos = RT_FL_TOS(oldflp);
2290
2291                 atomic_set(&rth->u.dst.__refcnt, 1);
2292                 
2293                 hash = rt_hash_code(oldflp->fl4_dst, 
2294                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2295                 err = rt_intern_hash(hash, rth, rp);
2296         }
2297         
2298         return err;
2299 }
2300
2301 static inline int ip_mkroute_output(struct rtable** rp,
2302                                     struct fib_result* res,
2303                                     const struct flowi *fl,
2304                                     const struct flowi *oldflp,
2305                                     struct net_device *dev_out,
2306                                     unsigned flags)
2307 {
2308 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2309         u32 tos = RT_FL_TOS(oldflp);
2310         unsigned char hop;
2311         unsigned hash;
2312         int err = -EINVAL;
2313         struct rtable *rth = NULL;
2314
2315         if (res->fi && res->fi->fib_nhs > 1) {
2316                 unsigned char hopcount = res->fi->fib_nhs;
2317
2318                 for (hop = 0; hop < hopcount; hop++) {
2319                         struct net_device *dev2nexthop;
2320
2321                         res->nh_sel = hop;
2322
2323                         /* hold a work reference to the output device */
2324                         dev2nexthop = FIB_RES_DEV(*res);
2325                         dev_hold(dev2nexthop);
2326
2327                         err = __mkroute_output(&rth, res, fl, oldflp,
2328                                                dev2nexthop, flags);
2329
2330                         if (err != 0)
2331                                 goto cleanup;
2332
2333                         hash = rt_hash_code(oldflp->fl4_dst, 
2334                                             oldflp->fl4_src ^
2335                                             (oldflp->oif << 5), tos);
2336                         err = rt_intern_hash(hash, rth, rp);
2337
2338                         /* forward hop information to multipath impl. */
2339                         multipath_set_nhinfo(rth,
2340                                              FIB_RES_NETWORK(*res),
2341                                              FIB_RES_NETMASK(*res),
2342                                              res->prefixlen,
2343                                              &FIB_RES_NH(*res));
2344                 cleanup:
2345                         /* release work reference to output device */
2346                         dev_put(dev2nexthop);
2347
2348                         if (err != 0)
2349                                 return err;
2350                 }
2351                 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2352                 return err;
2353         } else {
2354                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2355                                              flags);
2356         }
2357 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2358         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2359 #endif
2360 }
2361
2362 /*
2363  * Major route resolver routine.
2364  */
2365
2366 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2367 {
2368         u32 tos = RT_FL_TOS(oldflp);
2369         struct flowi fl = { .nl_u = { .ip4_u =
2370                                       { .daddr = oldflp->fl4_dst,
2371                                         .saddr = oldflp->fl4_src,
2372                                         .tos = tos & IPTOS_RT_MASK,
2373                                         .scope = ((tos & RTO_ONLINK) ?
2374                                                   RT_SCOPE_LINK :
2375                                                   RT_SCOPE_UNIVERSE),
2376 #ifdef CONFIG_IP_ROUTE_FWMARK
2377                                         .fwmark = oldflp->fl4_fwmark
2378 #endif
2379                                       } },
2380                             .iif = loopback_dev.ifindex,
2381                             .oif = oldflp->oif };
2382         struct fib_result res;
2383         unsigned flags = 0;
2384         struct net_device *dev_out = NULL;
2385         int free_res = 0;
2386         int err;
2387
2388
2389         res.fi          = NULL;
2390 #ifdef CONFIG_IP_MULTIPLE_TABLES
2391         res.r           = NULL;
2392 #endif
2393
2394         if (oldflp->fl4_src) {
2395                 err = -EINVAL;
2396                 if (MULTICAST(oldflp->fl4_src) ||
2397                     BADCLASS(oldflp->fl4_src) ||
2398                     ZERONET(oldflp->fl4_src))
2399                         goto out;
2400
2401                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2402                 dev_out = ip_dev_find(oldflp->fl4_src);
2403                 if (dev_out == NULL)
2404                         goto out;
2405
2406                 /* I removed check for oif == dev_out->oif here.
2407                    It was wrong for two reasons:
2408                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2409                       assigned to multiple interfaces.
2410                    2. Moreover, we are allowed to send packets with saddr
2411                       of another iface. --ANK
2412                  */
2413
2414                 if (oldflp->oif == 0
2415                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2416                         /* Special hack: user can direct multicasts
2417                            and limited broadcast via necessary interface
2418                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2419                            This hack is not just for fun, it allows
2420                            vic,vat and friends to work.
2421                            They bind socket to loopback, set ttl to zero
2422                            and expect that it will work.
2423                            From the viewpoint of routing cache they are broken,
2424                            because we are not allowed to build multicast path
2425                            with loopback source addr (look, routing cache
2426                            cannot know, that ttl is zero, so that packet
2427                            will not leave this host and route is valid).
2428                            Luckily, this hack is good workaround.
2429                          */
2430
2431                         fl.oif = dev_out->ifindex;
2432                         goto make_route;
2433                 }
2434                 if (dev_out)
2435                         dev_put(dev_out);
2436                 dev_out = NULL;
2437         }
2438
2439
2440         if (oldflp->oif) {
2441                 dev_out = dev_get_by_index(oldflp->oif);
2442                 err = -ENODEV;
2443                 if (dev_out == NULL)
2444                         goto out;
2445                 if (__in_dev_get(dev_out) == NULL) {
2446                         dev_put(dev_out);
2447                         goto out;       /* Wrong error code */
2448                 }
2449
2450                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2451                         if (!fl.fl4_src)
2452                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2453                                                               RT_SCOPE_LINK);
2454                         goto make_route;
2455                 }
2456                 if (!fl.fl4_src) {
2457                         if (MULTICAST(oldflp->fl4_dst))
2458                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2459                                                               fl.fl4_scope);
2460                         else if (!oldflp->fl4_dst)
2461                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2462                                                               RT_SCOPE_HOST);
2463                 }
2464         }
2465
2466         if (!fl.fl4_dst) {
2467                 fl.fl4_dst = fl.fl4_src;
2468                 if (!fl.fl4_dst)
2469                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2470                 if (dev_out)
2471                         dev_put(dev_out);
2472                 dev_out = &loopback_dev;
2473                 dev_hold(dev_out);
2474                 fl.oif = loopback_dev.ifindex;
2475                 res.type = RTN_LOCAL;
2476                 flags |= RTCF_LOCAL;
2477                 goto make_route;
2478         }
2479
2480         if (fib_lookup(&fl, &res)) {
2481                 res.fi = NULL;
2482                 if (oldflp->oif) {
2483                         /* Apparently, routing tables are wrong. Assume,
2484                            that the destination is on link.
2485
2486                            WHY? DW.
2487                            Because we are allowed to send to iface
2488                            even if it has NO routes and NO assigned
2489                            addresses. When oif is specified, routing
2490                            tables are looked up with only one purpose:
2491                            to catch if destination is gatewayed, rather than
2492                            direct. Moreover, if MSG_DONTROUTE is set,
2493                            we send packet, ignoring both routing tables
2494                            and ifaddr state. --ANK
2495
2496
2497                            We could make it even if oif is unknown,
2498                            likely IPv6, but we do not.
2499                          */
2500
2501                         if (fl.fl4_src == 0)
2502                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2503                                                               RT_SCOPE_LINK);
2504                         res.type = RTN_UNICAST;
2505                         goto make_route;
2506                 }
2507                 if (dev_out)
2508                         dev_put(dev_out);
2509                 err = -ENETUNREACH;
2510                 goto out;
2511         }
2512         free_res = 1;
2513
2514         if (res.type == RTN_LOCAL) {
2515                 if (!fl.fl4_src)
2516                         fl.fl4_src = fl.fl4_dst;
2517                 if (dev_out)
2518                         dev_put(dev_out);
2519                 dev_out = &loopback_dev;
2520                 dev_hold(dev_out);
2521                 fl.oif = dev_out->ifindex;
2522                 if (res.fi)
2523                         fib_info_put(res.fi);
2524                 res.fi = NULL;
2525                 flags |= RTCF_LOCAL;
2526                 goto make_route;
2527         }
2528
2529 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2530         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2531                 fib_select_multipath(&fl, &res);
2532         else
2533 #endif
2534         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2535                 fib_select_default(&fl, &res);
2536
2537         if (!fl.fl4_src)
2538                 fl.fl4_src = FIB_RES_PREFSRC(res);
2539
2540         if (dev_out)
2541                 dev_put(dev_out);
2542         dev_out = FIB_RES_DEV(res);
2543         dev_hold(dev_out);
2544         fl.oif = dev_out->ifindex;
2545
2546
2547 make_route:
2548         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2549
2550
2551         if (free_res)
2552                 fib_res_put(&res);
2553         if (dev_out)
2554                 dev_put(dev_out);
2555 out:    return err;
2556 }
2557
2558 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2559 {
2560         unsigned hash;
2561         struct rtable *rth;
2562
2563         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2564
2565         rcu_read_lock_bh();
2566         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2567                 rth = rcu_dereference(rth->u.rt_next)) {
2568                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2569                     rth->fl.fl4_src == flp->fl4_src &&
2570                     rth->fl.iif == 0 &&
2571                     rth->fl.oif == flp->oif &&
2572 #ifdef CONFIG_IP_ROUTE_FWMARK
2573                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2574 #endif
2575                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2576                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2577
2578                         /* check for multipath routes and choose one if
2579                          * necessary
2580                          */
2581                         if (multipath_select_route(flp, rth, rp)) {
2582                                 dst_hold(&(*rp)->u.dst);
2583                                 RT_CACHE_STAT_INC(out_hit);
2584                                 rcu_read_unlock_bh();
2585                                 return 0;
2586                         }
2587
2588                         rth->u.dst.lastuse = jiffies;
2589                         dst_hold(&rth->u.dst);
2590                         rth->u.dst.__use++;
2591                         RT_CACHE_STAT_INC(out_hit);
2592                         rcu_read_unlock_bh();
2593                         *rp = rth;
2594                         return 0;
2595                 }
2596                 RT_CACHE_STAT_INC(out_hlist_search);
2597         }
2598         rcu_read_unlock_bh();
2599
2600         return ip_route_output_slow(rp, flp);
2601 }
2602
2603 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2604 {
2605         int err;
2606
2607         if ((err = __ip_route_output_key(rp, flp)) != 0)
2608                 return err;
2609
2610         if (flp->proto) {
2611                 if (!flp->fl4_src)
2612                         flp->fl4_src = (*rp)->rt_src;
2613                 if (!flp->fl4_dst)
2614                         flp->fl4_dst = (*rp)->rt_dst;
2615                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2616         }
2617
2618         return 0;
2619 }
2620
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622 {
2623         return ip_route_output_flow(rp, flp, NULL, 0);
2624 }
2625
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627                         int nowait, unsigned int flags)
2628 {
2629         struct rtable *rt = (struct rtable*)skb->dst;
2630         struct rtmsg *r;
2631         struct nlmsghdr  *nlh;
2632         unsigned char    *b = skb->tail;
2633         struct rta_cacheinfo ci;
2634 #ifdef CONFIG_IP_MROUTE
2635         struct rtattr *eptr;
2636 #endif
2637         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2638         r = NLMSG_DATA(nlh);
2639         r->rtm_family    = AF_INET;
2640         r->rtm_dst_len  = 32;
2641         r->rtm_src_len  = 0;
2642         r->rtm_tos      = rt->fl.fl4_tos;
2643         r->rtm_table    = RT_TABLE_MAIN;
2644         r->rtm_type     = rt->rt_type;
2645         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2646         r->rtm_protocol = RTPROT_UNSPEC;
2647         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2648         if (rt->rt_flags & RTCF_NOTIFY)
2649                 r->rtm_flags |= RTM_F_NOTIFY;
2650         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2651         if (rt->fl.fl4_src) {
2652                 r->rtm_src_len = 32;
2653                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2654         }
2655         if (rt->u.dst.dev)
2656                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2657 #ifdef CONFIG_NET_CLS_ROUTE
2658         if (rt->u.dst.tclassid)
2659                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2660 #endif
2661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2662         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2663                 __u32 alg = rt->rt_multipath_alg;
2664
2665                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2666         }
2667 #endif
2668         if (rt->fl.iif)
2669                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2670         else if (rt->rt_src != rt->fl.fl4_src)
2671                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2672         if (rt->rt_dst != rt->rt_gateway)
2673                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2674         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2675                 goto rtattr_failure;
2676         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2677         ci.rta_used     = rt->u.dst.__use;
2678         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2679         if (rt->u.dst.expires)
2680                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2681         else
2682                 ci.rta_expires = 0;
2683         ci.rta_error    = rt->u.dst.error;
2684         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2685         if (rt->peer) {
2686                 ci.rta_id = rt->peer->ip_id_count;
2687                 if (rt->peer->tcp_ts_stamp) {
2688                         ci.rta_ts = rt->peer->tcp_ts;
2689                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2690                 }
2691         }
2692 #ifdef CONFIG_IP_MROUTE
2693         eptr = (struct rtattr*)skb->tail;
2694 #endif
2695         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2696         if (rt->fl.iif) {
2697 #ifdef CONFIG_IP_MROUTE
2698                 u32 dst = rt->rt_dst;
2699
2700                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2701                     ipv4_devconf.mc_forwarding) {
2702                         int err = ipmr_get_route(skb, r, nowait);
2703                         if (err <= 0) {
2704                                 if (!nowait) {
2705                                         if (err == 0)
2706                                                 return 0;
2707                                         goto nlmsg_failure;
2708                                 } else {
2709                                         if (err == -EMSGSIZE)
2710                                                 goto nlmsg_failure;
2711                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2712                                 }
2713                         }
2714                 } else
2715 #endif
2716                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2717         }
2718
2719         nlh->nlmsg_len = skb->tail - b;
2720         return skb->len;
2721
2722 nlmsg_failure:
2723 rtattr_failure:
2724         skb_trim(skb, b - skb->data);
2725         return -1;
2726 }
2727
2728 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2729 {
2730         struct rtattr **rta = arg;
2731         struct rtmsg *rtm = NLMSG_DATA(nlh);
2732         struct rtable *rt = NULL;
2733         u32 dst = 0;
2734         u32 src = 0;
2735         int iif = 0;
2736         int err = -ENOBUFS;
2737         struct sk_buff *skb;
2738
2739         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2740         if (!skb)
2741                 goto out;
2742
2743         /* Reserve room for dummy headers, this skb can pass
2744            through good chunk of routing engine.
2745          */
2746         skb->mac.raw = skb->data;
2747         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2748
2749         if (rta[RTA_SRC - 1])
2750                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751         if (rta[RTA_DST - 1])
2752                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753         if (rta[RTA_IIF - 1])
2754                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2755
2756         if (iif) {
2757                 struct net_device *dev = __dev_get_by_index(iif);
2758                 err = -ENODEV;
2759                 if (!dev)
2760                         goto out_free;
2761                 skb->protocol   = htons(ETH_P_IP);
2762                 skb->dev        = dev;
2763                 local_bh_disable();
2764                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2765                 local_bh_enable();
2766                 rt = (struct rtable*)skb->dst;
2767                 if (!err && rt->u.dst.error)
2768                         err = -rt->u.dst.error;
2769         } else {
2770                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2771                                                          .saddr = src,
2772                                                          .tos = rtm->rtm_tos } } };
2773                 int oif = 0;
2774                 if (rta[RTA_OIF - 1])
2775                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2776                 fl.oif = oif;
2777                 err = ip_route_output_key(&rt, &fl);
2778         }
2779         if (err)
2780                 goto out_free;
2781
2782         skb->dst = &rt->u.dst;
2783         if (rtm->rtm_flags & RTM_F_NOTIFY)
2784                 rt->rt_flags |= RTCF_NOTIFY;
2785
2786         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2787
2788         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2789                                 RTM_NEWROUTE, 0, 0);
2790         if (!err)
2791                 goto out_free;
2792         if (err < 0) {
2793                 err = -EMSGSIZE;
2794                 goto out_free;
2795         }
2796
2797         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2798         if (err > 0)
2799                 err = 0;
2800 out:    return err;
2801
2802 out_free:
2803         kfree_skb(skb);
2804         goto out;
2805 }
2806
2807 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2808 {
2809         struct rtable *rt;
2810         int h, s_h;
2811         int idx, s_idx;
2812
2813         s_h = cb->args[0];
2814         s_idx = idx = cb->args[1];
2815         for (h = 0; h <= rt_hash_mask; h++) {
2816                 if (h < s_h) continue;
2817                 if (h > s_h)
2818                         s_idx = 0;
2819                 rcu_read_lock_bh();
2820                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2822                         if (idx < s_idx)
2823                                 continue;
2824                         skb->dst = dst_clone(&rt->u.dst);
2825                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2826                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
2827                                          1, NLM_F_MULTI) <= 0) {
2828                                 dst_release(xchg(&skb->dst, NULL));
2829                                 rcu_read_unlock_bh();
2830                                 goto done;
2831                         }
2832                         dst_release(xchg(&skb->dst, NULL));
2833                 }
2834                 rcu_read_unlock_bh();
2835         }
2836
2837 done:
2838         cb->args[0] = h;
2839         cb->args[1] = idx;
2840         return skb->len;
2841 }
2842
2843 void ip_rt_multicast_event(struct in_device *in_dev)
2844 {
2845         rt_cache_flush(0);
2846 }
2847
2848 #ifdef CONFIG_SYSCTL
2849 static int flush_delay;
2850
2851 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852                                         struct file *filp, void __user *buffer,
2853                                         size_t *lenp, loff_t *ppos)
2854 {
2855         if (write) {
2856                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857                 rt_cache_flush(flush_delay);
2858                 return 0;
2859         } 
2860
2861         return -EINVAL;
2862 }
2863
2864 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2865                                                 int __user *name,
2866                                                 int nlen,
2867                                                 void __user *oldval,
2868                                                 size_t __user *oldlenp,
2869                                                 void __user *newval,
2870                                                 size_t newlen,
2871                                                 void **context)
2872 {
2873         int delay;
2874         if (newlen != sizeof(int))
2875                 return -EINVAL;
2876         if (get_user(delay, (int __user *)newval))
2877                 return -EFAULT; 
2878         rt_cache_flush(delay); 
2879         return 0;
2880 }
2881
2882 ctl_table ipv4_route_table[] = {
2883         {
2884                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2885                 .procname       = "flush",
2886                 .data           = &flush_delay,
2887                 .maxlen         = sizeof(int),
2888                 .mode           = 0200,
2889                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2890                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2891         },
2892         {
2893                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2894                 .procname       = "min_delay",
2895                 .data           = &ip_rt_min_delay,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = &proc_dointvec_jiffies,
2899                 .strategy       = &sysctl_jiffies,
2900         },
2901         {
2902                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2903                 .procname       = "max_delay",
2904                 .data           = &ip_rt_max_delay,
2905                 .maxlen         = sizeof(int),
2906                 .mode           = 0644,
2907                 .proc_handler   = &proc_dointvec_jiffies,
2908                 .strategy       = &sysctl_jiffies,
2909         },
2910         {
2911                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2912                 .procname       = "gc_thresh",
2913                 .data           = &ipv4_dst_ops.gc_thresh,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0644,
2916                 .proc_handler   = &proc_dointvec,
2917         },
2918         {
2919                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2920                 .procname       = "max_size",
2921                 .data           = &ip_rt_max_size,
2922                 .maxlen         = sizeof(int),
2923                 .mode           = 0644,
2924                 .proc_handler   = &proc_dointvec,
2925         },
2926         {
2927                 /*  Deprecated. Use gc_min_interval_ms */
2928  
2929                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930                 .procname       = "gc_min_interval",
2931                 .data           = &ip_rt_gc_min_interval,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = &proc_dointvec_jiffies,
2935                 .strategy       = &sysctl_jiffies,
2936         },
2937         {
2938                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939                 .procname       = "gc_min_interval_ms",
2940                 .data           = &ip_rt_gc_min_interval,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = &proc_dointvec_ms_jiffies,
2944                 .strategy       = &sysctl_ms_jiffies,
2945         },
2946         {
2947                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2948                 .procname       = "gc_timeout",
2949                 .data           = &ip_rt_gc_timeout,
2950                 .maxlen         = sizeof(int),
2951                 .mode           = 0644,
2952                 .proc_handler   = &proc_dointvec_jiffies,
2953                 .strategy       = &sysctl_jiffies,
2954         },
2955         {
2956                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2957                 .procname       = "gc_interval",
2958                 .data           = &ip_rt_gc_interval,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = &proc_dointvec_jiffies,
2962                 .strategy       = &sysctl_jiffies,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2966                 .procname       = "redirect_load",
2967                 .data           = &ip_rt_redirect_load,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974                 .procname       = "redirect_number",
2975                 .data           = &ip_rt_redirect_number,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec,
2979         },
2980         {
2981                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982                 .procname       = "redirect_silence",
2983                 .data           = &ip_rt_redirect_silence,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = &proc_dointvec,
2987         },
2988         {
2989                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2990                 .procname       = "error_cost",
2991                 .data           = &ip_rt_error_cost,
2992                 .maxlen         = sizeof(int),
2993                 .mode           = 0644,
2994                 .proc_handler   = &proc_dointvec,
2995         },
2996         {
2997                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2998                 .procname       = "error_burst",
2999                 .data           = &ip_rt_error_burst,
3000                 .maxlen         = sizeof(int),
3001                 .mode           = 0644,
3002                 .proc_handler   = &proc_dointvec,
3003         },
3004         {
3005                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3006                 .procname       = "gc_elasticity",
3007                 .data           = &ip_rt_gc_elasticity,
3008                 .maxlen         = sizeof(int),
3009                 .mode           = 0644,
3010                 .proc_handler   = &proc_dointvec,
3011         },
3012         {
3013                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3014                 .procname       = "mtu_expires",
3015                 .data           = &ip_rt_mtu_expires,
3016                 .maxlen         = sizeof(int),
3017                 .mode           = 0644,
3018                 .proc_handler   = &proc_dointvec_jiffies,
3019                 .strategy       = &sysctl_jiffies,
3020         },
3021         {
3022                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3023                 .procname       = "min_pmtu",
3024                 .data           = &ip_rt_min_pmtu,
3025                 .maxlen         = sizeof(int),
3026                 .mode           = 0644,
3027                 .proc_handler   = &proc_dointvec,
3028         },
3029         {
3030                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3031                 .procname       = "min_adv_mss",
3032                 .data           = &ip_rt_min_advmss,
3033                 .maxlen         = sizeof(int),
3034                 .mode           = 0644,
3035                 .proc_handler   = &proc_dointvec,
3036         },
3037         {
3038                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3039                 .procname       = "secret_interval",
3040                 .data           = &ip_rt_secret_interval,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = &proc_dointvec_jiffies,
3044                 .strategy       = &sysctl_jiffies,
3045         },
3046         { .ctl_name = 0 }
3047 };
3048 #endif
3049
3050 #ifdef CONFIG_NET_CLS_ROUTE
3051 struct ip_rt_acct *ip_rt_acct;
3052
3053 /* This code sucks.  But you should have seen it before! --RR */
3054
3055 /* IP route accounting ptr for this logical cpu number. */
3056 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3057
3058 #ifdef CONFIG_PROC_FS
3059 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060                            int length, int *eof, void *data)
3061 {
3062         unsigned int i;
3063
3064         if ((offset & 3) || (length & 3))
3065                 return -EIO;
3066
3067         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3068                 *eof = 1;
3069                 return 0;
3070         }
3071
3072         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3074                 *eof = 1;
3075         }
3076
3077         offset /= sizeof(u32);
3078
3079         if (length > 0) {
3080                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081                 u32 *dst = (u32 *) buffer;
3082
3083                 /* Copy first cpu. */
3084                 *start = buffer;
3085                 memcpy(dst, src, length);
3086
3087                 /* Add the other cpus in, one int at a time */
3088                 for_each_cpu(i) {
3089                         unsigned int j;
3090
3091                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3092
3093                         for (j = 0; j < length/4; j++)
3094                                 dst[j] += src[j];
3095                 }
3096         }
3097         return length;
3098 }
3099 #endif /* CONFIG_PROC_FS */
3100 #endif /* CONFIG_NET_CLS_ROUTE */
3101
3102 static __initdata unsigned long rhash_entries;
3103 static int __init set_rhash_entries(char *str)
3104 {
3105         if (!str)
3106                 return 0;
3107         rhash_entries = simple_strtoul(str, &str, 0);
3108         return 1;
3109 }
3110 __setup("rhash_entries=", set_rhash_entries);
3111
3112 int __init ip_rt_init(void)
3113 {
3114         int rc = 0;
3115
3116         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117                              (jiffies ^ (jiffies >> 7)));
3118
3119 #ifdef CONFIG_NET_CLS_ROUTE
3120         {
3121         int order;
3122         for (order = 0;
3123              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3124                 /* NOTHING */;
3125         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3126         if (!ip_rt_acct)
3127                 panic("IP: failed to allocate ip_rt_acct\n");
3128         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129         }
3130 #endif
3131
3132         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133                                                      sizeof(struct rtable),
3134                                                      0, SLAB_HWCACHE_ALIGN,
3135                                                      NULL, NULL);
3136
3137         if (!ipv4_dst_ops.kmem_cachep)
3138                 panic("IP: failed to allocate ip_dst_cache\n");
3139
3140         rt_hash_table = (struct rt_hash_bucket *)
3141                 alloc_large_system_hash("IP route cache",
3142                                         sizeof(struct rt_hash_bucket),
3143                                         rhash_entries,
3144                                         (num_physpages >= 128 * 1024) ?
3145                                                 (27 - PAGE_SHIFT) :
3146                                                 (29 - PAGE_SHIFT),
3147                                         HASH_HIGHMEM,
3148                                         &rt_hash_log,
3149                                         &rt_hash_mask,
3150                                         0);
3151         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152         rt_hash_lock_init();
3153
3154         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156
3157         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3158         if (!rt_cache_stat)
3159                 return -ENOMEM;
3160
3161         devinet_init();
3162         ip_fib_init();
3163
3164         init_timer(&rt_flush_timer);
3165         rt_flush_timer.function = rt_run_flush;
3166         init_timer(&rt_periodic_timer);
3167         rt_periodic_timer.function = rt_check_expire;
3168         init_timer(&rt_secret_timer);
3169         rt_secret_timer.function = rt_secret_rebuild;
3170
3171         /* All the timers, started at system startup tend
3172            to synchronize. Perturb it a bit.
3173          */
3174         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3175                                         ip_rt_gc_interval;
3176         add_timer(&rt_periodic_timer);
3177
3178         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3179                 ip_rt_secret_interval;
3180         add_timer(&rt_secret_timer);
3181
3182 #ifdef CONFIG_PROC_FS
3183         {
3184         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3185         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3186             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3187                                              proc_net_stat))) {
3188                 free_percpu(rt_cache_stat);
3189                 return -ENOMEM;
3190         }
3191         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3192         }
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3195 #endif
3196 #endif
3197 #ifdef CONFIG_XFRM
3198         xfrm_init();
3199         xfrm4_init();
3200 #endif
3201         return rc;
3202 }
3203
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);