[NET]: Make rtnetlink infrastructure network namespace aware (v3)
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static unsigned long rt_deadline;
137
138 #define RTprint(a...)   printk(KERN_DEBUG a)
139
140 static struct timer_list rt_flush_timer;
141 static void rt_check_expire(struct work_struct *work);
142 static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
143 static struct timer_list rt_secret_timer;
144
145 /*
146  *      Interface to generic destination cache.
147  */
148
149 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
150 static void              ipv4_dst_destroy(struct dst_entry *dst);
151 static void              ipv4_dst_ifdown(struct dst_entry *dst,
152                                          struct net_device *dev, int how);
153 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
154 static void              ipv4_link_failure(struct sk_buff *skb);
155 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
156 static int rt_garbage_collect(void);
157
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .protocol =             __constant_htons(ETH_P_IP),
162         .gc =                   rt_garbage_collect,
163         .check =                ipv4_dst_check,
164         .destroy =              ipv4_dst_destroy,
165         .ifdown =               ipv4_dst_ifdown,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .local_out =            ip_local_out,
170         .entry_size =           sizeof(struct rtable),
171 };
172
173 #define ECN_OR_COST(class)      TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(FILLER),
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK)
192 };
193
194
195 /*
196  * Route cache.
197  */
198
199 /* The locking scheme is rather straight forward:
200  *
201  * 1) Read-Copy Update protects the buckets of the central route hash.
202  * 2) Only writers remove entries, and they hold the lock
203  *    as they look at rtable reference counts.
204  * 3) Only readers acquire references to rtable entries,
205  *    they do so with atomic increments and with the
206  *    lock held.
207  */
208
209 struct rt_hash_bucket {
210         struct rtable   *chain;
211 };
212 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
213         defined(CONFIG_PROVE_LOCKING)
214 /*
215  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
216  * The size of this table is a power of two and depends on the number of CPUS.
217  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
218  */
219 #ifdef CONFIG_LOCKDEP
220 # define RT_HASH_LOCK_SZ        256
221 #else
222 # if NR_CPUS >= 32
223 #  define RT_HASH_LOCK_SZ       4096
224 # elif NR_CPUS >= 16
225 #  define RT_HASH_LOCK_SZ       2048
226 # elif NR_CPUS >= 8
227 #  define RT_HASH_LOCK_SZ       1024
228 # elif NR_CPUS >= 4
229 #  define RT_HASH_LOCK_SZ       512
230 # else
231 #  define RT_HASH_LOCK_SZ       256
232 # endif
233 #endif
234
235 static spinlock_t       *rt_hash_locks;
236 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
237 # define rt_hash_lock_init()    { \
238                 int i; \
239                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
240                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
241                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
242                         spin_lock_init(&rt_hash_locks[i]); \
243                 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246 # define rt_hash_lock_init()
247 #endif
248
249 static struct rt_hash_bucket    *rt_hash_table;
250 static unsigned                 rt_hash_mask;
251 static unsigned int             rt_hash_log;
252 static unsigned int             rt_hash_rnd;
253
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) \
256         (__raw_get_cpu_var(rt_cache_stat).field++)
257
258 static int rt_intern_hash(unsigned hash, struct rtable *rth,
259                                 struct rtable **res);
260
261 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
262 {
263         return (jhash_2words(daddr, saddr, rt_hash_rnd)
264                 & rt_hash_mask);
265 }
266
267 #define rt_hash(daddr, saddr, idx) \
268         rt_hash_code((__force u32)(__be32)(daddr),\
269                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
270
271 #ifdef CONFIG_PROC_FS
272 struct rt_cache_iter_state {
273         int bucket;
274 };
275
276 static struct rtable *rt_cache_get_first(struct seq_file *seq)
277 {
278         struct rtable *r = NULL;
279         struct rt_cache_iter_state *st = seq->private;
280
281         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
282                 rcu_read_lock_bh();
283                 r = rt_hash_table[st->bucket].chain;
284                 if (r)
285                         break;
286                 rcu_read_unlock_bh();
287         }
288         return rcu_dereference(r);
289 }
290
291 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
292 {
293         struct rt_cache_iter_state *st = seq->private;
294
295         r = r->u.dst.rt_next;
296         while (!r) {
297                 rcu_read_unlock_bh();
298                 if (--st->bucket < 0)
299                         break;
300                 rcu_read_lock_bh();
301                 r = rt_hash_table[st->bucket].chain;
302         }
303         return rcu_dereference(r);
304 }
305
306 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
307 {
308         struct rtable *r = rt_cache_get_first(seq);
309
310         if (r)
311                 while (pos && (r = rt_cache_get_next(seq, r)))
312                         --pos;
313         return pos ? NULL : r;
314 }
315
316 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
317 {
318         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
319 }
320
321 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
322 {
323         struct rtable *r = NULL;
324
325         if (v == SEQ_START_TOKEN)
326                 r = rt_cache_get_first(seq);
327         else
328                 r = rt_cache_get_next(seq, v);
329         ++*pos;
330         return r;
331 }
332
333 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
334 {
335         if (v && v != SEQ_START_TOKEN)
336                 rcu_read_unlock_bh();
337 }
338
339 static int rt_cache_seq_show(struct seq_file *seq, void *v)
340 {
341         if (v == SEQ_START_TOKEN)
342                 seq_printf(seq, "%-127s\n",
343                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
344                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
345                            "HHUptod\tSpecDst");
346         else {
347                 struct rtable *r = v;
348                 char temp[256];
349
350                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
351                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
352                         r->u.dst.dev ? r->u.dst.dev->name : "*",
353                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
354                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
355                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
356                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
357                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
358                         dst_metric(&r->u.dst, RTAX_WINDOW),
359                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
360                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
361                         r->fl.fl4_tos,
362                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
363                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
364                                        dev_queue_xmit) : 0,
365                         r->rt_spec_dst);
366                 seq_printf(seq, "%-127s\n", temp);
367         }
368         return 0;
369 }
370
371 static const struct seq_operations rt_cache_seq_ops = {
372         .start  = rt_cache_seq_start,
373         .next   = rt_cache_seq_next,
374         .stop   = rt_cache_seq_stop,
375         .show   = rt_cache_seq_show,
376 };
377
378 static int rt_cache_seq_open(struct inode *inode, struct file *file)
379 {
380         return seq_open_private(file, &rt_cache_seq_ops,
381                         sizeof(struct rt_cache_iter_state));
382 }
383
384 static const struct file_operations rt_cache_seq_fops = {
385         .owner   = THIS_MODULE,
386         .open    = rt_cache_seq_open,
387         .read    = seq_read,
388         .llseek  = seq_lseek,
389         .release = seq_release_private,
390 };
391
392
393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
394 {
395         int cpu;
396
397         if (*pos == 0)
398                 return SEQ_START_TOKEN;
399
400         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401                 if (!cpu_possible(cpu))
402                         continue;
403                 *pos = cpu+1;
404                 return &per_cpu(rt_cache_stat, cpu);
405         }
406         return NULL;
407 }
408
409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410 {
411         int cpu;
412
413         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414                 if (!cpu_possible(cpu))
415                         continue;
416                 *pos = cpu+1;
417                 return &per_cpu(rt_cache_stat, cpu);
418         }
419         return NULL;
420
421 }
422
423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
424 {
425
426 }
427
428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429 {
430         struct rt_cache_stat *st = v;
431
432         if (v == SEQ_START_TOKEN) {
433                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
434                 return 0;
435         }
436
437         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
438                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439                    atomic_read(&ipv4_dst_ops.entries),
440                    st->in_hit,
441                    st->in_slow_tot,
442                    st->in_slow_mc,
443                    st->in_no_route,
444                    st->in_brd,
445                    st->in_martian_dst,
446                    st->in_martian_src,
447
448                    st->out_hit,
449                    st->out_slow_tot,
450                    st->out_slow_mc,
451
452                    st->gc_total,
453                    st->gc_ignored,
454                    st->gc_goal_miss,
455                    st->gc_dst_overflow,
456                    st->in_hlist_search,
457                    st->out_hlist_search
458                 );
459         return 0;
460 }
461
462 static const struct seq_operations rt_cpu_seq_ops = {
463         .start  = rt_cpu_seq_start,
464         .next   = rt_cpu_seq_next,
465         .stop   = rt_cpu_seq_stop,
466         .show   = rt_cpu_seq_show,
467 };
468
469
470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471 {
472         return seq_open(file, &rt_cpu_seq_ops);
473 }
474
475 static const struct file_operations rt_cpu_seq_fops = {
476         .owner   = THIS_MODULE,
477         .open    = rt_cpu_seq_open,
478         .read    = seq_read,
479         .llseek  = seq_lseek,
480         .release = seq_release,
481 };
482
483 #endif /* CONFIG_PROC_FS */
484
485 static __inline__ void rt_free(struct rtable *rt)
486 {
487         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
488 }
489
490 static __inline__ void rt_drop(struct rtable *rt)
491 {
492         ip_rt_put(rt);
493         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
494 }
495
496 static __inline__ int rt_fast_clean(struct rtable *rth)
497 {
498         /* Kill broadcast/multicast entries very aggresively, if they
499            collide in hash table with more useful entries */
500         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
501                 rth->fl.iif && rth->u.dst.rt_next;
502 }
503
504 static __inline__ int rt_valuable(struct rtable *rth)
505 {
506         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
507                 rth->u.dst.expires;
508 }
509
510 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
511 {
512         unsigned long age;
513         int ret = 0;
514
515         if (atomic_read(&rth->u.dst.__refcnt))
516                 goto out;
517
518         ret = 1;
519         if (rth->u.dst.expires &&
520             time_after_eq(jiffies, rth->u.dst.expires))
521                 goto out;
522
523         age = jiffies - rth->u.dst.lastuse;
524         ret = 0;
525         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
526             (age <= tmo2 && rt_valuable(rth)))
527                 goto out;
528         ret = 1;
529 out:    return ret;
530 }
531
532 /* Bits of score are:
533  * 31: very valuable
534  * 30: not quite useless
535  * 29..0: usage counter
536  */
537 static inline u32 rt_score(struct rtable *rt)
538 {
539         u32 score = jiffies - rt->u.dst.lastuse;
540
541         score = ~score & ~(3<<30);
542
543         if (rt_valuable(rt))
544                 score |= (1<<31);
545
546         if (!rt->fl.iif ||
547             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
548                 score |= (1<<30);
549
550         return score;
551 }
552
553 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
554 {
555         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
556                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
557                 (fl1->mark ^ fl2->mark) |
558                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
559                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
560                 (fl1->oif ^ fl2->oif) |
561                 (fl1->iif ^ fl2->iif)) == 0;
562 }
563
564 static void rt_check_expire(struct work_struct *work)
565 {
566         static unsigned int rover;
567         unsigned int i = rover, goal;
568         struct rtable *rth, **rthp;
569         u64 mult;
570
571         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
572         if (ip_rt_gc_timeout > 1)
573                 do_div(mult, ip_rt_gc_timeout);
574         goal = (unsigned int)mult;
575         if (goal > rt_hash_mask)
576                 goal = rt_hash_mask + 1;
577         for (; goal > 0; goal--) {
578                 unsigned long tmo = ip_rt_gc_timeout;
579
580                 i = (i + 1) & rt_hash_mask;
581                 rthp = &rt_hash_table[i].chain;
582
583                 if (need_resched())
584                         cond_resched();
585
586                 if (*rthp == NULL)
587                         continue;
588                 spin_lock_bh(rt_hash_lock_addr(i));
589                 while ((rth = *rthp) != NULL) {
590                         if (rth->u.dst.expires) {
591                                 /* Entry is expired even if it is in use */
592                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
593                                         tmo >>= 1;
594                                         rthp = &rth->u.dst.rt_next;
595                                         continue;
596                                 }
597                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
598                                 tmo >>= 1;
599                                 rthp = &rth->u.dst.rt_next;
600                                 continue;
601                         }
602
603                         /* Cleanup aged off entries. */
604                         *rthp = rth->u.dst.rt_next;
605                         rt_free(rth);
606                 }
607                 spin_unlock_bh(rt_hash_lock_addr(i));
608         }
609         rover = i;
610         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
611 }
612
613 /* This can run from both BH and non-BH contexts, the latter
614  * in the case of a forced flush event.
615  */
616 static void rt_run_flush(unsigned long dummy)
617 {
618         int i;
619         struct rtable *rth, *next;
620
621         rt_deadline = 0;
622
623         get_random_bytes(&rt_hash_rnd, 4);
624
625         for (i = rt_hash_mask; i >= 0; i--) {
626                 spin_lock_bh(rt_hash_lock_addr(i));
627                 rth = rt_hash_table[i].chain;
628                 if (rth)
629                         rt_hash_table[i].chain = NULL;
630                 spin_unlock_bh(rt_hash_lock_addr(i));
631
632                 for (; rth; rth = next) {
633                         next = rth->u.dst.rt_next;
634                         rt_free(rth);
635                 }
636         }
637 }
638
639 static DEFINE_SPINLOCK(rt_flush_lock);
640
641 void rt_cache_flush(int delay)
642 {
643         unsigned long now = jiffies;
644         int user_mode = !in_softirq();
645
646         if (delay < 0)
647                 delay = ip_rt_min_delay;
648
649         spin_lock_bh(&rt_flush_lock);
650
651         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
652                 long tmo = (long)(rt_deadline - now);
653
654                 /* If flush timer is already running
655                    and flush request is not immediate (delay > 0):
656
657                    if deadline is not achieved, prolongate timer to "delay",
658                    otherwise fire it at deadline time.
659                  */
660
661                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
662                         tmo = 0;
663
664                 if (delay > tmo)
665                         delay = tmo;
666         }
667
668         if (delay <= 0) {
669                 spin_unlock_bh(&rt_flush_lock);
670                 rt_run_flush(0);
671                 return;
672         }
673
674         if (rt_deadline == 0)
675                 rt_deadline = now + ip_rt_max_delay;
676
677         mod_timer(&rt_flush_timer, now+delay);
678         spin_unlock_bh(&rt_flush_lock);
679 }
680
681 static void rt_secret_rebuild(unsigned long dummy)
682 {
683         unsigned long now = jiffies;
684
685         rt_cache_flush(0);
686         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
687 }
688
689 /*
690    Short description of GC goals.
691
692    We want to build algorithm, which will keep routing cache
693    at some equilibrium point, when number of aged off entries
694    is kept approximately equal to newly generated ones.
695
696    Current expiration strength is variable "expire".
697    We try to adjust it dynamically, so that if networking
698    is idle expires is large enough to keep enough of warm entries,
699    and when load increases it reduces to limit cache size.
700  */
701
702 static int rt_garbage_collect(void)
703 {
704         static unsigned long expire = RT_GC_TIMEOUT;
705         static unsigned long last_gc;
706         static int rover;
707         static int equilibrium;
708         struct rtable *rth, **rthp;
709         unsigned long now = jiffies;
710         int goal;
711
712         /*
713          * Garbage collection is pretty expensive,
714          * do not make it too frequently.
715          */
716
717         RT_CACHE_STAT_INC(gc_total);
718
719         if (now - last_gc < ip_rt_gc_min_interval &&
720             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
721                 RT_CACHE_STAT_INC(gc_ignored);
722                 goto out;
723         }
724
725         /* Calculate number of entries, which we want to expire now. */
726         goal = atomic_read(&ipv4_dst_ops.entries) -
727                 (ip_rt_gc_elasticity << rt_hash_log);
728         if (goal <= 0) {
729                 if (equilibrium < ipv4_dst_ops.gc_thresh)
730                         equilibrium = ipv4_dst_ops.gc_thresh;
731                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
732                 if (goal > 0) {
733                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
734                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
735                 }
736         } else {
737                 /* We are in dangerous area. Try to reduce cache really
738                  * aggressively.
739                  */
740                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
741                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
742         }
743
744         if (now - last_gc >= ip_rt_gc_min_interval)
745                 last_gc = now;
746
747         if (goal <= 0) {
748                 equilibrium += goal;
749                 goto work_done;
750         }
751
752         do {
753                 int i, k;
754
755                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
756                         unsigned long tmo = expire;
757
758                         k = (k + 1) & rt_hash_mask;
759                         rthp = &rt_hash_table[k].chain;
760                         spin_lock_bh(rt_hash_lock_addr(k));
761                         while ((rth = *rthp) != NULL) {
762                                 if (!rt_may_expire(rth, tmo, expire)) {
763                                         tmo >>= 1;
764                                         rthp = &rth->u.dst.rt_next;
765                                         continue;
766                                 }
767                                 *rthp = rth->u.dst.rt_next;
768                                 rt_free(rth);
769                                 goal--;
770                         }
771                         spin_unlock_bh(rt_hash_lock_addr(k));
772                         if (goal <= 0)
773                                 break;
774                 }
775                 rover = k;
776
777                 if (goal <= 0)
778                         goto work_done;
779
780                 /* Goal is not achieved. We stop process if:
781
782                    - if expire reduced to zero. Otherwise, expire is halfed.
783                    - if table is not full.
784                    - if we are called from interrupt.
785                    - jiffies check is just fallback/debug loop breaker.
786                      We will not spin here for long time in any case.
787                  */
788
789                 RT_CACHE_STAT_INC(gc_goal_miss);
790
791                 if (expire == 0)
792                         break;
793
794                 expire >>= 1;
795 #if RT_CACHE_DEBUG >= 2
796                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
797                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
798 #endif
799
800                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
801                         goto out;
802         } while (!in_softirq() && time_before_eq(jiffies, now));
803
804         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
805                 goto out;
806         if (net_ratelimit())
807                 printk(KERN_WARNING "dst cache overflow\n");
808         RT_CACHE_STAT_INC(gc_dst_overflow);
809         return 1;
810
811 work_done:
812         expire += ip_rt_gc_min_interval;
813         if (expire > ip_rt_gc_timeout ||
814             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
815                 expire = ip_rt_gc_timeout;
816 #if RT_CACHE_DEBUG >= 2
817         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
818                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
819 #endif
820 out:    return 0;
821 }
822
823 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
824 {
825         struct rtable   *rth, **rthp;
826         unsigned long   now;
827         struct rtable *cand, **candp;
828         u32             min_score;
829         int             chain_length;
830         int attempts = !in_softirq();
831
832 restart:
833         chain_length = 0;
834         min_score = ~(u32)0;
835         cand = NULL;
836         candp = NULL;
837         now = jiffies;
838
839         rthp = &rt_hash_table[hash].chain;
840
841         spin_lock_bh(rt_hash_lock_addr(hash));
842         while ((rth = *rthp) != NULL) {
843                 if (compare_keys(&rth->fl, &rt->fl)) {
844                         /* Put it first */
845                         *rthp = rth->u.dst.rt_next;
846                         /*
847                          * Since lookup is lockfree, the deletion
848                          * must be visible to another weakly ordered CPU before
849                          * the insertion at the start of the hash chain.
850                          */
851                         rcu_assign_pointer(rth->u.dst.rt_next,
852                                            rt_hash_table[hash].chain);
853                         /*
854                          * Since lookup is lockfree, the update writes
855                          * must be ordered for consistency on SMP.
856                          */
857                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
858
859                         dst_use(&rth->u.dst, now);
860                         spin_unlock_bh(rt_hash_lock_addr(hash));
861
862                         rt_drop(rt);
863                         *rp = rth;
864                         return 0;
865                 }
866
867                 if (!atomic_read(&rth->u.dst.__refcnt)) {
868                         u32 score = rt_score(rth);
869
870                         if (score <= min_score) {
871                                 cand = rth;
872                                 candp = rthp;
873                                 min_score = score;
874                         }
875                 }
876
877                 chain_length++;
878
879                 rthp = &rth->u.dst.rt_next;
880         }
881
882         if (cand) {
883                 /* ip_rt_gc_elasticity used to be average length of chain
884                  * length, when exceeded gc becomes really aggressive.
885                  *
886                  * The second limit is less certain. At the moment it allows
887                  * only 2 entries per bucket. We will see.
888                  */
889                 if (chain_length > ip_rt_gc_elasticity) {
890                         *candp = cand->u.dst.rt_next;
891                         rt_free(cand);
892                 }
893         }
894
895         /* Try to bind route to arp only if it is output
896            route or unicast forwarding path.
897          */
898         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
899                 int err = arp_bind_neighbour(&rt->u.dst);
900                 if (err) {
901                         spin_unlock_bh(rt_hash_lock_addr(hash));
902
903                         if (err != -ENOBUFS) {
904                                 rt_drop(rt);
905                                 return err;
906                         }
907
908                         /* Neighbour tables are full and nothing
909                            can be released. Try to shrink route cache,
910                            it is most likely it holds some neighbour records.
911                          */
912                         if (attempts-- > 0) {
913                                 int saved_elasticity = ip_rt_gc_elasticity;
914                                 int saved_int = ip_rt_gc_min_interval;
915                                 ip_rt_gc_elasticity     = 1;
916                                 ip_rt_gc_min_interval   = 0;
917                                 rt_garbage_collect();
918                                 ip_rt_gc_min_interval   = saved_int;
919                                 ip_rt_gc_elasticity     = saved_elasticity;
920                                 goto restart;
921                         }
922
923                         if (net_ratelimit())
924                                 printk(KERN_WARNING "Neighbour table overflow.\n");
925                         rt_drop(rt);
926                         return -ENOBUFS;
927                 }
928         }
929
930         rt->u.dst.rt_next = rt_hash_table[hash].chain;
931 #if RT_CACHE_DEBUG >= 2
932         if (rt->u.dst.rt_next) {
933                 struct rtable *trt;
934                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
935                        NIPQUAD(rt->rt_dst));
936                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
937                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
938                 printk("\n");
939         }
940 #endif
941         rt_hash_table[hash].chain = rt;
942         spin_unlock_bh(rt_hash_lock_addr(hash));
943         *rp = rt;
944         return 0;
945 }
946
947 void rt_bind_peer(struct rtable *rt, int create)
948 {
949         static DEFINE_SPINLOCK(rt_peer_lock);
950         struct inet_peer *peer;
951
952         peer = inet_getpeer(rt->rt_dst, create);
953
954         spin_lock_bh(&rt_peer_lock);
955         if (rt->peer == NULL) {
956                 rt->peer = peer;
957                 peer = NULL;
958         }
959         spin_unlock_bh(&rt_peer_lock);
960         if (peer)
961                 inet_putpeer(peer);
962 }
963
964 /*
965  * Peer allocation may fail only in serious out-of-memory conditions.  However
966  * we still can generate some output.
967  * Random ID selection looks a bit dangerous because we have no chances to
968  * select ID being unique in a reasonable period of time.
969  * But broken packet identifier may be better than no packet at all.
970  */
971 static void ip_select_fb_ident(struct iphdr *iph)
972 {
973         static DEFINE_SPINLOCK(ip_fb_id_lock);
974         static u32 ip_fallback_id;
975         u32 salt;
976
977         spin_lock_bh(&ip_fb_id_lock);
978         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
979         iph->id = htons(salt & 0xFFFF);
980         ip_fallback_id = salt;
981         spin_unlock_bh(&ip_fb_id_lock);
982 }
983
984 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
985 {
986         struct rtable *rt = (struct rtable *) dst;
987
988         if (rt) {
989                 if (rt->peer == NULL)
990                         rt_bind_peer(rt, 1);
991
992                 /* If peer is attached to destination, it is never detached,
993                    so that we need not to grab a lock to dereference it.
994                  */
995                 if (rt->peer) {
996                         iph->id = htons(inet_getid(rt->peer, more));
997                         return;
998                 }
999         } else
1000                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1001                        __builtin_return_address(0));
1002
1003         ip_select_fb_ident(iph);
1004 }
1005
1006 static void rt_del(unsigned hash, struct rtable *rt)
1007 {
1008         struct rtable **rthp;
1009
1010         spin_lock_bh(rt_hash_lock_addr(hash));
1011         ip_rt_put(rt);
1012         for (rthp = &rt_hash_table[hash].chain; *rthp;
1013              rthp = &(*rthp)->u.dst.rt_next)
1014                 if (*rthp == rt) {
1015                         *rthp = rt->u.dst.rt_next;
1016                         rt_free(rt);
1017                         break;
1018                 }
1019         spin_unlock_bh(rt_hash_lock_addr(hash));
1020 }
1021
1022 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1023                     __be32 saddr, struct net_device *dev)
1024 {
1025         int i, k;
1026         struct in_device *in_dev = in_dev_get(dev);
1027         struct rtable *rth, **rthp;
1028         __be32  skeys[2] = { saddr, 0 };
1029         int  ikeys[2] = { dev->ifindex, 0 };
1030         struct netevent_redirect netevent;
1031
1032         if (!in_dev)
1033                 return;
1034
1035         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1036             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1037                 goto reject_redirect;
1038
1039         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1040                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1041                         goto reject_redirect;
1042                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1043                         goto reject_redirect;
1044         } else {
1045                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1046                         goto reject_redirect;
1047         }
1048
1049         for (i = 0; i < 2; i++) {
1050                 for (k = 0; k < 2; k++) {
1051                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1052
1053                         rthp=&rt_hash_table[hash].chain;
1054
1055                         rcu_read_lock();
1056                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1057                                 struct rtable *rt;
1058
1059                                 if (rth->fl.fl4_dst != daddr ||
1060                                     rth->fl.fl4_src != skeys[i] ||
1061                                     rth->fl.oif != ikeys[k] ||
1062                                     rth->fl.iif != 0) {
1063                                         rthp = &rth->u.dst.rt_next;
1064                                         continue;
1065                                 }
1066
1067                                 if (rth->rt_dst != daddr ||
1068                                     rth->rt_src != saddr ||
1069                                     rth->u.dst.error ||
1070                                     rth->rt_gateway != old_gw ||
1071                                     rth->u.dst.dev != dev)
1072                                         break;
1073
1074                                 dst_hold(&rth->u.dst);
1075                                 rcu_read_unlock();
1076
1077                                 rt = dst_alloc(&ipv4_dst_ops);
1078                                 if (rt == NULL) {
1079                                         ip_rt_put(rth);
1080                                         in_dev_put(in_dev);
1081                                         return;
1082                                 }
1083
1084                                 /* Copy all the information. */
1085                                 *rt = *rth;
1086                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1087                                 rt->u.dst.__use         = 1;
1088                                 atomic_set(&rt->u.dst.__refcnt, 1);
1089                                 rt->u.dst.child         = NULL;
1090                                 if (rt->u.dst.dev)
1091                                         dev_hold(rt->u.dst.dev);
1092                                 if (rt->idev)
1093                                         in_dev_hold(rt->idev);
1094                                 rt->u.dst.obsolete      = 0;
1095                                 rt->u.dst.lastuse       = jiffies;
1096                                 rt->u.dst.path          = &rt->u.dst;
1097                                 rt->u.dst.neighbour     = NULL;
1098                                 rt->u.dst.hh            = NULL;
1099                                 rt->u.dst.xfrm          = NULL;
1100
1101                                 rt->rt_flags            |= RTCF_REDIRECTED;
1102
1103                                 /* Gateway is different ... */
1104                                 rt->rt_gateway          = new_gw;
1105
1106                                 /* Redirect received -> path was valid */
1107                                 dst_confirm(&rth->u.dst);
1108
1109                                 if (rt->peer)
1110                                         atomic_inc(&rt->peer->refcnt);
1111
1112                                 if (arp_bind_neighbour(&rt->u.dst) ||
1113                                     !(rt->u.dst.neighbour->nud_state &
1114                                             NUD_VALID)) {
1115                                         if (rt->u.dst.neighbour)
1116                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1117                                         ip_rt_put(rth);
1118                                         rt_drop(rt);
1119                                         goto do_next;
1120                                 }
1121
1122                                 netevent.old = &rth->u.dst;
1123                                 netevent.new = &rt->u.dst;
1124                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1125                                                         &netevent);
1126
1127                                 rt_del(hash, rth);
1128                                 if (!rt_intern_hash(hash, rt, &rt))
1129                                         ip_rt_put(rt);
1130                                 goto do_next;
1131                         }
1132                         rcu_read_unlock();
1133                 do_next:
1134                         ;
1135                 }
1136         }
1137         in_dev_put(in_dev);
1138         return;
1139
1140 reject_redirect:
1141 #ifdef CONFIG_IP_ROUTE_VERBOSE
1142         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1143                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1144                         "%u.%u.%u.%u ignored.\n"
1145                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1146                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1147                        NIPQUAD(saddr), NIPQUAD(daddr));
1148 #endif
1149         in_dev_put(in_dev);
1150 }
1151
1152 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1153 {
1154         struct rtable *rt = (struct rtable*)dst;
1155         struct dst_entry *ret = dst;
1156
1157         if (rt) {
1158                 if (dst->obsolete) {
1159                         ip_rt_put(rt);
1160                         ret = NULL;
1161                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1162                            rt->u.dst.expires) {
1163                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1164                                                 rt->fl.oif);
1165 #if RT_CACHE_DEBUG >= 1
1166                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1167                                           "%u.%u.%u.%u/%02x dropped\n",
1168                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1169 #endif
1170                         rt_del(hash, rt);
1171                         ret = NULL;
1172                 }
1173         }
1174         return ret;
1175 }
1176
1177 /*
1178  * Algorithm:
1179  *      1. The first ip_rt_redirect_number redirects are sent
1180  *         with exponential backoff, then we stop sending them at all,
1181  *         assuming that the host ignores our redirects.
1182  *      2. If we did not see packets requiring redirects
1183  *         during ip_rt_redirect_silence, we assume that the host
1184  *         forgot redirected route and start to send redirects again.
1185  *
1186  * This algorithm is much cheaper and more intelligent than dumb load limiting
1187  * in icmp.c.
1188  *
1189  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1190  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1191  */
1192
1193 void ip_rt_send_redirect(struct sk_buff *skb)
1194 {
1195         struct rtable *rt = (struct rtable*)skb->dst;
1196         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1197
1198         if (!in_dev)
1199                 return;
1200
1201         if (!IN_DEV_TX_REDIRECTS(in_dev))
1202                 goto out;
1203
1204         /* No redirected packets during ip_rt_redirect_silence;
1205          * reset the algorithm.
1206          */
1207         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1208                 rt->u.dst.rate_tokens = 0;
1209
1210         /* Too many ignored redirects; do not send anything
1211          * set u.dst.rate_last to the last seen redirected packet.
1212          */
1213         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1214                 rt->u.dst.rate_last = jiffies;
1215                 goto out;
1216         }
1217
1218         /* Check for load limit; set rate_last to the latest sent
1219          * redirect.
1220          */
1221         if (rt->u.dst.rate_tokens == 0 ||
1222             time_after(jiffies,
1223                        (rt->u.dst.rate_last +
1224                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1225                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1226                 rt->u.dst.rate_last = jiffies;
1227                 ++rt->u.dst.rate_tokens;
1228 #ifdef CONFIG_IP_ROUTE_VERBOSE
1229                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1230                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1231                     net_ratelimit())
1232                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1233                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1234                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1235                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1236 #endif
1237         }
1238 out:
1239         in_dev_put(in_dev);
1240 }
1241
1242 static int ip_error(struct sk_buff *skb)
1243 {
1244         struct rtable *rt = (struct rtable*)skb->dst;
1245         unsigned long now;
1246         int code;
1247
1248         switch (rt->u.dst.error) {
1249                 case EINVAL:
1250                 default:
1251                         goto out;
1252                 case EHOSTUNREACH:
1253                         code = ICMP_HOST_UNREACH;
1254                         break;
1255                 case ENETUNREACH:
1256                         code = ICMP_NET_UNREACH;
1257                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1258                         break;
1259                 case EACCES:
1260                         code = ICMP_PKT_FILTERED;
1261                         break;
1262         }
1263
1264         now = jiffies;
1265         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1266         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1267                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1268         rt->u.dst.rate_last = now;
1269         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1270                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1271                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1272         }
1273
1274 out:    kfree_skb(skb);
1275         return 0;
1276 }
1277
1278 /*
1279  *      The last two values are not from the RFC but
1280  *      are needed for AMPRnet AX.25 paths.
1281  */
1282
1283 static const unsigned short mtu_plateau[] =
1284 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1285
1286 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1287 {
1288         int i;
1289
1290         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1291                 if (old_mtu > mtu_plateau[i])
1292                         return mtu_plateau[i];
1293         return 68;
1294 }
1295
1296 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1297 {
1298         int i;
1299         unsigned short old_mtu = ntohs(iph->tot_len);
1300         struct rtable *rth;
1301         __be32  skeys[2] = { iph->saddr, 0, };
1302         __be32  daddr = iph->daddr;
1303         unsigned short est_mtu = 0;
1304
1305         if (ipv4_config.no_pmtu_disc)
1306                 return 0;
1307
1308         for (i = 0; i < 2; i++) {
1309                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1310
1311                 rcu_read_lock();
1312                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1313                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1314                         if (rth->fl.fl4_dst == daddr &&
1315                             rth->fl.fl4_src == skeys[i] &&
1316                             rth->rt_dst  == daddr &&
1317                             rth->rt_src  == iph->saddr &&
1318                             rth->fl.iif == 0 &&
1319                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1320                                 unsigned short mtu = new_mtu;
1321
1322                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1323
1324                                         /* BSD 4.2 compatibility hack :-( */
1325                                         if (mtu == 0 &&
1326                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1327                                             old_mtu >= 68 + (iph->ihl << 2))
1328                                                 old_mtu -= iph->ihl << 2;
1329
1330                                         mtu = guess_mtu(old_mtu);
1331                                 }
1332                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1333                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1334                                                 dst_confirm(&rth->u.dst);
1335                                                 if (mtu < ip_rt_min_pmtu) {
1336                                                         mtu = ip_rt_min_pmtu;
1337                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1338                                                                 (1 << RTAX_MTU);
1339                                                 }
1340                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1341                                                 dst_set_expires(&rth->u.dst,
1342                                                         ip_rt_mtu_expires);
1343                                         }
1344                                         est_mtu = mtu;
1345                                 }
1346                         }
1347                 }
1348                 rcu_read_unlock();
1349         }
1350         return est_mtu ? : new_mtu;
1351 }
1352
1353 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1354 {
1355         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1356             !(dst_metric_locked(dst, RTAX_MTU))) {
1357                 if (mtu < ip_rt_min_pmtu) {
1358                         mtu = ip_rt_min_pmtu;
1359                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1360                 }
1361                 dst->metrics[RTAX_MTU-1] = mtu;
1362                 dst_set_expires(dst, ip_rt_mtu_expires);
1363                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1364         }
1365 }
1366
1367 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1368 {
1369         return NULL;
1370 }
1371
1372 static void ipv4_dst_destroy(struct dst_entry *dst)
1373 {
1374         struct rtable *rt = (struct rtable *) dst;
1375         struct inet_peer *peer = rt->peer;
1376         struct in_device *idev = rt->idev;
1377
1378         if (peer) {
1379                 rt->peer = NULL;
1380                 inet_putpeer(peer);
1381         }
1382
1383         if (idev) {
1384                 rt->idev = NULL;
1385                 in_dev_put(idev);
1386         }
1387 }
1388
1389 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1390                             int how)
1391 {
1392         struct rtable *rt = (struct rtable *) dst;
1393         struct in_device *idev = rt->idev;
1394         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1395                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1396                 if (loopback_idev) {
1397                         rt->idev = loopback_idev;
1398                         in_dev_put(idev);
1399                 }
1400         }
1401 }
1402
1403 static void ipv4_link_failure(struct sk_buff *skb)
1404 {
1405         struct rtable *rt;
1406
1407         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1408
1409         rt = (struct rtable *) skb->dst;
1410         if (rt)
1411                 dst_set_expires(&rt->u.dst, 0);
1412 }
1413
1414 static int ip_rt_bug(struct sk_buff *skb)
1415 {
1416         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1417                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1418                 skb->dev ? skb->dev->name : "?");
1419         kfree_skb(skb);
1420         return 0;
1421 }
1422
1423 /*
1424    We do not cache source address of outgoing interface,
1425    because it is used only by IP RR, TS and SRR options,
1426    so that it out of fast path.
1427
1428    BTW remember: "addr" is allowed to be not aligned
1429    in IP options!
1430  */
1431
1432 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1433 {
1434         __be32 src;
1435         struct fib_result res;
1436
1437         if (rt->fl.iif == 0)
1438                 src = rt->rt_src;
1439         else if (fib_lookup(&rt->fl, &res) == 0) {
1440                 src = FIB_RES_PREFSRC(res);
1441                 fib_res_put(&res);
1442         } else
1443                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1444                                         RT_SCOPE_UNIVERSE);
1445         memcpy(addr, &src, 4);
1446 }
1447
1448 #ifdef CONFIG_NET_CLS_ROUTE
1449 static void set_class_tag(struct rtable *rt, u32 tag)
1450 {
1451         if (!(rt->u.dst.tclassid & 0xFFFF))
1452                 rt->u.dst.tclassid |= tag & 0xFFFF;
1453         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1454                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1455 }
1456 #endif
1457
1458 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1459 {
1460         struct fib_info *fi = res->fi;
1461
1462         if (fi) {
1463                 if (FIB_RES_GW(*res) &&
1464                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1465                         rt->rt_gateway = FIB_RES_GW(*res);
1466                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1467                        sizeof(rt->u.dst.metrics));
1468                 if (fi->fib_mtu == 0) {
1469                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1470                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1471                             rt->rt_gateway != rt->rt_dst &&
1472                             rt->u.dst.dev->mtu > 576)
1473                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1474                 }
1475 #ifdef CONFIG_NET_CLS_ROUTE
1476                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1477 #endif
1478         } else
1479                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1480
1481         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1482                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1483         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1484                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1485         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1486                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1487                                        ip_rt_min_advmss);
1488         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1489                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1490
1491 #ifdef CONFIG_NET_CLS_ROUTE
1492 #ifdef CONFIG_IP_MULTIPLE_TABLES
1493         set_class_tag(rt, fib_rules_tclass(res));
1494 #endif
1495         set_class_tag(rt, itag);
1496 #endif
1497         rt->rt_type = res->type;
1498 }
1499
1500 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1501                                 u8 tos, struct net_device *dev, int our)
1502 {
1503         unsigned hash;
1504         struct rtable *rth;
1505         __be32 spec_dst;
1506         struct in_device *in_dev = in_dev_get(dev);
1507         u32 itag = 0;
1508
1509         /* Primary sanity checks. */
1510
1511         if (in_dev == NULL)
1512                 return -EINVAL;
1513
1514         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1515             skb->protocol != htons(ETH_P_IP))
1516                 goto e_inval;
1517
1518         if (ZERONET(saddr)) {
1519                 if (!LOCAL_MCAST(daddr))
1520                         goto e_inval;
1521                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1522         } else if (fib_validate_source(saddr, 0, tos, 0,
1523                                         dev, &spec_dst, &itag) < 0)
1524                 goto e_inval;
1525
1526         rth = dst_alloc(&ipv4_dst_ops);
1527         if (!rth)
1528                 goto e_nobufs;
1529
1530         rth->u.dst.output= ip_rt_bug;
1531
1532         atomic_set(&rth->u.dst.__refcnt, 1);
1533         rth->u.dst.flags= DST_HOST;
1534         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1535                 rth->u.dst.flags |= DST_NOPOLICY;
1536         rth->fl.fl4_dst = daddr;
1537         rth->rt_dst     = daddr;
1538         rth->fl.fl4_tos = tos;
1539         rth->fl.mark    = skb->mark;
1540         rth->fl.fl4_src = saddr;
1541         rth->rt_src     = saddr;
1542 #ifdef CONFIG_NET_CLS_ROUTE
1543         rth->u.dst.tclassid = itag;
1544 #endif
1545         rth->rt_iif     =
1546         rth->fl.iif     = dev->ifindex;
1547         rth->u.dst.dev  = init_net.loopback_dev;
1548         dev_hold(rth->u.dst.dev);
1549         rth->idev       = in_dev_get(rth->u.dst.dev);
1550         rth->fl.oif     = 0;
1551         rth->rt_gateway = daddr;
1552         rth->rt_spec_dst= spec_dst;
1553         rth->rt_type    = RTN_MULTICAST;
1554         rth->rt_flags   = RTCF_MULTICAST;
1555         if (our) {
1556                 rth->u.dst.input= ip_local_deliver;
1557                 rth->rt_flags |= RTCF_LOCAL;
1558         }
1559
1560 #ifdef CONFIG_IP_MROUTE
1561         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1562                 rth->u.dst.input = ip_mr_input;
1563 #endif
1564         RT_CACHE_STAT_INC(in_slow_mc);
1565
1566         in_dev_put(in_dev);
1567         hash = rt_hash(daddr, saddr, dev->ifindex);
1568         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1569
1570 e_nobufs:
1571         in_dev_put(in_dev);
1572         return -ENOBUFS;
1573
1574 e_inval:
1575         in_dev_put(in_dev);
1576         return -EINVAL;
1577 }
1578
1579
1580 static void ip_handle_martian_source(struct net_device *dev,
1581                                      struct in_device *in_dev,
1582                                      struct sk_buff *skb,
1583                                      __be32 daddr,
1584                                      __be32 saddr)
1585 {
1586         RT_CACHE_STAT_INC(in_martian_src);
1587 #ifdef CONFIG_IP_ROUTE_VERBOSE
1588         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1589                 /*
1590                  *      RFC1812 recommendation, if source is martian,
1591                  *      the only hint is MAC header.
1592                  */
1593                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1594                         "%u.%u.%u.%u, on dev %s\n",
1595                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1596                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1597                         int i;
1598                         const unsigned char *p = skb_mac_header(skb);
1599                         printk(KERN_WARNING "ll header: ");
1600                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1601                                 printk("%02x", *p);
1602                                 if (i < (dev->hard_header_len - 1))
1603                                         printk(":");
1604                         }
1605                         printk("\n");
1606                 }
1607         }
1608 #endif
1609 }
1610
1611 static inline int __mkroute_input(struct sk_buff *skb,
1612                                   struct fib_result* res,
1613                                   struct in_device *in_dev,
1614                                   __be32 daddr, __be32 saddr, u32 tos,
1615                                   struct rtable **result)
1616 {
1617
1618         struct rtable *rth;
1619         int err;
1620         struct in_device *out_dev;
1621         unsigned flags = 0;
1622         __be32 spec_dst;
1623         u32 itag;
1624
1625         /* get a working reference to the output device */
1626         out_dev = in_dev_get(FIB_RES_DEV(*res));
1627         if (out_dev == NULL) {
1628                 if (net_ratelimit())
1629                         printk(KERN_CRIT "Bug in ip_route_input" \
1630                                "_slow(). Please, report\n");
1631                 return -EINVAL;
1632         }
1633
1634
1635         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1636                                   in_dev->dev, &spec_dst, &itag);
1637         if (err < 0) {
1638                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1639                                          saddr);
1640
1641                 err = -EINVAL;
1642                 goto cleanup;
1643         }
1644
1645         if (err)
1646                 flags |= RTCF_DIRECTSRC;
1647
1648         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1649             (IN_DEV_SHARED_MEDIA(out_dev) ||
1650              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1651                 flags |= RTCF_DOREDIRECT;
1652
1653         if (skb->protocol != htons(ETH_P_IP)) {
1654                 /* Not IP (i.e. ARP). Do not create route, if it is
1655                  * invalid for proxy arp. DNAT routes are always valid.
1656                  */
1657                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1658                         err = -EINVAL;
1659                         goto cleanup;
1660                 }
1661         }
1662
1663
1664         rth = dst_alloc(&ipv4_dst_ops);
1665         if (!rth) {
1666                 err = -ENOBUFS;
1667                 goto cleanup;
1668         }
1669
1670         atomic_set(&rth->u.dst.__refcnt, 1);
1671         rth->u.dst.flags= DST_HOST;
1672         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1673                 rth->u.dst.flags |= DST_NOPOLICY;
1674         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1675                 rth->u.dst.flags |= DST_NOXFRM;
1676         rth->fl.fl4_dst = daddr;
1677         rth->rt_dst     = daddr;
1678         rth->fl.fl4_tos = tos;
1679         rth->fl.mark    = skb->mark;
1680         rth->fl.fl4_src = saddr;
1681         rth->rt_src     = saddr;
1682         rth->rt_gateway = daddr;
1683         rth->rt_iif     =
1684                 rth->fl.iif     = in_dev->dev->ifindex;
1685         rth->u.dst.dev  = (out_dev)->dev;
1686         dev_hold(rth->u.dst.dev);
1687         rth->idev       = in_dev_get(rth->u.dst.dev);
1688         rth->fl.oif     = 0;
1689         rth->rt_spec_dst= spec_dst;
1690
1691         rth->u.dst.input = ip_forward;
1692         rth->u.dst.output = ip_output;
1693
1694         rt_set_nexthop(rth, res, itag);
1695
1696         rth->rt_flags = flags;
1697
1698         *result = rth;
1699         err = 0;
1700  cleanup:
1701         /* release the working reference to the output device */
1702         in_dev_put(out_dev);
1703         return err;
1704 }
1705
1706 static inline int ip_mkroute_input(struct sk_buff *skb,
1707                                    struct fib_result* res,
1708                                    const struct flowi *fl,
1709                                    struct in_device *in_dev,
1710                                    __be32 daddr, __be32 saddr, u32 tos)
1711 {
1712         struct rtable* rth = NULL;
1713         int err;
1714         unsigned hash;
1715
1716 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1717         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1718                 fib_select_multipath(fl, res);
1719 #endif
1720
1721         /* create a routing cache entry */
1722         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1723         if (err)
1724                 return err;
1725
1726         /* put it into the cache */
1727         hash = rt_hash(daddr, saddr, fl->iif);
1728         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1729 }
1730
1731 /*
1732  *      NOTE. We drop all the packets that has local source
1733  *      addresses, because every properly looped back packet
1734  *      must have correct destination already attached by output routine.
1735  *
1736  *      Such approach solves two big problems:
1737  *      1. Not simplex devices are handled properly.
1738  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1739  */
1740
1741 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1742                                u8 tos, struct net_device *dev)
1743 {
1744         struct fib_result res;
1745         struct in_device *in_dev = in_dev_get(dev);
1746         struct flowi fl = { .nl_u = { .ip4_u =
1747                                       { .daddr = daddr,
1748                                         .saddr = saddr,
1749                                         .tos = tos,
1750                                         .scope = RT_SCOPE_UNIVERSE,
1751                                       } },
1752                             .mark = skb->mark,
1753                             .iif = dev->ifindex };
1754         unsigned        flags = 0;
1755         u32             itag = 0;
1756         struct rtable * rth;
1757         unsigned        hash;
1758         __be32          spec_dst;
1759         int             err = -EINVAL;
1760         int             free_res = 0;
1761
1762         /* IP on this device is disabled. */
1763
1764         if (!in_dev)
1765                 goto out;
1766
1767         /* Check for the most weird martians, which can be not detected
1768            by fib_lookup.
1769          */
1770
1771         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1772                 goto martian_source;
1773
1774         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1775                 goto brd_input;
1776
1777         /* Accept zero addresses only to limited broadcast;
1778          * I even do not know to fix it or not. Waiting for complains :-)
1779          */
1780         if (ZERONET(saddr))
1781                 goto martian_source;
1782
1783         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1784                 goto martian_destination;
1785
1786         /*
1787          *      Now we are ready to route packet.
1788          */
1789         if ((err = fib_lookup(&fl, &res)) != 0) {
1790                 if (!IN_DEV_FORWARD(in_dev))
1791                         goto e_hostunreach;
1792                 goto no_route;
1793         }
1794         free_res = 1;
1795
1796         RT_CACHE_STAT_INC(in_slow_tot);
1797
1798         if (res.type == RTN_BROADCAST)
1799                 goto brd_input;
1800
1801         if (res.type == RTN_LOCAL) {
1802                 int result;
1803                 result = fib_validate_source(saddr, daddr, tos,
1804                                              init_net.loopback_dev->ifindex,
1805                                              dev, &spec_dst, &itag);
1806                 if (result < 0)
1807                         goto martian_source;
1808                 if (result)
1809                         flags |= RTCF_DIRECTSRC;
1810                 spec_dst = daddr;
1811                 goto local_input;
1812         }
1813
1814         if (!IN_DEV_FORWARD(in_dev))
1815                 goto e_hostunreach;
1816         if (res.type != RTN_UNICAST)
1817                 goto martian_destination;
1818
1819         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1820 done:
1821         in_dev_put(in_dev);
1822         if (free_res)
1823                 fib_res_put(&res);
1824 out:    return err;
1825
1826 brd_input:
1827         if (skb->protocol != htons(ETH_P_IP))
1828                 goto e_inval;
1829
1830         if (ZERONET(saddr))
1831                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1832         else {
1833                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1834                                           &itag);
1835                 if (err < 0)
1836                         goto martian_source;
1837                 if (err)
1838                         flags |= RTCF_DIRECTSRC;
1839         }
1840         flags |= RTCF_BROADCAST;
1841         res.type = RTN_BROADCAST;
1842         RT_CACHE_STAT_INC(in_brd);
1843
1844 local_input:
1845         rth = dst_alloc(&ipv4_dst_ops);
1846         if (!rth)
1847                 goto e_nobufs;
1848
1849         rth->u.dst.output= ip_rt_bug;
1850
1851         atomic_set(&rth->u.dst.__refcnt, 1);
1852         rth->u.dst.flags= DST_HOST;
1853         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1854                 rth->u.dst.flags |= DST_NOPOLICY;
1855         rth->fl.fl4_dst = daddr;
1856         rth->rt_dst     = daddr;
1857         rth->fl.fl4_tos = tos;
1858         rth->fl.mark    = skb->mark;
1859         rth->fl.fl4_src = saddr;
1860         rth->rt_src     = saddr;
1861 #ifdef CONFIG_NET_CLS_ROUTE
1862         rth->u.dst.tclassid = itag;
1863 #endif
1864         rth->rt_iif     =
1865         rth->fl.iif     = dev->ifindex;
1866         rth->u.dst.dev  = init_net.loopback_dev;
1867         dev_hold(rth->u.dst.dev);
1868         rth->idev       = in_dev_get(rth->u.dst.dev);
1869         rth->rt_gateway = daddr;
1870         rth->rt_spec_dst= spec_dst;
1871         rth->u.dst.input= ip_local_deliver;
1872         rth->rt_flags   = flags|RTCF_LOCAL;
1873         if (res.type == RTN_UNREACHABLE) {
1874                 rth->u.dst.input= ip_error;
1875                 rth->u.dst.error= -err;
1876                 rth->rt_flags   &= ~RTCF_LOCAL;
1877         }
1878         rth->rt_type    = res.type;
1879         hash = rt_hash(daddr, saddr, fl.iif);
1880         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1881         goto done;
1882
1883 no_route:
1884         RT_CACHE_STAT_INC(in_no_route);
1885         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1886         res.type = RTN_UNREACHABLE;
1887         if (err == -ESRCH)
1888                 err = -ENETUNREACH;
1889         goto local_input;
1890
1891         /*
1892          *      Do not cache martian addresses: they should be logged (RFC1812)
1893          */
1894 martian_destination:
1895         RT_CACHE_STAT_INC(in_martian_dst);
1896 #ifdef CONFIG_IP_ROUTE_VERBOSE
1897         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1898                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1899                         "%u.%u.%u.%u, dev %s\n",
1900                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1901 #endif
1902
1903 e_hostunreach:
1904         err = -EHOSTUNREACH;
1905         goto done;
1906
1907 e_inval:
1908         err = -EINVAL;
1909         goto done;
1910
1911 e_nobufs:
1912         err = -ENOBUFS;
1913         goto done;
1914
1915 martian_source:
1916         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1917         goto e_inval;
1918 }
1919
1920 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1921                    u8 tos, struct net_device *dev)
1922 {
1923         struct rtable * rth;
1924         unsigned        hash;
1925         int iif = dev->ifindex;
1926
1927         tos &= IPTOS_RT_MASK;
1928         hash = rt_hash(daddr, saddr, iif);
1929
1930         rcu_read_lock();
1931         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1932              rth = rcu_dereference(rth->u.dst.rt_next)) {
1933                 if (rth->fl.fl4_dst == daddr &&
1934                     rth->fl.fl4_src == saddr &&
1935                     rth->fl.iif == iif &&
1936                     rth->fl.oif == 0 &&
1937                     rth->fl.mark == skb->mark &&
1938                     rth->fl.fl4_tos == tos) {
1939                         dst_use(&rth->u.dst, jiffies);
1940                         RT_CACHE_STAT_INC(in_hit);
1941                         rcu_read_unlock();
1942                         skb->dst = (struct dst_entry*)rth;
1943                         return 0;
1944                 }
1945                 RT_CACHE_STAT_INC(in_hlist_search);
1946         }
1947         rcu_read_unlock();
1948
1949         /* Multicast recognition logic is moved from route cache to here.
1950            The problem was that too many Ethernet cards have broken/missing
1951            hardware multicast filters :-( As result the host on multicasting
1952            network acquires a lot of useless route cache entries, sort of
1953            SDR messages from all the world. Now we try to get rid of them.
1954            Really, provided software IP multicast filter is organized
1955            reasonably (at least, hashed), it does not result in a slowdown
1956            comparing with route cache reject entries.
1957            Note, that multicast routers are not affected, because
1958            route cache entry is created eventually.
1959          */
1960         if (MULTICAST(daddr)) {
1961                 struct in_device *in_dev;
1962
1963                 rcu_read_lock();
1964                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1965                         int our = ip_check_mc(in_dev, daddr, saddr,
1966                                 ip_hdr(skb)->protocol);
1967                         if (our
1968 #ifdef CONFIG_IP_MROUTE
1969                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1970 #endif
1971                             ) {
1972                                 rcu_read_unlock();
1973                                 return ip_route_input_mc(skb, daddr, saddr,
1974                                                          tos, dev, our);
1975                         }
1976                 }
1977                 rcu_read_unlock();
1978                 return -EINVAL;
1979         }
1980         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1981 }
1982
1983 static inline int __mkroute_output(struct rtable **result,
1984                                    struct fib_result* res,
1985                                    const struct flowi *fl,
1986                                    const struct flowi *oldflp,
1987                                    struct net_device *dev_out,
1988                                    unsigned flags)
1989 {
1990         struct rtable *rth;
1991         struct in_device *in_dev;
1992         u32 tos = RT_FL_TOS(oldflp);
1993         int err = 0;
1994
1995         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1996                 return -EINVAL;
1997
1998         if (fl->fl4_dst == htonl(0xFFFFFFFF))
1999                 res->type = RTN_BROADCAST;
2000         else if (MULTICAST(fl->fl4_dst))
2001                 res->type = RTN_MULTICAST;
2002         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2003                 return -EINVAL;
2004
2005         if (dev_out->flags & IFF_LOOPBACK)
2006                 flags |= RTCF_LOCAL;
2007
2008         /* get work reference to inet device */
2009         in_dev = in_dev_get(dev_out);
2010         if (!in_dev)
2011                 return -EINVAL;
2012
2013         if (res->type == RTN_BROADCAST) {
2014                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2015                 if (res->fi) {
2016                         fib_info_put(res->fi);
2017                         res->fi = NULL;
2018                 }
2019         } else if (res->type == RTN_MULTICAST) {
2020                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2021                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2022                                  oldflp->proto))
2023                         flags &= ~RTCF_LOCAL;
2024                 /* If multicast route do not exist use
2025                    default one, but do not gateway in this case.
2026                    Yes, it is hack.
2027                  */
2028                 if (res->fi && res->prefixlen < 4) {
2029                         fib_info_put(res->fi);
2030                         res->fi = NULL;
2031                 }
2032         }
2033
2034
2035         rth = dst_alloc(&ipv4_dst_ops);
2036         if (!rth) {
2037                 err = -ENOBUFS;
2038                 goto cleanup;
2039         }
2040
2041         atomic_set(&rth->u.dst.__refcnt, 1);
2042         rth->u.dst.flags= DST_HOST;
2043         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2044                 rth->u.dst.flags |= DST_NOXFRM;
2045         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2046                 rth->u.dst.flags |= DST_NOPOLICY;
2047
2048         rth->fl.fl4_dst = oldflp->fl4_dst;
2049         rth->fl.fl4_tos = tos;
2050         rth->fl.fl4_src = oldflp->fl4_src;
2051         rth->fl.oif     = oldflp->oif;
2052         rth->fl.mark    = oldflp->mark;
2053         rth->rt_dst     = fl->fl4_dst;
2054         rth->rt_src     = fl->fl4_src;
2055         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2056         /* get references to the devices that are to be hold by the routing
2057            cache entry */
2058         rth->u.dst.dev  = dev_out;
2059         dev_hold(dev_out);
2060         rth->idev       = in_dev_get(dev_out);
2061         rth->rt_gateway = fl->fl4_dst;
2062         rth->rt_spec_dst= fl->fl4_src;
2063
2064         rth->u.dst.output=ip_output;
2065
2066         RT_CACHE_STAT_INC(out_slow_tot);
2067
2068         if (flags & RTCF_LOCAL) {
2069                 rth->u.dst.input = ip_local_deliver;
2070                 rth->rt_spec_dst = fl->fl4_dst;
2071         }
2072         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2073                 rth->rt_spec_dst = fl->fl4_src;
2074                 if (flags & RTCF_LOCAL &&
2075                     !(dev_out->flags & IFF_LOOPBACK)) {
2076                         rth->u.dst.output = ip_mc_output;
2077                         RT_CACHE_STAT_INC(out_slow_mc);
2078                 }
2079 #ifdef CONFIG_IP_MROUTE
2080                 if (res->type == RTN_MULTICAST) {
2081                         if (IN_DEV_MFORWARD(in_dev) &&
2082                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2083                                 rth->u.dst.input = ip_mr_input;
2084                                 rth->u.dst.output = ip_mc_output;
2085                         }
2086                 }
2087 #endif
2088         }
2089
2090         rt_set_nexthop(rth, res, 0);
2091
2092         rth->rt_flags = flags;
2093
2094         *result = rth;
2095  cleanup:
2096         /* release work reference to inet device */
2097         in_dev_put(in_dev);
2098
2099         return err;
2100 }
2101
2102 static inline int ip_mkroute_output(struct rtable **rp,
2103                                     struct fib_result* res,
2104                                     const struct flowi *fl,
2105                                     const struct flowi *oldflp,
2106                                     struct net_device *dev_out,
2107                                     unsigned flags)
2108 {
2109         struct rtable *rth = NULL;
2110         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2111         unsigned hash;
2112         if (err == 0) {
2113                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2114                 err = rt_intern_hash(hash, rth, rp);
2115         }
2116
2117         return err;
2118 }
2119
2120 /*
2121  * Major route resolver routine.
2122  */
2123
2124 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2125 {
2126         u32 tos = RT_FL_TOS(oldflp);
2127         struct flowi fl = { .nl_u = { .ip4_u =
2128                                       { .daddr = oldflp->fl4_dst,
2129                                         .saddr = oldflp->fl4_src,
2130                                         .tos = tos & IPTOS_RT_MASK,
2131                                         .scope = ((tos & RTO_ONLINK) ?
2132                                                   RT_SCOPE_LINK :
2133                                                   RT_SCOPE_UNIVERSE),
2134                                       } },
2135                             .mark = oldflp->mark,
2136                             .iif = init_net.loopback_dev->ifindex,
2137                             .oif = oldflp->oif };
2138         struct fib_result res;
2139         unsigned flags = 0;
2140         struct net_device *dev_out = NULL;
2141         int free_res = 0;
2142         int err;
2143
2144
2145         res.fi          = NULL;
2146 #ifdef CONFIG_IP_MULTIPLE_TABLES
2147         res.r           = NULL;
2148 #endif
2149
2150         if (oldflp->fl4_src) {
2151                 err = -EINVAL;
2152                 if (MULTICAST(oldflp->fl4_src) ||
2153                     BADCLASS(oldflp->fl4_src) ||
2154                     ZERONET(oldflp->fl4_src))
2155                         goto out;
2156
2157                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2158                 dev_out = ip_dev_find(oldflp->fl4_src);
2159                 if (dev_out == NULL)
2160                         goto out;
2161
2162                 /* I removed check for oif == dev_out->oif here.
2163                    It was wrong for two reasons:
2164                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2165                       assigned to multiple interfaces.
2166                    2. Moreover, we are allowed to send packets with saddr
2167                       of another iface. --ANK
2168                  */
2169
2170                 if (oldflp->oif == 0
2171                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2172                         /* Special hack: user can direct multicasts
2173                            and limited broadcast via necessary interface
2174                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2175                            This hack is not just for fun, it allows
2176                            vic,vat and friends to work.
2177                            They bind socket to loopback, set ttl to zero
2178                            and expect that it will work.
2179                            From the viewpoint of routing cache they are broken,
2180                            because we are not allowed to build multicast path
2181                            with loopback source addr (look, routing cache
2182                            cannot know, that ttl is zero, so that packet
2183                            will not leave this host and route is valid).
2184                            Luckily, this hack is good workaround.
2185                          */
2186
2187                         fl.oif = dev_out->ifindex;
2188                         goto make_route;
2189                 }
2190                 if (dev_out)
2191                         dev_put(dev_out);
2192                 dev_out = NULL;
2193         }
2194
2195
2196         if (oldflp->oif) {
2197                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2198                 err = -ENODEV;
2199                 if (dev_out == NULL)
2200                         goto out;
2201
2202                 /* RACE: Check return value of inet_select_addr instead. */
2203                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2204                         dev_put(dev_out);
2205                         goto out;       /* Wrong error code */
2206                 }
2207
2208                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2209                         if (!fl.fl4_src)
2210                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2211                                                               RT_SCOPE_LINK);
2212                         goto make_route;
2213                 }
2214                 if (!fl.fl4_src) {
2215                         if (MULTICAST(oldflp->fl4_dst))
2216                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2217                                                               fl.fl4_scope);
2218                         else if (!oldflp->fl4_dst)
2219                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2220                                                               RT_SCOPE_HOST);
2221                 }
2222         }
2223
2224         if (!fl.fl4_dst) {
2225                 fl.fl4_dst = fl.fl4_src;
2226                 if (!fl.fl4_dst)
2227                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2228                 if (dev_out)
2229                         dev_put(dev_out);
2230                 dev_out = init_net.loopback_dev;
2231                 dev_hold(dev_out);
2232                 fl.oif = init_net.loopback_dev->ifindex;
2233                 res.type = RTN_LOCAL;
2234                 flags |= RTCF_LOCAL;
2235                 goto make_route;
2236         }
2237
2238         if (fib_lookup(&fl, &res)) {
2239                 res.fi = NULL;
2240                 if (oldflp->oif) {
2241                         /* Apparently, routing tables are wrong. Assume,
2242                            that the destination is on link.
2243
2244                            WHY? DW.
2245                            Because we are allowed to send to iface
2246                            even if it has NO routes and NO assigned
2247                            addresses. When oif is specified, routing
2248                            tables are looked up with only one purpose:
2249                            to catch if destination is gatewayed, rather than
2250                            direct. Moreover, if MSG_DONTROUTE is set,
2251                            we send packet, ignoring both routing tables
2252                            and ifaddr state. --ANK
2253
2254
2255                            We could make it even if oif is unknown,
2256                            likely IPv6, but we do not.
2257                          */
2258
2259                         if (fl.fl4_src == 0)
2260                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2261                                                               RT_SCOPE_LINK);
2262                         res.type = RTN_UNICAST;
2263                         goto make_route;
2264                 }
2265                 if (dev_out)
2266                         dev_put(dev_out);
2267                 err = -ENETUNREACH;
2268                 goto out;
2269         }
2270         free_res = 1;
2271
2272         if (res.type == RTN_LOCAL) {
2273                 if (!fl.fl4_src)
2274                         fl.fl4_src = fl.fl4_dst;
2275                 if (dev_out)
2276                         dev_put(dev_out);
2277                 dev_out = init_net.loopback_dev;
2278                 dev_hold(dev_out);
2279                 fl.oif = dev_out->ifindex;
2280                 if (res.fi)
2281                         fib_info_put(res.fi);
2282                 res.fi = NULL;
2283                 flags |= RTCF_LOCAL;
2284                 goto make_route;
2285         }
2286
2287 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2288         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2289                 fib_select_multipath(&fl, &res);
2290         else
2291 #endif
2292         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2293                 fib_select_default(&fl, &res);
2294
2295         if (!fl.fl4_src)
2296                 fl.fl4_src = FIB_RES_PREFSRC(res);
2297
2298         if (dev_out)
2299                 dev_put(dev_out);
2300         dev_out = FIB_RES_DEV(res);
2301         dev_hold(dev_out);
2302         fl.oif = dev_out->ifindex;
2303
2304
2305 make_route:
2306         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2307
2308
2309         if (free_res)
2310                 fib_res_put(&res);
2311         if (dev_out)
2312                 dev_put(dev_out);
2313 out:    return err;
2314 }
2315
2316 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2317 {
2318         unsigned hash;
2319         struct rtable *rth;
2320
2321         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2322
2323         rcu_read_lock_bh();
2324         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2325                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2326                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2327                     rth->fl.fl4_src == flp->fl4_src &&
2328                     rth->fl.iif == 0 &&
2329                     rth->fl.oif == flp->oif &&
2330                     rth->fl.mark == flp->mark &&
2331                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2332                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2333                         dst_use(&rth->u.dst, jiffies);
2334                         RT_CACHE_STAT_INC(out_hit);
2335                         rcu_read_unlock_bh();
2336                         *rp = rth;
2337                         return 0;
2338                 }
2339                 RT_CACHE_STAT_INC(out_hlist_search);
2340         }
2341         rcu_read_unlock_bh();
2342
2343         return ip_route_output_slow(rp, flp);
2344 }
2345
2346 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2347
2348 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2349 {
2350 }
2351
2352 static struct dst_ops ipv4_dst_blackhole_ops = {
2353         .family                 =       AF_INET,
2354         .protocol               =       __constant_htons(ETH_P_IP),
2355         .destroy                =       ipv4_dst_destroy,
2356         .check                  =       ipv4_dst_check,
2357         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2358         .entry_size             =       sizeof(struct rtable),
2359 };
2360
2361
2362 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2363 {
2364         struct rtable *ort = *rp;
2365         struct rtable *rt = (struct rtable *)
2366                 dst_alloc(&ipv4_dst_blackhole_ops);
2367
2368         if (rt) {
2369                 struct dst_entry *new = &rt->u.dst;
2370
2371                 atomic_set(&new->__refcnt, 1);
2372                 new->__use = 1;
2373                 new->input = dst_discard;
2374                 new->output = dst_discard;
2375                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2376
2377                 new->dev = ort->u.dst.dev;
2378                 if (new->dev)
2379                         dev_hold(new->dev);
2380
2381                 rt->fl = ort->fl;
2382
2383                 rt->idev = ort->idev;
2384                 if (rt->idev)
2385                         in_dev_hold(rt->idev);
2386                 rt->rt_flags = ort->rt_flags;
2387                 rt->rt_type = ort->rt_type;
2388                 rt->rt_dst = ort->rt_dst;
2389                 rt->rt_src = ort->rt_src;
2390                 rt->rt_iif = ort->rt_iif;
2391                 rt->rt_gateway = ort->rt_gateway;
2392                 rt->rt_spec_dst = ort->rt_spec_dst;
2393                 rt->peer = ort->peer;
2394                 if (rt->peer)
2395                         atomic_inc(&rt->peer->refcnt);
2396
2397                 dst_free(new);
2398         }
2399
2400         dst_release(&(*rp)->u.dst);
2401         *rp = rt;
2402         return (rt ? 0 : -ENOMEM);
2403 }
2404
2405 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2406 {
2407         int err;
2408
2409         if ((err = __ip_route_output_key(rp, flp)) != 0)
2410                 return err;
2411
2412         if (flp->proto) {
2413                 if (!flp->fl4_src)
2414                         flp->fl4_src = (*rp)->rt_src;
2415                 if (!flp->fl4_dst)
2416                         flp->fl4_dst = (*rp)->rt_dst;
2417                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2418                 if (err == -EREMOTE)
2419                         err = ipv4_dst_blackhole(rp, flp, sk);
2420
2421                 return err;
2422         }
2423
2424         return 0;
2425 }
2426
2427 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2428
2429 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2430 {
2431         return ip_route_output_flow(rp, flp, NULL, 0);
2432 }
2433
2434 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2435                         int nowait, unsigned int flags)
2436 {
2437         struct rtable *rt = (struct rtable*)skb->dst;
2438         struct rtmsg *r;
2439         struct nlmsghdr *nlh;
2440         long expires;
2441         u32 id = 0, ts = 0, tsage = 0, error;
2442
2443         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2444         if (nlh == NULL)
2445                 return -EMSGSIZE;
2446
2447         r = nlmsg_data(nlh);
2448         r->rtm_family    = AF_INET;
2449         r->rtm_dst_len  = 32;
2450         r->rtm_src_len  = 0;
2451         r->rtm_tos      = rt->fl.fl4_tos;
2452         r->rtm_table    = RT_TABLE_MAIN;
2453         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2454         r->rtm_type     = rt->rt_type;
2455         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2456         r->rtm_protocol = RTPROT_UNSPEC;
2457         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2458         if (rt->rt_flags & RTCF_NOTIFY)
2459                 r->rtm_flags |= RTM_F_NOTIFY;
2460
2461         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2462
2463         if (rt->fl.fl4_src) {
2464                 r->rtm_src_len = 32;
2465                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2466         }
2467         if (rt->u.dst.dev)
2468                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2469 #ifdef CONFIG_NET_CLS_ROUTE
2470         if (rt->u.dst.tclassid)
2471                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2472 #endif
2473         if (rt->fl.iif)
2474                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2475         else if (rt->rt_src != rt->fl.fl4_src)
2476                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2477
2478         if (rt->rt_dst != rt->rt_gateway)
2479                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2480
2481         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2482                 goto nla_put_failure;
2483
2484         error = rt->u.dst.error;
2485         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2486         if (rt->peer) {
2487                 id = rt->peer->ip_id_count;
2488                 if (rt->peer->tcp_ts_stamp) {
2489                         ts = rt->peer->tcp_ts;
2490                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2491                 }
2492         }
2493
2494         if (rt->fl.iif) {
2495 #ifdef CONFIG_IP_MROUTE
2496                 __be32 dst = rt->rt_dst;
2497
2498                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2499                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2500                         int err = ipmr_get_route(skb, r, nowait);
2501                         if (err <= 0) {
2502                                 if (!nowait) {
2503                                         if (err == 0)
2504                                                 return 0;
2505                                         goto nla_put_failure;
2506                                 } else {
2507                                         if (err == -EMSGSIZE)
2508                                                 goto nla_put_failure;
2509                                         error = err;
2510                                 }
2511                         }
2512                 } else
2513 #endif
2514                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2515         }
2516
2517         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2518                                expires, error) < 0)
2519                 goto nla_put_failure;
2520
2521         return nlmsg_end(skb, nlh);
2522
2523 nla_put_failure:
2524         nlmsg_cancel(skb, nlh);
2525         return -EMSGSIZE;
2526 }
2527
2528 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2529 {
2530         struct net *net = in_skb->sk->sk_net;
2531         struct rtmsg *rtm;
2532         struct nlattr *tb[RTA_MAX+1];
2533         struct rtable *rt = NULL;
2534         __be32 dst = 0;
2535         __be32 src = 0;
2536         u32 iif;
2537         int err;
2538         struct sk_buff *skb;
2539
2540         if (net != &init_net)
2541                 return -EINVAL;
2542
2543         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2544         if (err < 0)
2545                 goto errout;
2546
2547         rtm = nlmsg_data(nlh);
2548
2549         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2550         if (skb == NULL) {
2551                 err = -ENOBUFS;
2552                 goto errout;
2553         }
2554
2555         /* Reserve room for dummy headers, this skb can pass
2556            through good chunk of routing engine.
2557          */
2558         skb_reset_mac_header(skb);
2559         skb_reset_network_header(skb);
2560
2561         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2562         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2563         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2564
2565         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2566         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2567         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2568
2569         if (iif) {
2570                 struct net_device *dev;
2571
2572                 dev = __dev_get_by_index(&init_net, iif);
2573                 if (dev == NULL) {
2574                         err = -ENODEV;
2575                         goto errout_free;
2576                 }
2577
2578                 skb->protocol   = htons(ETH_P_IP);
2579                 skb->dev        = dev;
2580                 local_bh_disable();
2581                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2582                 local_bh_enable();
2583
2584                 rt = (struct rtable*) skb->dst;
2585                 if (err == 0 && rt->u.dst.error)
2586                         err = -rt->u.dst.error;
2587         } else {
2588                 struct flowi fl = {
2589                         .nl_u = {
2590                                 .ip4_u = {
2591                                         .daddr = dst,
2592                                         .saddr = src,
2593                                         .tos = rtm->rtm_tos,
2594                                 },
2595                         },
2596                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2597                 };
2598                 err = ip_route_output_key(&rt, &fl);
2599         }
2600
2601         if (err)
2602                 goto errout_free;
2603
2604         skb->dst = &rt->u.dst;
2605         if (rtm->rtm_flags & RTM_F_NOTIFY)
2606                 rt->rt_flags |= RTCF_NOTIFY;
2607
2608         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2609                                 RTM_NEWROUTE, 0, 0);
2610         if (err <= 0)
2611                 goto errout_free;
2612
2613         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2614 errout:
2615         return err;
2616
2617 errout_free:
2618         kfree_skb(skb);
2619         goto errout;
2620 }
2621
2622 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2623 {
2624         struct rtable *rt;
2625         int h, s_h;
2626         int idx, s_idx;
2627
2628         s_h = cb->args[0];
2629         if (s_h < 0)
2630                 s_h = 0;
2631         s_idx = idx = cb->args[1];
2632         for (h = s_h; h <= rt_hash_mask; h++) {
2633                 rcu_read_lock_bh();
2634                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2635                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2636                         if (idx < s_idx)
2637                                 continue;
2638                         skb->dst = dst_clone(&rt->u.dst);
2639                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2640                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2641                                          1, NLM_F_MULTI) <= 0) {
2642                                 dst_release(xchg(&skb->dst, NULL));
2643                                 rcu_read_unlock_bh();
2644                                 goto done;
2645                         }
2646                         dst_release(xchg(&skb->dst, NULL));
2647                 }
2648                 rcu_read_unlock_bh();
2649                 s_idx = 0;
2650         }
2651
2652 done:
2653         cb->args[0] = h;
2654         cb->args[1] = idx;
2655         return skb->len;
2656 }
2657
2658 void ip_rt_multicast_event(struct in_device *in_dev)
2659 {
2660         rt_cache_flush(0);
2661 }
2662
2663 #ifdef CONFIG_SYSCTL
2664 static int flush_delay;
2665
2666 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2667                                         struct file *filp, void __user *buffer,
2668                                         size_t *lenp, loff_t *ppos)
2669 {
2670         if (write) {
2671                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2672                 rt_cache_flush(flush_delay);
2673                 return 0;
2674         }
2675
2676         return -EINVAL;
2677 }
2678
2679 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2680                                                 int __user *name,
2681                                                 int nlen,
2682                                                 void __user *oldval,
2683                                                 size_t __user *oldlenp,
2684                                                 void __user *newval,
2685                                                 size_t newlen)
2686 {
2687         int delay;
2688         if (newlen != sizeof(int))
2689                 return -EINVAL;
2690         if (get_user(delay, (int __user *)newval))
2691                 return -EFAULT;
2692         rt_cache_flush(delay);
2693         return 0;
2694 }
2695
2696 ctl_table ipv4_route_table[] = {
2697         {
2698                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2699                 .procname       = "flush",
2700                 .data           = &flush_delay,
2701                 .maxlen         = sizeof(int),
2702                 .mode           = 0200,
2703                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2704                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2705         },
2706         {
2707                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2708                 .procname       = "min_delay",
2709                 .data           = &ip_rt_min_delay,
2710                 .maxlen         = sizeof(int),
2711                 .mode           = 0644,
2712                 .proc_handler   = &proc_dointvec_jiffies,
2713                 .strategy       = &sysctl_jiffies,
2714         },
2715         {
2716                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2717                 .procname       = "max_delay",
2718                 .data           = &ip_rt_max_delay,
2719                 .maxlen         = sizeof(int),
2720                 .mode           = 0644,
2721                 .proc_handler   = &proc_dointvec_jiffies,
2722                 .strategy       = &sysctl_jiffies,
2723         },
2724         {
2725                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2726                 .procname       = "gc_thresh",
2727                 .data           = &ipv4_dst_ops.gc_thresh,
2728                 .maxlen         = sizeof(int),
2729                 .mode           = 0644,
2730                 .proc_handler   = &proc_dointvec,
2731         },
2732         {
2733                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2734                 .procname       = "max_size",
2735                 .data           = &ip_rt_max_size,
2736                 .maxlen         = sizeof(int),
2737                 .mode           = 0644,
2738                 .proc_handler   = &proc_dointvec,
2739         },
2740         {
2741                 /*  Deprecated. Use gc_min_interval_ms */
2742
2743                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2744                 .procname       = "gc_min_interval",
2745                 .data           = &ip_rt_gc_min_interval,
2746                 .maxlen         = sizeof(int),
2747                 .mode           = 0644,
2748                 .proc_handler   = &proc_dointvec_jiffies,
2749                 .strategy       = &sysctl_jiffies,
2750         },
2751         {
2752                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2753                 .procname       = "gc_min_interval_ms",
2754                 .data           = &ip_rt_gc_min_interval,
2755                 .maxlen         = sizeof(int),
2756                 .mode           = 0644,
2757                 .proc_handler   = &proc_dointvec_ms_jiffies,
2758                 .strategy       = &sysctl_ms_jiffies,
2759         },
2760         {
2761                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2762                 .procname       = "gc_timeout",
2763                 .data           = &ip_rt_gc_timeout,
2764                 .maxlen         = sizeof(int),
2765                 .mode           = 0644,
2766                 .proc_handler   = &proc_dointvec_jiffies,
2767                 .strategy       = &sysctl_jiffies,
2768         },
2769         {
2770                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2771                 .procname       = "gc_interval",
2772                 .data           = &ip_rt_gc_interval,
2773                 .maxlen         = sizeof(int),
2774                 .mode           = 0644,
2775                 .proc_handler   = &proc_dointvec_jiffies,
2776                 .strategy       = &sysctl_jiffies,
2777         },
2778         {
2779                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2780                 .procname       = "redirect_load",
2781                 .data           = &ip_rt_redirect_load,
2782                 .maxlen         = sizeof(int),
2783                 .mode           = 0644,
2784                 .proc_handler   = &proc_dointvec,
2785         },
2786         {
2787                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2788                 .procname       = "redirect_number",
2789                 .data           = &ip_rt_redirect_number,
2790                 .maxlen         = sizeof(int),
2791                 .mode           = 0644,
2792                 .proc_handler   = &proc_dointvec,
2793         },
2794         {
2795                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2796                 .procname       = "redirect_silence",
2797                 .data           = &ip_rt_redirect_silence,
2798                 .maxlen         = sizeof(int),
2799                 .mode           = 0644,
2800                 .proc_handler   = &proc_dointvec,
2801         },
2802         {
2803                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2804                 .procname       = "error_cost",
2805                 .data           = &ip_rt_error_cost,
2806                 .maxlen         = sizeof(int),
2807                 .mode           = 0644,
2808                 .proc_handler   = &proc_dointvec,
2809         },
2810         {
2811                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2812                 .procname       = "error_burst",
2813                 .data           = &ip_rt_error_burst,
2814                 .maxlen         = sizeof(int),
2815                 .mode           = 0644,
2816                 .proc_handler   = &proc_dointvec,
2817         },
2818         {
2819                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2820                 .procname       = "gc_elasticity",
2821                 .data           = &ip_rt_gc_elasticity,
2822                 .maxlen         = sizeof(int),
2823                 .mode           = 0644,
2824                 .proc_handler   = &proc_dointvec,
2825         },
2826         {
2827                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2828                 .procname       = "mtu_expires",
2829                 .data           = &ip_rt_mtu_expires,
2830                 .maxlen         = sizeof(int),
2831                 .mode           = 0644,
2832                 .proc_handler   = &proc_dointvec_jiffies,
2833                 .strategy       = &sysctl_jiffies,
2834         },
2835         {
2836                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2837                 .procname       = "min_pmtu",
2838                 .data           = &ip_rt_min_pmtu,
2839                 .maxlen         = sizeof(int),
2840                 .mode           = 0644,
2841                 .proc_handler   = &proc_dointvec,
2842         },
2843         {
2844                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2845                 .procname       = "min_adv_mss",
2846                 .data           = &ip_rt_min_advmss,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0644,
2849                 .proc_handler   = &proc_dointvec,
2850         },
2851         {
2852                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2853                 .procname       = "secret_interval",
2854                 .data           = &ip_rt_secret_interval,
2855                 .maxlen         = sizeof(int),
2856                 .mode           = 0644,
2857                 .proc_handler   = &proc_dointvec_jiffies,
2858                 .strategy       = &sysctl_jiffies,
2859         },
2860         { .ctl_name = 0 }
2861 };
2862 #endif
2863
2864 #ifdef CONFIG_NET_CLS_ROUTE
2865 struct ip_rt_acct *ip_rt_acct __read_mostly;
2866
2867 /* IP route accounting ptr for this logical cpu number. */
2868 #define IP_RT_ACCT_CPU(cpu) (per_cpu_ptr(ip_rt_acct, cpu))
2869
2870 #ifdef CONFIG_PROC_FS
2871 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2872                            int length, int *eof, void *data)
2873 {
2874         unsigned int i;
2875
2876         if ((offset & 3) || (length & 3))
2877                 return -EIO;
2878
2879         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2880                 *eof = 1;
2881                 return 0;
2882         }
2883
2884         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2885                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2886                 *eof = 1;
2887         }
2888
2889         offset /= sizeof(u32);
2890
2891         if (length > 0) {
2892                 u32 *dst = (u32 *) buffer;
2893
2894                 *start = buffer;
2895                 memset(dst, 0, length);
2896
2897                 for_each_possible_cpu(i) {
2898                         unsigned int j;
2899                         u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2900
2901                         for (j = 0; j < length/4; j++)
2902                                 dst[j] += src[j];
2903                 }
2904         }
2905         return length;
2906 }
2907 #endif /* CONFIG_PROC_FS */
2908 #endif /* CONFIG_NET_CLS_ROUTE */
2909
2910 static __initdata unsigned long rhash_entries;
2911 static int __init set_rhash_entries(char *str)
2912 {
2913         if (!str)
2914                 return 0;
2915         rhash_entries = simple_strtoul(str, &str, 0);
2916         return 1;
2917 }
2918 __setup("rhash_entries=", set_rhash_entries);
2919
2920 int __init ip_rt_init(void)
2921 {
2922         int rc = 0;
2923
2924         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2925                              (jiffies ^ (jiffies >> 7)));
2926
2927 #ifdef CONFIG_NET_CLS_ROUTE
2928         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
2929         if (!ip_rt_acct)
2930                 panic("IP: failed to allocate ip_rt_acct\n");
2931 #endif
2932
2933         ipv4_dst_ops.kmem_cachep =
2934                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2935                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2936
2937         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2938
2939         rt_hash_table = (struct rt_hash_bucket *)
2940                 alloc_large_system_hash("IP route cache",
2941                                         sizeof(struct rt_hash_bucket),
2942                                         rhash_entries,
2943                                         (num_physpages >= 128 * 1024) ?
2944                                         15 : 17,
2945                                         0,
2946                                         &rt_hash_log,
2947                                         &rt_hash_mask,
2948                                         0);
2949         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2950         rt_hash_lock_init();
2951
2952         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2953         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2954
2955         devinet_init();
2956         ip_fib_init();
2957
2958         setup_timer(&rt_flush_timer, rt_run_flush, 0);
2959         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
2960
2961         /* All the timers, started at system startup tend
2962            to synchronize. Perturb it a bit.
2963          */
2964         schedule_delayed_work(&expires_work,
2965                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2966
2967         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2968                 ip_rt_secret_interval;
2969         add_timer(&rt_secret_timer);
2970
2971 #ifdef CONFIG_PROC_FS
2972         {
2973         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2974         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2975             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2976                                              init_net.proc_net_stat))) {
2977                 return -ENOMEM;
2978         }
2979         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2980         }
2981 #ifdef CONFIG_NET_CLS_ROUTE
2982         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
2983 #endif
2984 #endif
2985 #ifdef CONFIG_XFRM
2986         xfrm_init();
2987         xfrm4_init();
2988 #endif
2989         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
2990
2991         return rc;
2992 }
2993
2994 EXPORT_SYMBOL(__ip_select_ident);
2995 EXPORT_SYMBOL(ip_route_input);
2996 EXPORT_SYMBOL(ip_route_output_key);