Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland...
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109
110 #define RT_FL_TOS(oldflp) \
111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112
113 #define IP_MAX_MTU      0xFFF0
114
115 #define RT_GC_TIMEOUT (300*HZ)
116
117 static int ip_rt_min_delay              = 2 * HZ;
118 static int ip_rt_max_delay              = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval            = 60 * HZ;
122 static int ip_rt_gc_min_interval        = HZ / 2;
123 static int ip_rt_redirect_number        = 9;
124 static int ip_rt_redirect_load          = HZ / 50;
125 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost             = HZ;
127 static int ip_rt_error_burst            = 5 * HZ;
128 static int ip_rt_gc_elasticity          = 8;
129 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu               = 512 + 20 + 20;
131 static int ip_rt_min_advmss             = 256;
132 static int ip_rt_secret_interval        = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .entry_size =           sizeof(struct rtable),
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 const __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(FILLER),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188
189
190 /*
191  * Route cache.
192  */
193
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203
204 struct rt_hash_bucket {
205         struct rtable   *chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208         defined(CONFIG_PROVE_LOCKING)
209 /*
210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211  * The size of this table is a power of two and depends on the number of CPUS.
212  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213  */
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ        256
216 #else
217 # if NR_CPUS >= 32
218 #  define RT_HASH_LOCK_SZ       4096
219 # elif NR_CPUS >= 16
220 #  define RT_HASH_LOCK_SZ       2048
221 # elif NR_CPUS >= 8
222 #  define RT_HASH_LOCK_SZ       1024
223 # elif NR_CPUS >= 4
224 #  define RT_HASH_LOCK_SZ       512
225 # else
226 #  define RT_HASH_LOCK_SZ       256
227 # endif
228 #endif
229
230 static spinlock_t       *rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 # define rt_hash_lock_init()    { \
233                 int i; \
234                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
235                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
236                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
237                         spin_lock_init(&rt_hash_locks[i]); \
238                 }
239 #else
240 # define rt_hash_lock_addr(slot) NULL
241 # define rt_hash_lock_init()
242 #endif
243
244 static struct rt_hash_bucket    *rt_hash_table;
245 static unsigned                 rt_hash_mask;
246 static int                      rt_hash_log;
247 static unsigned int             rt_hash_rnd;
248
249 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
250 #define RT_CACHE_STAT_INC(field) \
251         (__raw_get_cpu_var(rt_cache_stat).field++)
252
253 static int rt_intern_hash(unsigned hash, struct rtable *rth,
254                                 struct rtable **res);
255
256 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
257 {
258         return (jhash_2words(daddr, saddr, rt_hash_rnd)
259                 & rt_hash_mask);
260 }
261
262 #define rt_hash(daddr, saddr, idx) \
263         rt_hash_code((__force u32)(__be32)(daddr),\
264                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
265
266 #ifdef CONFIG_PROC_FS
267 struct rt_cache_iter_state {
268         int bucket;
269 };
270
271 static struct rtable *rt_cache_get_first(struct seq_file *seq)
272 {
273         struct rtable *r = NULL;
274         struct rt_cache_iter_state *st = seq->private;
275
276         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
277                 rcu_read_lock_bh();
278                 r = rt_hash_table[st->bucket].chain;
279                 if (r)
280                         break;
281                 rcu_read_unlock_bh();
282         }
283         return r;
284 }
285
286 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
287 {
288         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
289
290         r = r->u.dst.rt_next;
291         while (!r) {
292                 rcu_read_unlock_bh();
293                 if (--st->bucket < 0)
294                         break;
295                 rcu_read_lock_bh();
296                 r = rt_hash_table[st->bucket].chain;
297         }
298         return r;
299 }
300
301 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
302 {
303         struct rtable *r = rt_cache_get_first(seq);
304
305         if (r)
306                 while (pos && (r = rt_cache_get_next(seq, r)))
307                         --pos;
308         return pos ? NULL : r;
309 }
310
311 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
312 {
313         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
314 }
315
316 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
317 {
318         struct rtable *r = NULL;
319
320         if (v == SEQ_START_TOKEN)
321                 r = rt_cache_get_first(seq);
322         else
323                 r = rt_cache_get_next(seq, v);
324         ++*pos;
325         return r;
326 }
327
328 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
329 {
330         if (v && v != SEQ_START_TOKEN)
331                 rcu_read_unlock_bh();
332 }
333
334 static int rt_cache_seq_show(struct seq_file *seq, void *v)
335 {
336         if (v == SEQ_START_TOKEN)
337                 seq_printf(seq, "%-127s\n",
338                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
339                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
340                            "HHUptod\tSpecDst");
341         else {
342                 struct rtable *r = v;
343                 char temp[256];
344
345                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
346                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
347                         r->u.dst.dev ? r->u.dst.dev->name : "*",
348                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
349                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
350                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
351                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
352                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
353                         dst_metric(&r->u.dst, RTAX_WINDOW),
354                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
355                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
356                         r->fl.fl4_tos,
357                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
358                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
359                                        dev_queue_xmit) : 0,
360                         r->rt_spec_dst);
361                 seq_printf(seq, "%-127s\n", temp);
362         }
363         return 0;
364 }
365
366 static const struct seq_operations rt_cache_seq_ops = {
367         .start  = rt_cache_seq_start,
368         .next   = rt_cache_seq_next,
369         .stop   = rt_cache_seq_stop,
370         .show   = rt_cache_seq_show,
371 };
372
373 static int rt_cache_seq_open(struct inode *inode, struct file *file)
374 {
375         struct seq_file *seq;
376         int rc = -ENOMEM;
377         struct rt_cache_iter_state *s;
378
379         s = kzalloc(sizeof(*s), GFP_KERNEL);
380         if (!s)
381                 goto out;
382         rc = seq_open(file, &rt_cache_seq_ops);
383         if (rc)
384                 goto out_kfree;
385         seq          = file->private_data;
386         seq->private = s;
387 out:
388         return rc;
389 out_kfree:
390         kfree(s);
391         goto out;
392 }
393
394 static const struct file_operations rt_cache_seq_fops = {
395         .owner   = THIS_MODULE,
396         .open    = rt_cache_seq_open,
397         .read    = seq_read,
398         .llseek  = seq_lseek,
399         .release = seq_release_private,
400 };
401
402
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405         int cpu;
406
407         if (*pos == 0)
408                 return SEQ_START_TOKEN;
409
410         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411                 if (!cpu_possible(cpu))
412                         continue;
413                 *pos = cpu+1;
414                 return &per_cpu(rt_cache_stat, cpu);
415         }
416         return NULL;
417 }
418
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421         int cpu;
422
423         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430
431 }
432
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435
436 }
437
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440         struct rt_cache_stat *st = v;
441
442         if (v == SEQ_START_TOKEN) {
443                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444                 return 0;
445         }
446
447         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449                    atomic_read(&ipv4_dst_ops.entries),
450                    st->in_hit,
451                    st->in_slow_tot,
452                    st->in_slow_mc,
453                    st->in_no_route,
454                    st->in_brd,
455                    st->in_martian_dst,
456                    st->in_martian_src,
457
458                    st->out_hit,
459                    st->out_slow_tot,
460                    st->out_slow_mc,
461
462                    st->gc_total,
463                    st->gc_ignored,
464                    st->gc_goal_miss,
465                    st->gc_dst_overflow,
466                    st->in_hlist_search,
467                    st->out_hlist_search
468                 );
469         return 0;
470 }
471
472 static const struct seq_operations rt_cpu_seq_ops = {
473         .start  = rt_cpu_seq_start,
474         .next   = rt_cpu_seq_next,
475         .stop   = rt_cpu_seq_stop,
476         .show   = rt_cpu_seq_show,
477 };
478
479
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482         return seq_open(file, &rt_cpu_seq_ops);
483 }
484
485 static const struct file_operations rt_cpu_seq_fops = {
486         .owner   = THIS_MODULE,
487         .open    = rt_cpu_seq_open,
488         .read    = seq_read,
489         .llseek  = seq_lseek,
490         .release = seq_release,
491 };
492
493 #endif /* CONFIG_PROC_FS */
494
495 static __inline__ void rt_free(struct rtable *rt)
496 {
497         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
498 }
499
500 static __inline__ void rt_drop(struct rtable *rt)
501 {
502         ip_rt_put(rt);
503         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
504 }
505
506 static __inline__ int rt_fast_clean(struct rtable *rth)
507 {
508         /* Kill broadcast/multicast entries very aggresively, if they
509            collide in hash table with more useful entries */
510         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
511                 rth->fl.iif && rth->u.dst.rt_next;
512 }
513
514 static __inline__ int rt_valuable(struct rtable *rth)
515 {
516         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
517                 rth->u.dst.expires;
518 }
519
520 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
521 {
522         unsigned long age;
523         int ret = 0;
524
525         if (atomic_read(&rth->u.dst.__refcnt))
526                 goto out;
527
528         ret = 1;
529         if (rth->u.dst.expires &&
530             time_after_eq(jiffies, rth->u.dst.expires))
531                 goto out;
532
533         age = jiffies - rth->u.dst.lastuse;
534         ret = 0;
535         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
536             (age <= tmo2 && rt_valuable(rth)))
537                 goto out;
538         ret = 1;
539 out:    return ret;
540 }
541
542 /* Bits of score are:
543  * 31: very valuable
544  * 30: not quite useless
545  * 29..0: usage counter
546  */
547 static inline u32 rt_score(struct rtable *rt)
548 {
549         u32 score = jiffies - rt->u.dst.lastuse;
550
551         score = ~score & ~(3<<30);
552
553         if (rt_valuable(rt))
554                 score |= (1<<31);
555
556         if (!rt->fl.iif ||
557             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
558                 score |= (1<<30);
559
560         return score;
561 }
562
563 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
564 {
565         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
566                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
567                 (fl1->mark ^ fl2->mark) |
568                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
569                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
570                 (fl1->oif ^ fl2->oif) |
571                 (fl1->iif ^ fl2->iif)) == 0;
572 }
573
574 /* This runs via a timer and thus is always in BH context. */
575 static void rt_check_expire(unsigned long dummy)
576 {
577         static unsigned int rover;
578         unsigned int i = rover, goal;
579         struct rtable *rth, **rthp;
580         unsigned long now = jiffies;
581         u64 mult;
582
583         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
584         if (ip_rt_gc_timeout > 1)
585                 do_div(mult, ip_rt_gc_timeout);
586         goal = (unsigned int)mult;
587         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
588         for (; goal > 0; goal--) {
589                 unsigned long tmo = ip_rt_gc_timeout;
590
591                 i = (i + 1) & rt_hash_mask;
592                 rthp = &rt_hash_table[i].chain;
593
594                 if (*rthp == 0)
595                         continue;
596                 spin_lock(rt_hash_lock_addr(i));
597                 while ((rth = *rthp) != NULL) {
598                         if (rth->u.dst.expires) {
599                                 /* Entry is expired even if it is in use */
600                                 if (time_before_eq(now, rth->u.dst.expires)) {
601                                         tmo >>= 1;
602                                         rthp = &rth->u.dst.rt_next;
603                                         continue;
604                                 }
605                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
606                                 tmo >>= 1;
607                                 rthp = &rth->u.dst.rt_next;
608                                 continue;
609                         }
610
611                         /* Cleanup aged off entries. */
612                         *rthp = rth->u.dst.rt_next;
613                         rt_free(rth);
614                 }
615                 spin_unlock(rt_hash_lock_addr(i));
616
617                 /* Fallback loop breaker. */
618                 if (time_after(jiffies, now))
619                         break;
620         }
621         rover = i;
622         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
623 }
624
625 /* This can run from both BH and non-BH contexts, the latter
626  * in the case of a forced flush event.
627  */
628 static void rt_run_flush(unsigned long dummy)
629 {
630         int i;
631         struct rtable *rth, *next;
632
633         rt_deadline = 0;
634
635         get_random_bytes(&rt_hash_rnd, 4);
636
637         for (i = rt_hash_mask; i >= 0; i--) {
638                 spin_lock_bh(rt_hash_lock_addr(i));
639                 rth = rt_hash_table[i].chain;
640                 if (rth)
641                         rt_hash_table[i].chain = NULL;
642                 spin_unlock_bh(rt_hash_lock_addr(i));
643
644                 for (; rth; rth = next) {
645                         next = rth->u.dst.rt_next;
646                         rt_free(rth);
647                 }
648         }
649 }
650
651 static DEFINE_SPINLOCK(rt_flush_lock);
652
653 void rt_cache_flush(int delay)
654 {
655         unsigned long now = jiffies;
656         int user_mode = !in_softirq();
657
658         if (delay < 0)
659                 delay = ip_rt_min_delay;
660
661         spin_lock_bh(&rt_flush_lock);
662
663         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
664                 long tmo = (long)(rt_deadline - now);
665
666                 /* If flush timer is already running
667                    and flush request is not immediate (delay > 0):
668
669                    if deadline is not achieved, prolongate timer to "delay",
670                    otherwise fire it at deadline time.
671                  */
672
673                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
674                         tmo = 0;
675
676                 if (delay > tmo)
677                         delay = tmo;
678         }
679
680         if (delay <= 0) {
681                 spin_unlock_bh(&rt_flush_lock);
682                 rt_run_flush(0);
683                 return;
684         }
685
686         if (rt_deadline == 0)
687                 rt_deadline = now + ip_rt_max_delay;
688
689         mod_timer(&rt_flush_timer, now+delay);
690         spin_unlock_bh(&rt_flush_lock);
691 }
692
693 static void rt_secret_rebuild(unsigned long dummy)
694 {
695         unsigned long now = jiffies;
696
697         rt_cache_flush(0);
698         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
699 }
700
701 /*
702    Short description of GC goals.
703
704    We want to build algorithm, which will keep routing cache
705    at some equilibrium point, when number of aged off entries
706    is kept approximately equal to newly generated ones.
707
708    Current expiration strength is variable "expire".
709    We try to adjust it dynamically, so that if networking
710    is idle expires is large enough to keep enough of warm entries,
711    and when load increases it reduces to limit cache size.
712  */
713
714 static int rt_garbage_collect(void)
715 {
716         static unsigned long expire = RT_GC_TIMEOUT;
717         static unsigned long last_gc;
718         static int rover;
719         static int equilibrium;
720         struct rtable *rth, **rthp;
721         unsigned long now = jiffies;
722         int goal;
723
724         /*
725          * Garbage collection is pretty expensive,
726          * do not make it too frequently.
727          */
728
729         RT_CACHE_STAT_INC(gc_total);
730
731         if (now - last_gc < ip_rt_gc_min_interval &&
732             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
733                 RT_CACHE_STAT_INC(gc_ignored);
734                 goto out;
735         }
736
737         /* Calculate number of entries, which we want to expire now. */
738         goal = atomic_read(&ipv4_dst_ops.entries) -
739                 (ip_rt_gc_elasticity << rt_hash_log);
740         if (goal <= 0) {
741                 if (equilibrium < ipv4_dst_ops.gc_thresh)
742                         equilibrium = ipv4_dst_ops.gc_thresh;
743                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
744                 if (goal > 0) {
745                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
746                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
747                 }
748         } else {
749                 /* We are in dangerous area. Try to reduce cache really
750                  * aggressively.
751                  */
752                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
753                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
754         }
755
756         if (now - last_gc >= ip_rt_gc_min_interval)
757                 last_gc = now;
758
759         if (goal <= 0) {
760                 equilibrium += goal;
761                 goto work_done;
762         }
763
764         do {
765                 int i, k;
766
767                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
768                         unsigned long tmo = expire;
769
770                         k = (k + 1) & rt_hash_mask;
771                         rthp = &rt_hash_table[k].chain;
772                         spin_lock_bh(rt_hash_lock_addr(k));
773                         while ((rth = *rthp) != NULL) {
774                                 if (!rt_may_expire(rth, tmo, expire)) {
775                                         tmo >>= 1;
776                                         rthp = &rth->u.dst.rt_next;
777                                         continue;
778                                 }
779                                 *rthp = rth->u.dst.rt_next;
780                                 rt_free(rth);
781                                 goal--;
782                         }
783                         spin_unlock_bh(rt_hash_lock_addr(k));
784                         if (goal <= 0)
785                                 break;
786                 }
787                 rover = k;
788
789                 if (goal <= 0)
790                         goto work_done;
791
792                 /* Goal is not achieved. We stop process if:
793
794                    - if expire reduced to zero. Otherwise, expire is halfed.
795                    - if table is not full.
796                    - if we are called from interrupt.
797                    - jiffies check is just fallback/debug loop breaker.
798                      We will not spin here for long time in any case.
799                  */
800
801                 RT_CACHE_STAT_INC(gc_goal_miss);
802
803                 if (expire == 0)
804                         break;
805
806                 expire >>= 1;
807 #if RT_CACHE_DEBUG >= 2
808                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
809                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
810 #endif
811
812                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
813                         goto out;
814         } while (!in_softirq() && time_before_eq(jiffies, now));
815
816         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
817                 goto out;
818         if (net_ratelimit())
819                 printk(KERN_WARNING "dst cache overflow\n");
820         RT_CACHE_STAT_INC(gc_dst_overflow);
821         return 1;
822
823 work_done:
824         expire += ip_rt_gc_min_interval;
825         if (expire > ip_rt_gc_timeout ||
826             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
827                 expire = ip_rt_gc_timeout;
828 #if RT_CACHE_DEBUG >= 2
829         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
830                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
831 #endif
832 out:    return 0;
833 }
834
835 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
836 {
837         struct rtable   *rth, **rthp;
838         unsigned long   now;
839         struct rtable *cand, **candp;
840         u32             min_score;
841         int             chain_length;
842         int attempts = !in_softirq();
843
844 restart:
845         chain_length = 0;
846         min_score = ~(u32)0;
847         cand = NULL;
848         candp = NULL;
849         now = jiffies;
850
851         rthp = &rt_hash_table[hash].chain;
852
853         spin_lock_bh(rt_hash_lock_addr(hash));
854         while ((rth = *rthp) != NULL) {
855                 if (compare_keys(&rth->fl, &rt->fl)) {
856                         /* Put it first */
857                         *rthp = rth->u.dst.rt_next;
858                         /*
859                          * Since lookup is lockfree, the deletion
860                          * must be visible to another weakly ordered CPU before
861                          * the insertion at the start of the hash chain.
862                          */
863                         rcu_assign_pointer(rth->u.dst.rt_next,
864                                            rt_hash_table[hash].chain);
865                         /*
866                          * Since lookup is lockfree, the update writes
867                          * must be ordered for consistency on SMP.
868                          */
869                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
870
871                         rth->u.dst.__use++;
872                         dst_hold(&rth->u.dst);
873                         rth->u.dst.lastuse = now;
874                         spin_unlock_bh(rt_hash_lock_addr(hash));
875
876                         rt_drop(rt);
877                         *rp = rth;
878                         return 0;
879                 }
880
881                 if (!atomic_read(&rth->u.dst.__refcnt)) {
882                         u32 score = rt_score(rth);
883
884                         if (score <= min_score) {
885                                 cand = rth;
886                                 candp = rthp;
887                                 min_score = score;
888                         }
889                 }
890
891                 chain_length++;
892
893                 rthp = &rth->u.dst.rt_next;
894         }
895
896         if (cand) {
897                 /* ip_rt_gc_elasticity used to be average length of chain
898                  * length, when exceeded gc becomes really aggressive.
899                  *
900                  * The second limit is less certain. At the moment it allows
901                  * only 2 entries per bucket. We will see.
902                  */
903                 if (chain_length > ip_rt_gc_elasticity) {
904                         *candp = cand->u.dst.rt_next;
905                         rt_free(cand);
906                 }
907         }
908
909         /* Try to bind route to arp only if it is output
910            route or unicast forwarding path.
911          */
912         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
913                 int err = arp_bind_neighbour(&rt->u.dst);
914                 if (err) {
915                         spin_unlock_bh(rt_hash_lock_addr(hash));
916
917                         if (err != -ENOBUFS) {
918                                 rt_drop(rt);
919                                 return err;
920                         }
921
922                         /* Neighbour tables are full and nothing
923                            can be released. Try to shrink route cache,
924                            it is most likely it holds some neighbour records.
925                          */
926                         if (attempts-- > 0) {
927                                 int saved_elasticity = ip_rt_gc_elasticity;
928                                 int saved_int = ip_rt_gc_min_interval;
929                                 ip_rt_gc_elasticity     = 1;
930                                 ip_rt_gc_min_interval   = 0;
931                                 rt_garbage_collect();
932                                 ip_rt_gc_min_interval   = saved_int;
933                                 ip_rt_gc_elasticity     = saved_elasticity;
934                                 goto restart;
935                         }
936
937                         if (net_ratelimit())
938                                 printk(KERN_WARNING "Neighbour table overflow.\n");
939                         rt_drop(rt);
940                         return -ENOBUFS;
941                 }
942         }
943
944         rt->u.dst.rt_next = rt_hash_table[hash].chain;
945 #if RT_CACHE_DEBUG >= 2
946         if (rt->u.dst.rt_next) {
947                 struct rtable *trt;
948                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
949                        NIPQUAD(rt->rt_dst));
950                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
951                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
952                 printk("\n");
953         }
954 #endif
955         rt_hash_table[hash].chain = rt;
956         spin_unlock_bh(rt_hash_lock_addr(hash));
957         *rp = rt;
958         return 0;
959 }
960
961 void rt_bind_peer(struct rtable *rt, int create)
962 {
963         static DEFINE_SPINLOCK(rt_peer_lock);
964         struct inet_peer *peer;
965
966         peer = inet_getpeer(rt->rt_dst, create);
967
968         spin_lock_bh(&rt_peer_lock);
969         if (rt->peer == NULL) {
970                 rt->peer = peer;
971                 peer = NULL;
972         }
973         spin_unlock_bh(&rt_peer_lock);
974         if (peer)
975                 inet_putpeer(peer);
976 }
977
978 /*
979  * Peer allocation may fail only in serious out-of-memory conditions.  However
980  * we still can generate some output.
981  * Random ID selection looks a bit dangerous because we have no chances to
982  * select ID being unique in a reasonable period of time.
983  * But broken packet identifier may be better than no packet at all.
984  */
985 static void ip_select_fb_ident(struct iphdr *iph)
986 {
987         static DEFINE_SPINLOCK(ip_fb_id_lock);
988         static u32 ip_fallback_id;
989         u32 salt;
990
991         spin_lock_bh(&ip_fb_id_lock);
992         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
993         iph->id = htons(salt & 0xFFFF);
994         ip_fallback_id = salt;
995         spin_unlock_bh(&ip_fb_id_lock);
996 }
997
998 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
999 {
1000         struct rtable *rt = (struct rtable *) dst;
1001
1002         if (rt) {
1003                 if (rt->peer == NULL)
1004                         rt_bind_peer(rt, 1);
1005
1006                 /* If peer is attached to destination, it is never detached,
1007                    so that we need not to grab a lock to dereference it.
1008                  */
1009                 if (rt->peer) {
1010                         iph->id = htons(inet_getid(rt->peer, more));
1011                         return;
1012                 }
1013         } else
1014                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1015                        __builtin_return_address(0));
1016
1017         ip_select_fb_ident(iph);
1018 }
1019
1020 static void rt_del(unsigned hash, struct rtable *rt)
1021 {
1022         struct rtable **rthp;
1023
1024         spin_lock_bh(rt_hash_lock_addr(hash));
1025         ip_rt_put(rt);
1026         for (rthp = &rt_hash_table[hash].chain; *rthp;
1027              rthp = &(*rthp)->u.dst.rt_next)
1028                 if (*rthp == rt) {
1029                         *rthp = rt->u.dst.rt_next;
1030                         rt_free(rt);
1031                         break;
1032                 }
1033         spin_unlock_bh(rt_hash_lock_addr(hash));
1034 }
1035
1036 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1037                     __be32 saddr, struct net_device *dev)
1038 {
1039         int i, k;
1040         struct in_device *in_dev = in_dev_get(dev);
1041         struct rtable *rth, **rthp;
1042         __be32  skeys[2] = { saddr, 0 };
1043         int  ikeys[2] = { dev->ifindex, 0 };
1044         struct netevent_redirect netevent;
1045
1046         if (!in_dev)
1047                 return;
1048
1049         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1050             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1051                 goto reject_redirect;
1052
1053         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1054                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1055                         goto reject_redirect;
1056                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1057                         goto reject_redirect;
1058         } else {
1059                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1060                         goto reject_redirect;
1061         }
1062
1063         for (i = 0; i < 2; i++) {
1064                 for (k = 0; k < 2; k++) {
1065                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1066
1067                         rthp=&rt_hash_table[hash].chain;
1068
1069                         rcu_read_lock();
1070                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1071                                 struct rtable *rt;
1072
1073                                 if (rth->fl.fl4_dst != daddr ||
1074                                     rth->fl.fl4_src != skeys[i] ||
1075                                     rth->fl.oif != ikeys[k] ||
1076                                     rth->fl.iif != 0) {
1077                                         rthp = &rth->u.dst.rt_next;
1078                                         continue;
1079                                 }
1080
1081                                 if (rth->rt_dst != daddr ||
1082                                     rth->rt_src != saddr ||
1083                                     rth->u.dst.error ||
1084                                     rth->rt_gateway != old_gw ||
1085                                     rth->u.dst.dev != dev)
1086                                         break;
1087
1088                                 dst_hold(&rth->u.dst);
1089                                 rcu_read_unlock();
1090
1091                                 rt = dst_alloc(&ipv4_dst_ops);
1092                                 if (rt == NULL) {
1093                                         ip_rt_put(rth);
1094                                         in_dev_put(in_dev);
1095                                         return;
1096                                 }
1097
1098                                 /* Copy all the information. */
1099                                 *rt = *rth;
1100                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1101                                 rt->u.dst.__use         = 1;
1102                                 atomic_set(&rt->u.dst.__refcnt, 1);
1103                                 rt->u.dst.child         = NULL;
1104                                 if (rt->u.dst.dev)
1105                                         dev_hold(rt->u.dst.dev);
1106                                 if (rt->idev)
1107                                         in_dev_hold(rt->idev);
1108                                 rt->u.dst.obsolete      = 0;
1109                                 rt->u.dst.lastuse       = jiffies;
1110                                 rt->u.dst.path          = &rt->u.dst;
1111                                 rt->u.dst.neighbour     = NULL;
1112                                 rt->u.dst.hh            = NULL;
1113                                 rt->u.dst.xfrm          = NULL;
1114
1115                                 rt->rt_flags            |= RTCF_REDIRECTED;
1116
1117                                 /* Gateway is different ... */
1118                                 rt->rt_gateway          = new_gw;
1119
1120                                 /* Redirect received -> path was valid */
1121                                 dst_confirm(&rth->u.dst);
1122
1123                                 if (rt->peer)
1124                                         atomic_inc(&rt->peer->refcnt);
1125
1126                                 if (arp_bind_neighbour(&rt->u.dst) ||
1127                                     !(rt->u.dst.neighbour->nud_state &
1128                                             NUD_VALID)) {
1129                                         if (rt->u.dst.neighbour)
1130                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1131                                         ip_rt_put(rth);
1132                                         rt_drop(rt);
1133                                         goto do_next;
1134                                 }
1135
1136                                 netevent.old = &rth->u.dst;
1137                                 netevent.new = &rt->u.dst;
1138                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1139                                                         &netevent);
1140
1141                                 rt_del(hash, rth);
1142                                 if (!rt_intern_hash(hash, rt, &rt))
1143                                         ip_rt_put(rt);
1144                                 goto do_next;
1145                         }
1146                         rcu_read_unlock();
1147                 do_next:
1148                         ;
1149                 }
1150         }
1151         in_dev_put(in_dev);
1152         return;
1153
1154 reject_redirect:
1155 #ifdef CONFIG_IP_ROUTE_VERBOSE
1156         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1157                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1158                         "%u.%u.%u.%u ignored.\n"
1159                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1160                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1161                        NIPQUAD(saddr), NIPQUAD(daddr));
1162 #endif
1163         in_dev_put(in_dev);
1164 }
1165
1166 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1167 {
1168         struct rtable *rt = (struct rtable*)dst;
1169         struct dst_entry *ret = dst;
1170
1171         if (rt) {
1172                 if (dst->obsolete) {
1173                         ip_rt_put(rt);
1174                         ret = NULL;
1175                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1176                            rt->u.dst.expires) {
1177                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1178                                                 rt->fl.oif);
1179 #if RT_CACHE_DEBUG >= 1
1180                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1181                                           "%u.%u.%u.%u/%02x dropped\n",
1182                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1183 #endif
1184                         rt_del(hash, rt);
1185                         ret = NULL;
1186                 }
1187         }
1188         return ret;
1189 }
1190
1191 /*
1192  * Algorithm:
1193  *      1. The first ip_rt_redirect_number redirects are sent
1194  *         with exponential backoff, then we stop sending them at all,
1195  *         assuming that the host ignores our redirects.
1196  *      2. If we did not see packets requiring redirects
1197  *         during ip_rt_redirect_silence, we assume that the host
1198  *         forgot redirected route and start to send redirects again.
1199  *
1200  * This algorithm is much cheaper and more intelligent than dumb load limiting
1201  * in icmp.c.
1202  *
1203  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1204  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1205  */
1206
1207 void ip_rt_send_redirect(struct sk_buff *skb)
1208 {
1209         struct rtable *rt = (struct rtable*)skb->dst;
1210         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1211
1212         if (!in_dev)
1213                 return;
1214
1215         if (!IN_DEV_TX_REDIRECTS(in_dev))
1216                 goto out;
1217
1218         /* No redirected packets during ip_rt_redirect_silence;
1219          * reset the algorithm.
1220          */
1221         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1222                 rt->u.dst.rate_tokens = 0;
1223
1224         /* Too many ignored redirects; do not send anything
1225          * set u.dst.rate_last to the last seen redirected packet.
1226          */
1227         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1228                 rt->u.dst.rate_last = jiffies;
1229                 goto out;
1230         }
1231
1232         /* Check for load limit; set rate_last to the latest sent
1233          * redirect.
1234          */
1235         if (rt->u.dst.rate_tokens == 0 ||
1236             time_after(jiffies,
1237                        (rt->u.dst.rate_last +
1238                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1239                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1240                 rt->u.dst.rate_last = jiffies;
1241                 ++rt->u.dst.rate_tokens;
1242 #ifdef CONFIG_IP_ROUTE_VERBOSE
1243                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1244                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1245                     net_ratelimit())
1246                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1247                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1248                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1249                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1250 #endif
1251         }
1252 out:
1253         in_dev_put(in_dev);
1254 }
1255
1256 static int ip_error(struct sk_buff *skb)
1257 {
1258         struct rtable *rt = (struct rtable*)skb->dst;
1259         unsigned long now;
1260         int code;
1261
1262         switch (rt->u.dst.error) {
1263                 case EINVAL:
1264                 default:
1265                         goto out;
1266                 case EHOSTUNREACH:
1267                         code = ICMP_HOST_UNREACH;
1268                         break;
1269                 case ENETUNREACH:
1270                         code = ICMP_NET_UNREACH;
1271                         break;
1272                 case EACCES:
1273                         code = ICMP_PKT_FILTERED;
1274                         break;
1275         }
1276
1277         now = jiffies;
1278         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1279         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1280                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1281         rt->u.dst.rate_last = now;
1282         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1283                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1284                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1285         }
1286
1287 out:    kfree_skb(skb);
1288         return 0;
1289 }
1290
1291 /*
1292  *      The last two values are not from the RFC but
1293  *      are needed for AMPRnet AX.25 paths.
1294  */
1295
1296 static const unsigned short mtu_plateau[] =
1297 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1298
1299 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1300 {
1301         int i;
1302
1303         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1304                 if (old_mtu > mtu_plateau[i])
1305                         return mtu_plateau[i];
1306         return 68;
1307 }
1308
1309 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1310 {
1311         int i;
1312         unsigned short old_mtu = ntohs(iph->tot_len);
1313         struct rtable *rth;
1314         __be32  skeys[2] = { iph->saddr, 0, };
1315         __be32  daddr = iph->daddr;
1316         unsigned short est_mtu = 0;
1317
1318         if (ipv4_config.no_pmtu_disc)
1319                 return 0;
1320
1321         for (i = 0; i < 2; i++) {
1322                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1323
1324                 rcu_read_lock();
1325                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1326                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1327                         if (rth->fl.fl4_dst == daddr &&
1328                             rth->fl.fl4_src == skeys[i] &&
1329                             rth->rt_dst  == daddr &&
1330                             rth->rt_src  == iph->saddr &&
1331                             rth->fl.iif == 0 &&
1332                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1333                                 unsigned short mtu = new_mtu;
1334
1335                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1336
1337                                         /* BSD 4.2 compatibility hack :-( */
1338                                         if (mtu == 0 &&
1339                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1340                                             old_mtu >= 68 + (iph->ihl << 2))
1341                                                 old_mtu -= iph->ihl << 2;
1342
1343                                         mtu = guess_mtu(old_mtu);
1344                                 }
1345                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1346                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1347                                                 dst_confirm(&rth->u.dst);
1348                                                 if (mtu < ip_rt_min_pmtu) {
1349                                                         mtu = ip_rt_min_pmtu;
1350                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1351                                                                 (1 << RTAX_MTU);
1352                                                 }
1353                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1354                                                 dst_set_expires(&rth->u.dst,
1355                                                         ip_rt_mtu_expires);
1356                                         }
1357                                         est_mtu = mtu;
1358                                 }
1359                         }
1360                 }
1361                 rcu_read_unlock();
1362         }
1363         return est_mtu ? : new_mtu;
1364 }
1365
1366 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1367 {
1368         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1369             !(dst_metric_locked(dst, RTAX_MTU))) {
1370                 if (mtu < ip_rt_min_pmtu) {
1371                         mtu = ip_rt_min_pmtu;
1372                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1373                 }
1374                 dst->metrics[RTAX_MTU-1] = mtu;
1375                 dst_set_expires(dst, ip_rt_mtu_expires);
1376                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1377         }
1378 }
1379
1380 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1381 {
1382         return NULL;
1383 }
1384
1385 static void ipv4_dst_destroy(struct dst_entry *dst)
1386 {
1387         struct rtable *rt = (struct rtable *) dst;
1388         struct inet_peer *peer = rt->peer;
1389         struct in_device *idev = rt->idev;
1390
1391         if (peer) {
1392                 rt->peer = NULL;
1393                 inet_putpeer(peer);
1394         }
1395
1396         if (idev) {
1397                 rt->idev = NULL;
1398                 in_dev_put(idev);
1399         }
1400 }
1401
1402 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1403                             int how)
1404 {
1405         struct rtable *rt = (struct rtable *) dst;
1406         struct in_device *idev = rt->idev;
1407         if (dev != &loopback_dev && idev && idev->dev == dev) {
1408                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1409                 if (loopback_idev) {
1410                         rt->idev = loopback_idev;
1411                         in_dev_put(idev);
1412                 }
1413         }
1414 }
1415
1416 static void ipv4_link_failure(struct sk_buff *skb)
1417 {
1418         struct rtable *rt;
1419
1420         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1421
1422         rt = (struct rtable *) skb->dst;
1423         if (rt)
1424                 dst_set_expires(&rt->u.dst, 0);
1425 }
1426
1427 static int ip_rt_bug(struct sk_buff *skb)
1428 {
1429         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1430                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1431                 skb->dev ? skb->dev->name : "?");
1432         kfree_skb(skb);
1433         return 0;
1434 }
1435
1436 /*
1437    We do not cache source address of outgoing interface,
1438    because it is used only by IP RR, TS and SRR options,
1439    so that it out of fast path.
1440
1441    BTW remember: "addr" is allowed to be not aligned
1442    in IP options!
1443  */
1444
1445 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1446 {
1447         __be32 src;
1448         struct fib_result res;
1449
1450         if (rt->fl.iif == 0)
1451                 src = rt->rt_src;
1452         else if (fib_lookup(&rt->fl, &res) == 0) {
1453                 src = FIB_RES_PREFSRC(res);
1454                 fib_res_put(&res);
1455         } else
1456                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1457                                         RT_SCOPE_UNIVERSE);
1458         memcpy(addr, &src, 4);
1459 }
1460
1461 #ifdef CONFIG_NET_CLS_ROUTE
1462 static void set_class_tag(struct rtable *rt, u32 tag)
1463 {
1464         if (!(rt->u.dst.tclassid & 0xFFFF))
1465                 rt->u.dst.tclassid |= tag & 0xFFFF;
1466         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1467                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1468 }
1469 #endif
1470
1471 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1472 {
1473         struct fib_info *fi = res->fi;
1474
1475         if (fi) {
1476                 if (FIB_RES_GW(*res) &&
1477                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1478                         rt->rt_gateway = FIB_RES_GW(*res);
1479                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1480                        sizeof(rt->u.dst.metrics));
1481                 if (fi->fib_mtu == 0) {
1482                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1483                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1484                             rt->rt_gateway != rt->rt_dst &&
1485                             rt->u.dst.dev->mtu > 576)
1486                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1487                 }
1488 #ifdef CONFIG_NET_CLS_ROUTE
1489                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1490 #endif
1491         } else
1492                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1493
1494         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1495                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1496         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1497                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1498         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1499                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1500                                        ip_rt_min_advmss);
1501         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1502                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1503
1504 #ifdef CONFIG_NET_CLS_ROUTE
1505 #ifdef CONFIG_IP_MULTIPLE_TABLES
1506         set_class_tag(rt, fib_rules_tclass(res));
1507 #endif
1508         set_class_tag(rt, itag);
1509 #endif
1510         rt->rt_type = res->type;
1511 }
1512
1513 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1514                                 u8 tos, struct net_device *dev, int our)
1515 {
1516         unsigned hash;
1517         struct rtable *rth;
1518         __be32 spec_dst;
1519         struct in_device *in_dev = in_dev_get(dev);
1520         u32 itag = 0;
1521
1522         /* Primary sanity checks. */
1523
1524         if (in_dev == NULL)
1525                 return -EINVAL;
1526
1527         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1528             skb->protocol != htons(ETH_P_IP))
1529                 goto e_inval;
1530
1531         if (ZERONET(saddr)) {
1532                 if (!LOCAL_MCAST(daddr))
1533                         goto e_inval;
1534                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1535         } else if (fib_validate_source(saddr, 0, tos, 0,
1536                                         dev, &spec_dst, &itag) < 0)
1537                 goto e_inval;
1538
1539         rth = dst_alloc(&ipv4_dst_ops);
1540         if (!rth)
1541                 goto e_nobufs;
1542
1543         rth->u.dst.output= ip_rt_bug;
1544
1545         atomic_set(&rth->u.dst.__refcnt, 1);
1546         rth->u.dst.flags= DST_HOST;
1547         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1548                 rth->u.dst.flags |= DST_NOPOLICY;
1549         rth->fl.fl4_dst = daddr;
1550         rth->rt_dst     = daddr;
1551         rth->fl.fl4_tos = tos;
1552         rth->fl.mark    = skb->mark;
1553         rth->fl.fl4_src = saddr;
1554         rth->rt_src     = saddr;
1555 #ifdef CONFIG_NET_CLS_ROUTE
1556         rth->u.dst.tclassid = itag;
1557 #endif
1558         rth->rt_iif     =
1559         rth->fl.iif     = dev->ifindex;
1560         rth->u.dst.dev  = &loopback_dev;
1561         dev_hold(rth->u.dst.dev);
1562         rth->idev       = in_dev_get(rth->u.dst.dev);
1563         rth->fl.oif     = 0;
1564         rth->rt_gateway = daddr;
1565         rth->rt_spec_dst= spec_dst;
1566         rth->rt_type    = RTN_MULTICAST;
1567         rth->rt_flags   = RTCF_MULTICAST;
1568         if (our) {
1569                 rth->u.dst.input= ip_local_deliver;
1570                 rth->rt_flags |= RTCF_LOCAL;
1571         }
1572
1573 #ifdef CONFIG_IP_MROUTE
1574         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1575                 rth->u.dst.input = ip_mr_input;
1576 #endif
1577         RT_CACHE_STAT_INC(in_slow_mc);
1578
1579         in_dev_put(in_dev);
1580         hash = rt_hash(daddr, saddr, dev->ifindex);
1581         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1582
1583 e_nobufs:
1584         in_dev_put(in_dev);
1585         return -ENOBUFS;
1586
1587 e_inval:
1588         in_dev_put(in_dev);
1589         return -EINVAL;
1590 }
1591
1592
1593 static void ip_handle_martian_source(struct net_device *dev,
1594                                      struct in_device *in_dev,
1595                                      struct sk_buff *skb,
1596                                      __be32 daddr,
1597                                      __be32 saddr)
1598 {
1599         RT_CACHE_STAT_INC(in_martian_src);
1600 #ifdef CONFIG_IP_ROUTE_VERBOSE
1601         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1602                 /*
1603                  *      RFC1812 recommendation, if source is martian,
1604                  *      the only hint is MAC header.
1605                  */
1606                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1607                         "%u.%u.%u.%u, on dev %s\n",
1608                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1609                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1610                         int i;
1611                         const unsigned char *p = skb_mac_header(skb);
1612                         printk(KERN_WARNING "ll header: ");
1613                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1614                                 printk("%02x", *p);
1615                                 if (i < (dev->hard_header_len - 1))
1616                                         printk(":");
1617                         }
1618                         printk("\n");
1619                 }
1620         }
1621 #endif
1622 }
1623
1624 static inline int __mkroute_input(struct sk_buff *skb,
1625                                   struct fib_result* res,
1626                                   struct in_device *in_dev,
1627                                   __be32 daddr, __be32 saddr, u32 tos,
1628                                   struct rtable **result)
1629 {
1630
1631         struct rtable *rth;
1632         int err;
1633         struct in_device *out_dev;
1634         unsigned flags = 0;
1635         __be32 spec_dst;
1636         u32 itag;
1637
1638         /* get a working reference to the output device */
1639         out_dev = in_dev_get(FIB_RES_DEV(*res));
1640         if (out_dev == NULL) {
1641                 if (net_ratelimit())
1642                         printk(KERN_CRIT "Bug in ip_route_input" \
1643                                "_slow(). Please, report\n");
1644                 return -EINVAL;
1645         }
1646
1647
1648         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1649                                   in_dev->dev, &spec_dst, &itag);
1650         if (err < 0) {
1651                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1652                                          saddr);
1653
1654                 err = -EINVAL;
1655                 goto cleanup;
1656         }
1657
1658         if (err)
1659                 flags |= RTCF_DIRECTSRC;
1660
1661         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1662             (IN_DEV_SHARED_MEDIA(out_dev) ||
1663              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1664                 flags |= RTCF_DOREDIRECT;
1665
1666         if (skb->protocol != htons(ETH_P_IP)) {
1667                 /* Not IP (i.e. ARP). Do not create route, if it is
1668                  * invalid for proxy arp. DNAT routes are always valid.
1669                  */
1670                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1671                         err = -EINVAL;
1672                         goto cleanup;
1673                 }
1674         }
1675
1676
1677         rth = dst_alloc(&ipv4_dst_ops);
1678         if (!rth) {
1679                 err = -ENOBUFS;
1680                 goto cleanup;
1681         }
1682
1683         atomic_set(&rth->u.dst.__refcnt, 1);
1684         rth->u.dst.flags= DST_HOST;
1685         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1686                 rth->u.dst.flags |= DST_NOPOLICY;
1687         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1688                 rth->u.dst.flags |= DST_NOXFRM;
1689         rth->fl.fl4_dst = daddr;
1690         rth->rt_dst     = daddr;
1691         rth->fl.fl4_tos = tos;
1692         rth->fl.mark    = skb->mark;
1693         rth->fl.fl4_src = saddr;
1694         rth->rt_src     = saddr;
1695         rth->rt_gateway = daddr;
1696         rth->rt_iif     =
1697                 rth->fl.iif     = in_dev->dev->ifindex;
1698         rth->u.dst.dev  = (out_dev)->dev;
1699         dev_hold(rth->u.dst.dev);
1700         rth->idev       = in_dev_get(rth->u.dst.dev);
1701         rth->fl.oif     = 0;
1702         rth->rt_spec_dst= spec_dst;
1703
1704         rth->u.dst.input = ip_forward;
1705         rth->u.dst.output = ip_output;
1706
1707         rt_set_nexthop(rth, res, itag);
1708
1709         rth->rt_flags = flags;
1710
1711         *result = rth;
1712         err = 0;
1713  cleanup:
1714         /* release the working reference to the output device */
1715         in_dev_put(out_dev);
1716         return err;
1717 }
1718
1719 static inline int ip_mkroute_input(struct sk_buff *skb,
1720                                    struct fib_result* res,
1721                                    const struct flowi *fl,
1722                                    struct in_device *in_dev,
1723                                    __be32 daddr, __be32 saddr, u32 tos)
1724 {
1725         struct rtable* rth = NULL;
1726         int err;
1727         unsigned hash;
1728
1729 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1730         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1731                 fib_select_multipath(fl, res);
1732 #endif
1733
1734         /* create a routing cache entry */
1735         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1736         if (err)
1737                 return err;
1738
1739         /* put it into the cache */
1740         hash = rt_hash(daddr, saddr, fl->iif);
1741         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1742 }
1743
1744 /*
1745  *      NOTE. We drop all the packets that has local source
1746  *      addresses, because every properly looped back packet
1747  *      must have correct destination already attached by output routine.
1748  *
1749  *      Such approach solves two big problems:
1750  *      1. Not simplex devices are handled properly.
1751  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1752  */
1753
1754 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1755                                u8 tos, struct net_device *dev)
1756 {
1757         struct fib_result res;
1758         struct in_device *in_dev = in_dev_get(dev);
1759         struct flowi fl = { .nl_u = { .ip4_u =
1760                                       { .daddr = daddr,
1761                                         .saddr = saddr,
1762                                         .tos = tos,
1763                                         .scope = RT_SCOPE_UNIVERSE,
1764                                       } },
1765                             .mark = skb->mark,
1766                             .iif = dev->ifindex };
1767         unsigned        flags = 0;
1768         u32             itag = 0;
1769         struct rtable * rth;
1770         unsigned        hash;
1771         __be32          spec_dst;
1772         int             err = -EINVAL;
1773         int             free_res = 0;
1774
1775         /* IP on this device is disabled. */
1776
1777         if (!in_dev)
1778                 goto out;
1779
1780         /* Check for the most weird martians, which can be not detected
1781            by fib_lookup.
1782          */
1783
1784         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1785                 goto martian_source;
1786
1787         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1788                 goto brd_input;
1789
1790         /* Accept zero addresses only to limited broadcast;
1791          * I even do not know to fix it or not. Waiting for complains :-)
1792          */
1793         if (ZERONET(saddr))
1794                 goto martian_source;
1795
1796         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1797                 goto martian_destination;
1798
1799         /*
1800          *      Now we are ready to route packet.
1801          */
1802         if ((err = fib_lookup(&fl, &res)) != 0) {
1803                 if (!IN_DEV_FORWARD(in_dev))
1804                         goto e_hostunreach;
1805                 goto no_route;
1806         }
1807         free_res = 1;
1808
1809         RT_CACHE_STAT_INC(in_slow_tot);
1810
1811         if (res.type == RTN_BROADCAST)
1812                 goto brd_input;
1813
1814         if (res.type == RTN_LOCAL) {
1815                 int result;
1816                 result = fib_validate_source(saddr, daddr, tos,
1817                                              loopback_dev.ifindex,
1818                                              dev, &spec_dst, &itag);
1819                 if (result < 0)
1820                         goto martian_source;
1821                 if (result)
1822                         flags |= RTCF_DIRECTSRC;
1823                 spec_dst = daddr;
1824                 goto local_input;
1825         }
1826
1827         if (!IN_DEV_FORWARD(in_dev))
1828                 goto e_hostunreach;
1829         if (res.type != RTN_UNICAST)
1830                 goto martian_destination;
1831
1832         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1833         if (err == -ENOBUFS)
1834                 goto e_nobufs;
1835         if (err == -EINVAL)
1836                 goto e_inval;
1837
1838 done:
1839         in_dev_put(in_dev);
1840         if (free_res)
1841                 fib_res_put(&res);
1842 out:    return err;
1843
1844 brd_input:
1845         if (skb->protocol != htons(ETH_P_IP))
1846                 goto e_inval;
1847
1848         if (ZERONET(saddr))
1849                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1850         else {
1851                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1852                                           &itag);
1853                 if (err < 0)
1854                         goto martian_source;
1855                 if (err)
1856                         flags |= RTCF_DIRECTSRC;
1857         }
1858         flags |= RTCF_BROADCAST;
1859         res.type = RTN_BROADCAST;
1860         RT_CACHE_STAT_INC(in_brd);
1861
1862 local_input:
1863         rth = dst_alloc(&ipv4_dst_ops);
1864         if (!rth)
1865                 goto e_nobufs;
1866
1867         rth->u.dst.output= ip_rt_bug;
1868
1869         atomic_set(&rth->u.dst.__refcnt, 1);
1870         rth->u.dst.flags= DST_HOST;
1871         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1872                 rth->u.dst.flags |= DST_NOPOLICY;
1873         rth->fl.fl4_dst = daddr;
1874         rth->rt_dst     = daddr;
1875         rth->fl.fl4_tos = tos;
1876         rth->fl.mark    = skb->mark;
1877         rth->fl.fl4_src = saddr;
1878         rth->rt_src     = saddr;
1879 #ifdef CONFIG_NET_CLS_ROUTE
1880         rth->u.dst.tclassid = itag;
1881 #endif
1882         rth->rt_iif     =
1883         rth->fl.iif     = dev->ifindex;
1884         rth->u.dst.dev  = &loopback_dev;
1885         dev_hold(rth->u.dst.dev);
1886         rth->idev       = in_dev_get(rth->u.dst.dev);
1887         rth->rt_gateway = daddr;
1888         rth->rt_spec_dst= spec_dst;
1889         rth->u.dst.input= ip_local_deliver;
1890         rth->rt_flags   = flags|RTCF_LOCAL;
1891         if (res.type == RTN_UNREACHABLE) {
1892                 rth->u.dst.input= ip_error;
1893                 rth->u.dst.error= -err;
1894                 rth->rt_flags   &= ~RTCF_LOCAL;
1895         }
1896         rth->rt_type    = res.type;
1897         hash = rt_hash(daddr, saddr, fl.iif);
1898         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1899         goto done;
1900
1901 no_route:
1902         RT_CACHE_STAT_INC(in_no_route);
1903         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1904         res.type = RTN_UNREACHABLE;
1905         goto local_input;
1906
1907         /*
1908          *      Do not cache martian addresses: they should be logged (RFC1812)
1909          */
1910 martian_destination:
1911         RT_CACHE_STAT_INC(in_martian_dst);
1912 #ifdef CONFIG_IP_ROUTE_VERBOSE
1913         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1914                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1915                         "%u.%u.%u.%u, dev %s\n",
1916                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1917 #endif
1918
1919 e_hostunreach:
1920         err = -EHOSTUNREACH;
1921         goto done;
1922
1923 e_inval:
1924         err = -EINVAL;
1925         goto done;
1926
1927 e_nobufs:
1928         err = -ENOBUFS;
1929         goto done;
1930
1931 martian_source:
1932         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1933         goto e_inval;
1934 }
1935
1936 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1937                    u8 tos, struct net_device *dev)
1938 {
1939         struct rtable * rth;
1940         unsigned        hash;
1941         int iif = dev->ifindex;
1942
1943         tos &= IPTOS_RT_MASK;
1944         hash = rt_hash(daddr, saddr, iif);
1945
1946         rcu_read_lock();
1947         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1948              rth = rcu_dereference(rth->u.dst.rt_next)) {
1949                 if (rth->fl.fl4_dst == daddr &&
1950                     rth->fl.fl4_src == saddr &&
1951                     rth->fl.iif == iif &&
1952                     rth->fl.oif == 0 &&
1953                     rth->fl.mark == skb->mark &&
1954                     rth->fl.fl4_tos == tos) {
1955                         rth->u.dst.lastuse = jiffies;
1956                         dst_hold(&rth->u.dst);
1957                         rth->u.dst.__use++;
1958                         RT_CACHE_STAT_INC(in_hit);
1959                         rcu_read_unlock();
1960                         skb->dst = (struct dst_entry*)rth;
1961                         return 0;
1962                 }
1963                 RT_CACHE_STAT_INC(in_hlist_search);
1964         }
1965         rcu_read_unlock();
1966
1967         /* Multicast recognition logic is moved from route cache to here.
1968            The problem was that too many Ethernet cards have broken/missing
1969            hardware multicast filters :-( As result the host on multicasting
1970            network acquires a lot of useless route cache entries, sort of
1971            SDR messages from all the world. Now we try to get rid of them.
1972            Really, provided software IP multicast filter is organized
1973            reasonably (at least, hashed), it does not result in a slowdown
1974            comparing with route cache reject entries.
1975            Note, that multicast routers are not affected, because
1976            route cache entry is created eventually.
1977          */
1978         if (MULTICAST(daddr)) {
1979                 struct in_device *in_dev;
1980
1981                 rcu_read_lock();
1982                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1983                         int our = ip_check_mc(in_dev, daddr, saddr,
1984                                 ip_hdr(skb)->protocol);
1985                         if (our
1986 #ifdef CONFIG_IP_MROUTE
1987                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1988 #endif
1989                             ) {
1990                                 rcu_read_unlock();
1991                                 return ip_route_input_mc(skb, daddr, saddr,
1992                                                          tos, dev, our);
1993                         }
1994                 }
1995                 rcu_read_unlock();
1996                 return -EINVAL;
1997         }
1998         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1999 }
2000
2001 static inline int __mkroute_output(struct rtable **result,
2002                                    struct fib_result* res,
2003                                    const struct flowi *fl,
2004                                    const struct flowi *oldflp,
2005                                    struct net_device *dev_out,
2006                                    unsigned flags)
2007 {
2008         struct rtable *rth;
2009         struct in_device *in_dev;
2010         u32 tos = RT_FL_TOS(oldflp);
2011         int err = 0;
2012
2013         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2014                 return -EINVAL;
2015
2016         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2017                 res->type = RTN_BROADCAST;
2018         else if (MULTICAST(fl->fl4_dst))
2019                 res->type = RTN_MULTICAST;
2020         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2021                 return -EINVAL;
2022
2023         if (dev_out->flags & IFF_LOOPBACK)
2024                 flags |= RTCF_LOCAL;
2025
2026         /* get work reference to inet device */
2027         in_dev = in_dev_get(dev_out);
2028         if (!in_dev)
2029                 return -EINVAL;
2030
2031         if (res->type == RTN_BROADCAST) {
2032                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2033                 if (res->fi) {
2034                         fib_info_put(res->fi);
2035                         res->fi = NULL;
2036                 }
2037         } else if (res->type == RTN_MULTICAST) {
2038                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2039                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2040                                  oldflp->proto))
2041                         flags &= ~RTCF_LOCAL;
2042                 /* If multicast route do not exist use
2043                    default one, but do not gateway in this case.
2044                    Yes, it is hack.
2045                  */
2046                 if (res->fi && res->prefixlen < 4) {
2047                         fib_info_put(res->fi);
2048                         res->fi = NULL;
2049                 }
2050         }
2051
2052
2053         rth = dst_alloc(&ipv4_dst_ops);
2054         if (!rth) {
2055                 err = -ENOBUFS;
2056                 goto cleanup;
2057         }
2058
2059         atomic_set(&rth->u.dst.__refcnt, 1);
2060         rth->u.dst.flags= DST_HOST;
2061         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2062                 rth->u.dst.flags |= DST_NOXFRM;
2063         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2064                 rth->u.dst.flags |= DST_NOPOLICY;
2065
2066         rth->fl.fl4_dst = oldflp->fl4_dst;
2067         rth->fl.fl4_tos = tos;
2068         rth->fl.fl4_src = oldflp->fl4_src;
2069         rth->fl.oif     = oldflp->oif;
2070         rth->fl.mark    = oldflp->mark;
2071         rth->rt_dst     = fl->fl4_dst;
2072         rth->rt_src     = fl->fl4_src;
2073         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2074         /* get references to the devices that are to be hold by the routing
2075            cache entry */
2076         rth->u.dst.dev  = dev_out;
2077         dev_hold(dev_out);
2078         rth->idev       = in_dev_get(dev_out);
2079         rth->rt_gateway = fl->fl4_dst;
2080         rth->rt_spec_dst= fl->fl4_src;
2081
2082         rth->u.dst.output=ip_output;
2083
2084         RT_CACHE_STAT_INC(out_slow_tot);
2085
2086         if (flags & RTCF_LOCAL) {
2087                 rth->u.dst.input = ip_local_deliver;
2088                 rth->rt_spec_dst = fl->fl4_dst;
2089         }
2090         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2091                 rth->rt_spec_dst = fl->fl4_src;
2092                 if (flags & RTCF_LOCAL &&
2093                     !(dev_out->flags & IFF_LOOPBACK)) {
2094                         rth->u.dst.output = ip_mc_output;
2095                         RT_CACHE_STAT_INC(out_slow_mc);
2096                 }
2097 #ifdef CONFIG_IP_MROUTE
2098                 if (res->type == RTN_MULTICAST) {
2099                         if (IN_DEV_MFORWARD(in_dev) &&
2100                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2101                                 rth->u.dst.input = ip_mr_input;
2102                                 rth->u.dst.output = ip_mc_output;
2103                         }
2104                 }
2105 #endif
2106         }
2107
2108         rt_set_nexthop(rth, res, 0);
2109
2110         rth->rt_flags = flags;
2111
2112         *result = rth;
2113  cleanup:
2114         /* release work reference to inet device */
2115         in_dev_put(in_dev);
2116
2117         return err;
2118 }
2119
2120 static inline int ip_mkroute_output(struct rtable **rp,
2121                                     struct fib_result* res,
2122                                     const struct flowi *fl,
2123                                     const struct flowi *oldflp,
2124                                     struct net_device *dev_out,
2125                                     unsigned flags)
2126 {
2127         struct rtable *rth = NULL;
2128         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2129         unsigned hash;
2130         if (err == 0) {
2131                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2132                 err = rt_intern_hash(hash, rth, rp);
2133         }
2134
2135         return err;
2136 }
2137
2138 /*
2139  * Major route resolver routine.
2140  */
2141
2142 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2143 {
2144         u32 tos = RT_FL_TOS(oldflp);
2145         struct flowi fl = { .nl_u = { .ip4_u =
2146                                       { .daddr = oldflp->fl4_dst,
2147                                         .saddr = oldflp->fl4_src,
2148                                         .tos = tos & IPTOS_RT_MASK,
2149                                         .scope = ((tos & RTO_ONLINK) ?
2150                                                   RT_SCOPE_LINK :
2151                                                   RT_SCOPE_UNIVERSE),
2152                                       } },
2153                             .mark = oldflp->mark,
2154                             .iif = loopback_dev.ifindex,
2155                             .oif = oldflp->oif };
2156         struct fib_result res;
2157         unsigned flags = 0;
2158         struct net_device *dev_out = NULL;
2159         int free_res = 0;
2160         int err;
2161
2162
2163         res.fi          = NULL;
2164 #ifdef CONFIG_IP_MULTIPLE_TABLES
2165         res.r           = NULL;
2166 #endif
2167
2168         if (oldflp->fl4_src) {
2169                 err = -EINVAL;
2170                 if (MULTICAST(oldflp->fl4_src) ||
2171                     BADCLASS(oldflp->fl4_src) ||
2172                     ZERONET(oldflp->fl4_src))
2173                         goto out;
2174
2175                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2176                 dev_out = ip_dev_find(oldflp->fl4_src);
2177                 if (dev_out == NULL)
2178                         goto out;
2179
2180                 /* I removed check for oif == dev_out->oif here.
2181                    It was wrong for two reasons:
2182                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2183                       assigned to multiple interfaces.
2184                    2. Moreover, we are allowed to send packets with saddr
2185                       of another iface. --ANK
2186                  */
2187
2188                 if (oldflp->oif == 0
2189                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2190                         /* Special hack: user can direct multicasts
2191                            and limited broadcast via necessary interface
2192                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2193                            This hack is not just for fun, it allows
2194                            vic,vat and friends to work.
2195                            They bind socket to loopback, set ttl to zero
2196                            and expect that it will work.
2197                            From the viewpoint of routing cache they are broken,
2198                            because we are not allowed to build multicast path
2199                            with loopback source addr (look, routing cache
2200                            cannot know, that ttl is zero, so that packet
2201                            will not leave this host and route is valid).
2202                            Luckily, this hack is good workaround.
2203                          */
2204
2205                         fl.oif = dev_out->ifindex;
2206                         goto make_route;
2207                 }
2208                 if (dev_out)
2209                         dev_put(dev_out);
2210                 dev_out = NULL;
2211         }
2212
2213
2214         if (oldflp->oif) {
2215                 dev_out = dev_get_by_index(oldflp->oif);
2216                 err = -ENODEV;
2217                 if (dev_out == NULL)
2218                         goto out;
2219
2220                 /* RACE: Check return value of inet_select_addr instead. */
2221                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2222                         dev_put(dev_out);
2223                         goto out;       /* Wrong error code */
2224                 }
2225
2226                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2227                         if (!fl.fl4_src)
2228                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2229                                                               RT_SCOPE_LINK);
2230                         goto make_route;
2231                 }
2232                 if (!fl.fl4_src) {
2233                         if (MULTICAST(oldflp->fl4_dst))
2234                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2235                                                               fl.fl4_scope);
2236                         else if (!oldflp->fl4_dst)
2237                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2238                                                               RT_SCOPE_HOST);
2239                 }
2240         }
2241
2242         if (!fl.fl4_dst) {
2243                 fl.fl4_dst = fl.fl4_src;
2244                 if (!fl.fl4_dst)
2245                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2246                 if (dev_out)
2247                         dev_put(dev_out);
2248                 dev_out = &loopback_dev;
2249                 dev_hold(dev_out);
2250                 fl.oif = loopback_dev.ifindex;
2251                 res.type = RTN_LOCAL;
2252                 flags |= RTCF_LOCAL;
2253                 goto make_route;
2254         }
2255
2256         if (fib_lookup(&fl, &res)) {
2257                 res.fi = NULL;
2258                 if (oldflp->oif) {
2259                         /* Apparently, routing tables are wrong. Assume,
2260                            that the destination is on link.
2261
2262                            WHY? DW.
2263                            Because we are allowed to send to iface
2264                            even if it has NO routes and NO assigned
2265                            addresses. When oif is specified, routing
2266                            tables are looked up with only one purpose:
2267                            to catch if destination is gatewayed, rather than
2268                            direct. Moreover, if MSG_DONTROUTE is set,
2269                            we send packet, ignoring both routing tables
2270                            and ifaddr state. --ANK
2271
2272
2273                            We could make it even if oif is unknown,
2274                            likely IPv6, but we do not.
2275                          */
2276
2277                         if (fl.fl4_src == 0)
2278                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2279                                                               RT_SCOPE_LINK);
2280                         res.type = RTN_UNICAST;
2281                         goto make_route;
2282                 }
2283                 if (dev_out)
2284                         dev_put(dev_out);
2285                 err = -ENETUNREACH;
2286                 goto out;
2287         }
2288         free_res = 1;
2289
2290         if (res.type == RTN_LOCAL) {
2291                 if (!fl.fl4_src)
2292                         fl.fl4_src = fl.fl4_dst;
2293                 if (dev_out)
2294                         dev_put(dev_out);
2295                 dev_out = &loopback_dev;
2296                 dev_hold(dev_out);
2297                 fl.oif = dev_out->ifindex;
2298                 if (res.fi)
2299                         fib_info_put(res.fi);
2300                 res.fi = NULL;
2301                 flags |= RTCF_LOCAL;
2302                 goto make_route;
2303         }
2304
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2306         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2307                 fib_select_multipath(&fl, &res);
2308         else
2309 #endif
2310         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2311                 fib_select_default(&fl, &res);
2312
2313         if (!fl.fl4_src)
2314                 fl.fl4_src = FIB_RES_PREFSRC(res);
2315
2316         if (dev_out)
2317                 dev_put(dev_out);
2318         dev_out = FIB_RES_DEV(res);
2319         dev_hold(dev_out);
2320         fl.oif = dev_out->ifindex;
2321
2322
2323 make_route:
2324         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2325
2326
2327         if (free_res)
2328                 fib_res_put(&res);
2329         if (dev_out)
2330                 dev_put(dev_out);
2331 out:    return err;
2332 }
2333
2334 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2335 {
2336         unsigned hash;
2337         struct rtable *rth;
2338
2339         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2340
2341         rcu_read_lock_bh();
2342         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2343                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2344                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2345                     rth->fl.fl4_src == flp->fl4_src &&
2346                     rth->fl.iif == 0 &&
2347                     rth->fl.oif == flp->oif &&
2348                     rth->fl.mark == flp->mark &&
2349                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2350                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2351                         rth->u.dst.lastuse = jiffies;
2352                         dst_hold(&rth->u.dst);
2353                         rth->u.dst.__use++;
2354                         RT_CACHE_STAT_INC(out_hit);
2355                         rcu_read_unlock_bh();
2356                         *rp = rth;
2357                         return 0;
2358                 }
2359                 RT_CACHE_STAT_INC(out_hlist_search);
2360         }
2361         rcu_read_unlock_bh();
2362
2363         return ip_route_output_slow(rp, flp);
2364 }
2365
2366 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2367
2368 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2369 {
2370 }
2371
2372 static struct dst_ops ipv4_dst_blackhole_ops = {
2373         .family                 =       AF_INET,
2374         .protocol               =       __constant_htons(ETH_P_IP),
2375         .destroy                =       ipv4_dst_destroy,
2376         .check                  =       ipv4_dst_check,
2377         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2378         .entry_size             =       sizeof(struct rtable),
2379 };
2380
2381
2382 static int ipv4_blackhole_output(struct sk_buff *skb)
2383 {
2384         kfree_skb(skb);
2385         return 0;
2386 }
2387
2388 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2389 {
2390         struct rtable *ort = *rp;
2391         struct rtable *rt = (struct rtable *)
2392                 dst_alloc(&ipv4_dst_blackhole_ops);
2393
2394         if (rt) {
2395                 struct dst_entry *new = &rt->u.dst;
2396
2397                 atomic_set(&new->__refcnt, 1);
2398                 new->__use = 1;
2399                 new->input = ipv4_blackhole_output;
2400                 new->output = ipv4_blackhole_output;
2401                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2402
2403                 new->dev = ort->u.dst.dev;
2404                 if (new->dev)
2405                         dev_hold(new->dev);
2406
2407                 rt->fl = ort->fl;
2408
2409                 rt->idev = ort->idev;
2410                 if (rt->idev)
2411                         in_dev_hold(rt->idev);
2412                 rt->rt_flags = ort->rt_flags;
2413                 rt->rt_type = ort->rt_type;
2414                 rt->rt_dst = ort->rt_dst;
2415                 rt->rt_src = ort->rt_src;
2416                 rt->rt_iif = ort->rt_iif;
2417                 rt->rt_gateway = ort->rt_gateway;
2418                 rt->rt_spec_dst = ort->rt_spec_dst;
2419                 rt->peer = ort->peer;
2420                 if (rt->peer)
2421                         atomic_inc(&rt->peer->refcnt);
2422
2423                 dst_free(new);
2424         }
2425
2426         dst_release(&(*rp)->u.dst);
2427         *rp = rt;
2428         return (rt ? 0 : -ENOMEM);
2429 }
2430
2431 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2432 {
2433         int err;
2434
2435         if ((err = __ip_route_output_key(rp, flp)) != 0)
2436                 return err;
2437
2438         if (flp->proto) {
2439                 if (!flp->fl4_src)
2440                         flp->fl4_src = (*rp)->rt_src;
2441                 if (!flp->fl4_dst)
2442                         flp->fl4_dst = (*rp)->rt_dst;
2443                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2444                 if (err == -EREMOTE)
2445                         err = ipv4_dst_blackhole(rp, flp, sk);
2446
2447                 return err;
2448         }
2449
2450         return 0;
2451 }
2452
2453 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2454
2455 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2456 {
2457         return ip_route_output_flow(rp, flp, NULL, 0);
2458 }
2459
2460 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2461                         int nowait, unsigned int flags)
2462 {
2463         struct rtable *rt = (struct rtable*)skb->dst;
2464         struct rtmsg *r;
2465         struct nlmsghdr *nlh;
2466         long expires;
2467         u32 id = 0, ts = 0, tsage = 0, error;
2468
2469         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2470         if (nlh == NULL)
2471                 return -EMSGSIZE;
2472
2473         r = nlmsg_data(nlh);
2474         r->rtm_family    = AF_INET;
2475         r->rtm_dst_len  = 32;
2476         r->rtm_src_len  = 0;
2477         r->rtm_tos      = rt->fl.fl4_tos;
2478         r->rtm_table    = RT_TABLE_MAIN;
2479         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2480         r->rtm_type     = rt->rt_type;
2481         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2482         r->rtm_protocol = RTPROT_UNSPEC;
2483         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2484         if (rt->rt_flags & RTCF_NOTIFY)
2485                 r->rtm_flags |= RTM_F_NOTIFY;
2486
2487         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2488
2489         if (rt->fl.fl4_src) {
2490                 r->rtm_src_len = 32;
2491                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2492         }
2493         if (rt->u.dst.dev)
2494                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2495 #ifdef CONFIG_NET_CLS_ROUTE
2496         if (rt->u.dst.tclassid)
2497                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2498 #endif
2499         if (rt->fl.iif)
2500                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2501         else if (rt->rt_src != rt->fl.fl4_src)
2502                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2503
2504         if (rt->rt_dst != rt->rt_gateway)
2505                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2506
2507         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2508                 goto nla_put_failure;
2509
2510         error = rt->u.dst.error;
2511         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2512         if (rt->peer) {
2513                 id = rt->peer->ip_id_count;
2514                 if (rt->peer->tcp_ts_stamp) {
2515                         ts = rt->peer->tcp_ts;
2516                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2517                 }
2518         }
2519
2520         if (rt->fl.iif) {
2521 #ifdef CONFIG_IP_MROUTE
2522                 __be32 dst = rt->rt_dst;
2523
2524                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2525                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2526                         int err = ipmr_get_route(skb, r, nowait);
2527                         if (err <= 0) {
2528                                 if (!nowait) {
2529                                         if (err == 0)
2530                                                 return 0;
2531                                         goto nla_put_failure;
2532                                 } else {
2533                                         if (err == -EMSGSIZE)
2534                                                 goto nla_put_failure;
2535                                         error = err;
2536                                 }
2537                         }
2538                 } else
2539 #endif
2540                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2541         }
2542
2543         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2544                                expires, error) < 0)
2545                 goto nla_put_failure;
2546
2547         return nlmsg_end(skb, nlh);
2548
2549 nla_put_failure:
2550         nlmsg_cancel(skb, nlh);
2551         return -EMSGSIZE;
2552 }
2553
2554 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2555 {
2556         struct rtmsg *rtm;
2557         struct nlattr *tb[RTA_MAX+1];
2558         struct rtable *rt = NULL;
2559         __be32 dst = 0;
2560         __be32 src = 0;
2561         u32 iif;
2562         int err;
2563         struct sk_buff *skb;
2564
2565         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2566         if (err < 0)
2567                 goto errout;
2568
2569         rtm = nlmsg_data(nlh);
2570
2571         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2572         if (skb == NULL) {
2573                 err = -ENOBUFS;
2574                 goto errout;
2575         }
2576
2577         /* Reserve room for dummy headers, this skb can pass
2578            through good chunk of routing engine.
2579          */
2580         skb_reset_mac_header(skb);
2581         skb_reset_network_header(skb);
2582
2583         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2584         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2585         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2586
2587         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2588         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2589         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2590
2591         if (iif) {
2592                 struct net_device *dev;
2593
2594                 dev = __dev_get_by_index(iif);
2595                 if (dev == NULL) {
2596                         err = -ENODEV;
2597                         goto errout_free;
2598                 }
2599
2600                 skb->protocol   = htons(ETH_P_IP);
2601                 skb->dev        = dev;
2602                 local_bh_disable();
2603                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2604                 local_bh_enable();
2605
2606                 rt = (struct rtable*) skb->dst;
2607                 if (err == 0 && rt->u.dst.error)
2608                         err = -rt->u.dst.error;
2609         } else {
2610                 struct flowi fl = {
2611                         .nl_u = {
2612                                 .ip4_u = {
2613                                         .daddr = dst,
2614                                         .saddr = src,
2615                                         .tos = rtm->rtm_tos,
2616                                 },
2617                         },
2618                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2619                 };
2620                 err = ip_route_output_key(&rt, &fl);
2621         }
2622
2623         if (err)
2624                 goto errout_free;
2625
2626         skb->dst = &rt->u.dst;
2627         if (rtm->rtm_flags & RTM_F_NOTIFY)
2628                 rt->rt_flags |= RTCF_NOTIFY;
2629
2630         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2631                                 RTM_NEWROUTE, 0, 0);
2632         if (err <= 0)
2633                 goto errout_free;
2634
2635         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2636 errout:
2637         return err;
2638
2639 errout_free:
2640         kfree_skb(skb);
2641         goto errout;
2642 }
2643
2644 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2645 {
2646         struct rtable *rt;
2647         int h, s_h;
2648         int idx, s_idx;
2649
2650         s_h = cb->args[0];
2651         s_idx = idx = cb->args[1];
2652         for (h = 0; h <= rt_hash_mask; h++) {
2653                 if (h < s_h) continue;
2654                 if (h > s_h)
2655                         s_idx = 0;
2656                 rcu_read_lock_bh();
2657                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2658                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2659                         if (idx < s_idx)
2660                                 continue;
2661                         skb->dst = dst_clone(&rt->u.dst);
2662                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2663                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2664                                          1, NLM_F_MULTI) <= 0) {
2665                                 dst_release(xchg(&skb->dst, NULL));
2666                                 rcu_read_unlock_bh();
2667                                 goto done;
2668                         }
2669                         dst_release(xchg(&skb->dst, NULL));
2670                 }
2671                 rcu_read_unlock_bh();
2672         }
2673
2674 done:
2675         cb->args[0] = h;
2676         cb->args[1] = idx;
2677         return skb->len;
2678 }
2679
2680 void ip_rt_multicast_event(struct in_device *in_dev)
2681 {
2682         rt_cache_flush(0);
2683 }
2684
2685 #ifdef CONFIG_SYSCTL
2686 static int flush_delay;
2687
2688 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2689                                         struct file *filp, void __user *buffer,
2690                                         size_t *lenp, loff_t *ppos)
2691 {
2692         if (write) {
2693                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2694                 rt_cache_flush(flush_delay);
2695                 return 0;
2696         }
2697
2698         return -EINVAL;
2699 }
2700
2701 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2702                                                 int __user *name,
2703                                                 int nlen,
2704                                                 void __user *oldval,
2705                                                 size_t __user *oldlenp,
2706                                                 void __user *newval,
2707                                                 size_t newlen)
2708 {
2709         int delay;
2710         if (newlen != sizeof(int))
2711                 return -EINVAL;
2712         if (get_user(delay, (int __user *)newval))
2713                 return -EFAULT;
2714         rt_cache_flush(delay);
2715         return 0;
2716 }
2717
2718 ctl_table ipv4_route_table[] = {
2719         {
2720                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2721                 .procname       = "flush",
2722                 .data           = &flush_delay,
2723                 .maxlen         = sizeof(int),
2724                 .mode           = 0200,
2725                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2726                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2727         },
2728         {
2729                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2730                 .procname       = "min_delay",
2731                 .data           = &ip_rt_min_delay,
2732                 .maxlen         = sizeof(int),
2733                 .mode           = 0644,
2734                 .proc_handler   = &proc_dointvec_jiffies,
2735                 .strategy       = &sysctl_jiffies,
2736         },
2737         {
2738                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2739                 .procname       = "max_delay",
2740                 .data           = &ip_rt_max_delay,
2741                 .maxlen         = sizeof(int),
2742                 .mode           = 0644,
2743                 .proc_handler   = &proc_dointvec_jiffies,
2744                 .strategy       = &sysctl_jiffies,
2745         },
2746         {
2747                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2748                 .procname       = "gc_thresh",
2749                 .data           = &ipv4_dst_ops.gc_thresh,
2750                 .maxlen         = sizeof(int),
2751                 .mode           = 0644,
2752                 .proc_handler   = &proc_dointvec,
2753         },
2754         {
2755                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2756                 .procname       = "max_size",
2757                 .data           = &ip_rt_max_size,
2758                 .maxlen         = sizeof(int),
2759                 .mode           = 0644,
2760                 .proc_handler   = &proc_dointvec,
2761         },
2762         {
2763                 /*  Deprecated. Use gc_min_interval_ms */
2764
2765                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2766                 .procname       = "gc_min_interval",
2767                 .data           = &ip_rt_gc_min_interval,
2768                 .maxlen         = sizeof(int),
2769                 .mode           = 0644,
2770                 .proc_handler   = &proc_dointvec_jiffies,
2771                 .strategy       = &sysctl_jiffies,
2772         },
2773         {
2774                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2775                 .procname       = "gc_min_interval_ms",
2776                 .data           = &ip_rt_gc_min_interval,
2777                 .maxlen         = sizeof(int),
2778                 .mode           = 0644,
2779                 .proc_handler   = &proc_dointvec_ms_jiffies,
2780                 .strategy       = &sysctl_ms_jiffies,
2781         },
2782         {
2783                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2784                 .procname       = "gc_timeout",
2785                 .data           = &ip_rt_gc_timeout,
2786                 .maxlen         = sizeof(int),
2787                 .mode           = 0644,
2788                 .proc_handler   = &proc_dointvec_jiffies,
2789                 .strategy       = &sysctl_jiffies,
2790         },
2791         {
2792                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2793                 .procname       = "gc_interval",
2794                 .data           = &ip_rt_gc_interval,
2795                 .maxlen         = sizeof(int),
2796                 .mode           = 0644,
2797                 .proc_handler   = &proc_dointvec_jiffies,
2798                 .strategy       = &sysctl_jiffies,
2799         },
2800         {
2801                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2802                 .procname       = "redirect_load",
2803                 .data           = &ip_rt_redirect_load,
2804                 .maxlen         = sizeof(int),
2805                 .mode           = 0644,
2806                 .proc_handler   = &proc_dointvec,
2807         },
2808         {
2809                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2810                 .procname       = "redirect_number",
2811                 .data           = &ip_rt_redirect_number,
2812                 .maxlen         = sizeof(int),
2813                 .mode           = 0644,
2814                 .proc_handler   = &proc_dointvec,
2815         },
2816         {
2817                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2818                 .procname       = "redirect_silence",
2819                 .data           = &ip_rt_redirect_silence,
2820                 .maxlen         = sizeof(int),
2821                 .mode           = 0644,
2822                 .proc_handler   = &proc_dointvec,
2823         },
2824         {
2825                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2826                 .procname       = "error_cost",
2827                 .data           = &ip_rt_error_cost,
2828                 .maxlen         = sizeof(int),
2829                 .mode           = 0644,
2830                 .proc_handler   = &proc_dointvec,
2831         },
2832         {
2833                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2834                 .procname       = "error_burst",
2835                 .data           = &ip_rt_error_burst,
2836                 .maxlen         = sizeof(int),
2837                 .mode           = 0644,
2838                 .proc_handler   = &proc_dointvec,
2839         },
2840         {
2841                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2842                 .procname       = "gc_elasticity",
2843                 .data           = &ip_rt_gc_elasticity,
2844                 .maxlen         = sizeof(int),
2845                 .mode           = 0644,
2846                 .proc_handler   = &proc_dointvec,
2847         },
2848         {
2849                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2850                 .procname       = "mtu_expires",
2851                 .data           = &ip_rt_mtu_expires,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = &proc_dointvec_jiffies,
2855                 .strategy       = &sysctl_jiffies,
2856         },
2857         {
2858                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2859                 .procname       = "min_pmtu",
2860                 .data           = &ip_rt_min_pmtu,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = &proc_dointvec,
2864         },
2865         {
2866                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2867                 .procname       = "min_adv_mss",
2868                 .data           = &ip_rt_min_advmss,
2869                 .maxlen         = sizeof(int),
2870                 .mode           = 0644,
2871                 .proc_handler   = &proc_dointvec,
2872         },
2873         {
2874                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2875                 .procname       = "secret_interval",
2876                 .data           = &ip_rt_secret_interval,
2877                 .maxlen         = sizeof(int),
2878                 .mode           = 0644,
2879                 .proc_handler   = &proc_dointvec_jiffies,
2880                 .strategy       = &sysctl_jiffies,
2881         },
2882         { .ctl_name = 0 }
2883 };
2884 #endif
2885
2886 #ifdef CONFIG_NET_CLS_ROUTE
2887 struct ip_rt_acct *ip_rt_acct;
2888
2889 /* This code sucks.  But you should have seen it before! --RR */
2890
2891 /* IP route accounting ptr for this logical cpu number. */
2892 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2893
2894 #ifdef CONFIG_PROC_FS
2895 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2896                            int length, int *eof, void *data)
2897 {
2898         unsigned int i;
2899
2900         if ((offset & 3) || (length & 3))
2901                 return -EIO;
2902
2903         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2904                 *eof = 1;
2905                 return 0;
2906         }
2907
2908         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2909                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2910                 *eof = 1;
2911         }
2912
2913         offset /= sizeof(u32);
2914
2915         if (length > 0) {
2916                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2917                 u32 *dst = (u32 *) buffer;
2918
2919                 /* Copy first cpu. */
2920                 *start = buffer;
2921                 memcpy(dst, src, length);
2922
2923                 /* Add the other cpus in, one int at a time */
2924                 for_each_possible_cpu(i) {
2925                         unsigned int j;
2926
2927                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2928
2929                         for (j = 0; j < length/4; j++)
2930                                 dst[j] += src[j];
2931                 }
2932         }
2933         return length;
2934 }
2935 #endif /* CONFIG_PROC_FS */
2936 #endif /* CONFIG_NET_CLS_ROUTE */
2937
2938 static __initdata unsigned long rhash_entries;
2939 static int __init set_rhash_entries(char *str)
2940 {
2941         if (!str)
2942                 return 0;
2943         rhash_entries = simple_strtoul(str, &str, 0);
2944         return 1;
2945 }
2946 __setup("rhash_entries=", set_rhash_entries);
2947
2948 int __init ip_rt_init(void)
2949 {
2950         int rc = 0;
2951
2952         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2953                              (jiffies ^ (jiffies >> 7)));
2954
2955 #ifdef CONFIG_NET_CLS_ROUTE
2956         {
2957         int order;
2958         for (order = 0;
2959              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2960                 /* NOTHING */;
2961         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2962         if (!ip_rt_acct)
2963                 panic("IP: failed to allocate ip_rt_acct\n");
2964         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2965         }
2966 #endif
2967
2968         ipv4_dst_ops.kmem_cachep =
2969                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2970                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2971
2972         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2973
2974         rt_hash_table = (struct rt_hash_bucket *)
2975                 alloc_large_system_hash("IP route cache",
2976                                         sizeof(struct rt_hash_bucket),
2977                                         rhash_entries,
2978                                         (num_physpages >= 128 * 1024) ?
2979                                         15 : 17,
2980                                         0,
2981                                         &rt_hash_log,
2982                                         &rt_hash_mask,
2983                                         0);
2984         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2985         rt_hash_lock_init();
2986
2987         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2988         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2989
2990         devinet_init();
2991         ip_fib_init();
2992
2993         init_timer(&rt_flush_timer);
2994         rt_flush_timer.function = rt_run_flush;
2995         init_timer(&rt_periodic_timer);
2996         rt_periodic_timer.function = rt_check_expire;
2997         init_timer(&rt_secret_timer);
2998         rt_secret_timer.function = rt_secret_rebuild;
2999
3000         /* All the timers, started at system startup tend
3001            to synchronize. Perturb it a bit.
3002          */
3003         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3004                                         ip_rt_gc_interval;
3005         add_timer(&rt_periodic_timer);
3006
3007         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3008                 ip_rt_secret_interval;
3009         add_timer(&rt_secret_timer);
3010
3011 #ifdef CONFIG_PROC_FS
3012         {
3013         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3014         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3015             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3016                                              proc_net_stat))) {
3017                 return -ENOMEM;
3018         }
3019         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3020         }
3021 #ifdef CONFIG_NET_CLS_ROUTE
3022         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3023 #endif
3024 #endif
3025 #ifdef CONFIG_XFRM
3026         xfrm_init();
3027         xfrm4_init();
3028 #endif
3029         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3030
3031         return rc;
3032 }
3033
3034 EXPORT_SYMBOL(__ip_select_ident);
3035 EXPORT_SYMBOL(ip_route_input);
3036 EXPORT_SYMBOL(ip_route_output_key);