Merge branch 'master' of ../net-2.6/
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly    = 8;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
134
135 static void rt_worker_func(struct work_struct *work);
136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
137 static struct timer_list rt_secret_timer;
138
139 /*
140  *      Interface to generic destination cache.
141  */
142
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void              ipv4_dst_destroy(struct dst_entry *dst);
145 static void              ipv4_dst_ifdown(struct dst_entry *dst,
146                                          struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void              ipv4_link_failure(struct sk_buff *skb);
149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
151
152
153 static struct dst_ops ipv4_dst_ops = {
154         .family =               AF_INET,
155         .protocol =             __constant_htons(ETH_P_IP),
156         .gc =                   rt_garbage_collect,
157         .check =                ipv4_dst_check,
158         .destroy =              ipv4_dst_destroy,
159         .ifdown =               ipv4_dst_ifdown,
160         .negative_advice =      ipv4_negative_advice,
161         .link_failure =         ipv4_link_failure,
162         .update_pmtu =          ip_rt_update_pmtu,
163         .local_out =            ip_local_out,
164         .entry_size =           sizeof(struct rtable),
165         .entries =              ATOMIC_INIT(0),
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 const __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(FILLER),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188
189
190 /*
191  * Route cache.
192  */
193
194 /* The locking scheme is rather straight forward:
195  *
196  * 1) Read-Copy Update protects the buckets of the central route hash.
197  * 2) Only writers remove entries, and they hold the lock
198  *    as they look at rtable reference counts.
199  * 3) Only readers acquire references to rtable entries,
200  *    they do so with atomic increments and with the
201  *    lock held.
202  */
203
204 struct rt_hash_bucket {
205         struct rtable   *chain;
206 };
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208         defined(CONFIG_PROVE_LOCKING)
209 /*
210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211  * The size of this table is a power of two and depends on the number of CPUS.
212  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213  */
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ        256
216 #else
217 # if NR_CPUS >= 32
218 #  define RT_HASH_LOCK_SZ       4096
219 # elif NR_CPUS >= 16
220 #  define RT_HASH_LOCK_SZ       2048
221 # elif NR_CPUS >= 8
222 #  define RT_HASH_LOCK_SZ       1024
223 # elif NR_CPUS >= 4
224 #  define RT_HASH_LOCK_SZ       512
225 # else
226 #  define RT_HASH_LOCK_SZ       256
227 # endif
228 #endif
229
230 static spinlock_t       *rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232
233 static __init void rt_hash_lock_init(void)
234 {
235         int i;
236
237         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238                         GFP_KERNEL);
239         if (!rt_hash_locks)
240                 panic("IP: failed to allocate rt_hash_locks\n");
241
242         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243                 spin_lock_init(&rt_hash_locks[i]);
244 }
245 #else
246 # define rt_hash_lock_addr(slot) NULL
247
248 static inline void rt_hash_lock_init(void)
249 {
250 }
251 #endif
252
253 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
254 static unsigned                 rt_hash_mask __read_mostly;
255 static unsigned int             rt_hash_log  __read_mostly;
256 static atomic_t                 rt_genid __read_mostly;
257
258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
259 #define RT_CACHE_STAT_INC(field) \
260         (__raw_get_cpu_var(rt_cache_stat).field++)
261
262 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
263 {
264         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
265                 & rt_hash_mask;
266 }
267
268 #define rt_hash(daddr, saddr, idx) \
269         rt_hash_code((__force u32)(__be32)(daddr),\
270                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274         struct seq_net_private p;
275         int bucket;
276         int genid;
277 };
278
279 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
280 {
281         struct rtable *r = NULL;
282
283         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284                 rcu_read_lock_bh();
285                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
286                 while (r) {
287                         if (r->u.dst.dev->nd_net == st->p.net &&
288                             r->rt_genid == st->genid)
289                                 return r;
290                         r = rcu_dereference(r->u.dst.rt_next);
291                 }
292                 rcu_read_unlock_bh();
293         }
294         return r;
295 }
296
297 static struct rtable *__rt_cache_get_next(struct rt_cache_iter_state *st,
298                                           struct rtable *r)
299 {
300         r = r->u.dst.rt_next;
301         while (!r) {
302                 rcu_read_unlock_bh();
303                 if (--st->bucket < 0)
304                         break;
305                 rcu_read_lock_bh();
306                 r = rt_hash_table[st->bucket].chain;
307         }
308         return rcu_dereference(r);
309 }
310
311 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st,
312                                         struct rtable *r)
313 {
314         while ((r = __rt_cache_get_next(st, r)) != NULL) {
315                 if (r->u.dst.dev->nd_net != st->p.net)
316                         continue;
317                 if (r->rt_genid == st->genid)
318                         break;
319         }
320         return r;
321 }
322
323 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
324 {
325         struct rtable *r = rt_cache_get_first(st);
326
327         if (r)
328                 while (pos && (r = rt_cache_get_next(st, r)))
329                         --pos;
330         return pos ? NULL : r;
331 }
332
333 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
334 {
335         struct rt_cache_iter_state *st = seq->private;
336
337         if (*pos)
338                 return rt_cache_get_idx(st, *pos - 1);
339         st->genid = atomic_read(&rt_genid);
340         return SEQ_START_TOKEN;
341 }
342
343 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
344 {
345         struct rtable *r;
346         struct rt_cache_iter_state *st = seq->private;
347
348         if (v == SEQ_START_TOKEN)
349                 r = rt_cache_get_first(st);
350         else
351                 r = rt_cache_get_next(st, v);
352         ++*pos;
353         return r;
354 }
355
356 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
357 {
358         if (v && v != SEQ_START_TOKEN)
359                 rcu_read_unlock_bh();
360 }
361
362 static int rt_cache_seq_show(struct seq_file *seq, void *v)
363 {
364         if (v == SEQ_START_TOKEN)
365                 seq_printf(seq, "%-127s\n",
366                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
367                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
368                            "HHUptod\tSpecDst");
369         else {
370                 struct rtable *r = v;
371                 char temp[256];
372
373                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
374                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
375                         r->u.dst.dev ? r->u.dst.dev->name : "*",
376                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
377                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
378                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
379                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
380                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
381                         dst_metric(&r->u.dst, RTAX_WINDOW),
382                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
383                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
384                         r->fl.fl4_tos,
385                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
386                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
387                                        dev_queue_xmit) : 0,
388                         r->rt_spec_dst);
389                 seq_printf(seq, "%-127s\n", temp);
390         }
391         return 0;
392 }
393
394 static const struct seq_operations rt_cache_seq_ops = {
395         .start  = rt_cache_seq_start,
396         .next   = rt_cache_seq_next,
397         .stop   = rt_cache_seq_stop,
398         .show   = rt_cache_seq_show,
399 };
400
401 static int rt_cache_seq_open(struct inode *inode, struct file *file)
402 {
403         return seq_open_net(inode, file, &rt_cache_seq_ops,
404                         sizeof(struct rt_cache_iter_state));
405 }
406
407 static const struct file_operations rt_cache_seq_fops = {
408         .owner   = THIS_MODULE,
409         .open    = rt_cache_seq_open,
410         .read    = seq_read,
411         .llseek  = seq_lseek,
412         .release = seq_release_net,
413 };
414
415
416 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
417 {
418         int cpu;
419
420         if (*pos == 0)
421                 return SEQ_START_TOKEN;
422
423         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430 }
431
432 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
433 {
434         int cpu;
435
436         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
437                 if (!cpu_possible(cpu))
438                         continue;
439                 *pos = cpu+1;
440                 return &per_cpu(rt_cache_stat, cpu);
441         }
442         return NULL;
443
444 }
445
446 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
447 {
448
449 }
450
451 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
452 {
453         struct rt_cache_stat *st = v;
454
455         if (v == SEQ_START_TOKEN) {
456                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
457                 return 0;
458         }
459
460         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
461                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
462                    atomic_read(&ipv4_dst_ops.entries),
463                    st->in_hit,
464                    st->in_slow_tot,
465                    st->in_slow_mc,
466                    st->in_no_route,
467                    st->in_brd,
468                    st->in_martian_dst,
469                    st->in_martian_src,
470
471                    st->out_hit,
472                    st->out_slow_tot,
473                    st->out_slow_mc,
474
475                    st->gc_total,
476                    st->gc_ignored,
477                    st->gc_goal_miss,
478                    st->gc_dst_overflow,
479                    st->in_hlist_search,
480                    st->out_hlist_search
481                 );
482         return 0;
483 }
484
485 static const struct seq_operations rt_cpu_seq_ops = {
486         .start  = rt_cpu_seq_start,
487         .next   = rt_cpu_seq_next,
488         .stop   = rt_cpu_seq_stop,
489         .show   = rt_cpu_seq_show,
490 };
491
492
493 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
494 {
495         return seq_open(file, &rt_cpu_seq_ops);
496 }
497
498 static const struct file_operations rt_cpu_seq_fops = {
499         .owner   = THIS_MODULE,
500         .open    = rt_cpu_seq_open,
501         .read    = seq_read,
502         .llseek  = seq_lseek,
503         .release = seq_release,
504 };
505
506 #ifdef CONFIG_NET_CLS_ROUTE
507 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
508                            int length, int *eof, void *data)
509 {
510         unsigned int i;
511
512         if ((offset & 3) || (length & 3))
513                 return -EIO;
514
515         if (offset >= sizeof(struct ip_rt_acct) * 256) {
516                 *eof = 1;
517                 return 0;
518         }
519
520         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
521                 length = sizeof(struct ip_rt_acct) * 256 - offset;
522                 *eof = 1;
523         }
524
525         offset /= sizeof(u32);
526
527         if (length > 0) {
528                 u32 *dst = (u32 *) buffer;
529
530                 *start = buffer;
531                 memset(dst, 0, length);
532
533                 for_each_possible_cpu(i) {
534                         unsigned int j;
535                         u32 *src;
536
537                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
538                         for (j = 0; j < length/4; j++)
539                                 dst[j] += src[j];
540                 }
541         }
542         return length;
543 }
544 #endif
545
546 static int __net_init ip_rt_do_proc_init(struct net *net)
547 {
548         struct proc_dir_entry *pde;
549
550         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
551                         &rt_cache_seq_fops);
552         if (!pde)
553                 goto err1;
554
555         pde = proc_create("rt_cache", S_IRUGO,
556                           net->proc_net_stat, &rt_cpu_seq_fops);
557         if (!pde)
558                 goto err2;
559
560 #ifdef CONFIG_NET_CLS_ROUTE
561         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
562                         ip_rt_acct_read, NULL);
563         if (!pde)
564                 goto err3;
565 #endif
566         return 0;
567
568 #ifdef CONFIG_NET_CLS_ROUTE
569 err3:
570         remove_proc_entry("rt_cache", net->proc_net_stat);
571 #endif
572 err2:
573         remove_proc_entry("rt_cache", net->proc_net);
574 err1:
575         return -ENOMEM;
576 }
577
578 static void __net_exit ip_rt_do_proc_exit(struct net *net)
579 {
580         remove_proc_entry("rt_cache", net->proc_net_stat);
581         remove_proc_entry("rt_cache", net->proc_net);
582         remove_proc_entry("rt_acct", net->proc_net);
583 }
584
585 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
586         .init = ip_rt_do_proc_init,
587         .exit = ip_rt_do_proc_exit,
588 };
589
590 static int __init ip_rt_proc_init(void)
591 {
592         return register_pernet_subsys(&ip_rt_proc_ops);
593 }
594
595 #else
596 static inline int ip_rt_proc_init(void)
597 {
598         return 0;
599 }
600 #endif /* CONFIG_PROC_FS */
601
602 static __inline__ void rt_free(struct rtable *rt)
603 {
604         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
605 }
606
607 static __inline__ void rt_drop(struct rtable *rt)
608 {
609         ip_rt_put(rt);
610         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
611 }
612
613 static __inline__ int rt_fast_clean(struct rtable *rth)
614 {
615         /* Kill broadcast/multicast entries very aggresively, if they
616            collide in hash table with more useful entries */
617         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
618                 rth->fl.iif && rth->u.dst.rt_next;
619 }
620
621 static __inline__ int rt_valuable(struct rtable *rth)
622 {
623         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
624                 rth->u.dst.expires;
625 }
626
627 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
628 {
629         unsigned long age;
630         int ret = 0;
631
632         if (atomic_read(&rth->u.dst.__refcnt))
633                 goto out;
634
635         ret = 1;
636         if (rth->u.dst.expires &&
637             time_after_eq(jiffies, rth->u.dst.expires))
638                 goto out;
639
640         age = jiffies - rth->u.dst.lastuse;
641         ret = 0;
642         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
643             (age <= tmo2 && rt_valuable(rth)))
644                 goto out;
645         ret = 1;
646 out:    return ret;
647 }
648
649 /* Bits of score are:
650  * 31: very valuable
651  * 30: not quite useless
652  * 29..0: usage counter
653  */
654 static inline u32 rt_score(struct rtable *rt)
655 {
656         u32 score = jiffies - rt->u.dst.lastuse;
657
658         score = ~score & ~(3<<30);
659
660         if (rt_valuable(rt))
661                 score |= (1<<31);
662
663         if (!rt->fl.iif ||
664             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
665                 score |= (1<<30);
666
667         return score;
668 }
669
670 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
671 {
672         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
673                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
674                 (fl1->mark ^ fl2->mark) |
675                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
676                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
677                 (fl1->oif ^ fl2->oif) |
678                 (fl1->iif ^ fl2->iif)) == 0;
679 }
680
681 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
682 {
683         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
684 }
685
686 /*
687  * Perform a full scan of hash table and free all entries.
688  * Can be called by a softirq or a process.
689  * In the later case, we want to be reschedule if necessary
690  */
691 static void rt_do_flush(int process_context)
692 {
693         unsigned int i;
694         struct rtable *rth, *next;
695
696         for (i = 0; i <= rt_hash_mask; i++) {
697                 if (process_context && need_resched())
698                         cond_resched();
699                 rth = rt_hash_table[i].chain;
700                 if (!rth)
701                         continue;
702
703                 spin_lock_bh(rt_hash_lock_addr(i));
704                 rth = rt_hash_table[i].chain;
705                 rt_hash_table[i].chain = NULL;
706                 spin_unlock_bh(rt_hash_lock_addr(i));
707
708                 for (; rth; rth = next) {
709                         next = rth->u.dst.rt_next;
710                         rt_free(rth);
711                 }
712         }
713 }
714
715 static void rt_check_expire(void)
716 {
717         static unsigned int rover;
718         unsigned int i = rover, goal;
719         struct rtable *rth, **rthp;
720         u64 mult;
721
722         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
723         if (ip_rt_gc_timeout > 1)
724                 do_div(mult, ip_rt_gc_timeout);
725         goal = (unsigned int)mult;
726         if (goal > rt_hash_mask)
727                 goal = rt_hash_mask + 1;
728         for (; goal > 0; goal--) {
729                 unsigned long tmo = ip_rt_gc_timeout;
730
731                 i = (i + 1) & rt_hash_mask;
732                 rthp = &rt_hash_table[i].chain;
733
734                 if (need_resched())
735                         cond_resched();
736
737                 if (*rthp == NULL)
738                         continue;
739                 spin_lock_bh(rt_hash_lock_addr(i));
740                 while ((rth = *rthp) != NULL) {
741                         if (rth->rt_genid != atomic_read(&rt_genid)) {
742                                 *rthp = rth->u.dst.rt_next;
743                                 rt_free(rth);
744                                 continue;
745                         }
746                         if (rth->u.dst.expires) {
747                                 /* Entry is expired even if it is in use */
748                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
749                                         tmo >>= 1;
750                                         rthp = &rth->u.dst.rt_next;
751                                         continue;
752                                 }
753                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
754                                 tmo >>= 1;
755                                 rthp = &rth->u.dst.rt_next;
756                                 continue;
757                         }
758
759                         /* Cleanup aged off entries. */
760                         *rthp = rth->u.dst.rt_next;
761                         rt_free(rth);
762                 }
763                 spin_unlock_bh(rt_hash_lock_addr(i));
764         }
765         rover = i;
766 }
767
768 /*
769  * rt_worker_func() is run in process context.
770  * we call rt_check_expire() to scan part of the hash table
771  */
772 static void rt_worker_func(struct work_struct *work)
773 {
774         rt_check_expire();
775         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
776 }
777
778 /*
779  * Pertubation of rt_genid by a small quantity [1..256]
780  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
781  * many times (2^24) without giving recent rt_genid.
782  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
783  */
784 static void rt_cache_invalidate(void)
785 {
786         unsigned char shuffle;
787
788         get_random_bytes(&shuffle, sizeof(shuffle));
789         atomic_add(shuffle + 1U, &rt_genid);
790 }
791
792 /*
793  * delay < 0  : invalidate cache (fast : entries will be deleted later)
794  * delay >= 0 : invalidate & flush cache (can be long)
795  */
796 void rt_cache_flush(int delay)
797 {
798         rt_cache_invalidate();
799         if (delay >= 0)
800                 rt_do_flush(!in_softirq());
801 }
802
803 /*
804  * We change rt_genid and let gc do the cleanup
805  */
806 static void rt_secret_rebuild(unsigned long dummy)
807 {
808         rt_cache_invalidate();
809         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
810 }
811
812 /*
813    Short description of GC goals.
814
815    We want to build algorithm, which will keep routing cache
816    at some equilibrium point, when number of aged off entries
817    is kept approximately equal to newly generated ones.
818
819    Current expiration strength is variable "expire".
820    We try to adjust it dynamically, so that if networking
821    is idle expires is large enough to keep enough of warm entries,
822    and when load increases it reduces to limit cache size.
823  */
824
825 static int rt_garbage_collect(struct dst_ops *ops)
826 {
827         static unsigned long expire = RT_GC_TIMEOUT;
828         static unsigned long last_gc;
829         static int rover;
830         static int equilibrium;
831         struct rtable *rth, **rthp;
832         unsigned long now = jiffies;
833         int goal;
834
835         /*
836          * Garbage collection is pretty expensive,
837          * do not make it too frequently.
838          */
839
840         RT_CACHE_STAT_INC(gc_total);
841
842         if (now - last_gc < ip_rt_gc_min_interval &&
843             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
844                 RT_CACHE_STAT_INC(gc_ignored);
845                 goto out;
846         }
847
848         /* Calculate number of entries, which we want to expire now. */
849         goal = atomic_read(&ipv4_dst_ops.entries) -
850                 (ip_rt_gc_elasticity << rt_hash_log);
851         if (goal <= 0) {
852                 if (equilibrium < ipv4_dst_ops.gc_thresh)
853                         equilibrium = ipv4_dst_ops.gc_thresh;
854                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
855                 if (goal > 0) {
856                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
857                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
858                 }
859         } else {
860                 /* We are in dangerous area. Try to reduce cache really
861                  * aggressively.
862                  */
863                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
864                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
865         }
866
867         if (now - last_gc >= ip_rt_gc_min_interval)
868                 last_gc = now;
869
870         if (goal <= 0) {
871                 equilibrium += goal;
872                 goto work_done;
873         }
874
875         do {
876                 int i, k;
877
878                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
879                         unsigned long tmo = expire;
880
881                         k = (k + 1) & rt_hash_mask;
882                         rthp = &rt_hash_table[k].chain;
883                         spin_lock_bh(rt_hash_lock_addr(k));
884                         while ((rth = *rthp) != NULL) {
885                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
886                                         !rt_may_expire(rth, tmo, expire)) {
887                                         tmo >>= 1;
888                                         rthp = &rth->u.dst.rt_next;
889                                         continue;
890                                 }
891                                 *rthp = rth->u.dst.rt_next;
892                                 rt_free(rth);
893                                 goal--;
894                         }
895                         spin_unlock_bh(rt_hash_lock_addr(k));
896                         if (goal <= 0)
897                                 break;
898                 }
899                 rover = k;
900
901                 if (goal <= 0)
902                         goto work_done;
903
904                 /* Goal is not achieved. We stop process if:
905
906                    - if expire reduced to zero. Otherwise, expire is halfed.
907                    - if table is not full.
908                    - if we are called from interrupt.
909                    - jiffies check is just fallback/debug loop breaker.
910                      We will not spin here for long time in any case.
911                  */
912
913                 RT_CACHE_STAT_INC(gc_goal_miss);
914
915                 if (expire == 0)
916                         break;
917
918                 expire >>= 1;
919 #if RT_CACHE_DEBUG >= 2
920                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
921                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
922 #endif
923
924                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
925                         goto out;
926         } while (!in_softirq() && time_before_eq(jiffies, now));
927
928         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
929                 goto out;
930         if (net_ratelimit())
931                 printk(KERN_WARNING "dst cache overflow\n");
932         RT_CACHE_STAT_INC(gc_dst_overflow);
933         return 1;
934
935 work_done:
936         expire += ip_rt_gc_min_interval;
937         if (expire > ip_rt_gc_timeout ||
938             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
939                 expire = ip_rt_gc_timeout;
940 #if RT_CACHE_DEBUG >= 2
941         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
942                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
943 #endif
944 out:    return 0;
945 }
946
947 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
948 {
949         struct rtable   *rth, **rthp;
950         unsigned long   now;
951         struct rtable *cand, **candp;
952         u32             min_score;
953         int             chain_length;
954         int attempts = !in_softirq();
955
956 restart:
957         chain_length = 0;
958         min_score = ~(u32)0;
959         cand = NULL;
960         candp = NULL;
961         now = jiffies;
962
963         rthp = &rt_hash_table[hash].chain;
964
965         spin_lock_bh(rt_hash_lock_addr(hash));
966         while ((rth = *rthp) != NULL) {
967                 if (rth->rt_genid != atomic_read(&rt_genid)) {
968                         *rthp = rth->u.dst.rt_next;
969                         rt_free(rth);
970                         continue;
971                 }
972                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
973                         /* Put it first */
974                         *rthp = rth->u.dst.rt_next;
975                         /*
976                          * Since lookup is lockfree, the deletion
977                          * must be visible to another weakly ordered CPU before
978                          * the insertion at the start of the hash chain.
979                          */
980                         rcu_assign_pointer(rth->u.dst.rt_next,
981                                            rt_hash_table[hash].chain);
982                         /*
983                          * Since lookup is lockfree, the update writes
984                          * must be ordered for consistency on SMP.
985                          */
986                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
987
988                         dst_use(&rth->u.dst, now);
989                         spin_unlock_bh(rt_hash_lock_addr(hash));
990
991                         rt_drop(rt);
992                         *rp = rth;
993                         return 0;
994                 }
995
996                 if (!atomic_read(&rth->u.dst.__refcnt)) {
997                         u32 score = rt_score(rth);
998
999                         if (score <= min_score) {
1000                                 cand = rth;
1001                                 candp = rthp;
1002                                 min_score = score;
1003                         }
1004                 }
1005
1006                 chain_length++;
1007
1008                 rthp = &rth->u.dst.rt_next;
1009         }
1010
1011         if (cand) {
1012                 /* ip_rt_gc_elasticity used to be average length of chain
1013                  * length, when exceeded gc becomes really aggressive.
1014                  *
1015                  * The second limit is less certain. At the moment it allows
1016                  * only 2 entries per bucket. We will see.
1017                  */
1018                 if (chain_length > ip_rt_gc_elasticity) {
1019                         *candp = cand->u.dst.rt_next;
1020                         rt_free(cand);
1021                 }
1022         }
1023
1024         /* Try to bind route to arp only if it is output
1025            route or unicast forwarding path.
1026          */
1027         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1028                 int err = arp_bind_neighbour(&rt->u.dst);
1029                 if (err) {
1030                         spin_unlock_bh(rt_hash_lock_addr(hash));
1031
1032                         if (err != -ENOBUFS) {
1033                                 rt_drop(rt);
1034                                 return err;
1035                         }
1036
1037                         /* Neighbour tables are full and nothing
1038                            can be released. Try to shrink route cache,
1039                            it is most likely it holds some neighbour records.
1040                          */
1041                         if (attempts-- > 0) {
1042                                 int saved_elasticity = ip_rt_gc_elasticity;
1043                                 int saved_int = ip_rt_gc_min_interval;
1044                                 ip_rt_gc_elasticity     = 1;
1045                                 ip_rt_gc_min_interval   = 0;
1046                                 rt_garbage_collect(&ipv4_dst_ops);
1047                                 ip_rt_gc_min_interval   = saved_int;
1048                                 ip_rt_gc_elasticity     = saved_elasticity;
1049                                 goto restart;
1050                         }
1051
1052                         if (net_ratelimit())
1053                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1054                         rt_drop(rt);
1055                         return -ENOBUFS;
1056                 }
1057         }
1058
1059         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1060 #if RT_CACHE_DEBUG >= 2
1061         if (rt->u.dst.rt_next) {
1062                 struct rtable *trt;
1063                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1064                        NIPQUAD(rt->rt_dst));
1065                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1066                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1067                 printk("\n");
1068         }
1069 #endif
1070         rt_hash_table[hash].chain = rt;
1071         spin_unlock_bh(rt_hash_lock_addr(hash));
1072         *rp = rt;
1073         return 0;
1074 }
1075
1076 void rt_bind_peer(struct rtable *rt, int create)
1077 {
1078         static DEFINE_SPINLOCK(rt_peer_lock);
1079         struct inet_peer *peer;
1080
1081         peer = inet_getpeer(rt->rt_dst, create);
1082
1083         spin_lock_bh(&rt_peer_lock);
1084         if (rt->peer == NULL) {
1085                 rt->peer = peer;
1086                 peer = NULL;
1087         }
1088         spin_unlock_bh(&rt_peer_lock);
1089         if (peer)
1090                 inet_putpeer(peer);
1091 }
1092
1093 /*
1094  * Peer allocation may fail only in serious out-of-memory conditions.  However
1095  * we still can generate some output.
1096  * Random ID selection looks a bit dangerous because we have no chances to
1097  * select ID being unique in a reasonable period of time.
1098  * But broken packet identifier may be better than no packet at all.
1099  */
1100 static void ip_select_fb_ident(struct iphdr *iph)
1101 {
1102         static DEFINE_SPINLOCK(ip_fb_id_lock);
1103         static u32 ip_fallback_id;
1104         u32 salt;
1105
1106         spin_lock_bh(&ip_fb_id_lock);
1107         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1108         iph->id = htons(salt & 0xFFFF);
1109         ip_fallback_id = salt;
1110         spin_unlock_bh(&ip_fb_id_lock);
1111 }
1112
1113 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114 {
1115         struct rtable *rt = (struct rtable *) dst;
1116
1117         if (rt) {
1118                 if (rt->peer == NULL)
1119                         rt_bind_peer(rt, 1);
1120
1121                 /* If peer is attached to destination, it is never detached,
1122                    so that we need not to grab a lock to dereference it.
1123                  */
1124                 if (rt->peer) {
1125                         iph->id = htons(inet_getid(rt->peer, more));
1126                         return;
1127                 }
1128         } else
1129                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1130                        __builtin_return_address(0));
1131
1132         ip_select_fb_ident(iph);
1133 }
1134
1135 static void rt_del(unsigned hash, struct rtable *rt)
1136 {
1137         struct rtable **rthp, *aux;
1138
1139         rthp = &rt_hash_table[hash].chain;
1140         spin_lock_bh(rt_hash_lock_addr(hash));
1141         ip_rt_put(rt);
1142         while ((aux = *rthp) != NULL) {
1143                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1144                         *rthp = aux->u.dst.rt_next;
1145                         rt_free(aux);
1146                         continue;
1147                 }
1148                 rthp = &aux->u.dst.rt_next;
1149         }
1150         spin_unlock_bh(rt_hash_lock_addr(hash));
1151 }
1152
1153 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1154                     __be32 saddr, struct net_device *dev)
1155 {
1156         int i, k;
1157         struct in_device *in_dev = in_dev_get(dev);
1158         struct rtable *rth, **rthp;
1159         __be32  skeys[2] = { saddr, 0 };
1160         int  ikeys[2] = { dev->ifindex, 0 };
1161         struct netevent_redirect netevent;
1162         struct net *net;
1163
1164         if (!in_dev)
1165                 return;
1166
1167         net = dev->nd_net;
1168         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1169             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1170             || ipv4_is_zeronet(new_gw))
1171                 goto reject_redirect;
1172
1173         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1174                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1175                         goto reject_redirect;
1176                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1177                         goto reject_redirect;
1178         } else {
1179                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1180                         goto reject_redirect;
1181         }
1182
1183         for (i = 0; i < 2; i++) {
1184                 for (k = 0; k < 2; k++) {
1185                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186
1187                         rthp=&rt_hash_table[hash].chain;
1188
1189                         rcu_read_lock();
1190                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1191                                 struct rtable *rt;
1192
1193                                 if (rth->fl.fl4_dst != daddr ||
1194                                     rth->fl.fl4_src != skeys[i] ||
1195                                     rth->fl.oif != ikeys[k] ||
1196                                     rth->fl.iif != 0 ||
1197                                     rth->rt_genid != atomic_read(&rt_genid) ||
1198                                     rth->u.dst.dev->nd_net != net) {
1199                                         rthp = &rth->u.dst.rt_next;
1200                                         continue;
1201                                 }
1202
1203                                 if (rth->rt_dst != daddr ||
1204                                     rth->rt_src != saddr ||
1205                                     rth->u.dst.error ||
1206                                     rth->rt_gateway != old_gw ||
1207                                     rth->u.dst.dev != dev)
1208                                         break;
1209
1210                                 dst_hold(&rth->u.dst);
1211                                 rcu_read_unlock();
1212
1213                                 rt = dst_alloc(&ipv4_dst_ops);
1214                                 if (rt == NULL) {
1215                                         ip_rt_put(rth);
1216                                         in_dev_put(in_dev);
1217                                         return;
1218                                 }
1219
1220                                 /* Copy all the information. */
1221                                 *rt = *rth;
1222                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223                                 rt->u.dst.__use         = 1;
1224                                 atomic_set(&rt->u.dst.__refcnt, 1);
1225                                 rt->u.dst.child         = NULL;
1226                                 if (rt->u.dst.dev)
1227                                         dev_hold(rt->u.dst.dev);
1228                                 if (rt->idev)
1229                                         in_dev_hold(rt->idev);
1230                                 rt->u.dst.obsolete      = 0;
1231                                 rt->u.dst.lastuse       = jiffies;
1232                                 rt->u.dst.path          = &rt->u.dst;
1233                                 rt->u.dst.neighbour     = NULL;
1234                                 rt->u.dst.hh            = NULL;
1235                                 rt->u.dst.xfrm          = NULL;
1236                                 rt->rt_genid            = atomic_read(&rt_genid);
1237                                 rt->rt_flags            |= RTCF_REDIRECTED;
1238
1239                                 /* Gateway is different ... */
1240                                 rt->rt_gateway          = new_gw;
1241
1242                                 /* Redirect received -> path was valid */
1243                                 dst_confirm(&rth->u.dst);
1244
1245                                 if (rt->peer)
1246                                         atomic_inc(&rt->peer->refcnt);
1247
1248                                 if (arp_bind_neighbour(&rt->u.dst) ||
1249                                     !(rt->u.dst.neighbour->nud_state &
1250                                             NUD_VALID)) {
1251                                         if (rt->u.dst.neighbour)
1252                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1253                                         ip_rt_put(rth);
1254                                         rt_drop(rt);
1255                                         goto do_next;
1256                                 }
1257
1258                                 netevent.old = &rth->u.dst;
1259                                 netevent.new = &rt->u.dst;
1260                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1261                                                         &netevent);
1262
1263                                 rt_del(hash, rth);
1264                                 if (!rt_intern_hash(hash, rt, &rt))
1265                                         ip_rt_put(rt);
1266                                 goto do_next;
1267                         }
1268                         rcu_read_unlock();
1269                 do_next:
1270                         ;
1271                 }
1272         }
1273         in_dev_put(in_dev);
1274         return;
1275
1276 reject_redirect:
1277 #ifdef CONFIG_IP_ROUTE_VERBOSE
1278         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1280                         "%u.%u.%u.%u ignored.\n"
1281                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1282                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283                        NIPQUAD(saddr), NIPQUAD(daddr));
1284 #endif
1285         in_dev_put(in_dev);
1286 }
1287
1288 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289 {
1290         struct rtable *rt = (struct rtable *)dst;
1291         struct dst_entry *ret = dst;
1292
1293         if (rt) {
1294                 if (dst->obsolete) {
1295                         ip_rt_put(rt);
1296                         ret = NULL;
1297                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298                            rt->u.dst.expires) {
1299                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300                                                 rt->fl.oif);
1301 #if RT_CACHE_DEBUG >= 1
1302                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1303                                           "%u.%u.%u.%u/%02x dropped\n",
1304                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1305 #endif
1306                         rt_del(hash, rt);
1307                         ret = NULL;
1308                 }
1309         }
1310         return ret;
1311 }
1312
1313 /*
1314  * Algorithm:
1315  *      1. The first ip_rt_redirect_number redirects are sent
1316  *         with exponential backoff, then we stop sending them at all,
1317  *         assuming that the host ignores our redirects.
1318  *      2. If we did not see packets requiring redirects
1319  *         during ip_rt_redirect_silence, we assume that the host
1320  *         forgot redirected route and start to send redirects again.
1321  *
1322  * This algorithm is much cheaper and more intelligent than dumb load limiting
1323  * in icmp.c.
1324  *
1325  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1326  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1327  */
1328
1329 void ip_rt_send_redirect(struct sk_buff *skb)
1330 {
1331         struct rtable *rt = skb->rtable;
1332         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1333
1334         if (!in_dev)
1335                 return;
1336
1337         if (!IN_DEV_TX_REDIRECTS(in_dev))
1338                 goto out;
1339
1340         /* No redirected packets during ip_rt_redirect_silence;
1341          * reset the algorithm.
1342          */
1343         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1344                 rt->u.dst.rate_tokens = 0;
1345
1346         /* Too many ignored redirects; do not send anything
1347          * set u.dst.rate_last to the last seen redirected packet.
1348          */
1349         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1350                 rt->u.dst.rate_last = jiffies;
1351                 goto out;
1352         }
1353
1354         /* Check for load limit; set rate_last to the latest sent
1355          * redirect.
1356          */
1357         if (rt->u.dst.rate_tokens == 0 ||
1358             time_after(jiffies,
1359                        (rt->u.dst.rate_last +
1360                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1361                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1362                 rt->u.dst.rate_last = jiffies;
1363                 ++rt->u.dst.rate_tokens;
1364 #ifdef CONFIG_IP_ROUTE_VERBOSE
1365                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1366                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1367                     net_ratelimit())
1368                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1369                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1370                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1371                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1372 #endif
1373         }
1374 out:
1375         in_dev_put(in_dev);
1376 }
1377
1378 static int ip_error(struct sk_buff *skb)
1379 {
1380         struct rtable *rt = skb->rtable;
1381         unsigned long now;
1382         int code;
1383
1384         switch (rt->u.dst.error) {
1385                 case EINVAL:
1386                 default:
1387                         goto out;
1388                 case EHOSTUNREACH:
1389                         code = ICMP_HOST_UNREACH;
1390                         break;
1391                 case ENETUNREACH:
1392                         code = ICMP_NET_UNREACH;
1393                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1394                         break;
1395                 case EACCES:
1396                         code = ICMP_PKT_FILTERED;
1397                         break;
1398         }
1399
1400         now = jiffies;
1401         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1402         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1403                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1404         rt->u.dst.rate_last = now;
1405         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1406                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1407                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1408         }
1409
1410 out:    kfree_skb(skb);
1411         return 0;
1412 }
1413
1414 /*
1415  *      The last two values are not from the RFC but
1416  *      are needed for AMPRnet AX.25 paths.
1417  */
1418
1419 static const unsigned short mtu_plateau[] =
1420 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421
1422 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1423 {
1424         int i;
1425
1426         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1427                 if (old_mtu > mtu_plateau[i])
1428                         return mtu_plateau[i];
1429         return 68;
1430 }
1431
1432 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1433                                  unsigned short new_mtu)
1434 {
1435         int i;
1436         unsigned short old_mtu = ntohs(iph->tot_len);
1437         struct rtable *rth;
1438         __be32  skeys[2] = { iph->saddr, 0, };
1439         __be32  daddr = iph->daddr;
1440         unsigned short est_mtu = 0;
1441
1442         if (ipv4_config.no_pmtu_disc)
1443                 return 0;
1444
1445         for (i = 0; i < 2; i++) {
1446                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1447
1448                 rcu_read_lock();
1449                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1450                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1451                         if (rth->fl.fl4_dst == daddr &&
1452                             rth->fl.fl4_src == skeys[i] &&
1453                             rth->rt_dst  == daddr &&
1454                             rth->rt_src  == iph->saddr &&
1455                             rth->fl.iif == 0 &&
1456                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1457                             rth->u.dst.dev->nd_net == net &&
1458                             rth->rt_genid == atomic_read(&rt_genid)) {
1459                                 unsigned short mtu = new_mtu;
1460
1461                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1462
1463                                         /* BSD 4.2 compatibility hack :-( */
1464                                         if (mtu == 0 &&
1465                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1466                                             old_mtu >= 68 + (iph->ihl << 2))
1467                                                 old_mtu -= iph->ihl << 2;
1468
1469                                         mtu = guess_mtu(old_mtu);
1470                                 }
1471                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1472                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1473                                                 dst_confirm(&rth->u.dst);
1474                                                 if (mtu < ip_rt_min_pmtu) {
1475                                                         mtu = ip_rt_min_pmtu;
1476                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1477                                                                 (1 << RTAX_MTU);
1478                                                 }
1479                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1480                                                 dst_set_expires(&rth->u.dst,
1481                                                         ip_rt_mtu_expires);
1482                                         }
1483                                         est_mtu = mtu;
1484                                 }
1485                         }
1486                 }
1487                 rcu_read_unlock();
1488         }
1489         return est_mtu ? : new_mtu;
1490 }
1491
1492 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1493 {
1494         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1495             !(dst_metric_locked(dst, RTAX_MTU))) {
1496                 if (mtu < ip_rt_min_pmtu) {
1497                         mtu = ip_rt_min_pmtu;
1498                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1499                 }
1500                 dst->metrics[RTAX_MTU-1] = mtu;
1501                 dst_set_expires(dst, ip_rt_mtu_expires);
1502                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1503         }
1504 }
1505
1506 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1507 {
1508         return NULL;
1509 }
1510
1511 static void ipv4_dst_destroy(struct dst_entry *dst)
1512 {
1513         struct rtable *rt = (struct rtable *) dst;
1514         struct inet_peer *peer = rt->peer;
1515         struct in_device *idev = rt->idev;
1516
1517         if (peer) {
1518                 rt->peer = NULL;
1519                 inet_putpeer(peer);
1520         }
1521
1522         if (idev) {
1523                 rt->idev = NULL;
1524                 in_dev_put(idev);
1525         }
1526 }
1527
1528 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1529                             int how)
1530 {
1531         struct rtable *rt = (struct rtable *) dst;
1532         struct in_device *idev = rt->idev;
1533         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1534                 struct in_device *loopback_idev =
1535                         in_dev_get(dev->nd_net->loopback_dev);
1536                 if (loopback_idev) {
1537                         rt->idev = loopback_idev;
1538                         in_dev_put(idev);
1539                 }
1540         }
1541 }
1542
1543 static void ipv4_link_failure(struct sk_buff *skb)
1544 {
1545         struct rtable *rt;
1546
1547         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1548
1549         rt = skb->rtable;
1550         if (rt)
1551                 dst_set_expires(&rt->u.dst, 0);
1552 }
1553
1554 static int ip_rt_bug(struct sk_buff *skb)
1555 {
1556         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1557                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1558                 skb->dev ? skb->dev->name : "?");
1559         kfree_skb(skb);
1560         return 0;
1561 }
1562
1563 /*
1564    We do not cache source address of outgoing interface,
1565    because it is used only by IP RR, TS and SRR options,
1566    so that it out of fast path.
1567
1568    BTW remember: "addr" is allowed to be not aligned
1569    in IP options!
1570  */
1571
1572 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1573 {
1574         __be32 src;
1575         struct fib_result res;
1576
1577         if (rt->fl.iif == 0)
1578                 src = rt->rt_src;
1579         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1580                 src = FIB_RES_PREFSRC(res);
1581                 fib_res_put(&res);
1582         } else
1583                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1584                                         RT_SCOPE_UNIVERSE);
1585         memcpy(addr, &src, 4);
1586 }
1587
1588 #ifdef CONFIG_NET_CLS_ROUTE
1589 static void set_class_tag(struct rtable *rt, u32 tag)
1590 {
1591         if (!(rt->u.dst.tclassid & 0xFFFF))
1592                 rt->u.dst.tclassid |= tag & 0xFFFF;
1593         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1594                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1595 }
1596 #endif
1597
1598 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1599 {
1600         struct fib_info *fi = res->fi;
1601
1602         if (fi) {
1603                 if (FIB_RES_GW(*res) &&
1604                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1605                         rt->rt_gateway = FIB_RES_GW(*res);
1606                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1607                        sizeof(rt->u.dst.metrics));
1608                 if (fi->fib_mtu == 0) {
1609                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1610                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1611                             rt->rt_gateway != rt->rt_dst &&
1612                             rt->u.dst.dev->mtu > 576)
1613                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1614                 }
1615 #ifdef CONFIG_NET_CLS_ROUTE
1616                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1617 #endif
1618         } else
1619                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1620
1621         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1622                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1623         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1624                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1625         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1626                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1627                                        ip_rt_min_advmss);
1628         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1629                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1630
1631 #ifdef CONFIG_NET_CLS_ROUTE
1632 #ifdef CONFIG_IP_MULTIPLE_TABLES
1633         set_class_tag(rt, fib_rules_tclass(res));
1634 #endif
1635         set_class_tag(rt, itag);
1636 #endif
1637         rt->rt_type = res->type;
1638 }
1639
1640 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1641                                 u8 tos, struct net_device *dev, int our)
1642 {
1643         unsigned hash;
1644         struct rtable *rth;
1645         __be32 spec_dst;
1646         struct in_device *in_dev = in_dev_get(dev);
1647         u32 itag = 0;
1648
1649         /* Primary sanity checks. */
1650
1651         if (in_dev == NULL)
1652                 return -EINVAL;
1653
1654         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1655             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1656                 goto e_inval;
1657
1658         if (ipv4_is_zeronet(saddr)) {
1659                 if (!ipv4_is_local_multicast(daddr))
1660                         goto e_inval;
1661                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1662         } else if (fib_validate_source(saddr, 0, tos, 0,
1663                                         dev, &spec_dst, &itag) < 0)
1664                 goto e_inval;
1665
1666         rth = dst_alloc(&ipv4_dst_ops);
1667         if (!rth)
1668                 goto e_nobufs;
1669
1670         rth->u.dst.output= ip_rt_bug;
1671
1672         atomic_set(&rth->u.dst.__refcnt, 1);
1673         rth->u.dst.flags= DST_HOST;
1674         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1675                 rth->u.dst.flags |= DST_NOPOLICY;
1676         rth->fl.fl4_dst = daddr;
1677         rth->rt_dst     = daddr;
1678         rth->fl.fl4_tos = tos;
1679         rth->fl.mark    = skb->mark;
1680         rth->fl.fl4_src = saddr;
1681         rth->rt_src     = saddr;
1682 #ifdef CONFIG_NET_CLS_ROUTE
1683         rth->u.dst.tclassid = itag;
1684 #endif
1685         rth->rt_iif     =
1686         rth->fl.iif     = dev->ifindex;
1687         rth->u.dst.dev  = init_net.loopback_dev;
1688         dev_hold(rth->u.dst.dev);
1689         rth->idev       = in_dev_get(rth->u.dst.dev);
1690         rth->fl.oif     = 0;
1691         rth->rt_gateway = daddr;
1692         rth->rt_spec_dst= spec_dst;
1693         rth->rt_genid   = atomic_read(&rt_genid);
1694         rth->rt_flags   = RTCF_MULTICAST;
1695         rth->rt_type    = RTN_MULTICAST;
1696         if (our) {
1697                 rth->u.dst.input= ip_local_deliver;
1698                 rth->rt_flags |= RTCF_LOCAL;
1699         }
1700
1701 #ifdef CONFIG_IP_MROUTE
1702         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1703                 rth->u.dst.input = ip_mr_input;
1704 #endif
1705         RT_CACHE_STAT_INC(in_slow_mc);
1706
1707         in_dev_put(in_dev);
1708         hash = rt_hash(daddr, saddr, dev->ifindex);
1709         return rt_intern_hash(hash, rth, &skb->rtable);
1710
1711 e_nobufs:
1712         in_dev_put(in_dev);
1713         return -ENOBUFS;
1714
1715 e_inval:
1716         in_dev_put(in_dev);
1717         return -EINVAL;
1718 }
1719
1720
1721 static void ip_handle_martian_source(struct net_device *dev,
1722                                      struct in_device *in_dev,
1723                                      struct sk_buff *skb,
1724                                      __be32 daddr,
1725                                      __be32 saddr)
1726 {
1727         RT_CACHE_STAT_INC(in_martian_src);
1728 #ifdef CONFIG_IP_ROUTE_VERBOSE
1729         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1730                 /*
1731                  *      RFC1812 recommendation, if source is martian,
1732                  *      the only hint is MAC header.
1733                  */
1734                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1735                         "%u.%u.%u.%u, on dev %s\n",
1736                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1737                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1738                         int i;
1739                         const unsigned char *p = skb_mac_header(skb);
1740                         printk(KERN_WARNING "ll header: ");
1741                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1742                                 printk("%02x", *p);
1743                                 if (i < (dev->hard_header_len - 1))
1744                                         printk(":");
1745                         }
1746                         printk("\n");
1747                 }
1748         }
1749 #endif
1750 }
1751
1752 static inline int __mkroute_input(struct sk_buff *skb,
1753                                   struct fib_result* res,
1754                                   struct in_device *in_dev,
1755                                   __be32 daddr, __be32 saddr, u32 tos,
1756                                   struct rtable **result)
1757 {
1758
1759         struct rtable *rth;
1760         int err;
1761         struct in_device *out_dev;
1762         unsigned flags = 0;
1763         __be32 spec_dst;
1764         u32 itag;
1765
1766         /* get a working reference to the output device */
1767         out_dev = in_dev_get(FIB_RES_DEV(*res));
1768         if (out_dev == NULL) {
1769                 if (net_ratelimit())
1770                         printk(KERN_CRIT "Bug in ip_route_input" \
1771                                "_slow(). Please, report\n");
1772                 return -EINVAL;
1773         }
1774
1775
1776         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1777                                   in_dev->dev, &spec_dst, &itag);
1778         if (err < 0) {
1779                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1780                                          saddr);
1781
1782                 err = -EINVAL;
1783                 goto cleanup;
1784         }
1785
1786         if (err)
1787                 flags |= RTCF_DIRECTSRC;
1788
1789         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1790             (IN_DEV_SHARED_MEDIA(out_dev) ||
1791              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1792                 flags |= RTCF_DOREDIRECT;
1793
1794         if (skb->protocol != htons(ETH_P_IP)) {
1795                 /* Not IP (i.e. ARP). Do not create route, if it is
1796                  * invalid for proxy arp. DNAT routes are always valid.
1797                  */
1798                 if (out_dev == in_dev) {
1799                         err = -EINVAL;
1800                         goto cleanup;
1801                 }
1802         }
1803
1804
1805         rth = dst_alloc(&ipv4_dst_ops);
1806         if (!rth) {
1807                 err = -ENOBUFS;
1808                 goto cleanup;
1809         }
1810
1811         atomic_set(&rth->u.dst.__refcnt, 1);
1812         rth->u.dst.flags= DST_HOST;
1813         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1814                 rth->u.dst.flags |= DST_NOPOLICY;
1815         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1816                 rth->u.dst.flags |= DST_NOXFRM;
1817         rth->fl.fl4_dst = daddr;
1818         rth->rt_dst     = daddr;
1819         rth->fl.fl4_tos = tos;
1820         rth->fl.mark    = skb->mark;
1821         rth->fl.fl4_src = saddr;
1822         rth->rt_src     = saddr;
1823         rth->rt_gateway = daddr;
1824         rth->rt_iif     =
1825                 rth->fl.iif     = in_dev->dev->ifindex;
1826         rth->u.dst.dev  = (out_dev)->dev;
1827         dev_hold(rth->u.dst.dev);
1828         rth->idev       = in_dev_get(rth->u.dst.dev);
1829         rth->fl.oif     = 0;
1830         rth->rt_spec_dst= spec_dst;
1831
1832         rth->u.dst.input = ip_forward;
1833         rth->u.dst.output = ip_output;
1834         rth->rt_genid = atomic_read(&rt_genid);
1835
1836         rt_set_nexthop(rth, res, itag);
1837
1838         rth->rt_flags = flags;
1839
1840         *result = rth;
1841         err = 0;
1842  cleanup:
1843         /* release the working reference to the output device */
1844         in_dev_put(out_dev);
1845         return err;
1846 }
1847
1848 static inline int ip_mkroute_input(struct sk_buff *skb,
1849                                    struct fib_result* res,
1850                                    const struct flowi *fl,
1851                                    struct in_device *in_dev,
1852                                    __be32 daddr, __be32 saddr, u32 tos)
1853 {
1854         struct rtable* rth = NULL;
1855         int err;
1856         unsigned hash;
1857
1858 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1859         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1860                 fib_select_multipath(fl, res);
1861 #endif
1862
1863         /* create a routing cache entry */
1864         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1865         if (err)
1866                 return err;
1867
1868         /* put it into the cache */
1869         hash = rt_hash(daddr, saddr, fl->iif);
1870         return rt_intern_hash(hash, rth, &skb->rtable);
1871 }
1872
1873 /*
1874  *      NOTE. We drop all the packets that has local source
1875  *      addresses, because every properly looped back packet
1876  *      must have correct destination already attached by output routine.
1877  *
1878  *      Such approach solves two big problems:
1879  *      1. Not simplex devices are handled properly.
1880  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1881  */
1882
1883 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1884                                u8 tos, struct net_device *dev)
1885 {
1886         struct fib_result res;
1887         struct in_device *in_dev = in_dev_get(dev);
1888         struct flowi fl = { .nl_u = { .ip4_u =
1889                                       { .daddr = daddr,
1890                                         .saddr = saddr,
1891                                         .tos = tos,
1892                                         .scope = RT_SCOPE_UNIVERSE,
1893                                       } },
1894                             .mark = skb->mark,
1895                             .iif = dev->ifindex };
1896         unsigned        flags = 0;
1897         u32             itag = 0;
1898         struct rtable * rth;
1899         unsigned        hash;
1900         __be32          spec_dst;
1901         int             err = -EINVAL;
1902         int             free_res = 0;
1903         struct net    * net = dev->nd_net;
1904
1905         /* IP on this device is disabled. */
1906
1907         if (!in_dev)
1908                 goto out;
1909
1910         /* Check for the most weird martians, which can be not detected
1911            by fib_lookup.
1912          */
1913
1914         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1915             ipv4_is_loopback(saddr))
1916                 goto martian_source;
1917
1918         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1919                 goto brd_input;
1920
1921         /* Accept zero addresses only to limited broadcast;
1922          * I even do not know to fix it or not. Waiting for complains :-)
1923          */
1924         if (ipv4_is_zeronet(saddr))
1925                 goto martian_source;
1926
1927         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1928             ipv4_is_loopback(daddr))
1929                 goto martian_destination;
1930
1931         /*
1932          *      Now we are ready to route packet.
1933          */
1934         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1935                 if (!IN_DEV_FORWARD(in_dev))
1936                         goto e_hostunreach;
1937                 goto no_route;
1938         }
1939         free_res = 1;
1940
1941         RT_CACHE_STAT_INC(in_slow_tot);
1942
1943         if (res.type == RTN_BROADCAST)
1944                 goto brd_input;
1945
1946         if (res.type == RTN_LOCAL) {
1947                 int result;
1948                 result = fib_validate_source(saddr, daddr, tos,
1949                                              net->loopback_dev->ifindex,
1950                                              dev, &spec_dst, &itag);
1951                 if (result < 0)
1952                         goto martian_source;
1953                 if (result)
1954                         flags |= RTCF_DIRECTSRC;
1955                 spec_dst = daddr;
1956                 goto local_input;
1957         }
1958
1959         if (!IN_DEV_FORWARD(in_dev))
1960                 goto e_hostunreach;
1961         if (res.type != RTN_UNICAST)
1962                 goto martian_destination;
1963
1964         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1965 done:
1966         in_dev_put(in_dev);
1967         if (free_res)
1968                 fib_res_put(&res);
1969 out:    return err;
1970
1971 brd_input:
1972         if (skb->protocol != htons(ETH_P_IP))
1973                 goto e_inval;
1974
1975         if (ipv4_is_zeronet(saddr))
1976                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1977         else {
1978                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1979                                           &itag);
1980                 if (err < 0)
1981                         goto martian_source;
1982                 if (err)
1983                         flags |= RTCF_DIRECTSRC;
1984         }
1985         flags |= RTCF_BROADCAST;
1986         res.type = RTN_BROADCAST;
1987         RT_CACHE_STAT_INC(in_brd);
1988
1989 local_input:
1990         rth = dst_alloc(&ipv4_dst_ops);
1991         if (!rth)
1992                 goto e_nobufs;
1993
1994         rth->u.dst.output= ip_rt_bug;
1995         rth->rt_genid = atomic_read(&rt_genid);
1996
1997         atomic_set(&rth->u.dst.__refcnt, 1);
1998         rth->u.dst.flags= DST_HOST;
1999         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2000                 rth->u.dst.flags |= DST_NOPOLICY;
2001         rth->fl.fl4_dst = daddr;
2002         rth->rt_dst     = daddr;
2003         rth->fl.fl4_tos = tos;
2004         rth->fl.mark    = skb->mark;
2005         rth->fl.fl4_src = saddr;
2006         rth->rt_src     = saddr;
2007 #ifdef CONFIG_NET_CLS_ROUTE
2008         rth->u.dst.tclassid = itag;
2009 #endif
2010         rth->rt_iif     =
2011         rth->fl.iif     = dev->ifindex;
2012         rth->u.dst.dev  = net->loopback_dev;
2013         dev_hold(rth->u.dst.dev);
2014         rth->idev       = in_dev_get(rth->u.dst.dev);
2015         rth->rt_gateway = daddr;
2016         rth->rt_spec_dst= spec_dst;
2017         rth->u.dst.input= ip_local_deliver;
2018         rth->rt_flags   = flags|RTCF_LOCAL;
2019         if (res.type == RTN_UNREACHABLE) {
2020                 rth->u.dst.input= ip_error;
2021                 rth->u.dst.error= -err;
2022                 rth->rt_flags   &= ~RTCF_LOCAL;
2023         }
2024         rth->rt_type    = res.type;
2025         hash = rt_hash(daddr, saddr, fl.iif);
2026         err = rt_intern_hash(hash, rth, &skb->rtable);
2027         goto done;
2028
2029 no_route:
2030         RT_CACHE_STAT_INC(in_no_route);
2031         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2032         res.type = RTN_UNREACHABLE;
2033         if (err == -ESRCH)
2034                 err = -ENETUNREACH;
2035         goto local_input;
2036
2037         /*
2038          *      Do not cache martian addresses: they should be logged (RFC1812)
2039          */
2040 martian_destination:
2041         RT_CACHE_STAT_INC(in_martian_dst);
2042 #ifdef CONFIG_IP_ROUTE_VERBOSE
2043         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2044                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2045                         "%u.%u.%u.%u, dev %s\n",
2046                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2047 #endif
2048
2049 e_hostunreach:
2050         err = -EHOSTUNREACH;
2051         goto done;
2052
2053 e_inval:
2054         err = -EINVAL;
2055         goto done;
2056
2057 e_nobufs:
2058         err = -ENOBUFS;
2059         goto done;
2060
2061 martian_source:
2062         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2063         goto e_inval;
2064 }
2065
2066 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2067                    u8 tos, struct net_device *dev)
2068 {
2069         struct rtable * rth;
2070         unsigned        hash;
2071         int iif = dev->ifindex;
2072         struct net *net;
2073
2074         net = dev->nd_net;
2075         tos &= IPTOS_RT_MASK;
2076         hash = rt_hash(daddr, saddr, iif);
2077
2078         rcu_read_lock();
2079         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2080              rth = rcu_dereference(rth->u.dst.rt_next)) {
2081                 if (rth->fl.fl4_dst == daddr &&
2082                     rth->fl.fl4_src == saddr &&
2083                     rth->fl.iif == iif &&
2084                     rth->fl.oif == 0 &&
2085                     rth->fl.mark == skb->mark &&
2086                     rth->fl.fl4_tos == tos &&
2087                     rth->u.dst.dev->nd_net == net &&
2088                     rth->rt_genid == atomic_read(&rt_genid)) {
2089                         dst_use(&rth->u.dst, jiffies);
2090                         RT_CACHE_STAT_INC(in_hit);
2091                         rcu_read_unlock();
2092                         skb->rtable = rth;
2093                         return 0;
2094                 }
2095                 RT_CACHE_STAT_INC(in_hlist_search);
2096         }
2097         rcu_read_unlock();
2098
2099         /* Multicast recognition logic is moved from route cache to here.
2100            The problem was that too many Ethernet cards have broken/missing
2101            hardware multicast filters :-( As result the host on multicasting
2102            network acquires a lot of useless route cache entries, sort of
2103            SDR messages from all the world. Now we try to get rid of them.
2104            Really, provided software IP multicast filter is organized
2105            reasonably (at least, hashed), it does not result in a slowdown
2106            comparing with route cache reject entries.
2107            Note, that multicast routers are not affected, because
2108            route cache entry is created eventually.
2109          */
2110         if (ipv4_is_multicast(daddr)) {
2111                 struct in_device *in_dev;
2112
2113                 rcu_read_lock();
2114                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2115                         int our = ip_check_mc(in_dev, daddr, saddr,
2116                                 ip_hdr(skb)->protocol);
2117                         if (our
2118 #ifdef CONFIG_IP_MROUTE
2119                             || (!ipv4_is_local_multicast(daddr) &&
2120                                 IN_DEV_MFORWARD(in_dev))
2121 #endif
2122                             ) {
2123                                 rcu_read_unlock();
2124                                 return ip_route_input_mc(skb, daddr, saddr,
2125                                                          tos, dev, our);
2126                         }
2127                 }
2128                 rcu_read_unlock();
2129                 return -EINVAL;
2130         }
2131         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2132 }
2133
2134 static inline int __mkroute_output(struct rtable **result,
2135                                    struct fib_result* res,
2136                                    const struct flowi *fl,
2137                                    const struct flowi *oldflp,
2138                                    struct net_device *dev_out,
2139                                    unsigned flags)
2140 {
2141         struct rtable *rth;
2142         struct in_device *in_dev;
2143         u32 tos = RT_FL_TOS(oldflp);
2144         int err = 0;
2145
2146         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2147                 return -EINVAL;
2148
2149         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2150                 res->type = RTN_BROADCAST;
2151         else if (ipv4_is_multicast(fl->fl4_dst))
2152                 res->type = RTN_MULTICAST;
2153         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2154                 return -EINVAL;
2155
2156         if (dev_out->flags & IFF_LOOPBACK)
2157                 flags |= RTCF_LOCAL;
2158
2159         /* get work reference to inet device */
2160         in_dev = in_dev_get(dev_out);
2161         if (!in_dev)
2162                 return -EINVAL;
2163
2164         if (res->type == RTN_BROADCAST) {
2165                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2166                 if (res->fi) {
2167                         fib_info_put(res->fi);
2168                         res->fi = NULL;
2169                 }
2170         } else if (res->type == RTN_MULTICAST) {
2171                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2172                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2173                                  oldflp->proto))
2174                         flags &= ~RTCF_LOCAL;
2175                 /* If multicast route do not exist use
2176                    default one, but do not gateway in this case.
2177                    Yes, it is hack.
2178                  */
2179                 if (res->fi && res->prefixlen < 4) {
2180                         fib_info_put(res->fi);
2181                         res->fi = NULL;
2182                 }
2183         }
2184
2185
2186         rth = dst_alloc(&ipv4_dst_ops);
2187         if (!rth) {
2188                 err = -ENOBUFS;
2189                 goto cleanup;
2190         }
2191
2192         atomic_set(&rth->u.dst.__refcnt, 1);
2193         rth->u.dst.flags= DST_HOST;
2194         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2195                 rth->u.dst.flags |= DST_NOXFRM;
2196         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2197                 rth->u.dst.flags |= DST_NOPOLICY;
2198
2199         rth->fl.fl4_dst = oldflp->fl4_dst;
2200         rth->fl.fl4_tos = tos;
2201         rth->fl.fl4_src = oldflp->fl4_src;
2202         rth->fl.oif     = oldflp->oif;
2203         rth->fl.mark    = oldflp->mark;
2204         rth->rt_dst     = fl->fl4_dst;
2205         rth->rt_src     = fl->fl4_src;
2206         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2207         /* get references to the devices that are to be hold by the routing
2208            cache entry */
2209         rth->u.dst.dev  = dev_out;
2210         dev_hold(dev_out);
2211         rth->idev       = in_dev_get(dev_out);
2212         rth->rt_gateway = fl->fl4_dst;
2213         rth->rt_spec_dst= fl->fl4_src;
2214
2215         rth->u.dst.output=ip_output;
2216         rth->rt_genid = atomic_read(&rt_genid);
2217
2218         RT_CACHE_STAT_INC(out_slow_tot);
2219
2220         if (flags & RTCF_LOCAL) {
2221                 rth->u.dst.input = ip_local_deliver;
2222                 rth->rt_spec_dst = fl->fl4_dst;
2223         }
2224         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2225                 rth->rt_spec_dst = fl->fl4_src;
2226                 if (flags & RTCF_LOCAL &&
2227                     !(dev_out->flags & IFF_LOOPBACK)) {
2228                         rth->u.dst.output = ip_mc_output;
2229                         RT_CACHE_STAT_INC(out_slow_mc);
2230                 }
2231 #ifdef CONFIG_IP_MROUTE
2232                 if (res->type == RTN_MULTICAST) {
2233                         if (IN_DEV_MFORWARD(in_dev) &&
2234                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2235                                 rth->u.dst.input = ip_mr_input;
2236                                 rth->u.dst.output = ip_mc_output;
2237                         }
2238                 }
2239 #endif
2240         }
2241
2242         rt_set_nexthop(rth, res, 0);
2243
2244         rth->rt_flags = flags;
2245
2246         *result = rth;
2247  cleanup:
2248         /* release work reference to inet device */
2249         in_dev_put(in_dev);
2250
2251         return err;
2252 }
2253
2254 static inline int ip_mkroute_output(struct rtable **rp,
2255                                     struct fib_result* res,
2256                                     const struct flowi *fl,
2257                                     const struct flowi *oldflp,
2258                                     struct net_device *dev_out,
2259                                     unsigned flags)
2260 {
2261         struct rtable *rth = NULL;
2262         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2263         unsigned hash;
2264         if (err == 0) {
2265                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2266                 err = rt_intern_hash(hash, rth, rp);
2267         }
2268
2269         return err;
2270 }
2271
2272 /*
2273  * Major route resolver routine.
2274  */
2275
2276 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2277                                 const struct flowi *oldflp)
2278 {
2279         u32 tos = RT_FL_TOS(oldflp);
2280         struct flowi fl = { .nl_u = { .ip4_u =
2281                                       { .daddr = oldflp->fl4_dst,
2282                                         .saddr = oldflp->fl4_src,
2283                                         .tos = tos & IPTOS_RT_MASK,
2284                                         .scope = ((tos & RTO_ONLINK) ?
2285                                                   RT_SCOPE_LINK :
2286                                                   RT_SCOPE_UNIVERSE),
2287                                       } },
2288                             .mark = oldflp->mark,
2289                             .iif = net->loopback_dev->ifindex,
2290                             .oif = oldflp->oif };
2291         struct fib_result res;
2292         unsigned flags = 0;
2293         struct net_device *dev_out = NULL;
2294         int free_res = 0;
2295         int err;
2296
2297
2298         res.fi          = NULL;
2299 #ifdef CONFIG_IP_MULTIPLE_TABLES
2300         res.r           = NULL;
2301 #endif
2302
2303         if (oldflp->fl4_src) {
2304                 err = -EINVAL;
2305                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2306                     ipv4_is_lbcast(oldflp->fl4_src) ||
2307                     ipv4_is_zeronet(oldflp->fl4_src))
2308                         goto out;
2309
2310                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2311                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2312                 if (dev_out == NULL)
2313                         goto out;
2314
2315                 /* I removed check for oif == dev_out->oif here.
2316                    It was wrong for two reasons:
2317                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2318                       is assigned to multiple interfaces.
2319                    2. Moreover, we are allowed to send packets with saddr
2320                       of another iface. --ANK
2321                  */
2322
2323                 if (oldflp->oif == 0
2324                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2325                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2326                         /* Special hack: user can direct multicasts
2327                            and limited broadcast via necessary interface
2328                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2329                            This hack is not just for fun, it allows
2330                            vic,vat and friends to work.
2331                            They bind socket to loopback, set ttl to zero
2332                            and expect that it will work.
2333                            From the viewpoint of routing cache they are broken,
2334                            because we are not allowed to build multicast path
2335                            with loopback source addr (look, routing cache
2336                            cannot know, that ttl is zero, so that packet
2337                            will not leave this host and route is valid).
2338                            Luckily, this hack is good workaround.
2339                          */
2340
2341                         fl.oif = dev_out->ifindex;
2342                         goto make_route;
2343                 }
2344                 if (dev_out)
2345                         dev_put(dev_out);
2346                 dev_out = NULL;
2347         }
2348
2349
2350         if (oldflp->oif) {
2351                 dev_out = dev_get_by_index(net, oldflp->oif);
2352                 err = -ENODEV;
2353                 if (dev_out == NULL)
2354                         goto out;
2355
2356                 /* RACE: Check return value of inet_select_addr instead. */
2357                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2358                         dev_put(dev_out);
2359                         goto out;       /* Wrong error code */
2360                 }
2361
2362                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2363                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2364                         if (!fl.fl4_src)
2365                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2366                                                               RT_SCOPE_LINK);
2367                         goto make_route;
2368                 }
2369                 if (!fl.fl4_src) {
2370                         if (ipv4_is_multicast(oldflp->fl4_dst))
2371                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2372                                                               fl.fl4_scope);
2373                         else if (!oldflp->fl4_dst)
2374                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2375                                                               RT_SCOPE_HOST);
2376                 }
2377         }
2378
2379         if (!fl.fl4_dst) {
2380                 fl.fl4_dst = fl.fl4_src;
2381                 if (!fl.fl4_dst)
2382                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2383                 if (dev_out)
2384                         dev_put(dev_out);
2385                 dev_out = net->loopback_dev;
2386                 dev_hold(dev_out);
2387                 fl.oif = net->loopback_dev->ifindex;
2388                 res.type = RTN_LOCAL;
2389                 flags |= RTCF_LOCAL;
2390                 goto make_route;
2391         }
2392
2393         if (fib_lookup(net, &fl, &res)) {
2394                 res.fi = NULL;
2395                 if (oldflp->oif) {
2396                         /* Apparently, routing tables are wrong. Assume,
2397                            that the destination is on link.
2398
2399                            WHY? DW.
2400                            Because we are allowed to send to iface
2401                            even if it has NO routes and NO assigned
2402                            addresses. When oif is specified, routing
2403                            tables are looked up with only one purpose:
2404                            to catch if destination is gatewayed, rather than
2405                            direct. Moreover, if MSG_DONTROUTE is set,
2406                            we send packet, ignoring both routing tables
2407                            and ifaddr state. --ANK
2408
2409
2410                            We could make it even if oif is unknown,
2411                            likely IPv6, but we do not.
2412                          */
2413
2414                         if (fl.fl4_src == 0)
2415                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2416                                                               RT_SCOPE_LINK);
2417                         res.type = RTN_UNICAST;
2418                         goto make_route;
2419                 }
2420                 if (dev_out)
2421                         dev_put(dev_out);
2422                 err = -ENETUNREACH;
2423                 goto out;
2424         }
2425         free_res = 1;
2426
2427         if (res.type == RTN_LOCAL) {
2428                 if (!fl.fl4_src)
2429                         fl.fl4_src = fl.fl4_dst;
2430                 if (dev_out)
2431                         dev_put(dev_out);
2432                 dev_out = net->loopback_dev;
2433                 dev_hold(dev_out);
2434                 fl.oif = dev_out->ifindex;
2435                 if (res.fi)
2436                         fib_info_put(res.fi);
2437                 res.fi = NULL;
2438                 flags |= RTCF_LOCAL;
2439                 goto make_route;
2440         }
2441
2442 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2443         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2444                 fib_select_multipath(&fl, &res);
2445         else
2446 #endif
2447         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2448                 fib_select_default(net, &fl, &res);
2449
2450         if (!fl.fl4_src)
2451                 fl.fl4_src = FIB_RES_PREFSRC(res);
2452
2453         if (dev_out)
2454                 dev_put(dev_out);
2455         dev_out = FIB_RES_DEV(res);
2456         dev_hold(dev_out);
2457         fl.oif = dev_out->ifindex;
2458
2459
2460 make_route:
2461         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2462
2463
2464         if (free_res)
2465                 fib_res_put(&res);
2466         if (dev_out)
2467                 dev_put(dev_out);
2468 out:    return err;
2469 }
2470
2471 int __ip_route_output_key(struct net *net, struct rtable **rp,
2472                           const struct flowi *flp)
2473 {
2474         unsigned hash;
2475         struct rtable *rth;
2476
2477         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2478
2479         rcu_read_lock_bh();
2480         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2481                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2482                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2483                     rth->fl.fl4_src == flp->fl4_src &&
2484                     rth->fl.iif == 0 &&
2485                     rth->fl.oif == flp->oif &&
2486                     rth->fl.mark == flp->mark &&
2487                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2488                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2489                     rth->u.dst.dev->nd_net == net &&
2490                     rth->rt_genid == atomic_read(&rt_genid)) {
2491                         dst_use(&rth->u.dst, jiffies);
2492                         RT_CACHE_STAT_INC(out_hit);
2493                         rcu_read_unlock_bh();
2494                         *rp = rth;
2495                         return 0;
2496                 }
2497                 RT_CACHE_STAT_INC(out_hlist_search);
2498         }
2499         rcu_read_unlock_bh();
2500
2501         return ip_route_output_slow(net, rp, flp);
2502 }
2503
2504 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2505
2506 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2507 {
2508 }
2509
2510 static struct dst_ops ipv4_dst_blackhole_ops = {
2511         .family                 =       AF_INET,
2512         .protocol               =       __constant_htons(ETH_P_IP),
2513         .destroy                =       ipv4_dst_destroy,
2514         .check                  =       ipv4_dst_check,
2515         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2516         .entry_size             =       sizeof(struct rtable),
2517         .entries                =       ATOMIC_INIT(0),
2518 };
2519
2520
2521 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2522 {
2523         struct rtable *ort = *rp;
2524         struct rtable *rt = (struct rtable *)
2525                 dst_alloc(&ipv4_dst_blackhole_ops);
2526
2527         if (rt) {
2528                 struct dst_entry *new = &rt->u.dst;
2529
2530                 atomic_set(&new->__refcnt, 1);
2531                 new->__use = 1;
2532                 new->input = dst_discard;
2533                 new->output = dst_discard;
2534                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2535
2536                 new->dev = ort->u.dst.dev;
2537                 if (new->dev)
2538                         dev_hold(new->dev);
2539
2540                 rt->fl = ort->fl;
2541
2542                 rt->idev = ort->idev;
2543                 if (rt->idev)
2544                         in_dev_hold(rt->idev);
2545                 rt->rt_genid = atomic_read(&rt_genid);
2546                 rt->rt_flags = ort->rt_flags;
2547                 rt->rt_type = ort->rt_type;
2548                 rt->rt_dst = ort->rt_dst;
2549                 rt->rt_src = ort->rt_src;
2550                 rt->rt_iif = ort->rt_iif;
2551                 rt->rt_gateway = ort->rt_gateway;
2552                 rt->rt_spec_dst = ort->rt_spec_dst;
2553                 rt->peer = ort->peer;
2554                 if (rt->peer)
2555                         atomic_inc(&rt->peer->refcnt);
2556
2557                 dst_free(new);
2558         }
2559
2560         dst_release(&(*rp)->u.dst);
2561         *rp = rt;
2562         return (rt ? 0 : -ENOMEM);
2563 }
2564
2565 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2566                          struct sock *sk, int flags)
2567 {
2568         int err;
2569
2570         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2571                 return err;
2572
2573         if (flp->proto) {
2574                 if (!flp->fl4_src)
2575                         flp->fl4_src = (*rp)->rt_src;
2576                 if (!flp->fl4_dst)
2577                         flp->fl4_dst = (*rp)->rt_dst;
2578                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2579                                     flags ? XFRM_LOOKUP_WAIT : 0);
2580                 if (err == -EREMOTE)
2581                         err = ipv4_dst_blackhole(rp, flp);
2582
2583                 return err;
2584         }
2585
2586         return 0;
2587 }
2588
2589 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2590
2591 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2592 {
2593         return ip_route_output_flow(net, rp, flp, NULL, 0);
2594 }
2595
2596 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2597                         int nowait, unsigned int flags)
2598 {
2599         struct rtable *rt = skb->rtable;
2600         struct rtmsg *r;
2601         struct nlmsghdr *nlh;
2602         long expires;
2603         u32 id = 0, ts = 0, tsage = 0, error;
2604
2605         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2606         if (nlh == NULL)
2607                 return -EMSGSIZE;
2608
2609         r = nlmsg_data(nlh);
2610         r->rtm_family    = AF_INET;
2611         r->rtm_dst_len  = 32;
2612         r->rtm_src_len  = 0;
2613         r->rtm_tos      = rt->fl.fl4_tos;
2614         r->rtm_table    = RT_TABLE_MAIN;
2615         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2616         r->rtm_type     = rt->rt_type;
2617         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2618         r->rtm_protocol = RTPROT_UNSPEC;
2619         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2620         if (rt->rt_flags & RTCF_NOTIFY)
2621                 r->rtm_flags |= RTM_F_NOTIFY;
2622
2623         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2624
2625         if (rt->fl.fl4_src) {
2626                 r->rtm_src_len = 32;
2627                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2628         }
2629         if (rt->u.dst.dev)
2630                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2631 #ifdef CONFIG_NET_CLS_ROUTE
2632         if (rt->u.dst.tclassid)
2633                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2634 #endif
2635         if (rt->fl.iif)
2636                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2637         else if (rt->rt_src != rt->fl.fl4_src)
2638                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2639
2640         if (rt->rt_dst != rt->rt_gateway)
2641                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2642
2643         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2644                 goto nla_put_failure;
2645
2646         error = rt->u.dst.error;
2647         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2648         if (rt->peer) {
2649                 id = rt->peer->ip_id_count;
2650                 if (rt->peer->tcp_ts_stamp) {
2651                         ts = rt->peer->tcp_ts;
2652                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2653                 }
2654         }
2655
2656         if (rt->fl.iif) {
2657 #ifdef CONFIG_IP_MROUTE
2658                 __be32 dst = rt->rt_dst;
2659
2660                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2661                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2662                         int err = ipmr_get_route(skb, r, nowait);
2663                         if (err <= 0) {
2664                                 if (!nowait) {
2665                                         if (err == 0)
2666                                                 return 0;
2667                                         goto nla_put_failure;
2668                                 } else {
2669                                         if (err == -EMSGSIZE)
2670                                                 goto nla_put_failure;
2671                                         error = err;
2672                                 }
2673                         }
2674                 } else
2675 #endif
2676                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2677         }
2678
2679         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2680                                expires, error) < 0)
2681                 goto nla_put_failure;
2682
2683         return nlmsg_end(skb, nlh);
2684
2685 nla_put_failure:
2686         nlmsg_cancel(skb, nlh);
2687         return -EMSGSIZE;
2688 }
2689
2690 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2691 {
2692         struct net *net = in_skb->sk->sk_net;
2693         struct rtmsg *rtm;
2694         struct nlattr *tb[RTA_MAX+1];
2695         struct rtable *rt = NULL;
2696         __be32 dst = 0;
2697         __be32 src = 0;
2698         u32 iif;
2699         int err;
2700         struct sk_buff *skb;
2701
2702         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2703         if (err < 0)
2704                 goto errout;
2705
2706         rtm = nlmsg_data(nlh);
2707
2708         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2709         if (skb == NULL) {
2710                 err = -ENOBUFS;
2711                 goto errout;
2712         }
2713
2714         /* Reserve room for dummy headers, this skb can pass
2715            through good chunk of routing engine.
2716          */
2717         skb_reset_mac_header(skb);
2718         skb_reset_network_header(skb);
2719
2720         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2721         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2722         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2723
2724         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2725         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2726         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2727
2728         if (iif) {
2729                 struct net_device *dev;
2730
2731                 dev = __dev_get_by_index(net, iif);
2732                 if (dev == NULL) {
2733                         err = -ENODEV;
2734                         goto errout_free;
2735                 }
2736
2737                 skb->protocol   = htons(ETH_P_IP);
2738                 skb->dev        = dev;
2739                 local_bh_disable();
2740                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2741                 local_bh_enable();
2742
2743                 rt = skb->rtable;
2744                 if (err == 0 && rt->u.dst.error)
2745                         err = -rt->u.dst.error;
2746         } else {
2747                 struct flowi fl = {
2748                         .nl_u = {
2749                                 .ip4_u = {
2750                                         .daddr = dst,
2751                                         .saddr = src,
2752                                         .tos = rtm->rtm_tos,
2753                                 },
2754                         },
2755                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2756                 };
2757                 err = ip_route_output_key(net, &rt, &fl);
2758         }
2759
2760         if (err)
2761                 goto errout_free;
2762
2763         skb->rtable = rt;
2764         if (rtm->rtm_flags & RTM_F_NOTIFY)
2765                 rt->rt_flags |= RTCF_NOTIFY;
2766
2767         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2768                            RTM_NEWROUTE, 0, 0);
2769         if (err <= 0)
2770                 goto errout_free;
2771
2772         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2773 errout:
2774         return err;
2775
2776 errout_free:
2777         kfree_skb(skb);
2778         goto errout;
2779 }
2780
2781 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2782 {
2783         struct rtable *rt;
2784         int h, s_h;
2785         int idx, s_idx;
2786         struct net *net;
2787
2788         net = skb->sk->sk_net;
2789
2790         s_h = cb->args[0];
2791         if (s_h < 0)
2792                 s_h = 0;
2793         s_idx = idx = cb->args[1];
2794         for (h = s_h; h <= rt_hash_mask; h++) {
2795                 rcu_read_lock_bh();
2796                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2797                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2798                         if (rt->u.dst.dev->nd_net != net || idx < s_idx)
2799                                 continue;
2800                         if (rt->rt_genid != atomic_read(&rt_genid))
2801                                 continue;
2802                         skb->dst = dst_clone(&rt->u.dst);
2803                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2804                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2805                                          1, NLM_F_MULTI) <= 0) {
2806                                 dst_release(xchg(&skb->dst, NULL));
2807                                 rcu_read_unlock_bh();
2808                                 goto done;
2809                         }
2810                         dst_release(xchg(&skb->dst, NULL));
2811                 }
2812                 rcu_read_unlock_bh();
2813                 s_idx = 0;
2814         }
2815
2816 done:
2817         cb->args[0] = h;
2818         cb->args[1] = idx;
2819         return skb->len;
2820 }
2821
2822 void ip_rt_multicast_event(struct in_device *in_dev)
2823 {
2824         rt_cache_flush(0);
2825 }
2826
2827 #ifdef CONFIG_SYSCTL
2828 static int flush_delay;
2829
2830 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2831                                         struct file *filp, void __user *buffer,
2832                                         size_t *lenp, loff_t *ppos)
2833 {
2834         if (write) {
2835                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2836                 rt_cache_flush(flush_delay);
2837                 return 0;
2838         }
2839
2840         return -EINVAL;
2841 }
2842
2843 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2844                                                 int __user *name,
2845                                                 int nlen,
2846                                                 void __user *oldval,
2847                                                 size_t __user *oldlenp,
2848                                                 void __user *newval,
2849                                                 size_t newlen)
2850 {
2851         int delay;
2852         if (newlen != sizeof(int))
2853                 return -EINVAL;
2854         if (get_user(delay, (int __user *)newval))
2855                 return -EFAULT;
2856         rt_cache_flush(delay);
2857         return 0;
2858 }
2859
2860 ctl_table ipv4_route_table[] = {
2861         {
2862                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2863                 .procname       = "flush",
2864                 .data           = &flush_delay,
2865                 .maxlen         = sizeof(int),
2866                 .mode           = 0200,
2867                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2868                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2869         },
2870         {
2871                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2872                 .procname       = "gc_thresh",
2873                 .data           = &ipv4_dst_ops.gc_thresh,
2874                 .maxlen         = sizeof(int),
2875                 .mode           = 0644,
2876                 .proc_handler   = &proc_dointvec,
2877         },
2878         {
2879                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2880                 .procname       = "max_size",
2881                 .data           = &ip_rt_max_size,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = &proc_dointvec,
2885         },
2886         {
2887                 /*  Deprecated. Use gc_min_interval_ms */
2888
2889                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2890                 .procname       = "gc_min_interval",
2891                 .data           = &ip_rt_gc_min_interval,
2892                 .maxlen         = sizeof(int),
2893                 .mode           = 0644,
2894                 .proc_handler   = &proc_dointvec_jiffies,
2895                 .strategy       = &sysctl_jiffies,
2896         },
2897         {
2898                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2899                 .procname       = "gc_min_interval_ms",
2900                 .data           = &ip_rt_gc_min_interval,
2901                 .maxlen         = sizeof(int),
2902                 .mode           = 0644,
2903                 .proc_handler   = &proc_dointvec_ms_jiffies,
2904                 .strategy       = &sysctl_ms_jiffies,
2905         },
2906         {
2907                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2908                 .procname       = "gc_timeout",
2909                 .data           = &ip_rt_gc_timeout,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = &proc_dointvec_jiffies,
2913                 .strategy       = &sysctl_jiffies,
2914         },
2915         {
2916                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2917                 .procname       = "gc_interval",
2918                 .data           = &ip_rt_gc_interval,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = &proc_dointvec_jiffies,
2922                 .strategy       = &sysctl_jiffies,
2923         },
2924         {
2925                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2926                 .procname       = "redirect_load",
2927                 .data           = &ip_rt_redirect_load,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = &proc_dointvec,
2931         },
2932         {
2933                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2934                 .procname       = "redirect_number",
2935                 .data           = &ip_rt_redirect_number,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = &proc_dointvec,
2939         },
2940         {
2941                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2942                 .procname       = "redirect_silence",
2943                 .data           = &ip_rt_redirect_silence,
2944                 .maxlen         = sizeof(int),
2945                 .mode           = 0644,
2946                 .proc_handler   = &proc_dointvec,
2947         },
2948         {
2949                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2950                 .procname       = "error_cost",
2951                 .data           = &ip_rt_error_cost,
2952                 .maxlen         = sizeof(int),
2953                 .mode           = 0644,
2954                 .proc_handler   = &proc_dointvec,
2955         },
2956         {
2957                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2958                 .procname       = "error_burst",
2959                 .data           = &ip_rt_error_burst,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0644,
2962                 .proc_handler   = &proc_dointvec,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2966                 .procname       = "gc_elasticity",
2967                 .data           = &ip_rt_gc_elasticity,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2974                 .procname       = "mtu_expires",
2975                 .data           = &ip_rt_mtu_expires,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec_jiffies,
2979                 .strategy       = &sysctl_jiffies,
2980         },
2981         {
2982                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2983                 .procname       = "min_pmtu",
2984                 .data           = &ip_rt_min_pmtu,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = &proc_dointvec,
2988         },
2989         {
2990                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2991                 .procname       = "min_adv_mss",
2992                 .data           = &ip_rt_min_advmss,
2993                 .maxlen         = sizeof(int),
2994                 .mode           = 0644,
2995                 .proc_handler   = &proc_dointvec,
2996         },
2997         {
2998                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2999                 .procname       = "secret_interval",
3000                 .data           = &ip_rt_secret_interval,
3001                 .maxlen         = sizeof(int),
3002                 .mode           = 0644,
3003                 .proc_handler   = &proc_dointvec_jiffies,
3004                 .strategy       = &sysctl_jiffies,
3005         },
3006         { .ctl_name = 0 }
3007 };
3008 #endif
3009
3010 #ifdef CONFIG_NET_CLS_ROUTE
3011 struct ip_rt_acct *ip_rt_acct __read_mostly;
3012 #endif /* CONFIG_NET_CLS_ROUTE */
3013
3014 static __initdata unsigned long rhash_entries;
3015 static int __init set_rhash_entries(char *str)
3016 {
3017         if (!str)
3018                 return 0;
3019         rhash_entries = simple_strtoul(str, &str, 0);
3020         return 1;
3021 }
3022 __setup("rhash_entries=", set_rhash_entries);
3023
3024 int __init ip_rt_init(void)
3025 {
3026         int rc = 0;
3027
3028         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3029                              (jiffies ^ (jiffies >> 7))));
3030
3031 #ifdef CONFIG_NET_CLS_ROUTE
3032         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3033         if (!ip_rt_acct)
3034                 panic("IP: failed to allocate ip_rt_acct\n");
3035 #endif
3036
3037         ipv4_dst_ops.kmem_cachep =
3038                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3039                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3040
3041         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3042
3043         rt_hash_table = (struct rt_hash_bucket *)
3044                 alloc_large_system_hash("IP route cache",
3045                                         sizeof(struct rt_hash_bucket),
3046                                         rhash_entries,
3047                                         (num_physpages >= 128 * 1024) ?
3048                                         15 : 17,
3049                                         0,
3050                                         &rt_hash_log,
3051                                         &rt_hash_mask,
3052                                         0);
3053         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3054         rt_hash_lock_init();
3055
3056         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3057         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3058
3059         devinet_init();
3060         ip_fib_init();
3061
3062         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3063
3064         /* All the timers, started at system startup tend
3065            to synchronize. Perturb it a bit.
3066          */
3067         schedule_delayed_work(&expires_work,
3068                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3069
3070         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3071                 ip_rt_secret_interval;
3072         add_timer(&rt_secret_timer);
3073
3074         if (ip_rt_proc_init())
3075                 printk(KERN_ERR "Unable to create route proc files\n");
3076 #ifdef CONFIG_XFRM
3077         xfrm_init();
3078         xfrm4_init();
3079 #endif
3080         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3081
3082         return rc;
3083 }
3084
3085 EXPORT_SYMBOL(__ip_select_ident);
3086 EXPORT_SYMBOL(ip_route_input);
3087 EXPORT_SYMBOL(ip_route_output_key);