netns: register net.ipv4.route.flush in each namespace
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114 #define IP_MAX_MTU      0xFFF0
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly    = 8;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
132
133 static void rt_worker_func(struct work_struct *work);
134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
135 static struct timer_list rt_secret_timer;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
144                                          struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150
151 static struct dst_ops ipv4_dst_ops = {
152         .family =               AF_INET,
153         .protocol =             __constant_htons(ETH_P_IP),
154         .gc =                   rt_garbage_collect,
155         .check =                ipv4_dst_check,
156         .destroy =              ipv4_dst_destroy,
157         .ifdown =               ipv4_dst_ifdown,
158         .negative_advice =      ipv4_negative_advice,
159         .link_failure =         ipv4_link_failure,
160         .update_pmtu =          ip_rt_update_pmtu,
161         .local_out =            __ip_local_out,
162         .entry_size =           sizeof(struct rtable),
163         .entries =              ATOMIC_INIT(0),
164 };
165
166 #define ECN_OR_COST(class)      TC_PRIO_##class
167
168 const __u8 ip_tos2prio[16] = {
169         TC_PRIO_BESTEFFORT,
170         ECN_OR_COST(FILLER),
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BULK,
174         ECN_OR_COST(BULK),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_INTERACTIVE,
178         ECN_OR_COST(INTERACTIVE),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE_BULK,
182         ECN_OR_COST(INTERACTIVE_BULK),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK)
185 };
186
187
188 /*
189  * Route cache.
190  */
191
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201
202 struct rt_hash_bucket {
203         struct rtable   *chain;
204 };
205 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206         defined(CONFIG_PROVE_LOCKING)
207 /*
208  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209  * The size of this table is a power of two and depends on the number of CPUS.
210  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211  */
212 #ifdef CONFIG_LOCKDEP
213 # define RT_HASH_LOCK_SZ        256
214 #else
215 # if NR_CPUS >= 32
216 #  define RT_HASH_LOCK_SZ       4096
217 # elif NR_CPUS >= 16
218 #  define RT_HASH_LOCK_SZ       2048
219 # elif NR_CPUS >= 8
220 #  define RT_HASH_LOCK_SZ       1024
221 # elif NR_CPUS >= 4
222 #  define RT_HASH_LOCK_SZ       512
223 # else
224 #  define RT_HASH_LOCK_SZ       256
225 # endif
226 #endif
227
228 static spinlock_t       *rt_hash_locks;
229 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230
231 static __init void rt_hash_lock_init(void)
232 {
233         int i;
234
235         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
236                         GFP_KERNEL);
237         if (!rt_hash_locks)
238                 panic("IP: failed to allocate rt_hash_locks\n");
239
240         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241                 spin_lock_init(&rt_hash_locks[i]);
242 }
243 #else
244 # define rt_hash_lock_addr(slot) NULL
245
246 static inline void rt_hash_lock_init(void)
247 {
248 }
249 #endif
250
251 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
252 static unsigned                 rt_hash_mask __read_mostly;
253 static unsigned int             rt_hash_log  __read_mostly;
254 static atomic_t                 rt_genid __read_mostly;
255
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258         (__raw_get_cpu_var(rt_cache_stat).field++)
259
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
261 {
262         return jhash_3words((__force u32)(__be32)(daddr),
263                             (__force u32)(__be32)(saddr),
264                             idx, atomic_read(&rt_genid))
265                 & rt_hash_mask;
266 }
267
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270         struct seq_net_private p;
271         int bucket;
272         int genid;
273 };
274
275 static struct rtable *rt_cache_get_first(struct seq_file *seq)
276 {
277         struct rt_cache_iter_state *st = seq->private;
278         struct rtable *r = NULL;
279
280         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
281                 rcu_read_lock_bh();
282                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
283                 while (r) {
284                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
285                             r->rt_genid == st->genid)
286                                 return r;
287                         r = rcu_dereference(r->u.dst.rt_next);
288                 }
289                 rcu_read_unlock_bh();
290         }
291         return r;
292 }
293
294 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
295                                           struct rtable *r)
296 {
297         struct rt_cache_iter_state *st = seq->private;
298         r = r->u.dst.rt_next;
299         while (!r) {
300                 rcu_read_unlock_bh();
301                 if (--st->bucket < 0)
302                         break;
303                 rcu_read_lock_bh();
304                 r = rt_hash_table[st->bucket].chain;
305         }
306         return rcu_dereference(r);
307 }
308
309 static struct rtable *rt_cache_get_next(struct seq_file *seq,
310                                         struct rtable *r)
311 {
312         struct rt_cache_iter_state *st = seq->private;
313         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
314                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
315                         continue;
316                 if (r->rt_genid == st->genid)
317                         break;
318         }
319         return r;
320 }
321
322 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
323 {
324         struct rtable *r = rt_cache_get_first(seq);
325
326         if (r)
327                 while (pos && (r = rt_cache_get_next(seq, r)))
328                         --pos;
329         return pos ? NULL : r;
330 }
331
332 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
333 {
334         struct rt_cache_iter_state *st = seq->private;
335         if (*pos)
336                 return rt_cache_get_idx(seq, *pos - 1);
337         st->genid = atomic_read(&rt_genid);
338         return SEQ_START_TOKEN;
339 }
340
341 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
342 {
343         struct rtable *r;
344
345         if (v == SEQ_START_TOKEN)
346                 r = rt_cache_get_first(seq);
347         else
348                 r = rt_cache_get_next(seq, v);
349         ++*pos;
350         return r;
351 }
352
353 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
354 {
355         if (v && v != SEQ_START_TOKEN)
356                 rcu_read_unlock_bh();
357 }
358
359 static int rt_cache_seq_show(struct seq_file *seq, void *v)
360 {
361         if (v == SEQ_START_TOKEN)
362                 seq_printf(seq, "%-127s\n",
363                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
364                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
365                            "HHUptod\tSpecDst");
366         else {
367                 struct rtable *r = v;
368                 int len;
369
370                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
371                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
372                         r->u.dst.dev ? r->u.dst.dev->name : "*",
373                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
374                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
375                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
376                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
377                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
378                         dst_metric(&r->u.dst, RTAX_WINDOW),
379                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
380                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
381                         r->fl.fl4_tos,
382                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
383                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
384                                        dev_queue_xmit) : 0,
385                         r->rt_spec_dst, &len);
386
387                 seq_printf(seq, "%*s\n", 127 - len, "");
388         }
389         return 0;
390 }
391
392 static const struct seq_operations rt_cache_seq_ops = {
393         .start  = rt_cache_seq_start,
394         .next   = rt_cache_seq_next,
395         .stop   = rt_cache_seq_stop,
396         .show   = rt_cache_seq_show,
397 };
398
399 static int rt_cache_seq_open(struct inode *inode, struct file *file)
400 {
401         return seq_open_net(inode, file, &rt_cache_seq_ops,
402                         sizeof(struct rt_cache_iter_state));
403 }
404
405 static const struct file_operations rt_cache_seq_fops = {
406         .owner   = THIS_MODULE,
407         .open    = rt_cache_seq_open,
408         .read    = seq_read,
409         .llseek  = seq_lseek,
410         .release = seq_release_net,
411 };
412
413
414 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
415 {
416         int cpu;
417
418         if (*pos == 0)
419                 return SEQ_START_TOKEN;
420
421         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
422                 if (!cpu_possible(cpu))
423                         continue;
424                 *pos = cpu+1;
425                 return &per_cpu(rt_cache_stat, cpu);
426         }
427         return NULL;
428 }
429
430 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
431 {
432         int cpu;
433
434         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
435                 if (!cpu_possible(cpu))
436                         continue;
437                 *pos = cpu+1;
438                 return &per_cpu(rt_cache_stat, cpu);
439         }
440         return NULL;
441
442 }
443
444 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
445 {
446
447 }
448
449 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
450 {
451         struct rt_cache_stat *st = v;
452
453         if (v == SEQ_START_TOKEN) {
454                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
455                 return 0;
456         }
457
458         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
459                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
460                    atomic_read(&ipv4_dst_ops.entries),
461                    st->in_hit,
462                    st->in_slow_tot,
463                    st->in_slow_mc,
464                    st->in_no_route,
465                    st->in_brd,
466                    st->in_martian_dst,
467                    st->in_martian_src,
468
469                    st->out_hit,
470                    st->out_slow_tot,
471                    st->out_slow_mc,
472
473                    st->gc_total,
474                    st->gc_ignored,
475                    st->gc_goal_miss,
476                    st->gc_dst_overflow,
477                    st->in_hlist_search,
478                    st->out_hlist_search
479                 );
480         return 0;
481 }
482
483 static const struct seq_operations rt_cpu_seq_ops = {
484         .start  = rt_cpu_seq_start,
485         .next   = rt_cpu_seq_next,
486         .stop   = rt_cpu_seq_stop,
487         .show   = rt_cpu_seq_show,
488 };
489
490
491 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
492 {
493         return seq_open(file, &rt_cpu_seq_ops);
494 }
495
496 static const struct file_operations rt_cpu_seq_fops = {
497         .owner   = THIS_MODULE,
498         .open    = rt_cpu_seq_open,
499         .read    = seq_read,
500         .llseek  = seq_lseek,
501         .release = seq_release,
502 };
503
504 #ifdef CONFIG_NET_CLS_ROUTE
505 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
506                            int length, int *eof, void *data)
507 {
508         unsigned int i;
509
510         if ((offset & 3) || (length & 3))
511                 return -EIO;
512
513         if (offset >= sizeof(struct ip_rt_acct) * 256) {
514                 *eof = 1;
515                 return 0;
516         }
517
518         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
519                 length = sizeof(struct ip_rt_acct) * 256 - offset;
520                 *eof = 1;
521         }
522
523         offset /= sizeof(u32);
524
525         if (length > 0) {
526                 u32 *dst = (u32 *) buffer;
527
528                 *start = buffer;
529                 memset(dst, 0, length);
530
531                 for_each_possible_cpu(i) {
532                         unsigned int j;
533                         u32 *src;
534
535                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
536                         for (j = 0; j < length/4; j++)
537                                 dst[j] += src[j];
538                 }
539         }
540         return length;
541 }
542 #endif
543
544 static int __net_init ip_rt_do_proc_init(struct net *net)
545 {
546         struct proc_dir_entry *pde;
547
548         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
549                         &rt_cache_seq_fops);
550         if (!pde)
551                 goto err1;
552
553         pde = proc_create("rt_cache", S_IRUGO,
554                           net->proc_net_stat, &rt_cpu_seq_fops);
555         if (!pde)
556                 goto err2;
557
558 #ifdef CONFIG_NET_CLS_ROUTE
559         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
560                         ip_rt_acct_read, NULL);
561         if (!pde)
562                 goto err3;
563 #endif
564         return 0;
565
566 #ifdef CONFIG_NET_CLS_ROUTE
567 err3:
568         remove_proc_entry("rt_cache", net->proc_net_stat);
569 #endif
570 err2:
571         remove_proc_entry("rt_cache", net->proc_net);
572 err1:
573         return -ENOMEM;
574 }
575
576 static void __net_exit ip_rt_do_proc_exit(struct net *net)
577 {
578         remove_proc_entry("rt_cache", net->proc_net_stat);
579         remove_proc_entry("rt_cache", net->proc_net);
580         remove_proc_entry("rt_acct", net->proc_net);
581 }
582
583 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
584         .init = ip_rt_do_proc_init,
585         .exit = ip_rt_do_proc_exit,
586 };
587
588 static int __init ip_rt_proc_init(void)
589 {
590         return register_pernet_subsys(&ip_rt_proc_ops);
591 }
592
593 #else
594 static inline int ip_rt_proc_init(void)
595 {
596         return 0;
597 }
598 #endif /* CONFIG_PROC_FS */
599
600 static inline void rt_free(struct rtable *rt)
601 {
602         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
603 }
604
605 static inline void rt_drop(struct rtable *rt)
606 {
607         ip_rt_put(rt);
608         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
609 }
610
611 static inline int rt_fast_clean(struct rtable *rth)
612 {
613         /* Kill broadcast/multicast entries very aggresively, if they
614            collide in hash table with more useful entries */
615         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
616                 rth->fl.iif && rth->u.dst.rt_next;
617 }
618
619 static inline int rt_valuable(struct rtable *rth)
620 {
621         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
622                 rth->u.dst.expires;
623 }
624
625 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
626 {
627         unsigned long age;
628         int ret = 0;
629
630         if (atomic_read(&rth->u.dst.__refcnt))
631                 goto out;
632
633         ret = 1;
634         if (rth->u.dst.expires &&
635             time_after_eq(jiffies, rth->u.dst.expires))
636                 goto out;
637
638         age = jiffies - rth->u.dst.lastuse;
639         ret = 0;
640         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
641             (age <= tmo2 && rt_valuable(rth)))
642                 goto out;
643         ret = 1;
644 out:    return ret;
645 }
646
647 /* Bits of score are:
648  * 31: very valuable
649  * 30: not quite useless
650  * 29..0: usage counter
651  */
652 static inline u32 rt_score(struct rtable *rt)
653 {
654         u32 score = jiffies - rt->u.dst.lastuse;
655
656         score = ~score & ~(3<<30);
657
658         if (rt_valuable(rt))
659                 score |= (1<<31);
660
661         if (!rt->fl.iif ||
662             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
663                 score |= (1<<30);
664
665         return score;
666 }
667
668 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
669 {
670         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
671                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
672                 (fl1->mark ^ fl2->mark) |
673                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
674                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
675                 (fl1->oif ^ fl2->oif) |
676                 (fl1->iif ^ fl2->iif)) == 0;
677 }
678
679 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
680 {
681         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
682 }
683
684 /*
685  * Perform a full scan of hash table and free all entries.
686  * Can be called by a softirq or a process.
687  * In the later case, we want to be reschedule if necessary
688  */
689 static void rt_do_flush(int process_context)
690 {
691         unsigned int i;
692         struct rtable *rth, *next;
693
694         for (i = 0; i <= rt_hash_mask; i++) {
695                 if (process_context && need_resched())
696                         cond_resched();
697                 rth = rt_hash_table[i].chain;
698                 if (!rth)
699                         continue;
700
701                 spin_lock_bh(rt_hash_lock_addr(i));
702                 rth = rt_hash_table[i].chain;
703                 rt_hash_table[i].chain = NULL;
704                 spin_unlock_bh(rt_hash_lock_addr(i));
705
706                 for (; rth; rth = next) {
707                         next = rth->u.dst.rt_next;
708                         rt_free(rth);
709                 }
710         }
711 }
712
713 static void rt_check_expire(void)
714 {
715         static unsigned int rover;
716         unsigned int i = rover, goal;
717         struct rtable *rth, **rthp;
718         u64 mult;
719
720         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
721         if (ip_rt_gc_timeout > 1)
722                 do_div(mult, ip_rt_gc_timeout);
723         goal = (unsigned int)mult;
724         if (goal > rt_hash_mask)
725                 goal = rt_hash_mask + 1;
726         for (; goal > 0; goal--) {
727                 unsigned long tmo = ip_rt_gc_timeout;
728
729                 i = (i + 1) & rt_hash_mask;
730                 rthp = &rt_hash_table[i].chain;
731
732                 if (need_resched())
733                         cond_resched();
734
735                 if (*rthp == NULL)
736                         continue;
737                 spin_lock_bh(rt_hash_lock_addr(i));
738                 while ((rth = *rthp) != NULL) {
739                         if (rth->rt_genid != atomic_read(&rt_genid)) {
740                                 *rthp = rth->u.dst.rt_next;
741                                 rt_free(rth);
742                                 continue;
743                         }
744                         if (rth->u.dst.expires) {
745                                 /* Entry is expired even if it is in use */
746                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
747                                         tmo >>= 1;
748                                         rthp = &rth->u.dst.rt_next;
749                                         continue;
750                                 }
751                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
752                                 tmo >>= 1;
753                                 rthp = &rth->u.dst.rt_next;
754                                 continue;
755                         }
756
757                         /* Cleanup aged off entries. */
758                         *rthp = rth->u.dst.rt_next;
759                         rt_free(rth);
760                 }
761                 spin_unlock_bh(rt_hash_lock_addr(i));
762         }
763         rover = i;
764 }
765
766 /*
767  * rt_worker_func() is run in process context.
768  * we call rt_check_expire() to scan part of the hash table
769  */
770 static void rt_worker_func(struct work_struct *work)
771 {
772         rt_check_expire();
773         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
774 }
775
776 /*
777  * Pertubation of rt_genid by a small quantity [1..256]
778  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
779  * many times (2^24) without giving recent rt_genid.
780  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
781  */
782 static void rt_cache_invalidate(void)
783 {
784         unsigned char shuffle;
785
786         get_random_bytes(&shuffle, sizeof(shuffle));
787         atomic_add(shuffle + 1U, &rt_genid);
788 }
789
790 /*
791  * delay < 0  : invalidate cache (fast : entries will be deleted later)
792  * delay >= 0 : invalidate & flush cache (can be long)
793  */
794 void rt_cache_flush(struct net *net, int delay)
795 {
796         rt_cache_invalidate();
797         if (delay >= 0)
798                 rt_do_flush(!in_softirq());
799 }
800
801 /*
802  * We change rt_genid and let gc do the cleanup
803  */
804 static void rt_secret_rebuild(unsigned long dummy)
805 {
806         rt_cache_invalidate();
807         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
808 }
809
810 /*
811    Short description of GC goals.
812
813    We want to build algorithm, which will keep routing cache
814    at some equilibrium point, when number of aged off entries
815    is kept approximately equal to newly generated ones.
816
817    Current expiration strength is variable "expire".
818    We try to adjust it dynamically, so that if networking
819    is idle expires is large enough to keep enough of warm entries,
820    and when load increases it reduces to limit cache size.
821  */
822
823 static int rt_garbage_collect(struct dst_ops *ops)
824 {
825         static unsigned long expire = RT_GC_TIMEOUT;
826         static unsigned long last_gc;
827         static int rover;
828         static int equilibrium;
829         struct rtable *rth, **rthp;
830         unsigned long now = jiffies;
831         int goal;
832
833         /*
834          * Garbage collection is pretty expensive,
835          * do not make it too frequently.
836          */
837
838         RT_CACHE_STAT_INC(gc_total);
839
840         if (now - last_gc < ip_rt_gc_min_interval &&
841             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842                 RT_CACHE_STAT_INC(gc_ignored);
843                 goto out;
844         }
845
846         /* Calculate number of entries, which we want to expire now. */
847         goal = atomic_read(&ipv4_dst_ops.entries) -
848                 (ip_rt_gc_elasticity << rt_hash_log);
849         if (goal <= 0) {
850                 if (equilibrium < ipv4_dst_ops.gc_thresh)
851                         equilibrium = ipv4_dst_ops.gc_thresh;
852                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853                 if (goal > 0) {
854                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
855                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856                 }
857         } else {
858                 /* We are in dangerous area. Try to reduce cache really
859                  * aggressively.
860                  */
861                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
862                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863         }
864
865         if (now - last_gc >= ip_rt_gc_min_interval)
866                 last_gc = now;
867
868         if (goal <= 0) {
869                 equilibrium += goal;
870                 goto work_done;
871         }
872
873         do {
874                 int i, k;
875
876                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877                         unsigned long tmo = expire;
878
879                         k = (k + 1) & rt_hash_mask;
880                         rthp = &rt_hash_table[k].chain;
881                         spin_lock_bh(rt_hash_lock_addr(k));
882                         while ((rth = *rthp) != NULL) {
883                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
884                                         !rt_may_expire(rth, tmo, expire)) {
885                                         tmo >>= 1;
886                                         rthp = &rth->u.dst.rt_next;
887                                         continue;
888                                 }
889                                 *rthp = rth->u.dst.rt_next;
890                                 rt_free(rth);
891                                 goal--;
892                         }
893                         spin_unlock_bh(rt_hash_lock_addr(k));
894                         if (goal <= 0)
895                                 break;
896                 }
897                 rover = k;
898
899                 if (goal <= 0)
900                         goto work_done;
901
902                 /* Goal is not achieved. We stop process if:
903
904                    - if expire reduced to zero. Otherwise, expire is halfed.
905                    - if table is not full.
906                    - if we are called from interrupt.
907                    - jiffies check is just fallback/debug loop breaker.
908                      We will not spin here for long time in any case.
909                  */
910
911                 RT_CACHE_STAT_INC(gc_goal_miss);
912
913                 if (expire == 0)
914                         break;
915
916                 expire >>= 1;
917 #if RT_CACHE_DEBUG >= 2
918                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
919                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
920 #endif
921
922                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
923                         goto out;
924         } while (!in_softirq() && time_before_eq(jiffies, now));
925
926         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
927                 goto out;
928         if (net_ratelimit())
929                 printk(KERN_WARNING "dst cache overflow\n");
930         RT_CACHE_STAT_INC(gc_dst_overflow);
931         return 1;
932
933 work_done:
934         expire += ip_rt_gc_min_interval;
935         if (expire > ip_rt_gc_timeout ||
936             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
937                 expire = ip_rt_gc_timeout;
938 #if RT_CACHE_DEBUG >= 2
939         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
940                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
941 #endif
942 out:    return 0;
943 }
944
945 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
946 {
947         struct rtable   *rth, **rthp;
948         unsigned long   now;
949         struct rtable *cand, **candp;
950         u32             min_score;
951         int             chain_length;
952         int attempts = !in_softirq();
953
954 restart:
955         chain_length = 0;
956         min_score = ~(u32)0;
957         cand = NULL;
958         candp = NULL;
959         now = jiffies;
960
961         rthp = &rt_hash_table[hash].chain;
962
963         spin_lock_bh(rt_hash_lock_addr(hash));
964         while ((rth = *rthp) != NULL) {
965                 if (rth->rt_genid != atomic_read(&rt_genid)) {
966                         *rthp = rth->u.dst.rt_next;
967                         rt_free(rth);
968                         continue;
969                 }
970                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
971                         /* Put it first */
972                         *rthp = rth->u.dst.rt_next;
973                         /*
974                          * Since lookup is lockfree, the deletion
975                          * must be visible to another weakly ordered CPU before
976                          * the insertion at the start of the hash chain.
977                          */
978                         rcu_assign_pointer(rth->u.dst.rt_next,
979                                            rt_hash_table[hash].chain);
980                         /*
981                          * Since lookup is lockfree, the update writes
982                          * must be ordered for consistency on SMP.
983                          */
984                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
985
986                         dst_use(&rth->u.dst, now);
987                         spin_unlock_bh(rt_hash_lock_addr(hash));
988
989                         rt_drop(rt);
990                         *rp = rth;
991                         return 0;
992                 }
993
994                 if (!atomic_read(&rth->u.dst.__refcnt)) {
995                         u32 score = rt_score(rth);
996
997                         if (score <= min_score) {
998                                 cand = rth;
999                                 candp = rthp;
1000                                 min_score = score;
1001                         }
1002                 }
1003
1004                 chain_length++;
1005
1006                 rthp = &rth->u.dst.rt_next;
1007         }
1008
1009         if (cand) {
1010                 /* ip_rt_gc_elasticity used to be average length of chain
1011                  * length, when exceeded gc becomes really aggressive.
1012                  *
1013                  * The second limit is less certain. At the moment it allows
1014                  * only 2 entries per bucket. We will see.
1015                  */
1016                 if (chain_length > ip_rt_gc_elasticity) {
1017                         *candp = cand->u.dst.rt_next;
1018                         rt_free(cand);
1019                 }
1020         }
1021
1022         /* Try to bind route to arp only if it is output
1023            route or unicast forwarding path.
1024          */
1025         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026                 int err = arp_bind_neighbour(&rt->u.dst);
1027                 if (err) {
1028                         spin_unlock_bh(rt_hash_lock_addr(hash));
1029
1030                         if (err != -ENOBUFS) {
1031                                 rt_drop(rt);
1032                                 return err;
1033                         }
1034
1035                         /* Neighbour tables are full and nothing
1036                            can be released. Try to shrink route cache,
1037                            it is most likely it holds some neighbour records.
1038                          */
1039                         if (attempts-- > 0) {
1040                                 int saved_elasticity = ip_rt_gc_elasticity;
1041                                 int saved_int = ip_rt_gc_min_interval;
1042                                 ip_rt_gc_elasticity     = 1;
1043                                 ip_rt_gc_min_interval   = 0;
1044                                 rt_garbage_collect(&ipv4_dst_ops);
1045                                 ip_rt_gc_min_interval   = saved_int;
1046                                 ip_rt_gc_elasticity     = saved_elasticity;
1047                                 goto restart;
1048                         }
1049
1050                         if (net_ratelimit())
1051                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1052                         rt_drop(rt);
1053                         return -ENOBUFS;
1054                 }
1055         }
1056
1057         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1058 #if RT_CACHE_DEBUG >= 2
1059         if (rt->u.dst.rt_next) {
1060                 struct rtable *trt;
1061                 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1062                        NIPQUAD(rt->rt_dst));
1063                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1064                         printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1065                 printk("\n");
1066         }
1067 #endif
1068         rt_hash_table[hash].chain = rt;
1069         spin_unlock_bh(rt_hash_lock_addr(hash));
1070         *rp = rt;
1071         return 0;
1072 }
1073
1074 void rt_bind_peer(struct rtable *rt, int create)
1075 {
1076         static DEFINE_SPINLOCK(rt_peer_lock);
1077         struct inet_peer *peer;
1078
1079         peer = inet_getpeer(rt->rt_dst, create);
1080
1081         spin_lock_bh(&rt_peer_lock);
1082         if (rt->peer == NULL) {
1083                 rt->peer = peer;
1084                 peer = NULL;
1085         }
1086         spin_unlock_bh(&rt_peer_lock);
1087         if (peer)
1088                 inet_putpeer(peer);
1089 }
1090
1091 /*
1092  * Peer allocation may fail only in serious out-of-memory conditions.  However
1093  * we still can generate some output.
1094  * Random ID selection looks a bit dangerous because we have no chances to
1095  * select ID being unique in a reasonable period of time.
1096  * But broken packet identifier may be better than no packet at all.
1097  */
1098 static void ip_select_fb_ident(struct iphdr *iph)
1099 {
1100         static DEFINE_SPINLOCK(ip_fb_id_lock);
1101         static u32 ip_fallback_id;
1102         u32 salt;
1103
1104         spin_lock_bh(&ip_fb_id_lock);
1105         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1106         iph->id = htons(salt & 0xFFFF);
1107         ip_fallback_id = salt;
1108         spin_unlock_bh(&ip_fb_id_lock);
1109 }
1110
1111 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1112 {
1113         struct rtable *rt = (struct rtable *) dst;
1114
1115         if (rt) {
1116                 if (rt->peer == NULL)
1117                         rt_bind_peer(rt, 1);
1118
1119                 /* If peer is attached to destination, it is never detached,
1120                    so that we need not to grab a lock to dereference it.
1121                  */
1122                 if (rt->peer) {
1123                         iph->id = htons(inet_getid(rt->peer, more));
1124                         return;
1125                 }
1126         } else
1127                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1128                        __builtin_return_address(0));
1129
1130         ip_select_fb_ident(iph);
1131 }
1132
1133 static void rt_del(unsigned hash, struct rtable *rt)
1134 {
1135         struct rtable **rthp, *aux;
1136
1137         rthp = &rt_hash_table[hash].chain;
1138         spin_lock_bh(rt_hash_lock_addr(hash));
1139         ip_rt_put(rt);
1140         while ((aux = *rthp) != NULL) {
1141                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1142                         *rthp = aux->u.dst.rt_next;
1143                         rt_free(aux);
1144                         continue;
1145                 }
1146                 rthp = &aux->u.dst.rt_next;
1147         }
1148         spin_unlock_bh(rt_hash_lock_addr(hash));
1149 }
1150
1151 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1152                     __be32 saddr, struct net_device *dev)
1153 {
1154         int i, k;
1155         struct in_device *in_dev = in_dev_get(dev);
1156         struct rtable *rth, **rthp;
1157         __be32  skeys[2] = { saddr, 0 };
1158         int  ikeys[2] = { dev->ifindex, 0 };
1159         struct netevent_redirect netevent;
1160         struct net *net;
1161
1162         if (!in_dev)
1163                 return;
1164
1165         net = dev_net(dev);
1166         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1167             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1168             || ipv4_is_zeronet(new_gw))
1169                 goto reject_redirect;
1170
1171         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1172                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1173                         goto reject_redirect;
1174                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1175                         goto reject_redirect;
1176         } else {
1177                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1178                         goto reject_redirect;
1179         }
1180
1181         for (i = 0; i < 2; i++) {
1182                 for (k = 0; k < 2; k++) {
1183                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1184
1185                         rthp=&rt_hash_table[hash].chain;
1186
1187                         rcu_read_lock();
1188                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1189                                 struct rtable *rt;
1190
1191                                 if (rth->fl.fl4_dst != daddr ||
1192                                     rth->fl.fl4_src != skeys[i] ||
1193                                     rth->fl.oif != ikeys[k] ||
1194                                     rth->fl.iif != 0 ||
1195                                     rth->rt_genid != atomic_read(&rt_genid) ||
1196                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1197                                         rthp = &rth->u.dst.rt_next;
1198                                         continue;
1199                                 }
1200
1201                                 if (rth->rt_dst != daddr ||
1202                                     rth->rt_src != saddr ||
1203                                     rth->u.dst.error ||
1204                                     rth->rt_gateway != old_gw ||
1205                                     rth->u.dst.dev != dev)
1206                                         break;
1207
1208                                 dst_hold(&rth->u.dst);
1209                                 rcu_read_unlock();
1210
1211                                 rt = dst_alloc(&ipv4_dst_ops);
1212                                 if (rt == NULL) {
1213                                         ip_rt_put(rth);
1214                                         in_dev_put(in_dev);
1215                                         return;
1216                                 }
1217
1218                                 /* Copy all the information. */
1219                                 *rt = *rth;
1220                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1221                                 rt->u.dst.__use         = 1;
1222                                 atomic_set(&rt->u.dst.__refcnt, 1);
1223                                 rt->u.dst.child         = NULL;
1224                                 if (rt->u.dst.dev)
1225                                         dev_hold(rt->u.dst.dev);
1226                                 if (rt->idev)
1227                                         in_dev_hold(rt->idev);
1228                                 rt->u.dst.obsolete      = 0;
1229                                 rt->u.dst.lastuse       = jiffies;
1230                                 rt->u.dst.path          = &rt->u.dst;
1231                                 rt->u.dst.neighbour     = NULL;
1232                                 rt->u.dst.hh            = NULL;
1233                                 rt->u.dst.xfrm          = NULL;
1234                                 rt->rt_genid            = atomic_read(&rt_genid);
1235                                 rt->rt_flags            |= RTCF_REDIRECTED;
1236
1237                                 /* Gateway is different ... */
1238                                 rt->rt_gateway          = new_gw;
1239
1240                                 /* Redirect received -> path was valid */
1241                                 dst_confirm(&rth->u.dst);
1242
1243                                 if (rt->peer)
1244                                         atomic_inc(&rt->peer->refcnt);
1245
1246                                 if (arp_bind_neighbour(&rt->u.dst) ||
1247                                     !(rt->u.dst.neighbour->nud_state &
1248                                             NUD_VALID)) {
1249                                         if (rt->u.dst.neighbour)
1250                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1251                                         ip_rt_put(rth);
1252                                         rt_drop(rt);
1253                                         goto do_next;
1254                                 }
1255
1256                                 netevent.old = &rth->u.dst;
1257                                 netevent.new = &rt->u.dst;
1258                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1259                                                         &netevent);
1260
1261                                 rt_del(hash, rth);
1262                                 if (!rt_intern_hash(hash, rt, &rt))
1263                                         ip_rt_put(rt);
1264                                 goto do_next;
1265                         }
1266                         rcu_read_unlock();
1267                 do_next:
1268                         ;
1269                 }
1270         }
1271         in_dev_put(in_dev);
1272         return;
1273
1274 reject_redirect:
1275 #ifdef CONFIG_IP_ROUTE_VERBOSE
1276         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1277                 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1278                         NIPQUAD_FMT " ignored.\n"
1279                         "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1280                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1281                        NIPQUAD(saddr), NIPQUAD(daddr));
1282 #endif
1283         in_dev_put(in_dev);
1284 }
1285
1286 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1287 {
1288         struct rtable *rt = (struct rtable *)dst;
1289         struct dst_entry *ret = dst;
1290
1291         if (rt) {
1292                 if (dst->obsolete) {
1293                         ip_rt_put(rt);
1294                         ret = NULL;
1295                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1296                            rt->u.dst.expires) {
1297                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1298                                                 rt->fl.oif);
1299 #if RT_CACHE_DEBUG >= 1
1300                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1301                                           NIPQUAD_FMT "/%02x dropped\n",
1302                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1303 #endif
1304                         rt_del(hash, rt);
1305                         ret = NULL;
1306                 }
1307         }
1308         return ret;
1309 }
1310
1311 /*
1312  * Algorithm:
1313  *      1. The first ip_rt_redirect_number redirects are sent
1314  *         with exponential backoff, then we stop sending them at all,
1315  *         assuming that the host ignores our redirects.
1316  *      2. If we did not see packets requiring redirects
1317  *         during ip_rt_redirect_silence, we assume that the host
1318  *         forgot redirected route and start to send redirects again.
1319  *
1320  * This algorithm is much cheaper and more intelligent than dumb load limiting
1321  * in icmp.c.
1322  *
1323  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1324  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1325  */
1326
1327 void ip_rt_send_redirect(struct sk_buff *skb)
1328 {
1329         struct rtable *rt = skb->rtable;
1330         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1331
1332         if (!in_dev)
1333                 return;
1334
1335         if (!IN_DEV_TX_REDIRECTS(in_dev))
1336                 goto out;
1337
1338         /* No redirected packets during ip_rt_redirect_silence;
1339          * reset the algorithm.
1340          */
1341         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1342                 rt->u.dst.rate_tokens = 0;
1343
1344         /* Too many ignored redirects; do not send anything
1345          * set u.dst.rate_last to the last seen redirected packet.
1346          */
1347         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1348                 rt->u.dst.rate_last = jiffies;
1349                 goto out;
1350         }
1351
1352         /* Check for load limit; set rate_last to the latest sent
1353          * redirect.
1354          */
1355         if (rt->u.dst.rate_tokens == 0 ||
1356             time_after(jiffies,
1357                        (rt->u.dst.rate_last +
1358                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1359                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1360                 rt->u.dst.rate_last = jiffies;
1361                 ++rt->u.dst.rate_tokens;
1362 #ifdef CONFIG_IP_ROUTE_VERBOSE
1363                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1364                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1365                     net_ratelimit())
1366                         printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1367                                 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1368                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1369                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1370 #endif
1371         }
1372 out:
1373         in_dev_put(in_dev);
1374 }
1375
1376 static int ip_error(struct sk_buff *skb)
1377 {
1378         struct rtable *rt = skb->rtable;
1379         unsigned long now;
1380         int code;
1381
1382         switch (rt->u.dst.error) {
1383                 case EINVAL:
1384                 default:
1385                         goto out;
1386                 case EHOSTUNREACH:
1387                         code = ICMP_HOST_UNREACH;
1388                         break;
1389                 case ENETUNREACH:
1390                         code = ICMP_NET_UNREACH;
1391                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1392                         break;
1393                 case EACCES:
1394                         code = ICMP_PKT_FILTERED;
1395                         break;
1396         }
1397
1398         now = jiffies;
1399         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1400         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1401                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1402         rt->u.dst.rate_last = now;
1403         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1404                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1405                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1406         }
1407
1408 out:    kfree_skb(skb);
1409         return 0;
1410 }
1411
1412 /*
1413  *      The last two values are not from the RFC but
1414  *      are needed for AMPRnet AX.25 paths.
1415  */
1416
1417 static const unsigned short mtu_plateau[] =
1418 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1419
1420 static inline unsigned short guess_mtu(unsigned short old_mtu)
1421 {
1422         int i;
1423
1424         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1425                 if (old_mtu > mtu_plateau[i])
1426                         return mtu_plateau[i];
1427         return 68;
1428 }
1429
1430 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1431                                  unsigned short new_mtu,
1432                                  struct net_device *dev)
1433 {
1434         int i, k;
1435         unsigned short old_mtu = ntohs(iph->tot_len);
1436         struct rtable *rth;
1437         int  ikeys[2] = { dev->ifindex, 0 };
1438         __be32  skeys[2] = { iph->saddr, 0, };
1439         __be32  daddr = iph->daddr;
1440         unsigned short est_mtu = 0;
1441
1442         if (ipv4_config.no_pmtu_disc)
1443                 return 0;
1444
1445         for (k = 0; k < 2; k++) {
1446                 for (i = 0; i < 2; i++) {
1447                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1448
1449                         rcu_read_lock();
1450                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1451                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1452                                 unsigned short mtu = new_mtu;
1453
1454                                 if (rth->fl.fl4_dst != daddr ||
1455                                     rth->fl.fl4_src != skeys[i] ||
1456                                     rth->rt_dst != daddr ||
1457                                     rth->rt_src != iph->saddr ||
1458                                     rth->fl.oif != ikeys[k] ||
1459                                     rth->fl.iif != 0 ||
1460                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1461                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1462                                     rth->rt_genid != atomic_read(&rt_genid))
1463                                         continue;
1464
1465                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1466
1467                                         /* BSD 4.2 compatibility hack :-( */
1468                                         if (mtu == 0 &&
1469                                             old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1470                                             old_mtu >= 68 + (iph->ihl << 2))
1471                                                 old_mtu -= iph->ihl << 2;
1472
1473                                         mtu = guess_mtu(old_mtu);
1474                                 }
1475                                 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1476                                         if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1477                                                 dst_confirm(&rth->u.dst);
1478                                                 if (mtu < ip_rt_min_pmtu) {
1479                                                         mtu = ip_rt_min_pmtu;
1480                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1481                                                                 (1 << RTAX_MTU);
1482                                                 }
1483                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1484                                                 dst_set_expires(&rth->u.dst,
1485                                                         ip_rt_mtu_expires);
1486                                         }
1487                                         est_mtu = mtu;
1488                                 }
1489                         }
1490                         rcu_read_unlock();
1491                 }
1492         }
1493         return est_mtu ? : new_mtu;
1494 }
1495
1496 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1497 {
1498         if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1499             !(dst_metric_locked(dst, RTAX_MTU))) {
1500                 if (mtu < ip_rt_min_pmtu) {
1501                         mtu = ip_rt_min_pmtu;
1502                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1503                 }
1504                 dst->metrics[RTAX_MTU-1] = mtu;
1505                 dst_set_expires(dst, ip_rt_mtu_expires);
1506                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1507         }
1508 }
1509
1510 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1511 {
1512         return NULL;
1513 }
1514
1515 static void ipv4_dst_destroy(struct dst_entry *dst)
1516 {
1517         struct rtable *rt = (struct rtable *) dst;
1518         struct inet_peer *peer = rt->peer;
1519         struct in_device *idev = rt->idev;
1520
1521         if (peer) {
1522                 rt->peer = NULL;
1523                 inet_putpeer(peer);
1524         }
1525
1526         if (idev) {
1527                 rt->idev = NULL;
1528                 in_dev_put(idev);
1529         }
1530 }
1531
1532 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1533                             int how)
1534 {
1535         struct rtable *rt = (struct rtable *) dst;
1536         struct in_device *idev = rt->idev;
1537         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1538                 struct in_device *loopback_idev =
1539                         in_dev_get(dev_net(dev)->loopback_dev);
1540                 if (loopback_idev) {
1541                         rt->idev = loopback_idev;
1542                         in_dev_put(idev);
1543                 }
1544         }
1545 }
1546
1547 static void ipv4_link_failure(struct sk_buff *skb)
1548 {
1549         struct rtable *rt;
1550
1551         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1552
1553         rt = skb->rtable;
1554         if (rt)
1555                 dst_set_expires(&rt->u.dst, 0);
1556 }
1557
1558 static int ip_rt_bug(struct sk_buff *skb)
1559 {
1560         printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1561                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1562                 skb->dev ? skb->dev->name : "?");
1563         kfree_skb(skb);
1564         return 0;
1565 }
1566
1567 /*
1568    We do not cache source address of outgoing interface,
1569    because it is used only by IP RR, TS and SRR options,
1570    so that it out of fast path.
1571
1572    BTW remember: "addr" is allowed to be not aligned
1573    in IP options!
1574  */
1575
1576 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1577 {
1578         __be32 src;
1579         struct fib_result res;
1580
1581         if (rt->fl.iif == 0)
1582                 src = rt->rt_src;
1583         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1584                 src = FIB_RES_PREFSRC(res);
1585                 fib_res_put(&res);
1586         } else
1587                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1588                                         RT_SCOPE_UNIVERSE);
1589         memcpy(addr, &src, 4);
1590 }
1591
1592 #ifdef CONFIG_NET_CLS_ROUTE
1593 static void set_class_tag(struct rtable *rt, u32 tag)
1594 {
1595         if (!(rt->u.dst.tclassid & 0xFFFF))
1596                 rt->u.dst.tclassid |= tag & 0xFFFF;
1597         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1598                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1599 }
1600 #endif
1601
1602 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1603 {
1604         struct fib_info *fi = res->fi;
1605
1606         if (fi) {
1607                 if (FIB_RES_GW(*res) &&
1608                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1609                         rt->rt_gateway = FIB_RES_GW(*res);
1610                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1611                        sizeof(rt->u.dst.metrics));
1612                 if (fi->fib_mtu == 0) {
1613                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1614                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1615                             rt->rt_gateway != rt->rt_dst &&
1616                             rt->u.dst.dev->mtu > 576)
1617                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1618                 }
1619 #ifdef CONFIG_NET_CLS_ROUTE
1620                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1621 #endif
1622         } else
1623                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1624
1625         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1626                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1627         if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1628                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1629         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1630                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1631                                        ip_rt_min_advmss);
1632         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1633                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1634
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 #ifdef CONFIG_IP_MULTIPLE_TABLES
1637         set_class_tag(rt, fib_rules_tclass(res));
1638 #endif
1639         set_class_tag(rt, itag);
1640 #endif
1641         rt->rt_type = res->type;
1642 }
1643
1644 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1645                                 u8 tos, struct net_device *dev, int our)
1646 {
1647         unsigned hash;
1648         struct rtable *rth;
1649         __be32 spec_dst;
1650         struct in_device *in_dev = in_dev_get(dev);
1651         u32 itag = 0;
1652
1653         /* Primary sanity checks. */
1654
1655         if (in_dev == NULL)
1656                 return -EINVAL;
1657
1658         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1659             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1660                 goto e_inval;
1661
1662         if (ipv4_is_zeronet(saddr)) {
1663                 if (!ipv4_is_local_multicast(daddr))
1664                         goto e_inval;
1665                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1666         } else if (fib_validate_source(saddr, 0, tos, 0,
1667                                         dev, &spec_dst, &itag) < 0)
1668                 goto e_inval;
1669
1670         rth = dst_alloc(&ipv4_dst_ops);
1671         if (!rth)
1672                 goto e_nobufs;
1673
1674         rth->u.dst.output= ip_rt_bug;
1675
1676         atomic_set(&rth->u.dst.__refcnt, 1);
1677         rth->u.dst.flags= DST_HOST;
1678         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1679                 rth->u.dst.flags |= DST_NOPOLICY;
1680         rth->fl.fl4_dst = daddr;
1681         rth->rt_dst     = daddr;
1682         rth->fl.fl4_tos = tos;
1683         rth->fl.mark    = skb->mark;
1684         rth->fl.fl4_src = saddr;
1685         rth->rt_src     = saddr;
1686 #ifdef CONFIG_NET_CLS_ROUTE
1687         rth->u.dst.tclassid = itag;
1688 #endif
1689         rth->rt_iif     =
1690         rth->fl.iif     = dev->ifindex;
1691         rth->u.dst.dev  = init_net.loopback_dev;
1692         dev_hold(rth->u.dst.dev);
1693         rth->idev       = in_dev_get(rth->u.dst.dev);
1694         rth->fl.oif     = 0;
1695         rth->rt_gateway = daddr;
1696         rth->rt_spec_dst= spec_dst;
1697         rth->rt_genid   = atomic_read(&rt_genid);
1698         rth->rt_flags   = RTCF_MULTICAST;
1699         rth->rt_type    = RTN_MULTICAST;
1700         if (our) {
1701                 rth->u.dst.input= ip_local_deliver;
1702                 rth->rt_flags |= RTCF_LOCAL;
1703         }
1704
1705 #ifdef CONFIG_IP_MROUTE
1706         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1707                 rth->u.dst.input = ip_mr_input;
1708 #endif
1709         RT_CACHE_STAT_INC(in_slow_mc);
1710
1711         in_dev_put(in_dev);
1712         hash = rt_hash(daddr, saddr, dev->ifindex);
1713         return rt_intern_hash(hash, rth, &skb->rtable);
1714
1715 e_nobufs:
1716         in_dev_put(in_dev);
1717         return -ENOBUFS;
1718
1719 e_inval:
1720         in_dev_put(in_dev);
1721         return -EINVAL;
1722 }
1723
1724
1725 static void ip_handle_martian_source(struct net_device *dev,
1726                                      struct in_device *in_dev,
1727                                      struct sk_buff *skb,
1728                                      __be32 daddr,
1729                                      __be32 saddr)
1730 {
1731         RT_CACHE_STAT_INC(in_martian_src);
1732 #ifdef CONFIG_IP_ROUTE_VERBOSE
1733         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1734                 /*
1735                  *      RFC1812 recommendation, if source is martian,
1736                  *      the only hint is MAC header.
1737                  */
1738                 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1739                         NIPQUAD_FMT", on dev %s\n",
1740                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1741                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1742                         int i;
1743                         const unsigned char *p = skb_mac_header(skb);
1744                         printk(KERN_WARNING "ll header: ");
1745                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1746                                 printk("%02x", *p);
1747                                 if (i < (dev->hard_header_len - 1))
1748                                         printk(":");
1749                         }
1750                         printk("\n");
1751                 }
1752         }
1753 #endif
1754 }
1755
1756 static int __mkroute_input(struct sk_buff *skb,
1757                            struct fib_result *res,
1758                            struct in_device *in_dev,
1759                            __be32 daddr, __be32 saddr, u32 tos,
1760                            struct rtable **result)
1761 {
1762
1763         struct rtable *rth;
1764         int err;
1765         struct in_device *out_dev;
1766         unsigned flags = 0;
1767         __be32 spec_dst;
1768         u32 itag;
1769
1770         /* get a working reference to the output device */
1771         out_dev = in_dev_get(FIB_RES_DEV(*res));
1772         if (out_dev == NULL) {
1773                 if (net_ratelimit())
1774                         printk(KERN_CRIT "Bug in ip_route_input" \
1775                                "_slow(). Please, report\n");
1776                 return -EINVAL;
1777         }
1778
1779
1780         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1781                                   in_dev->dev, &spec_dst, &itag);
1782         if (err < 0) {
1783                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1784                                          saddr);
1785
1786                 err = -EINVAL;
1787                 goto cleanup;
1788         }
1789
1790         if (err)
1791                 flags |= RTCF_DIRECTSRC;
1792
1793         if (out_dev == in_dev && err &&
1794             (IN_DEV_SHARED_MEDIA(out_dev) ||
1795              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1796                 flags |= RTCF_DOREDIRECT;
1797
1798         if (skb->protocol != htons(ETH_P_IP)) {
1799                 /* Not IP (i.e. ARP). Do not create route, if it is
1800                  * invalid for proxy arp. DNAT routes are always valid.
1801                  */
1802                 if (out_dev == in_dev) {
1803                         err = -EINVAL;
1804                         goto cleanup;
1805                 }
1806         }
1807
1808
1809         rth = dst_alloc(&ipv4_dst_ops);
1810         if (!rth) {
1811                 err = -ENOBUFS;
1812                 goto cleanup;
1813         }
1814
1815         atomic_set(&rth->u.dst.__refcnt, 1);
1816         rth->u.dst.flags= DST_HOST;
1817         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1818                 rth->u.dst.flags |= DST_NOPOLICY;
1819         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1820                 rth->u.dst.flags |= DST_NOXFRM;
1821         rth->fl.fl4_dst = daddr;
1822         rth->rt_dst     = daddr;
1823         rth->fl.fl4_tos = tos;
1824         rth->fl.mark    = skb->mark;
1825         rth->fl.fl4_src = saddr;
1826         rth->rt_src     = saddr;
1827         rth->rt_gateway = daddr;
1828         rth->rt_iif     =
1829                 rth->fl.iif     = in_dev->dev->ifindex;
1830         rth->u.dst.dev  = (out_dev)->dev;
1831         dev_hold(rth->u.dst.dev);
1832         rth->idev       = in_dev_get(rth->u.dst.dev);
1833         rth->fl.oif     = 0;
1834         rth->rt_spec_dst= spec_dst;
1835
1836         rth->u.dst.input = ip_forward;
1837         rth->u.dst.output = ip_output;
1838         rth->rt_genid = atomic_read(&rt_genid);
1839
1840         rt_set_nexthop(rth, res, itag);
1841
1842         rth->rt_flags = flags;
1843
1844         *result = rth;
1845         err = 0;
1846  cleanup:
1847         /* release the working reference to the output device */
1848         in_dev_put(out_dev);
1849         return err;
1850 }
1851
1852 static int ip_mkroute_input(struct sk_buff *skb,
1853                             struct fib_result *res,
1854                             const struct flowi *fl,
1855                             struct in_device *in_dev,
1856                             __be32 daddr, __be32 saddr, u32 tos)
1857 {
1858         struct rtable* rth = NULL;
1859         int err;
1860         unsigned hash;
1861
1862 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1863         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1864                 fib_select_multipath(fl, res);
1865 #endif
1866
1867         /* create a routing cache entry */
1868         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1869         if (err)
1870                 return err;
1871
1872         /* put it into the cache */
1873         hash = rt_hash(daddr, saddr, fl->iif);
1874         return rt_intern_hash(hash, rth, &skb->rtable);
1875 }
1876
1877 /*
1878  *      NOTE. We drop all the packets that has local source
1879  *      addresses, because every properly looped back packet
1880  *      must have correct destination already attached by output routine.
1881  *
1882  *      Such approach solves two big problems:
1883  *      1. Not simplex devices are handled properly.
1884  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1885  */
1886
1887 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1888                                u8 tos, struct net_device *dev)
1889 {
1890         struct fib_result res;
1891         struct in_device *in_dev = in_dev_get(dev);
1892         struct flowi fl = { .nl_u = { .ip4_u =
1893                                       { .daddr = daddr,
1894                                         .saddr = saddr,
1895                                         .tos = tos,
1896                                         .scope = RT_SCOPE_UNIVERSE,
1897                                       } },
1898                             .mark = skb->mark,
1899                             .iif = dev->ifindex };
1900         unsigned        flags = 0;
1901         u32             itag = 0;
1902         struct rtable * rth;
1903         unsigned        hash;
1904         __be32          spec_dst;
1905         int             err = -EINVAL;
1906         int             free_res = 0;
1907         struct net    * net = dev_net(dev);
1908
1909         /* IP on this device is disabled. */
1910
1911         if (!in_dev)
1912                 goto out;
1913
1914         /* Check for the most weird martians, which can be not detected
1915            by fib_lookup.
1916          */
1917
1918         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1919             ipv4_is_loopback(saddr))
1920                 goto martian_source;
1921
1922         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1923                 goto brd_input;
1924
1925         /* Accept zero addresses only to limited broadcast;
1926          * I even do not know to fix it or not. Waiting for complains :-)
1927          */
1928         if (ipv4_is_zeronet(saddr))
1929                 goto martian_source;
1930
1931         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1932             ipv4_is_loopback(daddr))
1933                 goto martian_destination;
1934
1935         /*
1936          *      Now we are ready to route packet.
1937          */
1938         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1939                 if (!IN_DEV_FORWARD(in_dev))
1940                         goto e_hostunreach;
1941                 goto no_route;
1942         }
1943         free_res = 1;
1944
1945         RT_CACHE_STAT_INC(in_slow_tot);
1946
1947         if (res.type == RTN_BROADCAST)
1948                 goto brd_input;
1949
1950         if (res.type == RTN_LOCAL) {
1951                 int result;
1952                 result = fib_validate_source(saddr, daddr, tos,
1953                                              net->loopback_dev->ifindex,
1954                                              dev, &spec_dst, &itag);
1955                 if (result < 0)
1956                         goto martian_source;
1957                 if (result)
1958                         flags |= RTCF_DIRECTSRC;
1959                 spec_dst = daddr;
1960                 goto local_input;
1961         }
1962
1963         if (!IN_DEV_FORWARD(in_dev))
1964                 goto e_hostunreach;
1965         if (res.type != RTN_UNICAST)
1966                 goto martian_destination;
1967
1968         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1969 done:
1970         in_dev_put(in_dev);
1971         if (free_res)
1972                 fib_res_put(&res);
1973 out:    return err;
1974
1975 brd_input:
1976         if (skb->protocol != htons(ETH_P_IP))
1977                 goto e_inval;
1978
1979         if (ipv4_is_zeronet(saddr))
1980                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1981         else {
1982                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1983                                           &itag);
1984                 if (err < 0)
1985                         goto martian_source;
1986                 if (err)
1987                         flags |= RTCF_DIRECTSRC;
1988         }
1989         flags |= RTCF_BROADCAST;
1990         res.type = RTN_BROADCAST;
1991         RT_CACHE_STAT_INC(in_brd);
1992
1993 local_input:
1994         rth = dst_alloc(&ipv4_dst_ops);
1995         if (!rth)
1996                 goto e_nobufs;
1997
1998         rth->u.dst.output= ip_rt_bug;
1999         rth->rt_genid = atomic_read(&rt_genid);
2000
2001         atomic_set(&rth->u.dst.__refcnt, 1);
2002         rth->u.dst.flags= DST_HOST;
2003         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2004                 rth->u.dst.flags |= DST_NOPOLICY;
2005         rth->fl.fl4_dst = daddr;
2006         rth->rt_dst     = daddr;
2007         rth->fl.fl4_tos = tos;
2008         rth->fl.mark    = skb->mark;
2009         rth->fl.fl4_src = saddr;
2010         rth->rt_src     = saddr;
2011 #ifdef CONFIG_NET_CLS_ROUTE
2012         rth->u.dst.tclassid = itag;
2013 #endif
2014         rth->rt_iif     =
2015         rth->fl.iif     = dev->ifindex;
2016         rth->u.dst.dev  = net->loopback_dev;
2017         dev_hold(rth->u.dst.dev);
2018         rth->idev       = in_dev_get(rth->u.dst.dev);
2019         rth->rt_gateway = daddr;
2020         rth->rt_spec_dst= spec_dst;
2021         rth->u.dst.input= ip_local_deliver;
2022         rth->rt_flags   = flags|RTCF_LOCAL;
2023         if (res.type == RTN_UNREACHABLE) {
2024                 rth->u.dst.input= ip_error;
2025                 rth->u.dst.error= -err;
2026                 rth->rt_flags   &= ~RTCF_LOCAL;
2027         }
2028         rth->rt_type    = res.type;
2029         hash = rt_hash(daddr, saddr, fl.iif);
2030         err = rt_intern_hash(hash, rth, &skb->rtable);
2031         goto done;
2032
2033 no_route:
2034         RT_CACHE_STAT_INC(in_no_route);
2035         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2036         res.type = RTN_UNREACHABLE;
2037         if (err == -ESRCH)
2038                 err = -ENETUNREACH;
2039         goto local_input;
2040
2041         /*
2042          *      Do not cache martian addresses: they should be logged (RFC1812)
2043          */
2044 martian_destination:
2045         RT_CACHE_STAT_INC(in_martian_dst);
2046 #ifdef CONFIG_IP_ROUTE_VERBOSE
2047         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2048                 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2049                         NIPQUAD_FMT ", dev %s\n",
2050                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2051 #endif
2052
2053 e_hostunreach:
2054         err = -EHOSTUNREACH;
2055         goto done;
2056
2057 e_inval:
2058         err = -EINVAL;
2059         goto done;
2060
2061 e_nobufs:
2062         err = -ENOBUFS;
2063         goto done;
2064
2065 martian_source:
2066         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2067         goto e_inval;
2068 }
2069
2070 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2071                    u8 tos, struct net_device *dev)
2072 {
2073         struct rtable * rth;
2074         unsigned        hash;
2075         int iif = dev->ifindex;
2076         struct net *net;
2077
2078         net = dev_net(dev);
2079         tos &= IPTOS_RT_MASK;
2080         hash = rt_hash(daddr, saddr, iif);
2081
2082         rcu_read_lock();
2083         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2084              rth = rcu_dereference(rth->u.dst.rt_next)) {
2085                 if (((rth->fl.fl4_dst ^ daddr) |
2086                      (rth->fl.fl4_src ^ saddr) |
2087                      (rth->fl.iif ^ iif) |
2088                      rth->fl.oif |
2089                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2090                     rth->fl.mark == skb->mark &&
2091                     net_eq(dev_net(rth->u.dst.dev), net) &&
2092                     rth->rt_genid == atomic_read(&rt_genid)) {
2093                         dst_use(&rth->u.dst, jiffies);
2094                         RT_CACHE_STAT_INC(in_hit);
2095                         rcu_read_unlock();
2096                         skb->rtable = rth;
2097                         return 0;
2098                 }
2099                 RT_CACHE_STAT_INC(in_hlist_search);
2100         }
2101         rcu_read_unlock();
2102
2103         /* Multicast recognition logic is moved from route cache to here.
2104            The problem was that too many Ethernet cards have broken/missing
2105            hardware multicast filters :-( As result the host on multicasting
2106            network acquires a lot of useless route cache entries, sort of
2107            SDR messages from all the world. Now we try to get rid of them.
2108            Really, provided software IP multicast filter is organized
2109            reasonably (at least, hashed), it does not result in a slowdown
2110            comparing with route cache reject entries.
2111            Note, that multicast routers are not affected, because
2112            route cache entry is created eventually.
2113          */
2114         if (ipv4_is_multicast(daddr)) {
2115                 struct in_device *in_dev;
2116
2117                 rcu_read_lock();
2118                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2119                         int our = ip_check_mc(in_dev, daddr, saddr,
2120                                 ip_hdr(skb)->protocol);
2121                         if (our
2122 #ifdef CONFIG_IP_MROUTE
2123                             || (!ipv4_is_local_multicast(daddr) &&
2124                                 IN_DEV_MFORWARD(in_dev))
2125 #endif
2126                             ) {
2127                                 rcu_read_unlock();
2128                                 return ip_route_input_mc(skb, daddr, saddr,
2129                                                          tos, dev, our);
2130                         }
2131                 }
2132                 rcu_read_unlock();
2133                 return -EINVAL;
2134         }
2135         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2136 }
2137
2138 static int __mkroute_output(struct rtable **result,
2139                             struct fib_result *res,
2140                             const struct flowi *fl,
2141                             const struct flowi *oldflp,
2142                             struct net_device *dev_out,
2143                             unsigned flags)
2144 {
2145         struct rtable *rth;
2146         struct in_device *in_dev;
2147         u32 tos = RT_FL_TOS(oldflp);
2148         int err = 0;
2149
2150         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2151                 return -EINVAL;
2152
2153         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2154                 res->type = RTN_BROADCAST;
2155         else if (ipv4_is_multicast(fl->fl4_dst))
2156                 res->type = RTN_MULTICAST;
2157         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2158                 return -EINVAL;
2159
2160         if (dev_out->flags & IFF_LOOPBACK)
2161                 flags |= RTCF_LOCAL;
2162
2163         /* get work reference to inet device */
2164         in_dev = in_dev_get(dev_out);
2165         if (!in_dev)
2166                 return -EINVAL;
2167
2168         if (res->type == RTN_BROADCAST) {
2169                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2170                 if (res->fi) {
2171                         fib_info_put(res->fi);
2172                         res->fi = NULL;
2173                 }
2174         } else if (res->type == RTN_MULTICAST) {
2175                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2176                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2177                                  oldflp->proto))
2178                         flags &= ~RTCF_LOCAL;
2179                 /* If multicast route do not exist use
2180                    default one, but do not gateway in this case.
2181                    Yes, it is hack.
2182                  */
2183                 if (res->fi && res->prefixlen < 4) {
2184                         fib_info_put(res->fi);
2185                         res->fi = NULL;
2186                 }
2187         }
2188
2189
2190         rth = dst_alloc(&ipv4_dst_ops);
2191         if (!rth) {
2192                 err = -ENOBUFS;
2193                 goto cleanup;
2194         }
2195
2196         atomic_set(&rth->u.dst.__refcnt, 1);
2197         rth->u.dst.flags= DST_HOST;
2198         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2199                 rth->u.dst.flags |= DST_NOXFRM;
2200         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2201                 rth->u.dst.flags |= DST_NOPOLICY;
2202
2203         rth->fl.fl4_dst = oldflp->fl4_dst;
2204         rth->fl.fl4_tos = tos;
2205         rth->fl.fl4_src = oldflp->fl4_src;
2206         rth->fl.oif     = oldflp->oif;
2207         rth->fl.mark    = oldflp->mark;
2208         rth->rt_dst     = fl->fl4_dst;
2209         rth->rt_src     = fl->fl4_src;
2210         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2211         /* get references to the devices that are to be hold by the routing
2212            cache entry */
2213         rth->u.dst.dev  = dev_out;
2214         dev_hold(dev_out);
2215         rth->idev       = in_dev_get(dev_out);
2216         rth->rt_gateway = fl->fl4_dst;
2217         rth->rt_spec_dst= fl->fl4_src;
2218
2219         rth->u.dst.output=ip_output;
2220         rth->rt_genid = atomic_read(&rt_genid);
2221
2222         RT_CACHE_STAT_INC(out_slow_tot);
2223
2224         if (flags & RTCF_LOCAL) {
2225                 rth->u.dst.input = ip_local_deliver;
2226                 rth->rt_spec_dst = fl->fl4_dst;
2227         }
2228         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2229                 rth->rt_spec_dst = fl->fl4_src;
2230                 if (flags & RTCF_LOCAL &&
2231                     !(dev_out->flags & IFF_LOOPBACK)) {
2232                         rth->u.dst.output = ip_mc_output;
2233                         RT_CACHE_STAT_INC(out_slow_mc);
2234                 }
2235 #ifdef CONFIG_IP_MROUTE
2236                 if (res->type == RTN_MULTICAST) {
2237                         if (IN_DEV_MFORWARD(in_dev) &&
2238                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2239                                 rth->u.dst.input = ip_mr_input;
2240                                 rth->u.dst.output = ip_mc_output;
2241                         }
2242                 }
2243 #endif
2244         }
2245
2246         rt_set_nexthop(rth, res, 0);
2247
2248         rth->rt_flags = flags;
2249
2250         *result = rth;
2251  cleanup:
2252         /* release work reference to inet device */
2253         in_dev_put(in_dev);
2254
2255         return err;
2256 }
2257
2258 static int ip_mkroute_output(struct rtable **rp,
2259                              struct fib_result *res,
2260                              const struct flowi *fl,
2261                              const struct flowi *oldflp,
2262                              struct net_device *dev_out,
2263                              unsigned flags)
2264 {
2265         struct rtable *rth = NULL;
2266         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2267         unsigned hash;
2268         if (err == 0) {
2269                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2270                 err = rt_intern_hash(hash, rth, rp);
2271         }
2272
2273         return err;
2274 }
2275
2276 /*
2277  * Major route resolver routine.
2278  */
2279
2280 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2281                                 const struct flowi *oldflp)
2282 {
2283         u32 tos = RT_FL_TOS(oldflp);
2284         struct flowi fl = { .nl_u = { .ip4_u =
2285                                       { .daddr = oldflp->fl4_dst,
2286                                         .saddr = oldflp->fl4_src,
2287                                         .tos = tos & IPTOS_RT_MASK,
2288                                         .scope = ((tos & RTO_ONLINK) ?
2289                                                   RT_SCOPE_LINK :
2290                                                   RT_SCOPE_UNIVERSE),
2291                                       } },
2292                             .mark = oldflp->mark,
2293                             .iif = net->loopback_dev->ifindex,
2294                             .oif = oldflp->oif };
2295         struct fib_result res;
2296         unsigned flags = 0;
2297         struct net_device *dev_out = NULL;
2298         int free_res = 0;
2299         int err;
2300
2301
2302         res.fi          = NULL;
2303 #ifdef CONFIG_IP_MULTIPLE_TABLES
2304         res.r           = NULL;
2305 #endif
2306
2307         if (oldflp->fl4_src) {
2308                 err = -EINVAL;
2309                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2310                     ipv4_is_lbcast(oldflp->fl4_src) ||
2311                     ipv4_is_zeronet(oldflp->fl4_src))
2312                         goto out;
2313
2314                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2315                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2316                 if (dev_out == NULL)
2317                         goto out;
2318
2319                 /* I removed check for oif == dev_out->oif here.
2320                    It was wrong for two reasons:
2321                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322                       is assigned to multiple interfaces.
2323                    2. Moreover, we are allowed to send packets with saddr
2324                       of another iface. --ANK
2325                  */
2326
2327                 if (oldflp->oif == 0
2328                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2329                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2330                         /* Special hack: user can direct multicasts
2331                            and limited broadcast via necessary interface
2332                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2333                            This hack is not just for fun, it allows
2334                            vic,vat and friends to work.
2335                            They bind socket to loopback, set ttl to zero
2336                            and expect that it will work.
2337                            From the viewpoint of routing cache they are broken,
2338                            because we are not allowed to build multicast path
2339                            with loopback source addr (look, routing cache
2340                            cannot know, that ttl is zero, so that packet
2341                            will not leave this host and route is valid).
2342                            Luckily, this hack is good workaround.
2343                          */
2344
2345                         fl.oif = dev_out->ifindex;
2346                         goto make_route;
2347                 }
2348                 if (dev_out)
2349                         dev_put(dev_out);
2350                 dev_out = NULL;
2351         }
2352
2353
2354         if (oldflp->oif) {
2355                 dev_out = dev_get_by_index(net, oldflp->oif);
2356                 err = -ENODEV;
2357                 if (dev_out == NULL)
2358                         goto out;
2359
2360                 /* RACE: Check return value of inet_select_addr instead. */
2361                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2362                         dev_put(dev_out);
2363                         goto out;       /* Wrong error code */
2364                 }
2365
2366                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2367                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2368                         if (!fl.fl4_src)
2369                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2370                                                               RT_SCOPE_LINK);
2371                         goto make_route;
2372                 }
2373                 if (!fl.fl4_src) {
2374                         if (ipv4_is_multicast(oldflp->fl4_dst))
2375                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2376                                                               fl.fl4_scope);
2377                         else if (!oldflp->fl4_dst)
2378                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2379                                                               RT_SCOPE_HOST);
2380                 }
2381         }
2382
2383         if (!fl.fl4_dst) {
2384                 fl.fl4_dst = fl.fl4_src;
2385                 if (!fl.fl4_dst)
2386                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2387                 if (dev_out)
2388                         dev_put(dev_out);
2389                 dev_out = net->loopback_dev;
2390                 dev_hold(dev_out);
2391                 fl.oif = net->loopback_dev->ifindex;
2392                 res.type = RTN_LOCAL;
2393                 flags |= RTCF_LOCAL;
2394                 goto make_route;
2395         }
2396
2397         if (fib_lookup(net, &fl, &res)) {
2398                 res.fi = NULL;
2399                 if (oldflp->oif) {
2400                         /* Apparently, routing tables are wrong. Assume,
2401                            that the destination is on link.
2402
2403                            WHY? DW.
2404                            Because we are allowed to send to iface
2405                            even if it has NO routes and NO assigned
2406                            addresses. When oif is specified, routing
2407                            tables are looked up with only one purpose:
2408                            to catch if destination is gatewayed, rather than
2409                            direct. Moreover, if MSG_DONTROUTE is set,
2410                            we send packet, ignoring both routing tables
2411                            and ifaddr state. --ANK
2412
2413
2414                            We could make it even if oif is unknown,
2415                            likely IPv6, but we do not.
2416                          */
2417
2418                         if (fl.fl4_src == 0)
2419                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2420                                                               RT_SCOPE_LINK);
2421                         res.type = RTN_UNICAST;
2422                         goto make_route;
2423                 }
2424                 if (dev_out)
2425                         dev_put(dev_out);
2426                 err = -ENETUNREACH;
2427                 goto out;
2428         }
2429         free_res = 1;
2430
2431         if (res.type == RTN_LOCAL) {
2432                 if (!fl.fl4_src)
2433                         fl.fl4_src = fl.fl4_dst;
2434                 if (dev_out)
2435                         dev_put(dev_out);
2436                 dev_out = net->loopback_dev;
2437                 dev_hold(dev_out);
2438                 fl.oif = dev_out->ifindex;
2439                 if (res.fi)
2440                         fib_info_put(res.fi);
2441                 res.fi = NULL;
2442                 flags |= RTCF_LOCAL;
2443                 goto make_route;
2444         }
2445
2446 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2447         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2448                 fib_select_multipath(&fl, &res);
2449         else
2450 #endif
2451         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2452                 fib_select_default(net, &fl, &res);
2453
2454         if (!fl.fl4_src)
2455                 fl.fl4_src = FIB_RES_PREFSRC(res);
2456
2457         if (dev_out)
2458                 dev_put(dev_out);
2459         dev_out = FIB_RES_DEV(res);
2460         dev_hold(dev_out);
2461         fl.oif = dev_out->ifindex;
2462
2463
2464 make_route:
2465         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2466
2467
2468         if (free_res)
2469                 fib_res_put(&res);
2470         if (dev_out)
2471                 dev_put(dev_out);
2472 out:    return err;
2473 }
2474
2475 int __ip_route_output_key(struct net *net, struct rtable **rp,
2476                           const struct flowi *flp)
2477 {
2478         unsigned hash;
2479         struct rtable *rth;
2480
2481         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2482
2483         rcu_read_lock_bh();
2484         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2485                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2486                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2487                     rth->fl.fl4_src == flp->fl4_src &&
2488                     rth->fl.iif == 0 &&
2489                     rth->fl.oif == flp->oif &&
2490                     rth->fl.mark == flp->mark &&
2491                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2492                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2493                     net_eq(dev_net(rth->u.dst.dev), net) &&
2494                     rth->rt_genid == atomic_read(&rt_genid)) {
2495                         dst_use(&rth->u.dst, jiffies);
2496                         RT_CACHE_STAT_INC(out_hit);
2497                         rcu_read_unlock_bh();
2498                         *rp = rth;
2499                         return 0;
2500                 }
2501                 RT_CACHE_STAT_INC(out_hlist_search);
2502         }
2503         rcu_read_unlock_bh();
2504
2505         return ip_route_output_slow(net, rp, flp);
2506 }
2507
2508 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2509
2510 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2511 {
2512 }
2513
2514 static struct dst_ops ipv4_dst_blackhole_ops = {
2515         .family                 =       AF_INET,
2516         .protocol               =       __constant_htons(ETH_P_IP),
2517         .destroy                =       ipv4_dst_destroy,
2518         .check                  =       ipv4_dst_check,
2519         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2520         .entry_size             =       sizeof(struct rtable),
2521         .entries                =       ATOMIC_INIT(0),
2522 };
2523
2524
2525 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2526 {
2527         struct rtable *ort = *rp;
2528         struct rtable *rt = (struct rtable *)
2529                 dst_alloc(&ipv4_dst_blackhole_ops);
2530
2531         if (rt) {
2532                 struct dst_entry *new = &rt->u.dst;
2533
2534                 atomic_set(&new->__refcnt, 1);
2535                 new->__use = 1;
2536                 new->input = dst_discard;
2537                 new->output = dst_discard;
2538                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2539
2540                 new->dev = ort->u.dst.dev;
2541                 if (new->dev)
2542                         dev_hold(new->dev);
2543
2544                 rt->fl = ort->fl;
2545
2546                 rt->idev = ort->idev;
2547                 if (rt->idev)
2548                         in_dev_hold(rt->idev);
2549                 rt->rt_genid = atomic_read(&rt_genid);
2550                 rt->rt_flags = ort->rt_flags;
2551                 rt->rt_type = ort->rt_type;
2552                 rt->rt_dst = ort->rt_dst;
2553                 rt->rt_src = ort->rt_src;
2554                 rt->rt_iif = ort->rt_iif;
2555                 rt->rt_gateway = ort->rt_gateway;
2556                 rt->rt_spec_dst = ort->rt_spec_dst;
2557                 rt->peer = ort->peer;
2558                 if (rt->peer)
2559                         atomic_inc(&rt->peer->refcnt);
2560
2561                 dst_free(new);
2562         }
2563
2564         dst_release(&(*rp)->u.dst);
2565         *rp = rt;
2566         return (rt ? 0 : -ENOMEM);
2567 }
2568
2569 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2570                          struct sock *sk, int flags)
2571 {
2572         int err;
2573
2574         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2575                 return err;
2576
2577         if (flp->proto) {
2578                 if (!flp->fl4_src)
2579                         flp->fl4_src = (*rp)->rt_src;
2580                 if (!flp->fl4_dst)
2581                         flp->fl4_dst = (*rp)->rt_dst;
2582                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2583                                     flags ? XFRM_LOOKUP_WAIT : 0);
2584                 if (err == -EREMOTE)
2585                         err = ipv4_dst_blackhole(rp, flp);
2586
2587                 return err;
2588         }
2589
2590         return 0;
2591 }
2592
2593 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2594
2595 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2596 {
2597         return ip_route_output_flow(net, rp, flp, NULL, 0);
2598 }
2599
2600 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2601                         int nowait, unsigned int flags)
2602 {
2603         struct rtable *rt = skb->rtable;
2604         struct rtmsg *r;
2605         struct nlmsghdr *nlh;
2606         long expires;
2607         u32 id = 0, ts = 0, tsage = 0, error;
2608
2609         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2610         if (nlh == NULL)
2611                 return -EMSGSIZE;
2612
2613         r = nlmsg_data(nlh);
2614         r->rtm_family    = AF_INET;
2615         r->rtm_dst_len  = 32;
2616         r->rtm_src_len  = 0;
2617         r->rtm_tos      = rt->fl.fl4_tos;
2618         r->rtm_table    = RT_TABLE_MAIN;
2619         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2620         r->rtm_type     = rt->rt_type;
2621         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2622         r->rtm_protocol = RTPROT_UNSPEC;
2623         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2624         if (rt->rt_flags & RTCF_NOTIFY)
2625                 r->rtm_flags |= RTM_F_NOTIFY;
2626
2627         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2628
2629         if (rt->fl.fl4_src) {
2630                 r->rtm_src_len = 32;
2631                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2632         }
2633         if (rt->u.dst.dev)
2634                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2635 #ifdef CONFIG_NET_CLS_ROUTE
2636         if (rt->u.dst.tclassid)
2637                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2638 #endif
2639         if (rt->fl.iif)
2640                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2641         else if (rt->rt_src != rt->fl.fl4_src)
2642                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2643
2644         if (rt->rt_dst != rt->rt_gateway)
2645                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2646
2647         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2648                 goto nla_put_failure;
2649
2650         error = rt->u.dst.error;
2651         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2652         if (rt->peer) {
2653                 id = rt->peer->ip_id_count;
2654                 if (rt->peer->tcp_ts_stamp) {
2655                         ts = rt->peer->tcp_ts;
2656                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2657                 }
2658         }
2659
2660         if (rt->fl.iif) {
2661 #ifdef CONFIG_IP_MROUTE
2662                 __be32 dst = rt->rt_dst;
2663
2664                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2666                         int err = ipmr_get_route(skb, r, nowait);
2667                         if (err <= 0) {
2668                                 if (!nowait) {
2669                                         if (err == 0)
2670                                                 return 0;
2671                                         goto nla_put_failure;
2672                                 } else {
2673                                         if (err == -EMSGSIZE)
2674                                                 goto nla_put_failure;
2675                                         error = err;
2676                                 }
2677                         }
2678                 } else
2679 #endif
2680                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2681         }
2682
2683         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2684                                expires, error) < 0)
2685                 goto nla_put_failure;
2686
2687         return nlmsg_end(skb, nlh);
2688
2689 nla_put_failure:
2690         nlmsg_cancel(skb, nlh);
2691         return -EMSGSIZE;
2692 }
2693
2694 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2695 {
2696         struct net *net = sock_net(in_skb->sk);
2697         struct rtmsg *rtm;
2698         struct nlattr *tb[RTA_MAX+1];
2699         struct rtable *rt = NULL;
2700         __be32 dst = 0;
2701         __be32 src = 0;
2702         u32 iif;
2703         int err;
2704         struct sk_buff *skb;
2705
2706         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2707         if (err < 0)
2708                 goto errout;
2709
2710         rtm = nlmsg_data(nlh);
2711
2712         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2713         if (skb == NULL) {
2714                 err = -ENOBUFS;
2715                 goto errout;
2716         }
2717
2718         /* Reserve room for dummy headers, this skb can pass
2719            through good chunk of routing engine.
2720          */
2721         skb_reset_mac_header(skb);
2722         skb_reset_network_header(skb);
2723
2724         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2725         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2726         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2727
2728         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2729         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2730         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2731
2732         if (iif) {
2733                 struct net_device *dev;
2734
2735                 dev = __dev_get_by_index(net, iif);
2736                 if (dev == NULL) {
2737                         err = -ENODEV;
2738                         goto errout_free;
2739                 }
2740
2741                 skb->protocol   = htons(ETH_P_IP);
2742                 skb->dev        = dev;
2743                 local_bh_disable();
2744                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2745                 local_bh_enable();
2746
2747                 rt = skb->rtable;
2748                 if (err == 0 && rt->u.dst.error)
2749                         err = -rt->u.dst.error;
2750         } else {
2751                 struct flowi fl = {
2752                         .nl_u = {
2753                                 .ip4_u = {
2754                                         .daddr = dst,
2755                                         .saddr = src,
2756                                         .tos = rtm->rtm_tos,
2757                                 },
2758                         },
2759                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2760                 };
2761                 err = ip_route_output_key(net, &rt, &fl);
2762         }
2763
2764         if (err)
2765                 goto errout_free;
2766
2767         skb->rtable = rt;
2768         if (rtm->rtm_flags & RTM_F_NOTIFY)
2769                 rt->rt_flags |= RTCF_NOTIFY;
2770
2771         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2772                            RTM_NEWROUTE, 0, 0);
2773         if (err <= 0)
2774                 goto errout_free;
2775
2776         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2777 errout:
2778         return err;
2779
2780 errout_free:
2781         kfree_skb(skb);
2782         goto errout;
2783 }
2784
2785 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2786 {
2787         struct rtable *rt;
2788         int h, s_h;
2789         int idx, s_idx;
2790         struct net *net;
2791
2792         net = sock_net(skb->sk);
2793
2794         s_h = cb->args[0];
2795         if (s_h < 0)
2796                 s_h = 0;
2797         s_idx = idx = cb->args[1];
2798         for (h = s_h; h <= rt_hash_mask; h++) {
2799                 rcu_read_lock_bh();
2800                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2801                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2802                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2803                                 continue;
2804                         if (rt->rt_genid != atomic_read(&rt_genid))
2805                                 continue;
2806                         skb->dst = dst_clone(&rt->u.dst);
2807                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2808                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2809                                          1, NLM_F_MULTI) <= 0) {
2810                                 dst_release(xchg(&skb->dst, NULL));
2811                                 rcu_read_unlock_bh();
2812                                 goto done;
2813                         }
2814                         dst_release(xchg(&skb->dst, NULL));
2815                 }
2816                 rcu_read_unlock_bh();
2817                 s_idx = 0;
2818         }
2819
2820 done:
2821         cb->args[0] = h;
2822         cb->args[1] = idx;
2823         return skb->len;
2824 }
2825
2826 void ip_rt_multicast_event(struct in_device *in_dev)
2827 {
2828         rt_cache_flush(dev_net(in_dev->dev), 0);
2829 }
2830
2831 #ifdef CONFIG_SYSCTL
2832 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2833                                         struct file *filp, void __user *buffer,
2834                                         size_t *lenp, loff_t *ppos)
2835 {
2836         if (write) {
2837                 int flush_delay;
2838                 struct net *net;
2839                 static DEFINE_MUTEX(flush_mutex);
2840
2841                 mutex_lock(&flush_mutex);
2842                 ctl->data = &flush_delay;
2843                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2844                 ctl->data = NULL;
2845                 mutex_unlock(&flush_mutex);
2846
2847                 net = (struct net *)ctl->extra1;
2848                 rt_cache_flush(net, flush_delay);
2849                 return 0;
2850         }
2851
2852         return -EINVAL;
2853 }
2854
2855 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2856                                                 int __user *name,
2857                                                 int nlen,
2858                                                 void __user *oldval,
2859                                                 size_t __user *oldlenp,
2860                                                 void __user *newval,
2861                                                 size_t newlen)
2862 {
2863         int delay;
2864         struct net *net;
2865         if (newlen != sizeof(int))
2866                 return -EINVAL;
2867         if (get_user(delay, (int __user *)newval))
2868                 return -EFAULT;
2869         net = (struct net *)table->extra1;
2870         rt_cache_flush(net, delay);
2871         return 0;
2872 }
2873
2874 ctl_table ipv4_route_table[] = {
2875         {
2876                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2877                 .procname       = "gc_thresh",
2878                 .data           = &ipv4_dst_ops.gc_thresh,
2879                 .maxlen         = sizeof(int),
2880                 .mode           = 0644,
2881                 .proc_handler   = &proc_dointvec,
2882         },
2883         {
2884                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2885                 .procname       = "max_size",
2886                 .data           = &ip_rt_max_size,
2887                 .maxlen         = sizeof(int),
2888                 .mode           = 0644,
2889                 .proc_handler   = &proc_dointvec,
2890         },
2891         {
2892                 /*  Deprecated. Use gc_min_interval_ms */
2893
2894                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2895                 .procname       = "gc_min_interval",
2896                 .data           = &ip_rt_gc_min_interval,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = &proc_dointvec_jiffies,
2900                 .strategy       = &sysctl_jiffies,
2901         },
2902         {
2903                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2904                 .procname       = "gc_min_interval_ms",
2905                 .data           = &ip_rt_gc_min_interval,
2906                 .maxlen         = sizeof(int),
2907                 .mode           = 0644,
2908                 .proc_handler   = &proc_dointvec_ms_jiffies,
2909                 .strategy       = &sysctl_ms_jiffies,
2910         },
2911         {
2912                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2913                 .procname       = "gc_timeout",
2914                 .data           = &ip_rt_gc_timeout,
2915                 .maxlen         = sizeof(int),
2916                 .mode           = 0644,
2917                 .proc_handler   = &proc_dointvec_jiffies,
2918                 .strategy       = &sysctl_jiffies,
2919         },
2920         {
2921                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2922                 .procname       = "gc_interval",
2923                 .data           = &ip_rt_gc_interval,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = &proc_dointvec_jiffies,
2927                 .strategy       = &sysctl_jiffies,
2928         },
2929         {
2930                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2931                 .procname       = "redirect_load",
2932                 .data           = &ip_rt_redirect_load,
2933                 .maxlen         = sizeof(int),
2934                 .mode           = 0644,
2935                 .proc_handler   = &proc_dointvec,
2936         },
2937         {
2938                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2939                 .procname       = "redirect_number",
2940                 .data           = &ip_rt_redirect_number,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = &proc_dointvec,
2944         },
2945         {
2946                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2947                 .procname       = "redirect_silence",
2948                 .data           = &ip_rt_redirect_silence,
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0644,
2951                 .proc_handler   = &proc_dointvec,
2952         },
2953         {
2954                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2955                 .procname       = "error_cost",
2956                 .data           = &ip_rt_error_cost,
2957                 .maxlen         = sizeof(int),
2958                 .mode           = 0644,
2959                 .proc_handler   = &proc_dointvec,
2960         },
2961         {
2962                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2963                 .procname       = "error_burst",
2964                 .data           = &ip_rt_error_burst,
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0644,
2967                 .proc_handler   = &proc_dointvec,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2971                 .procname       = "gc_elasticity",
2972                 .data           = &ip_rt_gc_elasticity,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec,
2976         },
2977         {
2978                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2979                 .procname       = "mtu_expires",
2980                 .data           = &ip_rt_mtu_expires,
2981                 .maxlen         = sizeof(int),
2982                 .mode           = 0644,
2983                 .proc_handler   = &proc_dointvec_jiffies,
2984                 .strategy       = &sysctl_jiffies,
2985         },
2986         {
2987                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2988                 .procname       = "min_pmtu",
2989                 .data           = &ip_rt_min_pmtu,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = &proc_dointvec,
2993         },
2994         {
2995                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2996                 .procname       = "min_adv_mss",
2997                 .data           = &ip_rt_min_advmss,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = &proc_dointvec,
3001         },
3002         {
3003                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3004                 .procname       = "secret_interval",
3005                 .data           = &ip_rt_secret_interval,
3006                 .maxlen         = sizeof(int),
3007                 .mode           = 0644,
3008                 .proc_handler   = &proc_dointvec_jiffies,
3009                 .strategy       = &sysctl_jiffies,
3010         },
3011         { .ctl_name = 0 }
3012 };
3013
3014 static __net_initdata struct ctl_path ipv4_route_path[] = {
3015         { .procname = "net", .ctl_name = CTL_NET, },
3016         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3017         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3018         { },
3019 };
3020
3021
3022 static struct ctl_table ipv4_route_flush_table[] = {
3023         {
3024                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3025                 .procname       = "flush",
3026                 .maxlen         = sizeof(int),
3027                 .mode           = 0200,
3028                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
3029                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
3030         },
3031         { .ctl_name = 0 },
3032 };
3033
3034 static __net_init int sysctl_route_net_init(struct net *net)
3035 {
3036         struct ctl_table *tbl;
3037
3038         tbl = ipv4_route_flush_table;
3039         if (net != &init_net) {
3040                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3041                 if (tbl == NULL)
3042                         goto err_dup;
3043         }
3044         tbl[0].extra1 = net;
3045
3046         net->ipv4.route_hdr =
3047                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3048         if (net->ipv4.route_hdr == NULL)
3049                 goto err_reg;
3050         return 0;
3051
3052 err_reg:
3053         if (tbl != ipv4_route_flush_table)
3054                 kfree(tbl);
3055 err_dup:
3056         return -ENOMEM;
3057 }
3058
3059 static __net_exit void sysctl_route_net_exit(struct net *net)
3060 {
3061         struct ctl_table *tbl;
3062
3063         tbl = net->ipv4.route_hdr->ctl_table_arg;
3064         unregister_net_sysctl_table(net->ipv4.route_hdr);
3065         BUG_ON(tbl == ipv4_route_flush_table);
3066         kfree(tbl);
3067 }
3068
3069 static __net_initdata struct pernet_operations sysctl_route_ops = {
3070         .init = sysctl_route_net_init,
3071         .exit = sysctl_route_net_exit,
3072 };
3073 #endif
3074
3075 #ifdef CONFIG_NET_CLS_ROUTE
3076 struct ip_rt_acct *ip_rt_acct __read_mostly;
3077 #endif /* CONFIG_NET_CLS_ROUTE */
3078
3079 static __initdata unsigned long rhash_entries;
3080 static int __init set_rhash_entries(char *str)
3081 {
3082         if (!str)
3083                 return 0;
3084         rhash_entries = simple_strtoul(str, &str, 0);
3085         return 1;
3086 }
3087 __setup("rhash_entries=", set_rhash_entries);
3088
3089 int __init ip_rt_init(void)
3090 {
3091         int rc = 0;
3092
3093         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3094                              (jiffies ^ (jiffies >> 7))));
3095
3096 #ifdef CONFIG_NET_CLS_ROUTE
3097         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3098         if (!ip_rt_acct)
3099                 panic("IP: failed to allocate ip_rt_acct\n");
3100 #endif
3101
3102         ipv4_dst_ops.kmem_cachep =
3103                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3104                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3105
3106         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3107
3108         rt_hash_table = (struct rt_hash_bucket *)
3109                 alloc_large_system_hash("IP route cache",
3110                                         sizeof(struct rt_hash_bucket),
3111                                         rhash_entries,
3112                                         (num_physpages >= 128 * 1024) ?
3113                                         15 : 17,
3114                                         0,
3115                                         &rt_hash_log,
3116                                         &rt_hash_mask,
3117                                         0);
3118         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3119         rt_hash_lock_init();
3120
3121         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3122         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3123
3124         devinet_init();
3125         ip_fib_init();
3126
3127         rt_secret_timer.function = rt_secret_rebuild;
3128         rt_secret_timer.data = 0;
3129         init_timer_deferrable(&rt_secret_timer);
3130
3131         /* All the timers, started at system startup tend
3132            to synchronize. Perturb it a bit.
3133          */
3134         schedule_delayed_work(&expires_work,
3135                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3136
3137         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3138                 ip_rt_secret_interval;
3139         add_timer(&rt_secret_timer);
3140
3141         if (ip_rt_proc_init())
3142                 printk(KERN_ERR "Unable to create route proc files\n");
3143 #ifdef CONFIG_XFRM
3144         xfrm_init();
3145         xfrm4_init();
3146 #endif
3147         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3148
3149 #ifdef CONFIG_SYSCTL
3150         register_pernet_subsys(&sysctl_route_ops);
3151 #endif
3152         return rc;
3153 }
3154
3155 EXPORT_SYMBOL(__ip_select_ident);
3156 EXPORT_SYMBOL(ip_route_input);
3157 EXPORT_SYMBOL(ip_route_output_key);