Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114 #define IP_MAX_MTU      0xFFF0
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly    = 8;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
132
133 static void rt_worker_func(struct work_struct *work);
134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
135
136 /*
137  *      Interface to generic destination cache.
138  */
139
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static void              ipv4_dst_destroy(struct dst_entry *dst);
142 static void              ipv4_dst_ifdown(struct dst_entry *dst,
143                                          struct net_device *dev, int how);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void              ipv4_link_failure(struct sk_buff *skb);
146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
148
149
150 static struct dst_ops ipv4_dst_ops = {
151         .family =               AF_INET,
152         .protocol =             __constant_htons(ETH_P_IP),
153         .gc =                   rt_garbage_collect,
154         .check =                ipv4_dst_check,
155         .destroy =              ipv4_dst_destroy,
156         .ifdown =               ipv4_dst_ifdown,
157         .negative_advice =      ipv4_negative_advice,
158         .link_failure =         ipv4_link_failure,
159         .update_pmtu =          ip_rt_update_pmtu,
160         .local_out =            __ip_local_out,
161         .entry_size =           sizeof(struct rtable),
162         .entries =              ATOMIC_INIT(0),
163 };
164
165 #define ECN_OR_COST(class)      TC_PRIO_##class
166
167 const __u8 ip_tos2prio[16] = {
168         TC_PRIO_BESTEFFORT,
169         ECN_OR_COST(FILLER),
170         TC_PRIO_BESTEFFORT,
171         ECN_OR_COST(BESTEFFORT),
172         TC_PRIO_BULK,
173         ECN_OR_COST(BULK),
174         TC_PRIO_BULK,
175         ECN_OR_COST(BULK),
176         TC_PRIO_INTERACTIVE,
177         ECN_OR_COST(INTERACTIVE),
178         TC_PRIO_INTERACTIVE,
179         ECN_OR_COST(INTERACTIVE),
180         TC_PRIO_INTERACTIVE_BULK,
181         ECN_OR_COST(INTERACTIVE_BULK),
182         TC_PRIO_INTERACTIVE_BULK,
183         ECN_OR_COST(INTERACTIVE_BULK)
184 };
185
186
187 /*
188  * Route cache.
189  */
190
191 /* The locking scheme is rather straight forward:
192  *
193  * 1) Read-Copy Update protects the buckets of the central route hash.
194  * 2) Only writers remove entries, and they hold the lock
195  *    as they look at rtable reference counts.
196  * 3) Only readers acquire references to rtable entries,
197  *    they do so with atomic increments and with the
198  *    lock held.
199  */
200
201 struct rt_hash_bucket {
202         struct rtable   *chain;
203 };
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205         defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ        256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ       4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ       2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ       1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ       512
222 # else
223 #  define RT_HASH_LOCK_SZ       256
224 # endif
225 #endif
226
227 static spinlock_t       *rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229
230 static __init void rt_hash_lock_init(void)
231 {
232         int i;
233
234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235                         GFP_KERNEL);
236         if (!rt_hash_locks)
237                 panic("IP: failed to allocate rt_hash_locks\n");
238
239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240                 spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249
250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
251 static unsigned                 rt_hash_mask __read_mostly;
252 static unsigned int             rt_hash_log  __read_mostly;
253
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) \
256         (__raw_get_cpu_var(rt_cache_stat).field++)
257
258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
259                 int genid)
260 {
261         return jhash_3words((__force u32)(__be32)(daddr),
262                             (__force u32)(__be32)(saddr),
263                             idx, genid)
264                 & rt_hash_mask;
265 }
266
267 static inline int rt_genid(struct net *net)
268 {
269         return atomic_read(&net->ipv4.rt_genid);
270 }
271
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274         struct seq_net_private p;
275         int bucket;
276         int genid;
277 };
278
279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
280 {
281         struct rt_cache_iter_state *st = seq->private;
282         struct rtable *r = NULL;
283
284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285                 rcu_read_lock_bh();
286                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287                 while (r) {
288                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
289                             r->rt_genid == st->genid)
290                                 return r;
291                         r = rcu_dereference(r->u.dst.rt_next);
292                 }
293                 rcu_read_unlock_bh();
294         }
295         return r;
296 }
297
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299                                           struct rtable *r)
300 {
301         struct rt_cache_iter_state *st = seq->private;
302         r = r->u.dst.rt_next;
303         while (!r) {
304                 rcu_read_unlock_bh();
305                 if (--st->bucket < 0)
306                         break;
307                 rcu_read_lock_bh();
308                 r = rt_hash_table[st->bucket].chain;
309         }
310         return rcu_dereference(r);
311 }
312
313 static struct rtable *rt_cache_get_next(struct seq_file *seq,
314                                         struct rtable *r)
315 {
316         struct rt_cache_iter_state *st = seq->private;
317         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
318                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
319                         continue;
320                 if (r->rt_genid == st->genid)
321                         break;
322         }
323         return r;
324 }
325
326 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
327 {
328         struct rtable *r = rt_cache_get_first(seq);
329
330         if (r)
331                 while (pos && (r = rt_cache_get_next(seq, r)))
332                         --pos;
333         return pos ? NULL : r;
334 }
335
336 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
337 {
338         struct rt_cache_iter_state *st = seq->private;
339         if (*pos)
340                 return rt_cache_get_idx(seq, *pos - 1);
341         st->genid = rt_genid(seq_file_net(seq));
342         return SEQ_START_TOKEN;
343 }
344
345 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
346 {
347         struct rtable *r;
348
349         if (v == SEQ_START_TOKEN)
350                 r = rt_cache_get_first(seq);
351         else
352                 r = rt_cache_get_next(seq, v);
353         ++*pos;
354         return r;
355 }
356
357 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
358 {
359         if (v && v != SEQ_START_TOKEN)
360                 rcu_read_unlock_bh();
361 }
362
363 static int rt_cache_seq_show(struct seq_file *seq, void *v)
364 {
365         if (v == SEQ_START_TOKEN)
366                 seq_printf(seq, "%-127s\n",
367                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
368                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
369                            "HHUptod\tSpecDst");
370         else {
371                 struct rtable *r = v;
372                 int len;
373
374                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
375                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
376                         r->u.dst.dev ? r->u.dst.dev->name : "*",
377                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
378                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
379                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
380                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
381                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
382                         dst_metric(&r->u.dst, RTAX_WINDOW),
383                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
384                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
385                         r->fl.fl4_tos,
386                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
387                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
388                                        dev_queue_xmit) : 0,
389                         r->rt_spec_dst, &len);
390
391                 seq_printf(seq, "%*s\n", 127 - len, "");
392         }
393         return 0;
394 }
395
396 static const struct seq_operations rt_cache_seq_ops = {
397         .start  = rt_cache_seq_start,
398         .next   = rt_cache_seq_next,
399         .stop   = rt_cache_seq_stop,
400         .show   = rt_cache_seq_show,
401 };
402
403 static int rt_cache_seq_open(struct inode *inode, struct file *file)
404 {
405         return seq_open_net(inode, file, &rt_cache_seq_ops,
406                         sizeof(struct rt_cache_iter_state));
407 }
408
409 static const struct file_operations rt_cache_seq_fops = {
410         .owner   = THIS_MODULE,
411         .open    = rt_cache_seq_open,
412         .read    = seq_read,
413         .llseek  = seq_lseek,
414         .release = seq_release_net,
415 };
416
417
418 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
419 {
420         int cpu;
421
422         if (*pos == 0)
423                 return SEQ_START_TOKEN;
424
425         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
426                 if (!cpu_possible(cpu))
427                         continue;
428                 *pos = cpu+1;
429                 return &per_cpu(rt_cache_stat, cpu);
430         }
431         return NULL;
432 }
433
434 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435 {
436         int cpu;
437
438         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
439                 if (!cpu_possible(cpu))
440                         continue;
441                 *pos = cpu+1;
442                 return &per_cpu(rt_cache_stat, cpu);
443         }
444         return NULL;
445
446 }
447
448 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
449 {
450
451 }
452
453 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
454 {
455         struct rt_cache_stat *st = v;
456
457         if (v == SEQ_START_TOKEN) {
458                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
459                 return 0;
460         }
461
462         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
463                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
464                    atomic_read(&ipv4_dst_ops.entries),
465                    st->in_hit,
466                    st->in_slow_tot,
467                    st->in_slow_mc,
468                    st->in_no_route,
469                    st->in_brd,
470                    st->in_martian_dst,
471                    st->in_martian_src,
472
473                    st->out_hit,
474                    st->out_slow_tot,
475                    st->out_slow_mc,
476
477                    st->gc_total,
478                    st->gc_ignored,
479                    st->gc_goal_miss,
480                    st->gc_dst_overflow,
481                    st->in_hlist_search,
482                    st->out_hlist_search
483                 );
484         return 0;
485 }
486
487 static const struct seq_operations rt_cpu_seq_ops = {
488         .start  = rt_cpu_seq_start,
489         .next   = rt_cpu_seq_next,
490         .stop   = rt_cpu_seq_stop,
491         .show   = rt_cpu_seq_show,
492 };
493
494
495 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
496 {
497         return seq_open(file, &rt_cpu_seq_ops);
498 }
499
500 static const struct file_operations rt_cpu_seq_fops = {
501         .owner   = THIS_MODULE,
502         .open    = rt_cpu_seq_open,
503         .read    = seq_read,
504         .llseek  = seq_lseek,
505         .release = seq_release,
506 };
507
508 #ifdef CONFIG_NET_CLS_ROUTE
509 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
510                            int length, int *eof, void *data)
511 {
512         unsigned int i;
513
514         if ((offset & 3) || (length & 3))
515                 return -EIO;
516
517         if (offset >= sizeof(struct ip_rt_acct) * 256) {
518                 *eof = 1;
519                 return 0;
520         }
521
522         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
523                 length = sizeof(struct ip_rt_acct) * 256 - offset;
524                 *eof = 1;
525         }
526
527         offset /= sizeof(u32);
528
529         if (length > 0) {
530                 u32 *dst = (u32 *) buffer;
531
532                 *start = buffer;
533                 memset(dst, 0, length);
534
535                 for_each_possible_cpu(i) {
536                         unsigned int j;
537                         u32 *src;
538
539                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
540                         for (j = 0; j < length/4; j++)
541                                 dst[j] += src[j];
542                 }
543         }
544         return length;
545 }
546 #endif
547
548 static int __net_init ip_rt_do_proc_init(struct net *net)
549 {
550         struct proc_dir_entry *pde;
551
552         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
553                         &rt_cache_seq_fops);
554         if (!pde)
555                 goto err1;
556
557         pde = proc_create("rt_cache", S_IRUGO,
558                           net->proc_net_stat, &rt_cpu_seq_fops);
559         if (!pde)
560                 goto err2;
561
562 #ifdef CONFIG_NET_CLS_ROUTE
563         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
564                         ip_rt_acct_read, NULL);
565         if (!pde)
566                 goto err3;
567 #endif
568         return 0;
569
570 #ifdef CONFIG_NET_CLS_ROUTE
571 err3:
572         remove_proc_entry("rt_cache", net->proc_net_stat);
573 #endif
574 err2:
575         remove_proc_entry("rt_cache", net->proc_net);
576 err1:
577         return -ENOMEM;
578 }
579
580 static void __net_exit ip_rt_do_proc_exit(struct net *net)
581 {
582         remove_proc_entry("rt_cache", net->proc_net_stat);
583         remove_proc_entry("rt_cache", net->proc_net);
584         remove_proc_entry("rt_acct", net->proc_net);
585 }
586
587 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
588         .init = ip_rt_do_proc_init,
589         .exit = ip_rt_do_proc_exit,
590 };
591
592 static int __init ip_rt_proc_init(void)
593 {
594         return register_pernet_subsys(&ip_rt_proc_ops);
595 }
596
597 #else
598 static inline int ip_rt_proc_init(void)
599 {
600         return 0;
601 }
602 #endif /* CONFIG_PROC_FS */
603
604 static inline void rt_free(struct rtable *rt)
605 {
606         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
607 }
608
609 static inline void rt_drop(struct rtable *rt)
610 {
611         ip_rt_put(rt);
612         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
613 }
614
615 static inline int rt_fast_clean(struct rtable *rth)
616 {
617         /* Kill broadcast/multicast entries very aggresively, if they
618            collide in hash table with more useful entries */
619         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
620                 rth->fl.iif && rth->u.dst.rt_next;
621 }
622
623 static inline int rt_valuable(struct rtable *rth)
624 {
625         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
626                 rth->u.dst.expires;
627 }
628
629 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
630 {
631         unsigned long age;
632         int ret = 0;
633
634         if (atomic_read(&rth->u.dst.__refcnt))
635                 goto out;
636
637         ret = 1;
638         if (rth->u.dst.expires &&
639             time_after_eq(jiffies, rth->u.dst.expires))
640                 goto out;
641
642         age = jiffies - rth->u.dst.lastuse;
643         ret = 0;
644         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
645             (age <= tmo2 && rt_valuable(rth)))
646                 goto out;
647         ret = 1;
648 out:    return ret;
649 }
650
651 /* Bits of score are:
652  * 31: very valuable
653  * 30: not quite useless
654  * 29..0: usage counter
655  */
656 static inline u32 rt_score(struct rtable *rt)
657 {
658         u32 score = jiffies - rt->u.dst.lastuse;
659
660         score = ~score & ~(3<<30);
661
662         if (rt_valuable(rt))
663                 score |= (1<<31);
664
665         if (!rt->fl.iif ||
666             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
667                 score |= (1<<30);
668
669         return score;
670 }
671
672 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
673 {
674         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
675                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
676                 (fl1->mark ^ fl2->mark) |
677                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
678                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
679                 (fl1->oif ^ fl2->oif) |
680                 (fl1->iif ^ fl2->iif)) == 0;
681 }
682
683 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
684 {
685         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
686 }
687
688 static inline int rt_is_expired(struct rtable *rth)
689 {
690         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
691 }
692
693 /*
694  * Perform a full scan of hash table and free all entries.
695  * Can be called by a softirq or a process.
696  * In the later case, we want to be reschedule if necessary
697  */
698 static void rt_do_flush(int process_context)
699 {
700         unsigned int i;
701         struct rtable *rth, *next;
702         struct rtable * tail;
703
704         for (i = 0; i <= rt_hash_mask; i++) {
705                 if (process_context && need_resched())
706                         cond_resched();
707                 rth = rt_hash_table[i].chain;
708                 if (!rth)
709                         continue;
710
711                 spin_lock_bh(rt_hash_lock_addr(i));
712 #ifdef CONFIG_NET_NS
713                 {
714                 struct rtable ** prev, * p;
715
716                 rth = rt_hash_table[i].chain;
717
718                 /* defer releasing the head of the list after spin_unlock */
719                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
720                         if (!rt_is_expired(tail))
721                                 break;
722                 if (rth != tail)
723                         rt_hash_table[i].chain = tail;
724
725                 /* call rt_free on entries after the tail requiring flush */
726                 prev = &rt_hash_table[i].chain;
727                 for (p = *prev; p; p = next) {
728                         next = p->u.dst.rt_next;
729                         if (!rt_is_expired(p)) {
730                                 prev = &p->u.dst.rt_next;
731                         } else {
732                                 *prev = next;
733                                 rt_free(p);
734                         }
735                 }
736                 }
737 #else
738                 rth = rt_hash_table[i].chain;
739                 rt_hash_table[i].chain = NULL;
740                 tail = NULL;
741 #endif
742                 spin_unlock_bh(rt_hash_lock_addr(i));
743
744                 for (; rth != tail; rth = next) {
745                         next = rth->u.dst.rt_next;
746                         rt_free(rth);
747                 }
748         }
749 }
750
751 static void rt_check_expire(void)
752 {
753         static unsigned int rover;
754         unsigned int i = rover, goal;
755         struct rtable *rth, **rthp;
756         u64 mult;
757
758         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
759         if (ip_rt_gc_timeout > 1)
760                 do_div(mult, ip_rt_gc_timeout);
761         goal = (unsigned int)mult;
762         if (goal > rt_hash_mask)
763                 goal = rt_hash_mask + 1;
764         for (; goal > 0; goal--) {
765                 unsigned long tmo = ip_rt_gc_timeout;
766
767                 i = (i + 1) & rt_hash_mask;
768                 rthp = &rt_hash_table[i].chain;
769
770                 if (need_resched())
771                         cond_resched();
772
773                 if (*rthp == NULL)
774                         continue;
775                 spin_lock_bh(rt_hash_lock_addr(i));
776                 while ((rth = *rthp) != NULL) {
777                         if (rt_is_expired(rth)) {
778                                 *rthp = rth->u.dst.rt_next;
779                                 rt_free(rth);
780                                 continue;
781                         }
782                         if (rth->u.dst.expires) {
783                                 /* Entry is expired even if it is in use */
784                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
785                                         tmo >>= 1;
786                                         rthp = &rth->u.dst.rt_next;
787                                         continue;
788                                 }
789                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
790                                 tmo >>= 1;
791                                 rthp = &rth->u.dst.rt_next;
792                                 continue;
793                         }
794
795                         /* Cleanup aged off entries. */
796                         *rthp = rth->u.dst.rt_next;
797                         rt_free(rth);
798                 }
799                 spin_unlock_bh(rt_hash_lock_addr(i));
800         }
801         rover = i;
802 }
803
804 /*
805  * rt_worker_func() is run in process context.
806  * we call rt_check_expire() to scan part of the hash table
807  */
808 static void rt_worker_func(struct work_struct *work)
809 {
810         rt_check_expire();
811         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
812 }
813
814 /*
815  * Pertubation of rt_genid by a small quantity [1..256]
816  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
817  * many times (2^24) without giving recent rt_genid.
818  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
819  */
820 static void rt_cache_invalidate(struct net *net)
821 {
822         unsigned char shuffle;
823
824         get_random_bytes(&shuffle, sizeof(shuffle));
825         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
826 }
827
828 /*
829  * delay < 0  : invalidate cache (fast : entries will be deleted later)
830  * delay >= 0 : invalidate & flush cache (can be long)
831  */
832 void rt_cache_flush(struct net *net, int delay)
833 {
834         rt_cache_invalidate(net);
835         if (delay >= 0)
836                 rt_do_flush(!in_softirq());
837 }
838
839 /*
840  * We change rt_genid and let gc do the cleanup
841  */
842 static void rt_secret_rebuild(unsigned long __net)
843 {
844         struct net *net = (struct net *)__net;
845         rt_cache_invalidate(net);
846         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
847 }
848
849 /*
850    Short description of GC goals.
851
852    We want to build algorithm, which will keep routing cache
853    at some equilibrium point, when number of aged off entries
854    is kept approximately equal to newly generated ones.
855
856    Current expiration strength is variable "expire".
857    We try to adjust it dynamically, so that if networking
858    is idle expires is large enough to keep enough of warm entries,
859    and when load increases it reduces to limit cache size.
860  */
861
862 static int rt_garbage_collect(struct dst_ops *ops)
863 {
864         static unsigned long expire = RT_GC_TIMEOUT;
865         static unsigned long last_gc;
866         static int rover;
867         static int equilibrium;
868         struct rtable *rth, **rthp;
869         unsigned long now = jiffies;
870         int goal;
871
872         /*
873          * Garbage collection is pretty expensive,
874          * do not make it too frequently.
875          */
876
877         RT_CACHE_STAT_INC(gc_total);
878
879         if (now - last_gc < ip_rt_gc_min_interval &&
880             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
881                 RT_CACHE_STAT_INC(gc_ignored);
882                 goto out;
883         }
884
885         /* Calculate number of entries, which we want to expire now. */
886         goal = atomic_read(&ipv4_dst_ops.entries) -
887                 (ip_rt_gc_elasticity << rt_hash_log);
888         if (goal <= 0) {
889                 if (equilibrium < ipv4_dst_ops.gc_thresh)
890                         equilibrium = ipv4_dst_ops.gc_thresh;
891                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
892                 if (goal > 0) {
893                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
894                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
895                 }
896         } else {
897                 /* We are in dangerous area. Try to reduce cache really
898                  * aggressively.
899                  */
900                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
901                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
902         }
903
904         if (now - last_gc >= ip_rt_gc_min_interval)
905                 last_gc = now;
906
907         if (goal <= 0) {
908                 equilibrium += goal;
909                 goto work_done;
910         }
911
912         do {
913                 int i, k;
914
915                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
916                         unsigned long tmo = expire;
917
918                         k = (k + 1) & rt_hash_mask;
919                         rthp = &rt_hash_table[k].chain;
920                         spin_lock_bh(rt_hash_lock_addr(k));
921                         while ((rth = *rthp) != NULL) {
922                                 if (!rt_is_expired(rth) &&
923                                         !rt_may_expire(rth, tmo, expire)) {
924                                         tmo >>= 1;
925                                         rthp = &rth->u.dst.rt_next;
926                                         continue;
927                                 }
928                                 *rthp = rth->u.dst.rt_next;
929                                 rt_free(rth);
930                                 goal--;
931                         }
932                         spin_unlock_bh(rt_hash_lock_addr(k));
933                         if (goal <= 0)
934                                 break;
935                 }
936                 rover = k;
937
938                 if (goal <= 0)
939                         goto work_done;
940
941                 /* Goal is not achieved. We stop process if:
942
943                    - if expire reduced to zero. Otherwise, expire is halfed.
944                    - if table is not full.
945                    - if we are called from interrupt.
946                    - jiffies check is just fallback/debug loop breaker.
947                      We will not spin here for long time in any case.
948                  */
949
950                 RT_CACHE_STAT_INC(gc_goal_miss);
951
952                 if (expire == 0)
953                         break;
954
955                 expire >>= 1;
956 #if RT_CACHE_DEBUG >= 2
957                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
958                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
959 #endif
960
961                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
962                         goto out;
963         } while (!in_softirq() && time_before_eq(jiffies, now));
964
965         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
966                 goto out;
967         if (net_ratelimit())
968                 printk(KERN_WARNING "dst cache overflow\n");
969         RT_CACHE_STAT_INC(gc_dst_overflow);
970         return 1;
971
972 work_done:
973         expire += ip_rt_gc_min_interval;
974         if (expire > ip_rt_gc_timeout ||
975             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
976                 expire = ip_rt_gc_timeout;
977 #if RT_CACHE_DEBUG >= 2
978         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
979                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
980 #endif
981 out:    return 0;
982 }
983
984 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
985 {
986         struct rtable   *rth, **rthp;
987         unsigned long   now;
988         struct rtable *cand, **candp;
989         u32             min_score;
990         int             chain_length;
991         int attempts = !in_softirq();
992
993 restart:
994         chain_length = 0;
995         min_score = ~(u32)0;
996         cand = NULL;
997         candp = NULL;
998         now = jiffies;
999
1000         rthp = &rt_hash_table[hash].chain;
1001
1002         spin_lock_bh(rt_hash_lock_addr(hash));
1003         while ((rth = *rthp) != NULL) {
1004                 if (rt_is_expired(rth)) {
1005                         *rthp = rth->u.dst.rt_next;
1006                         rt_free(rth);
1007                         continue;
1008                 }
1009                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1010                         /* Put it first */
1011                         *rthp = rth->u.dst.rt_next;
1012                         /*
1013                          * Since lookup is lockfree, the deletion
1014                          * must be visible to another weakly ordered CPU before
1015                          * the insertion at the start of the hash chain.
1016                          */
1017                         rcu_assign_pointer(rth->u.dst.rt_next,
1018                                            rt_hash_table[hash].chain);
1019                         /*
1020                          * Since lookup is lockfree, the update writes
1021                          * must be ordered for consistency on SMP.
1022                          */
1023                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1024
1025                         dst_use(&rth->u.dst, now);
1026                         spin_unlock_bh(rt_hash_lock_addr(hash));
1027
1028                         rt_drop(rt);
1029                         *rp = rth;
1030                         return 0;
1031                 }
1032
1033                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1034                         u32 score = rt_score(rth);
1035
1036                         if (score <= min_score) {
1037                                 cand = rth;
1038                                 candp = rthp;
1039                                 min_score = score;
1040                         }
1041                 }
1042
1043                 chain_length++;
1044
1045                 rthp = &rth->u.dst.rt_next;
1046         }
1047
1048         if (cand) {
1049                 /* ip_rt_gc_elasticity used to be average length of chain
1050                  * length, when exceeded gc becomes really aggressive.
1051                  *
1052                  * The second limit is less certain. At the moment it allows
1053                  * only 2 entries per bucket. We will see.
1054                  */
1055                 if (chain_length > ip_rt_gc_elasticity) {
1056                         *candp = cand->u.dst.rt_next;
1057                         rt_free(cand);
1058                 }
1059         }
1060
1061         /* Try to bind route to arp only if it is output
1062            route or unicast forwarding path.
1063          */
1064         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1065                 int err = arp_bind_neighbour(&rt->u.dst);
1066                 if (err) {
1067                         spin_unlock_bh(rt_hash_lock_addr(hash));
1068
1069                         if (err != -ENOBUFS) {
1070                                 rt_drop(rt);
1071                                 return err;
1072                         }
1073
1074                         /* Neighbour tables are full and nothing
1075                            can be released. Try to shrink route cache,
1076                            it is most likely it holds some neighbour records.
1077                          */
1078                         if (attempts-- > 0) {
1079                                 int saved_elasticity = ip_rt_gc_elasticity;
1080                                 int saved_int = ip_rt_gc_min_interval;
1081                                 ip_rt_gc_elasticity     = 1;
1082                                 ip_rt_gc_min_interval   = 0;
1083                                 rt_garbage_collect(&ipv4_dst_ops);
1084                                 ip_rt_gc_min_interval   = saved_int;
1085                                 ip_rt_gc_elasticity     = saved_elasticity;
1086                                 goto restart;
1087                         }
1088
1089                         if (net_ratelimit())
1090                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1091                         rt_drop(rt);
1092                         return -ENOBUFS;
1093                 }
1094         }
1095
1096         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1097 #if RT_CACHE_DEBUG >= 2
1098         if (rt->u.dst.rt_next) {
1099                 struct rtable *trt;
1100                 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1101                        NIPQUAD(rt->rt_dst));
1102                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1103                         printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1104                 printk("\n");
1105         }
1106 #endif
1107         rt_hash_table[hash].chain = rt;
1108         spin_unlock_bh(rt_hash_lock_addr(hash));
1109         *rp = rt;
1110         return 0;
1111 }
1112
1113 void rt_bind_peer(struct rtable *rt, int create)
1114 {
1115         static DEFINE_SPINLOCK(rt_peer_lock);
1116         struct inet_peer *peer;
1117
1118         peer = inet_getpeer(rt->rt_dst, create);
1119
1120         spin_lock_bh(&rt_peer_lock);
1121         if (rt->peer == NULL) {
1122                 rt->peer = peer;
1123                 peer = NULL;
1124         }
1125         spin_unlock_bh(&rt_peer_lock);
1126         if (peer)
1127                 inet_putpeer(peer);
1128 }
1129
1130 /*
1131  * Peer allocation may fail only in serious out-of-memory conditions.  However
1132  * we still can generate some output.
1133  * Random ID selection looks a bit dangerous because we have no chances to
1134  * select ID being unique in a reasonable period of time.
1135  * But broken packet identifier may be better than no packet at all.
1136  */
1137 static void ip_select_fb_ident(struct iphdr *iph)
1138 {
1139         static DEFINE_SPINLOCK(ip_fb_id_lock);
1140         static u32 ip_fallback_id;
1141         u32 salt;
1142
1143         spin_lock_bh(&ip_fb_id_lock);
1144         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1145         iph->id = htons(salt & 0xFFFF);
1146         ip_fallback_id = salt;
1147         spin_unlock_bh(&ip_fb_id_lock);
1148 }
1149
1150 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1151 {
1152         struct rtable *rt = (struct rtable *) dst;
1153
1154         if (rt) {
1155                 if (rt->peer == NULL)
1156                         rt_bind_peer(rt, 1);
1157
1158                 /* If peer is attached to destination, it is never detached,
1159                    so that we need not to grab a lock to dereference it.
1160                  */
1161                 if (rt->peer) {
1162                         iph->id = htons(inet_getid(rt->peer, more));
1163                         return;
1164                 }
1165         } else
1166                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1167                        __builtin_return_address(0));
1168
1169         ip_select_fb_ident(iph);
1170 }
1171
1172 static void rt_del(unsigned hash, struct rtable *rt)
1173 {
1174         struct rtable **rthp, *aux;
1175
1176         rthp = &rt_hash_table[hash].chain;
1177         spin_lock_bh(rt_hash_lock_addr(hash));
1178         ip_rt_put(rt);
1179         while ((aux = *rthp) != NULL) {
1180                 if (aux == rt || rt_is_expired(aux)) {
1181                         *rthp = aux->u.dst.rt_next;
1182                         rt_free(aux);
1183                         continue;
1184                 }
1185                 rthp = &aux->u.dst.rt_next;
1186         }
1187         spin_unlock_bh(rt_hash_lock_addr(hash));
1188 }
1189
1190 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1191                     __be32 saddr, struct net_device *dev)
1192 {
1193         int i, k;
1194         struct in_device *in_dev = in_dev_get(dev);
1195         struct rtable *rth, **rthp;
1196         __be32  skeys[2] = { saddr, 0 };
1197         int  ikeys[2] = { dev->ifindex, 0 };
1198         struct netevent_redirect netevent;
1199         struct net *net;
1200
1201         if (!in_dev)
1202                 return;
1203
1204         net = dev_net(dev);
1205         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1206             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1207             || ipv4_is_zeronet(new_gw))
1208                 goto reject_redirect;
1209
1210         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1211                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1212                         goto reject_redirect;
1213                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1214                         goto reject_redirect;
1215         } else {
1216                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1217                         goto reject_redirect;
1218         }
1219
1220         for (i = 0; i < 2; i++) {
1221                 for (k = 0; k < 2; k++) {
1222                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1223                                                 rt_genid(net));
1224
1225                         rthp=&rt_hash_table[hash].chain;
1226
1227                         rcu_read_lock();
1228                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1229                                 struct rtable *rt;
1230
1231                                 if (rth->fl.fl4_dst != daddr ||
1232                                     rth->fl.fl4_src != skeys[i] ||
1233                                     rth->fl.oif != ikeys[k] ||
1234                                     rth->fl.iif != 0 ||
1235                                     rt_is_expired(rth) ||
1236                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1237                                         rthp = &rth->u.dst.rt_next;
1238                                         continue;
1239                                 }
1240
1241                                 if (rth->rt_dst != daddr ||
1242                                     rth->rt_src != saddr ||
1243                                     rth->u.dst.error ||
1244                                     rth->rt_gateway != old_gw ||
1245                                     rth->u.dst.dev != dev)
1246                                         break;
1247
1248                                 dst_hold(&rth->u.dst);
1249                                 rcu_read_unlock();
1250
1251                                 rt = dst_alloc(&ipv4_dst_ops);
1252                                 if (rt == NULL) {
1253                                         ip_rt_put(rth);
1254                                         in_dev_put(in_dev);
1255                                         return;
1256                                 }
1257
1258                                 /* Copy all the information. */
1259                                 *rt = *rth;
1260                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1261                                 rt->u.dst.__use         = 1;
1262                                 atomic_set(&rt->u.dst.__refcnt, 1);
1263                                 rt->u.dst.child         = NULL;
1264                                 if (rt->u.dst.dev)
1265                                         dev_hold(rt->u.dst.dev);
1266                                 if (rt->idev)
1267                                         in_dev_hold(rt->idev);
1268                                 rt->u.dst.obsolete      = 0;
1269                                 rt->u.dst.lastuse       = jiffies;
1270                                 rt->u.dst.path          = &rt->u.dst;
1271                                 rt->u.dst.neighbour     = NULL;
1272                                 rt->u.dst.hh            = NULL;
1273                                 rt->u.dst.xfrm          = NULL;
1274                                 rt->rt_genid            = rt_genid(net);
1275                                 rt->rt_flags            |= RTCF_REDIRECTED;
1276
1277                                 /* Gateway is different ... */
1278                                 rt->rt_gateway          = new_gw;
1279
1280                                 /* Redirect received -> path was valid */
1281                                 dst_confirm(&rth->u.dst);
1282
1283                                 if (rt->peer)
1284                                         atomic_inc(&rt->peer->refcnt);
1285
1286                                 if (arp_bind_neighbour(&rt->u.dst) ||
1287                                     !(rt->u.dst.neighbour->nud_state &
1288                                             NUD_VALID)) {
1289                                         if (rt->u.dst.neighbour)
1290                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1291                                         ip_rt_put(rth);
1292                                         rt_drop(rt);
1293                                         goto do_next;
1294                                 }
1295
1296                                 netevent.old = &rth->u.dst;
1297                                 netevent.new = &rt->u.dst;
1298                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1299                                                         &netevent);
1300
1301                                 rt_del(hash, rth);
1302                                 if (!rt_intern_hash(hash, rt, &rt))
1303                                         ip_rt_put(rt);
1304                                 goto do_next;
1305                         }
1306                         rcu_read_unlock();
1307                 do_next:
1308                         ;
1309                 }
1310         }
1311         in_dev_put(in_dev);
1312         return;
1313
1314 reject_redirect:
1315 #ifdef CONFIG_IP_ROUTE_VERBOSE
1316         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1317                 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1318                         NIPQUAD_FMT " ignored.\n"
1319                         "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1320                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1321                        NIPQUAD(saddr), NIPQUAD(daddr));
1322 #endif
1323         in_dev_put(in_dev);
1324 }
1325
1326 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1327 {
1328         struct rtable *rt = (struct rtable *)dst;
1329         struct dst_entry *ret = dst;
1330
1331         if (rt) {
1332                 if (dst->obsolete) {
1333                         ip_rt_put(rt);
1334                         ret = NULL;
1335                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1336                            rt->u.dst.expires) {
1337                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1338                                                 rt->fl.oif,
1339                                                 rt_genid(dev_net(dst->dev)));
1340 #if RT_CACHE_DEBUG >= 1
1341                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1342                                           NIPQUAD_FMT "/%02x dropped\n",
1343                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1344 #endif
1345                         rt_del(hash, rt);
1346                         ret = NULL;
1347                 }
1348         }
1349         return ret;
1350 }
1351
1352 /*
1353  * Algorithm:
1354  *      1. The first ip_rt_redirect_number redirects are sent
1355  *         with exponential backoff, then we stop sending them at all,
1356  *         assuming that the host ignores our redirects.
1357  *      2. If we did not see packets requiring redirects
1358  *         during ip_rt_redirect_silence, we assume that the host
1359  *         forgot redirected route and start to send redirects again.
1360  *
1361  * This algorithm is much cheaper and more intelligent than dumb load limiting
1362  * in icmp.c.
1363  *
1364  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1365  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1366  */
1367
1368 void ip_rt_send_redirect(struct sk_buff *skb)
1369 {
1370         struct rtable *rt = skb->rtable;
1371         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1372
1373         if (!in_dev)
1374                 return;
1375
1376         if (!IN_DEV_TX_REDIRECTS(in_dev))
1377                 goto out;
1378
1379         /* No redirected packets during ip_rt_redirect_silence;
1380          * reset the algorithm.
1381          */
1382         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1383                 rt->u.dst.rate_tokens = 0;
1384
1385         /* Too many ignored redirects; do not send anything
1386          * set u.dst.rate_last to the last seen redirected packet.
1387          */
1388         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1389                 rt->u.dst.rate_last = jiffies;
1390                 goto out;
1391         }
1392
1393         /* Check for load limit; set rate_last to the latest sent
1394          * redirect.
1395          */
1396         if (rt->u.dst.rate_tokens == 0 ||
1397             time_after(jiffies,
1398                        (rt->u.dst.rate_last +
1399                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1400                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1401                 rt->u.dst.rate_last = jiffies;
1402                 ++rt->u.dst.rate_tokens;
1403 #ifdef CONFIG_IP_ROUTE_VERBOSE
1404                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1405                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1406                     net_ratelimit())
1407                         printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1408                                 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1409                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1410                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1411 #endif
1412         }
1413 out:
1414         in_dev_put(in_dev);
1415 }
1416
1417 static int ip_error(struct sk_buff *skb)
1418 {
1419         struct rtable *rt = skb->rtable;
1420         unsigned long now;
1421         int code;
1422
1423         switch (rt->u.dst.error) {
1424                 case EINVAL:
1425                 default:
1426                         goto out;
1427                 case EHOSTUNREACH:
1428                         code = ICMP_HOST_UNREACH;
1429                         break;
1430                 case ENETUNREACH:
1431                         code = ICMP_NET_UNREACH;
1432                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1433                                         IPSTATS_MIB_INNOROUTES);
1434                         break;
1435                 case EACCES:
1436                         code = ICMP_PKT_FILTERED;
1437                         break;
1438         }
1439
1440         now = jiffies;
1441         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1442         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1443                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1444         rt->u.dst.rate_last = now;
1445         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1446                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1447                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1448         }
1449
1450 out:    kfree_skb(skb);
1451         return 0;
1452 }
1453
1454 /*
1455  *      The last two values are not from the RFC but
1456  *      are needed for AMPRnet AX.25 paths.
1457  */
1458
1459 static const unsigned short mtu_plateau[] =
1460 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1461
1462 static inline unsigned short guess_mtu(unsigned short old_mtu)
1463 {
1464         int i;
1465
1466         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1467                 if (old_mtu > mtu_plateau[i])
1468                         return mtu_plateau[i];
1469         return 68;
1470 }
1471
1472 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1473                                  unsigned short new_mtu,
1474                                  struct net_device *dev)
1475 {
1476         int i, k;
1477         unsigned short old_mtu = ntohs(iph->tot_len);
1478         struct rtable *rth;
1479         int  ikeys[2] = { dev->ifindex, 0 };
1480         __be32  skeys[2] = { iph->saddr, 0, };
1481         __be32  daddr = iph->daddr;
1482         unsigned short est_mtu = 0;
1483
1484         if (ipv4_config.no_pmtu_disc)
1485                 return 0;
1486
1487         for (k = 0; k < 2; k++) {
1488                 for (i = 0; i < 2; i++) {
1489                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1490                                                 rt_genid(net));
1491
1492                         rcu_read_lock();
1493                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1494                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1495                                 unsigned short mtu = new_mtu;
1496
1497                                 if (rth->fl.fl4_dst != daddr ||
1498                                     rth->fl.fl4_src != skeys[i] ||
1499                                     rth->rt_dst != daddr ||
1500                                     rth->rt_src != iph->saddr ||
1501                                     rth->fl.oif != ikeys[k] ||
1502                                     rth->fl.iif != 0 ||
1503                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1504                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1505                                     rt_is_expired(rth))
1506                                         continue;
1507
1508                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1509
1510                                         /* BSD 4.2 compatibility hack :-( */
1511                                         if (mtu == 0 &&
1512                                             old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1513                                             old_mtu >= 68 + (iph->ihl << 2))
1514                                                 old_mtu -= iph->ihl << 2;
1515
1516                                         mtu = guess_mtu(old_mtu);
1517                                 }
1518                                 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1519                                         if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1520                                                 dst_confirm(&rth->u.dst);
1521                                                 if (mtu < ip_rt_min_pmtu) {
1522                                                         mtu = ip_rt_min_pmtu;
1523                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1524                                                                 (1 << RTAX_MTU);
1525                                                 }
1526                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1527                                                 dst_set_expires(&rth->u.dst,
1528                                                         ip_rt_mtu_expires);
1529                                         }
1530                                         est_mtu = mtu;
1531                                 }
1532                         }
1533                         rcu_read_unlock();
1534                 }
1535         }
1536         return est_mtu ? : new_mtu;
1537 }
1538
1539 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1540 {
1541         if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1542             !(dst_metric_locked(dst, RTAX_MTU))) {
1543                 if (mtu < ip_rt_min_pmtu) {
1544                         mtu = ip_rt_min_pmtu;
1545                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1546                 }
1547                 dst->metrics[RTAX_MTU-1] = mtu;
1548                 dst_set_expires(dst, ip_rt_mtu_expires);
1549                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1550         }
1551 }
1552
1553 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1554 {
1555         return NULL;
1556 }
1557
1558 static void ipv4_dst_destroy(struct dst_entry *dst)
1559 {
1560         struct rtable *rt = (struct rtable *) dst;
1561         struct inet_peer *peer = rt->peer;
1562         struct in_device *idev = rt->idev;
1563
1564         if (peer) {
1565                 rt->peer = NULL;
1566                 inet_putpeer(peer);
1567         }
1568
1569         if (idev) {
1570                 rt->idev = NULL;
1571                 in_dev_put(idev);
1572         }
1573 }
1574
1575 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1576                             int how)
1577 {
1578         struct rtable *rt = (struct rtable *) dst;
1579         struct in_device *idev = rt->idev;
1580         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1581                 struct in_device *loopback_idev =
1582                         in_dev_get(dev_net(dev)->loopback_dev);
1583                 if (loopback_idev) {
1584                         rt->idev = loopback_idev;
1585                         in_dev_put(idev);
1586                 }
1587         }
1588 }
1589
1590 static void ipv4_link_failure(struct sk_buff *skb)
1591 {
1592         struct rtable *rt;
1593
1594         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1595
1596         rt = skb->rtable;
1597         if (rt)
1598                 dst_set_expires(&rt->u.dst, 0);
1599 }
1600
1601 static int ip_rt_bug(struct sk_buff *skb)
1602 {
1603         printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1604                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1605                 skb->dev ? skb->dev->name : "?");
1606         kfree_skb(skb);
1607         return 0;
1608 }
1609
1610 /*
1611    We do not cache source address of outgoing interface,
1612    because it is used only by IP RR, TS and SRR options,
1613    so that it out of fast path.
1614
1615    BTW remember: "addr" is allowed to be not aligned
1616    in IP options!
1617  */
1618
1619 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1620 {
1621         __be32 src;
1622         struct fib_result res;
1623
1624         if (rt->fl.iif == 0)
1625                 src = rt->rt_src;
1626         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1627                 src = FIB_RES_PREFSRC(res);
1628                 fib_res_put(&res);
1629         } else
1630                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1631                                         RT_SCOPE_UNIVERSE);
1632         memcpy(addr, &src, 4);
1633 }
1634
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 static void set_class_tag(struct rtable *rt, u32 tag)
1637 {
1638         if (!(rt->u.dst.tclassid & 0xFFFF))
1639                 rt->u.dst.tclassid |= tag & 0xFFFF;
1640         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1641                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1642 }
1643 #endif
1644
1645 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1646 {
1647         struct fib_info *fi = res->fi;
1648
1649         if (fi) {
1650                 if (FIB_RES_GW(*res) &&
1651                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1652                         rt->rt_gateway = FIB_RES_GW(*res);
1653                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1654                        sizeof(rt->u.dst.metrics));
1655                 if (fi->fib_mtu == 0) {
1656                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1657                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1658                             rt->rt_gateway != rt->rt_dst &&
1659                             rt->u.dst.dev->mtu > 576)
1660                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1661                 }
1662 #ifdef CONFIG_NET_CLS_ROUTE
1663                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1664 #endif
1665         } else
1666                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1667
1668         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1669                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1670         if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1671                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1672         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1673                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1674                                        ip_rt_min_advmss);
1675         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1676                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1677
1678 #ifdef CONFIG_NET_CLS_ROUTE
1679 #ifdef CONFIG_IP_MULTIPLE_TABLES
1680         set_class_tag(rt, fib_rules_tclass(res));
1681 #endif
1682         set_class_tag(rt, itag);
1683 #endif
1684         rt->rt_type = res->type;
1685 }
1686
1687 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688                                 u8 tos, struct net_device *dev, int our)
1689 {
1690         unsigned hash;
1691         struct rtable *rth;
1692         __be32 spec_dst;
1693         struct in_device *in_dev = in_dev_get(dev);
1694         u32 itag = 0;
1695
1696         /* Primary sanity checks. */
1697
1698         if (in_dev == NULL)
1699                 return -EINVAL;
1700
1701         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1703                 goto e_inval;
1704
1705         if (ipv4_is_zeronet(saddr)) {
1706                 if (!ipv4_is_local_multicast(daddr))
1707                         goto e_inval;
1708                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1709         } else if (fib_validate_source(saddr, 0, tos, 0,
1710                                         dev, &spec_dst, &itag) < 0)
1711                 goto e_inval;
1712
1713         rth = dst_alloc(&ipv4_dst_ops);
1714         if (!rth)
1715                 goto e_nobufs;
1716
1717         rth->u.dst.output= ip_rt_bug;
1718
1719         atomic_set(&rth->u.dst.__refcnt, 1);
1720         rth->u.dst.flags= DST_HOST;
1721         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1722                 rth->u.dst.flags |= DST_NOPOLICY;
1723         rth->fl.fl4_dst = daddr;
1724         rth->rt_dst     = daddr;
1725         rth->fl.fl4_tos = tos;
1726         rth->fl.mark    = skb->mark;
1727         rth->fl.fl4_src = saddr;
1728         rth->rt_src     = saddr;
1729 #ifdef CONFIG_NET_CLS_ROUTE
1730         rth->u.dst.tclassid = itag;
1731 #endif
1732         rth->rt_iif     =
1733         rth->fl.iif     = dev->ifindex;
1734         rth->u.dst.dev  = init_net.loopback_dev;
1735         dev_hold(rth->u.dst.dev);
1736         rth->idev       = in_dev_get(rth->u.dst.dev);
1737         rth->fl.oif     = 0;
1738         rth->rt_gateway = daddr;
1739         rth->rt_spec_dst= spec_dst;
1740         rth->rt_genid   = rt_genid(dev_net(dev));
1741         rth->rt_flags   = RTCF_MULTICAST;
1742         rth->rt_type    = RTN_MULTICAST;
1743         if (our) {
1744                 rth->u.dst.input= ip_local_deliver;
1745                 rth->rt_flags |= RTCF_LOCAL;
1746         }
1747
1748 #ifdef CONFIG_IP_MROUTE
1749         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1750                 rth->u.dst.input = ip_mr_input;
1751 #endif
1752         RT_CACHE_STAT_INC(in_slow_mc);
1753
1754         in_dev_put(in_dev);
1755         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1756         return rt_intern_hash(hash, rth, &skb->rtable);
1757
1758 e_nobufs:
1759         in_dev_put(in_dev);
1760         return -ENOBUFS;
1761
1762 e_inval:
1763         in_dev_put(in_dev);
1764         return -EINVAL;
1765 }
1766
1767
1768 static void ip_handle_martian_source(struct net_device *dev,
1769                                      struct in_device *in_dev,
1770                                      struct sk_buff *skb,
1771                                      __be32 daddr,
1772                                      __be32 saddr)
1773 {
1774         RT_CACHE_STAT_INC(in_martian_src);
1775 #ifdef CONFIG_IP_ROUTE_VERBOSE
1776         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1777                 /*
1778                  *      RFC1812 recommendation, if source is martian,
1779                  *      the only hint is MAC header.
1780                  */
1781                 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1782                         NIPQUAD_FMT", on dev %s\n",
1783                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1784                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1785                         int i;
1786                         const unsigned char *p = skb_mac_header(skb);
1787                         printk(KERN_WARNING "ll header: ");
1788                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1789                                 printk("%02x", *p);
1790                                 if (i < (dev->hard_header_len - 1))
1791                                         printk(":");
1792                         }
1793                         printk("\n");
1794                 }
1795         }
1796 #endif
1797 }
1798
1799 static int __mkroute_input(struct sk_buff *skb,
1800                            struct fib_result *res,
1801                            struct in_device *in_dev,
1802                            __be32 daddr, __be32 saddr, u32 tos,
1803                            struct rtable **result)
1804 {
1805
1806         struct rtable *rth;
1807         int err;
1808         struct in_device *out_dev;
1809         unsigned flags = 0;
1810         __be32 spec_dst;
1811         u32 itag;
1812
1813         /* get a working reference to the output device */
1814         out_dev = in_dev_get(FIB_RES_DEV(*res));
1815         if (out_dev == NULL) {
1816                 if (net_ratelimit())
1817                         printk(KERN_CRIT "Bug in ip_route_input" \
1818                                "_slow(). Please, report\n");
1819                 return -EINVAL;
1820         }
1821
1822
1823         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1824                                   in_dev->dev, &spec_dst, &itag);
1825         if (err < 0) {
1826                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1827                                          saddr);
1828
1829                 err = -EINVAL;
1830                 goto cleanup;
1831         }
1832
1833         if (err)
1834                 flags |= RTCF_DIRECTSRC;
1835
1836         if (out_dev == in_dev && err &&
1837             (IN_DEV_SHARED_MEDIA(out_dev) ||
1838              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1839                 flags |= RTCF_DOREDIRECT;
1840
1841         if (skb->protocol != htons(ETH_P_IP)) {
1842                 /* Not IP (i.e. ARP). Do not create route, if it is
1843                  * invalid for proxy arp. DNAT routes are always valid.
1844                  */
1845                 if (out_dev == in_dev) {
1846                         err = -EINVAL;
1847                         goto cleanup;
1848                 }
1849         }
1850
1851
1852         rth = dst_alloc(&ipv4_dst_ops);
1853         if (!rth) {
1854                 err = -ENOBUFS;
1855                 goto cleanup;
1856         }
1857
1858         atomic_set(&rth->u.dst.__refcnt, 1);
1859         rth->u.dst.flags= DST_HOST;
1860         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1861                 rth->u.dst.flags |= DST_NOPOLICY;
1862         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1863                 rth->u.dst.flags |= DST_NOXFRM;
1864         rth->fl.fl4_dst = daddr;
1865         rth->rt_dst     = daddr;
1866         rth->fl.fl4_tos = tos;
1867         rth->fl.mark    = skb->mark;
1868         rth->fl.fl4_src = saddr;
1869         rth->rt_src     = saddr;
1870         rth->rt_gateway = daddr;
1871         rth->rt_iif     =
1872                 rth->fl.iif     = in_dev->dev->ifindex;
1873         rth->u.dst.dev  = (out_dev)->dev;
1874         dev_hold(rth->u.dst.dev);
1875         rth->idev       = in_dev_get(rth->u.dst.dev);
1876         rth->fl.oif     = 0;
1877         rth->rt_spec_dst= spec_dst;
1878
1879         rth->u.dst.input = ip_forward;
1880         rth->u.dst.output = ip_output;
1881         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1882
1883         rt_set_nexthop(rth, res, itag);
1884
1885         rth->rt_flags = flags;
1886
1887         *result = rth;
1888         err = 0;
1889  cleanup:
1890         /* release the working reference to the output device */
1891         in_dev_put(out_dev);
1892         return err;
1893 }
1894
1895 static int ip_mkroute_input(struct sk_buff *skb,
1896                             struct fib_result *res,
1897                             const struct flowi *fl,
1898                             struct in_device *in_dev,
1899                             __be32 daddr, __be32 saddr, u32 tos)
1900 {
1901         struct rtable* rth = NULL;
1902         int err;
1903         unsigned hash;
1904
1905 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1906         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1907                 fib_select_multipath(fl, res);
1908 #endif
1909
1910         /* create a routing cache entry */
1911         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1912         if (err)
1913                 return err;
1914
1915         /* put it into the cache */
1916         hash = rt_hash(daddr, saddr, fl->iif,
1917                        rt_genid(dev_net(rth->u.dst.dev)));
1918         return rt_intern_hash(hash, rth, &skb->rtable);
1919 }
1920
1921 /*
1922  *      NOTE. We drop all the packets that has local source
1923  *      addresses, because every properly looped back packet
1924  *      must have correct destination already attached by output routine.
1925  *
1926  *      Such approach solves two big problems:
1927  *      1. Not simplex devices are handled properly.
1928  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1929  */
1930
1931 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1932                                u8 tos, struct net_device *dev)
1933 {
1934         struct fib_result res;
1935         struct in_device *in_dev = in_dev_get(dev);
1936         struct flowi fl = { .nl_u = { .ip4_u =
1937                                       { .daddr = daddr,
1938                                         .saddr = saddr,
1939                                         .tos = tos,
1940                                         .scope = RT_SCOPE_UNIVERSE,
1941                                       } },
1942                             .mark = skb->mark,
1943                             .iif = dev->ifindex };
1944         unsigned        flags = 0;
1945         u32             itag = 0;
1946         struct rtable * rth;
1947         unsigned        hash;
1948         __be32          spec_dst;
1949         int             err = -EINVAL;
1950         int             free_res = 0;
1951         struct net    * net = dev_net(dev);
1952
1953         /* IP on this device is disabled. */
1954
1955         if (!in_dev)
1956                 goto out;
1957
1958         /* Check for the most weird martians, which can be not detected
1959            by fib_lookup.
1960          */
1961
1962         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1963             ipv4_is_loopback(saddr))
1964                 goto martian_source;
1965
1966         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1967                 goto brd_input;
1968
1969         /* Accept zero addresses only to limited broadcast;
1970          * I even do not know to fix it or not. Waiting for complains :-)
1971          */
1972         if (ipv4_is_zeronet(saddr))
1973                 goto martian_source;
1974
1975         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1976             ipv4_is_loopback(daddr))
1977                 goto martian_destination;
1978
1979         /*
1980          *      Now we are ready to route packet.
1981          */
1982         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1983                 if (!IN_DEV_FORWARD(in_dev))
1984                         goto e_hostunreach;
1985                 goto no_route;
1986         }
1987         free_res = 1;
1988
1989         RT_CACHE_STAT_INC(in_slow_tot);
1990
1991         if (res.type == RTN_BROADCAST)
1992                 goto brd_input;
1993
1994         if (res.type == RTN_LOCAL) {
1995                 int result;
1996                 result = fib_validate_source(saddr, daddr, tos,
1997                                              net->loopback_dev->ifindex,
1998                                              dev, &spec_dst, &itag);
1999                 if (result < 0)
2000                         goto martian_source;
2001                 if (result)
2002                         flags |= RTCF_DIRECTSRC;
2003                 spec_dst = daddr;
2004                 goto local_input;
2005         }
2006
2007         if (!IN_DEV_FORWARD(in_dev))
2008                 goto e_hostunreach;
2009         if (res.type != RTN_UNICAST)
2010                 goto martian_destination;
2011
2012         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2013 done:
2014         in_dev_put(in_dev);
2015         if (free_res)
2016                 fib_res_put(&res);
2017 out:    return err;
2018
2019 brd_input:
2020         if (skb->protocol != htons(ETH_P_IP))
2021                 goto e_inval;
2022
2023         if (ipv4_is_zeronet(saddr))
2024                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2025         else {
2026                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2027                                           &itag);
2028                 if (err < 0)
2029                         goto martian_source;
2030                 if (err)
2031                         flags |= RTCF_DIRECTSRC;
2032         }
2033         flags |= RTCF_BROADCAST;
2034         res.type = RTN_BROADCAST;
2035         RT_CACHE_STAT_INC(in_brd);
2036
2037 local_input:
2038         rth = dst_alloc(&ipv4_dst_ops);
2039         if (!rth)
2040                 goto e_nobufs;
2041
2042         rth->u.dst.output= ip_rt_bug;
2043         rth->rt_genid = rt_genid(net);
2044
2045         atomic_set(&rth->u.dst.__refcnt, 1);
2046         rth->u.dst.flags= DST_HOST;
2047         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2048                 rth->u.dst.flags |= DST_NOPOLICY;
2049         rth->fl.fl4_dst = daddr;
2050         rth->rt_dst     = daddr;
2051         rth->fl.fl4_tos = tos;
2052         rth->fl.mark    = skb->mark;
2053         rth->fl.fl4_src = saddr;
2054         rth->rt_src     = saddr;
2055 #ifdef CONFIG_NET_CLS_ROUTE
2056         rth->u.dst.tclassid = itag;
2057 #endif
2058         rth->rt_iif     =
2059         rth->fl.iif     = dev->ifindex;
2060         rth->u.dst.dev  = net->loopback_dev;
2061         dev_hold(rth->u.dst.dev);
2062         rth->idev       = in_dev_get(rth->u.dst.dev);
2063         rth->rt_gateway = daddr;
2064         rth->rt_spec_dst= spec_dst;
2065         rth->u.dst.input= ip_local_deliver;
2066         rth->rt_flags   = flags|RTCF_LOCAL;
2067         if (res.type == RTN_UNREACHABLE) {
2068                 rth->u.dst.input= ip_error;
2069                 rth->u.dst.error= -err;
2070                 rth->rt_flags   &= ~RTCF_LOCAL;
2071         }
2072         rth->rt_type    = res.type;
2073         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2074         err = rt_intern_hash(hash, rth, &skb->rtable);
2075         goto done;
2076
2077 no_route:
2078         RT_CACHE_STAT_INC(in_no_route);
2079         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2080         res.type = RTN_UNREACHABLE;
2081         if (err == -ESRCH)
2082                 err = -ENETUNREACH;
2083         goto local_input;
2084
2085         /*
2086          *      Do not cache martian addresses: they should be logged (RFC1812)
2087          */
2088 martian_destination:
2089         RT_CACHE_STAT_INC(in_martian_dst);
2090 #ifdef CONFIG_IP_ROUTE_VERBOSE
2091         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2092                 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2093                         NIPQUAD_FMT ", dev %s\n",
2094                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2095 #endif
2096
2097 e_hostunreach:
2098         err = -EHOSTUNREACH;
2099         goto done;
2100
2101 e_inval:
2102         err = -EINVAL;
2103         goto done;
2104
2105 e_nobufs:
2106         err = -ENOBUFS;
2107         goto done;
2108
2109 martian_source:
2110         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2111         goto e_inval;
2112 }
2113
2114 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2115                    u8 tos, struct net_device *dev)
2116 {
2117         struct rtable * rth;
2118         unsigned        hash;
2119         int iif = dev->ifindex;
2120         struct net *net;
2121
2122         net = dev_net(dev);
2123         tos &= IPTOS_RT_MASK;
2124         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2125
2126         rcu_read_lock();
2127         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2128              rth = rcu_dereference(rth->u.dst.rt_next)) {
2129                 if (((rth->fl.fl4_dst ^ daddr) |
2130                      (rth->fl.fl4_src ^ saddr) |
2131                      (rth->fl.iif ^ iif) |
2132                      rth->fl.oif |
2133                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2134                     rth->fl.mark == skb->mark &&
2135                     net_eq(dev_net(rth->u.dst.dev), net) &&
2136                     !rt_is_expired(rth)) {
2137                         dst_use(&rth->u.dst, jiffies);
2138                         RT_CACHE_STAT_INC(in_hit);
2139                         rcu_read_unlock();
2140                         skb->rtable = rth;
2141                         return 0;
2142                 }
2143                 RT_CACHE_STAT_INC(in_hlist_search);
2144         }
2145         rcu_read_unlock();
2146
2147         /* Multicast recognition logic is moved from route cache to here.
2148            The problem was that too many Ethernet cards have broken/missing
2149            hardware multicast filters :-( As result the host on multicasting
2150            network acquires a lot of useless route cache entries, sort of
2151            SDR messages from all the world. Now we try to get rid of them.
2152            Really, provided software IP multicast filter is organized
2153            reasonably (at least, hashed), it does not result in a slowdown
2154            comparing with route cache reject entries.
2155            Note, that multicast routers are not affected, because
2156            route cache entry is created eventually.
2157          */
2158         if (ipv4_is_multicast(daddr)) {
2159                 struct in_device *in_dev;
2160
2161                 rcu_read_lock();
2162                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2163                         int our = ip_check_mc(in_dev, daddr, saddr,
2164                                 ip_hdr(skb)->protocol);
2165                         if (our
2166 #ifdef CONFIG_IP_MROUTE
2167                             || (!ipv4_is_local_multicast(daddr) &&
2168                                 IN_DEV_MFORWARD(in_dev))
2169 #endif
2170                             ) {
2171                                 rcu_read_unlock();
2172                                 return ip_route_input_mc(skb, daddr, saddr,
2173                                                          tos, dev, our);
2174                         }
2175                 }
2176                 rcu_read_unlock();
2177                 return -EINVAL;
2178         }
2179         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2180 }
2181
2182 static int __mkroute_output(struct rtable **result,
2183                             struct fib_result *res,
2184                             const struct flowi *fl,
2185                             const struct flowi *oldflp,
2186                             struct net_device *dev_out,
2187                             unsigned flags)
2188 {
2189         struct rtable *rth;
2190         struct in_device *in_dev;
2191         u32 tos = RT_FL_TOS(oldflp);
2192         int err = 0;
2193
2194         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2195                 return -EINVAL;
2196
2197         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2198                 res->type = RTN_BROADCAST;
2199         else if (ipv4_is_multicast(fl->fl4_dst))
2200                 res->type = RTN_MULTICAST;
2201         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2202                 return -EINVAL;
2203
2204         if (dev_out->flags & IFF_LOOPBACK)
2205                 flags |= RTCF_LOCAL;
2206
2207         /* get work reference to inet device */
2208         in_dev = in_dev_get(dev_out);
2209         if (!in_dev)
2210                 return -EINVAL;
2211
2212         if (res->type == RTN_BROADCAST) {
2213                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2214                 if (res->fi) {
2215                         fib_info_put(res->fi);
2216                         res->fi = NULL;
2217                 }
2218         } else if (res->type == RTN_MULTICAST) {
2219                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2220                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2221                                  oldflp->proto))
2222                         flags &= ~RTCF_LOCAL;
2223                 /* If multicast route do not exist use
2224                    default one, but do not gateway in this case.
2225                    Yes, it is hack.
2226                  */
2227                 if (res->fi && res->prefixlen < 4) {
2228                         fib_info_put(res->fi);
2229                         res->fi = NULL;
2230                 }
2231         }
2232
2233
2234         rth = dst_alloc(&ipv4_dst_ops);
2235         if (!rth) {
2236                 err = -ENOBUFS;
2237                 goto cleanup;
2238         }
2239
2240         atomic_set(&rth->u.dst.__refcnt, 1);
2241         rth->u.dst.flags= DST_HOST;
2242         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2243                 rth->u.dst.flags |= DST_NOXFRM;
2244         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2245                 rth->u.dst.flags |= DST_NOPOLICY;
2246
2247         rth->fl.fl4_dst = oldflp->fl4_dst;
2248         rth->fl.fl4_tos = tos;
2249         rth->fl.fl4_src = oldflp->fl4_src;
2250         rth->fl.oif     = oldflp->oif;
2251         rth->fl.mark    = oldflp->mark;
2252         rth->rt_dst     = fl->fl4_dst;
2253         rth->rt_src     = fl->fl4_src;
2254         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2255         /* get references to the devices that are to be hold by the routing
2256            cache entry */
2257         rth->u.dst.dev  = dev_out;
2258         dev_hold(dev_out);
2259         rth->idev       = in_dev_get(dev_out);
2260         rth->rt_gateway = fl->fl4_dst;
2261         rth->rt_spec_dst= fl->fl4_src;
2262
2263         rth->u.dst.output=ip_output;
2264         rth->rt_genid = rt_genid(dev_net(dev_out));
2265
2266         RT_CACHE_STAT_INC(out_slow_tot);
2267
2268         if (flags & RTCF_LOCAL) {
2269                 rth->u.dst.input = ip_local_deliver;
2270                 rth->rt_spec_dst = fl->fl4_dst;
2271         }
2272         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2273                 rth->rt_spec_dst = fl->fl4_src;
2274                 if (flags & RTCF_LOCAL &&
2275                     !(dev_out->flags & IFF_LOOPBACK)) {
2276                         rth->u.dst.output = ip_mc_output;
2277                         RT_CACHE_STAT_INC(out_slow_mc);
2278                 }
2279 #ifdef CONFIG_IP_MROUTE
2280                 if (res->type == RTN_MULTICAST) {
2281                         if (IN_DEV_MFORWARD(in_dev) &&
2282                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2283                                 rth->u.dst.input = ip_mr_input;
2284                                 rth->u.dst.output = ip_mc_output;
2285                         }
2286                 }
2287 #endif
2288         }
2289
2290         rt_set_nexthop(rth, res, 0);
2291
2292         rth->rt_flags = flags;
2293
2294         *result = rth;
2295  cleanup:
2296         /* release work reference to inet device */
2297         in_dev_put(in_dev);
2298
2299         return err;
2300 }
2301
2302 static int ip_mkroute_output(struct rtable **rp,
2303                              struct fib_result *res,
2304                              const struct flowi *fl,
2305                              const struct flowi *oldflp,
2306                              struct net_device *dev_out,
2307                              unsigned flags)
2308 {
2309         struct rtable *rth = NULL;
2310         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2311         unsigned hash;
2312         if (err == 0) {
2313                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2314                                rt_genid(dev_net(dev_out)));
2315                 err = rt_intern_hash(hash, rth, rp);
2316         }
2317
2318         return err;
2319 }
2320
2321 /*
2322  * Major route resolver routine.
2323  */
2324
2325 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2326                                 const struct flowi *oldflp)
2327 {
2328         u32 tos = RT_FL_TOS(oldflp);
2329         struct flowi fl = { .nl_u = { .ip4_u =
2330                                       { .daddr = oldflp->fl4_dst,
2331                                         .saddr = oldflp->fl4_src,
2332                                         .tos = tos & IPTOS_RT_MASK,
2333                                         .scope = ((tos & RTO_ONLINK) ?
2334                                                   RT_SCOPE_LINK :
2335                                                   RT_SCOPE_UNIVERSE),
2336                                       } },
2337                             .mark = oldflp->mark,
2338                             .iif = net->loopback_dev->ifindex,
2339                             .oif = oldflp->oif };
2340         struct fib_result res;
2341         unsigned flags = 0;
2342         struct net_device *dev_out = NULL;
2343         int free_res = 0;
2344         int err;
2345
2346
2347         res.fi          = NULL;
2348 #ifdef CONFIG_IP_MULTIPLE_TABLES
2349         res.r           = NULL;
2350 #endif
2351
2352         if (oldflp->fl4_src) {
2353                 err = -EINVAL;
2354                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2355                     ipv4_is_lbcast(oldflp->fl4_src) ||
2356                     ipv4_is_zeronet(oldflp->fl4_src))
2357                         goto out;
2358
2359                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2360                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2361                 if (dev_out == NULL)
2362                         goto out;
2363
2364                 /* I removed check for oif == dev_out->oif here.
2365                    It was wrong for two reasons:
2366                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2367                       is assigned to multiple interfaces.
2368                    2. Moreover, we are allowed to send packets with saddr
2369                       of another iface. --ANK
2370                  */
2371
2372                 if (oldflp->oif == 0
2373                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2374                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2375                         /* Special hack: user can direct multicasts
2376                            and limited broadcast via necessary interface
2377                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2378                            This hack is not just for fun, it allows
2379                            vic,vat and friends to work.
2380                            They bind socket to loopback, set ttl to zero
2381                            and expect that it will work.
2382                            From the viewpoint of routing cache they are broken,
2383                            because we are not allowed to build multicast path
2384                            with loopback source addr (look, routing cache
2385                            cannot know, that ttl is zero, so that packet
2386                            will not leave this host and route is valid).
2387                            Luckily, this hack is good workaround.
2388                          */
2389
2390                         fl.oif = dev_out->ifindex;
2391                         goto make_route;
2392                 }
2393                 if (dev_out)
2394                         dev_put(dev_out);
2395                 dev_out = NULL;
2396         }
2397
2398
2399         if (oldflp->oif) {
2400                 dev_out = dev_get_by_index(net, oldflp->oif);
2401                 err = -ENODEV;
2402                 if (dev_out == NULL)
2403                         goto out;
2404
2405                 /* RACE: Check return value of inet_select_addr instead. */
2406                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2407                         dev_put(dev_out);
2408                         goto out;       /* Wrong error code */
2409                 }
2410
2411                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2412                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2413                         if (!fl.fl4_src)
2414                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2415                                                               RT_SCOPE_LINK);
2416                         goto make_route;
2417                 }
2418                 if (!fl.fl4_src) {
2419                         if (ipv4_is_multicast(oldflp->fl4_dst))
2420                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2421                                                               fl.fl4_scope);
2422                         else if (!oldflp->fl4_dst)
2423                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2424                                                               RT_SCOPE_HOST);
2425                 }
2426         }
2427
2428         if (!fl.fl4_dst) {
2429                 fl.fl4_dst = fl.fl4_src;
2430                 if (!fl.fl4_dst)
2431                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2432                 if (dev_out)
2433                         dev_put(dev_out);
2434                 dev_out = net->loopback_dev;
2435                 dev_hold(dev_out);
2436                 fl.oif = net->loopback_dev->ifindex;
2437                 res.type = RTN_LOCAL;
2438                 flags |= RTCF_LOCAL;
2439                 goto make_route;
2440         }
2441
2442         if (fib_lookup(net, &fl, &res)) {
2443                 res.fi = NULL;
2444                 if (oldflp->oif) {
2445                         /* Apparently, routing tables are wrong. Assume,
2446                            that the destination is on link.
2447
2448                            WHY? DW.
2449                            Because we are allowed to send to iface
2450                            even if it has NO routes and NO assigned
2451                            addresses. When oif is specified, routing
2452                            tables are looked up with only one purpose:
2453                            to catch if destination is gatewayed, rather than
2454                            direct. Moreover, if MSG_DONTROUTE is set,
2455                            we send packet, ignoring both routing tables
2456                            and ifaddr state. --ANK
2457
2458
2459                            We could make it even if oif is unknown,
2460                            likely IPv6, but we do not.
2461                          */
2462
2463                         if (fl.fl4_src == 0)
2464                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2465                                                               RT_SCOPE_LINK);
2466                         res.type = RTN_UNICAST;
2467                         goto make_route;
2468                 }
2469                 if (dev_out)
2470                         dev_put(dev_out);
2471                 err = -ENETUNREACH;
2472                 goto out;
2473         }
2474         free_res = 1;
2475
2476         if (res.type == RTN_LOCAL) {
2477                 if (!fl.fl4_src)
2478                         fl.fl4_src = fl.fl4_dst;
2479                 if (dev_out)
2480                         dev_put(dev_out);
2481                 dev_out = net->loopback_dev;
2482                 dev_hold(dev_out);
2483                 fl.oif = dev_out->ifindex;
2484                 if (res.fi)
2485                         fib_info_put(res.fi);
2486                 res.fi = NULL;
2487                 flags |= RTCF_LOCAL;
2488                 goto make_route;
2489         }
2490
2491 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2492         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2493                 fib_select_multipath(&fl, &res);
2494         else
2495 #endif
2496         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2497                 fib_select_default(net, &fl, &res);
2498
2499         if (!fl.fl4_src)
2500                 fl.fl4_src = FIB_RES_PREFSRC(res);
2501
2502         if (dev_out)
2503                 dev_put(dev_out);
2504         dev_out = FIB_RES_DEV(res);
2505         dev_hold(dev_out);
2506         fl.oif = dev_out->ifindex;
2507
2508
2509 make_route:
2510         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2511
2512
2513         if (free_res)
2514                 fib_res_put(&res);
2515         if (dev_out)
2516                 dev_put(dev_out);
2517 out:    return err;
2518 }
2519
2520 int __ip_route_output_key(struct net *net, struct rtable **rp,
2521                           const struct flowi *flp)
2522 {
2523         unsigned hash;
2524         struct rtable *rth;
2525
2526         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2527
2528         rcu_read_lock_bh();
2529         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2530                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2531                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2532                     rth->fl.fl4_src == flp->fl4_src &&
2533                     rth->fl.iif == 0 &&
2534                     rth->fl.oif == flp->oif &&
2535                     rth->fl.mark == flp->mark &&
2536                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2537                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2538                     net_eq(dev_net(rth->u.dst.dev), net) &&
2539                     !rt_is_expired(rth)) {
2540                         dst_use(&rth->u.dst, jiffies);
2541                         RT_CACHE_STAT_INC(out_hit);
2542                         rcu_read_unlock_bh();
2543                         *rp = rth;
2544                         return 0;
2545                 }
2546                 RT_CACHE_STAT_INC(out_hlist_search);
2547         }
2548         rcu_read_unlock_bh();
2549
2550         return ip_route_output_slow(net, rp, flp);
2551 }
2552
2553 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2554
2555 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2556 {
2557 }
2558
2559 static struct dst_ops ipv4_dst_blackhole_ops = {
2560         .family                 =       AF_INET,
2561         .protocol               =       __constant_htons(ETH_P_IP),
2562         .destroy                =       ipv4_dst_destroy,
2563         .check                  =       ipv4_dst_check,
2564         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2565         .entry_size             =       sizeof(struct rtable),
2566         .entries                =       ATOMIC_INIT(0),
2567 };
2568
2569
2570 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2571 {
2572         struct rtable *ort = *rp;
2573         struct rtable *rt = (struct rtable *)
2574                 dst_alloc(&ipv4_dst_blackhole_ops);
2575
2576         if (rt) {
2577                 struct dst_entry *new = &rt->u.dst;
2578
2579                 atomic_set(&new->__refcnt, 1);
2580                 new->__use = 1;
2581                 new->input = dst_discard;
2582                 new->output = dst_discard;
2583                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2584
2585                 new->dev = ort->u.dst.dev;
2586                 if (new->dev)
2587                         dev_hold(new->dev);
2588
2589                 rt->fl = ort->fl;
2590
2591                 rt->idev = ort->idev;
2592                 if (rt->idev)
2593                         in_dev_hold(rt->idev);
2594                 rt->rt_genid = rt_genid(net);
2595                 rt->rt_flags = ort->rt_flags;
2596                 rt->rt_type = ort->rt_type;
2597                 rt->rt_dst = ort->rt_dst;
2598                 rt->rt_src = ort->rt_src;
2599                 rt->rt_iif = ort->rt_iif;
2600                 rt->rt_gateway = ort->rt_gateway;
2601                 rt->rt_spec_dst = ort->rt_spec_dst;
2602                 rt->peer = ort->peer;
2603                 if (rt->peer)
2604                         atomic_inc(&rt->peer->refcnt);
2605
2606                 dst_free(new);
2607         }
2608
2609         dst_release(&(*rp)->u.dst);
2610         *rp = rt;
2611         return (rt ? 0 : -ENOMEM);
2612 }
2613
2614 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2615                          struct sock *sk, int flags)
2616 {
2617         int err;
2618
2619         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2620                 return err;
2621
2622         if (flp->proto) {
2623                 if (!flp->fl4_src)
2624                         flp->fl4_src = (*rp)->rt_src;
2625                 if (!flp->fl4_dst)
2626                         flp->fl4_dst = (*rp)->rt_dst;
2627                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2628                                     flags ? XFRM_LOOKUP_WAIT : 0);
2629                 if (err == -EREMOTE)
2630                         err = ipv4_dst_blackhole(net, rp, flp);
2631
2632                 return err;
2633         }
2634
2635         return 0;
2636 }
2637
2638 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2639
2640 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2641 {
2642         return ip_route_output_flow(net, rp, flp, NULL, 0);
2643 }
2644
2645 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2646                         int nowait, unsigned int flags)
2647 {
2648         struct rtable *rt = skb->rtable;
2649         struct rtmsg *r;
2650         struct nlmsghdr *nlh;
2651         long expires;
2652         u32 id = 0, ts = 0, tsage = 0, error;
2653
2654         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2655         if (nlh == NULL)
2656                 return -EMSGSIZE;
2657
2658         r = nlmsg_data(nlh);
2659         r->rtm_family    = AF_INET;
2660         r->rtm_dst_len  = 32;
2661         r->rtm_src_len  = 0;
2662         r->rtm_tos      = rt->fl.fl4_tos;
2663         r->rtm_table    = RT_TABLE_MAIN;
2664         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2665         r->rtm_type     = rt->rt_type;
2666         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2667         r->rtm_protocol = RTPROT_UNSPEC;
2668         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2669         if (rt->rt_flags & RTCF_NOTIFY)
2670                 r->rtm_flags |= RTM_F_NOTIFY;
2671
2672         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2673
2674         if (rt->fl.fl4_src) {
2675                 r->rtm_src_len = 32;
2676                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2677         }
2678         if (rt->u.dst.dev)
2679                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2680 #ifdef CONFIG_NET_CLS_ROUTE
2681         if (rt->u.dst.tclassid)
2682                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2683 #endif
2684         if (rt->fl.iif)
2685                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2686         else if (rt->rt_src != rt->fl.fl4_src)
2687                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2688
2689         if (rt->rt_dst != rt->rt_gateway)
2690                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2691
2692         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2693                 goto nla_put_failure;
2694
2695         error = rt->u.dst.error;
2696         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2697         if (rt->peer) {
2698                 id = rt->peer->ip_id_count;
2699                 if (rt->peer->tcp_ts_stamp) {
2700                         ts = rt->peer->tcp_ts;
2701                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2702                 }
2703         }
2704
2705         if (rt->fl.iif) {
2706 #ifdef CONFIG_IP_MROUTE
2707                 __be32 dst = rt->rt_dst;
2708
2709                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2710                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2711                         int err = ipmr_get_route(skb, r, nowait);
2712                         if (err <= 0) {
2713                                 if (!nowait) {
2714                                         if (err == 0)
2715                                                 return 0;
2716                                         goto nla_put_failure;
2717                                 } else {
2718                                         if (err == -EMSGSIZE)
2719                                                 goto nla_put_failure;
2720                                         error = err;
2721                                 }
2722                         }
2723                 } else
2724 #endif
2725                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2726         }
2727
2728         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2729                                expires, error) < 0)
2730                 goto nla_put_failure;
2731
2732         return nlmsg_end(skb, nlh);
2733
2734 nla_put_failure:
2735         nlmsg_cancel(skb, nlh);
2736         return -EMSGSIZE;
2737 }
2738
2739 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2740 {
2741         struct net *net = sock_net(in_skb->sk);
2742         struct rtmsg *rtm;
2743         struct nlattr *tb[RTA_MAX+1];
2744         struct rtable *rt = NULL;
2745         __be32 dst = 0;
2746         __be32 src = 0;
2747         u32 iif;
2748         int err;
2749         struct sk_buff *skb;
2750
2751         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2752         if (err < 0)
2753                 goto errout;
2754
2755         rtm = nlmsg_data(nlh);
2756
2757         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2758         if (skb == NULL) {
2759                 err = -ENOBUFS;
2760                 goto errout;
2761         }
2762
2763         /* Reserve room for dummy headers, this skb can pass
2764            through good chunk of routing engine.
2765          */
2766         skb_reset_mac_header(skb);
2767         skb_reset_network_header(skb);
2768
2769         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2770         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2771         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2772
2773         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2774         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2775         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2776
2777         if (iif) {
2778                 struct net_device *dev;
2779
2780                 dev = __dev_get_by_index(net, iif);
2781                 if (dev == NULL) {
2782                         err = -ENODEV;
2783                         goto errout_free;
2784                 }
2785
2786                 skb->protocol   = htons(ETH_P_IP);
2787                 skb->dev        = dev;
2788                 local_bh_disable();
2789                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2790                 local_bh_enable();
2791
2792                 rt = skb->rtable;
2793                 if (err == 0 && rt->u.dst.error)
2794                         err = -rt->u.dst.error;
2795         } else {
2796                 struct flowi fl = {
2797                         .nl_u = {
2798                                 .ip4_u = {
2799                                         .daddr = dst,
2800                                         .saddr = src,
2801                                         .tos = rtm->rtm_tos,
2802                                 },
2803                         },
2804                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2805                 };
2806                 err = ip_route_output_key(net, &rt, &fl);
2807         }
2808
2809         if (err)
2810                 goto errout_free;
2811
2812         skb->rtable = rt;
2813         if (rtm->rtm_flags & RTM_F_NOTIFY)
2814                 rt->rt_flags |= RTCF_NOTIFY;
2815
2816         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2817                            RTM_NEWROUTE, 0, 0);
2818         if (err <= 0)
2819                 goto errout_free;
2820
2821         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2822 errout:
2823         return err;
2824
2825 errout_free:
2826         kfree_skb(skb);
2827         goto errout;
2828 }
2829
2830 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2831 {
2832         struct rtable *rt;
2833         int h, s_h;
2834         int idx, s_idx;
2835         struct net *net;
2836
2837         net = sock_net(skb->sk);
2838
2839         s_h = cb->args[0];
2840         if (s_h < 0)
2841                 s_h = 0;
2842         s_idx = idx = cb->args[1];
2843         for (h = s_h; h <= rt_hash_mask; h++) {
2844                 rcu_read_lock_bh();
2845                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2846                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2847                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2848                                 continue;
2849                         if (rt_is_expired(rt))
2850                                 continue;
2851                         skb->dst = dst_clone(&rt->u.dst);
2852                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2853                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2854                                          1, NLM_F_MULTI) <= 0) {
2855                                 dst_release(xchg(&skb->dst, NULL));
2856                                 rcu_read_unlock_bh();
2857                                 goto done;
2858                         }
2859                         dst_release(xchg(&skb->dst, NULL));
2860                 }
2861                 rcu_read_unlock_bh();
2862                 s_idx = 0;
2863         }
2864
2865 done:
2866         cb->args[0] = h;
2867         cb->args[1] = idx;
2868         return skb->len;
2869 }
2870
2871 void ip_rt_multicast_event(struct in_device *in_dev)
2872 {
2873         rt_cache_flush(dev_net(in_dev->dev), 0);
2874 }
2875
2876 #ifdef CONFIG_SYSCTL
2877 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2878                                         struct file *filp, void __user *buffer,
2879                                         size_t *lenp, loff_t *ppos)
2880 {
2881         if (write) {
2882                 int flush_delay;
2883                 ctl_table ctl;
2884                 struct net *net;
2885
2886                 memcpy(&ctl, __ctl, sizeof(ctl));
2887                 ctl.data = &flush_delay;
2888                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
2889
2890                 net = (struct net *)__ctl->extra1;
2891                 rt_cache_flush(net, flush_delay);
2892                 return 0;
2893         }
2894
2895         return -EINVAL;
2896 }
2897
2898 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2899                                                 int __user *name,
2900                                                 int nlen,
2901                                                 void __user *oldval,
2902                                                 size_t __user *oldlenp,
2903                                                 void __user *newval,
2904                                                 size_t newlen)
2905 {
2906         int delay;
2907         struct net *net;
2908         if (newlen != sizeof(int))
2909                 return -EINVAL;
2910         if (get_user(delay, (int __user *)newval))
2911                 return -EFAULT;
2912         net = (struct net *)table->extra1;
2913         rt_cache_flush(net, delay);
2914         return 0;
2915 }
2916
2917 static ctl_table ipv4_route_table[] = {
2918         {
2919                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2920                 .procname       = "gc_thresh",
2921                 .data           = &ipv4_dst_ops.gc_thresh,
2922                 .maxlen         = sizeof(int),
2923                 .mode           = 0644,
2924                 .proc_handler   = &proc_dointvec,
2925         },
2926         {
2927                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2928                 .procname       = "max_size",
2929                 .data           = &ip_rt_max_size,
2930                 .maxlen         = sizeof(int),
2931                 .mode           = 0644,
2932                 .proc_handler   = &proc_dointvec,
2933         },
2934         {
2935                 /*  Deprecated. Use gc_min_interval_ms */
2936
2937                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2938                 .procname       = "gc_min_interval",
2939                 .data           = &ip_rt_gc_min_interval,
2940                 .maxlen         = sizeof(int),
2941                 .mode           = 0644,
2942                 .proc_handler   = &proc_dointvec_jiffies,
2943                 .strategy       = &sysctl_jiffies,
2944         },
2945         {
2946                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2947                 .procname       = "gc_min_interval_ms",
2948                 .data           = &ip_rt_gc_min_interval,
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0644,
2951                 .proc_handler   = &proc_dointvec_ms_jiffies,
2952                 .strategy       = &sysctl_ms_jiffies,
2953         },
2954         {
2955                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2956                 .procname       = "gc_timeout",
2957                 .data           = &ip_rt_gc_timeout,
2958                 .maxlen         = sizeof(int),
2959                 .mode           = 0644,
2960                 .proc_handler   = &proc_dointvec_jiffies,
2961                 .strategy       = &sysctl_jiffies,
2962         },
2963         {
2964                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2965                 .procname       = "gc_interval",
2966                 .data           = &ip_rt_gc_interval,
2967                 .maxlen         = sizeof(int),
2968                 .mode           = 0644,
2969                 .proc_handler   = &proc_dointvec_jiffies,
2970                 .strategy       = &sysctl_jiffies,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2974                 .procname       = "redirect_load",
2975                 .data           = &ip_rt_redirect_load,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec,
2979         },
2980         {
2981                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2982                 .procname       = "redirect_number",
2983                 .data           = &ip_rt_redirect_number,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = &proc_dointvec,
2987         },
2988         {
2989                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2990                 .procname       = "redirect_silence",
2991                 .data           = &ip_rt_redirect_silence,
2992                 .maxlen         = sizeof(int),
2993                 .mode           = 0644,
2994                 .proc_handler   = &proc_dointvec,
2995         },
2996         {
2997                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2998                 .procname       = "error_cost",
2999                 .data           = &ip_rt_error_cost,
3000                 .maxlen         = sizeof(int),
3001                 .mode           = 0644,
3002                 .proc_handler   = &proc_dointvec,
3003         },
3004         {
3005                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3006                 .procname       = "error_burst",
3007                 .data           = &ip_rt_error_burst,
3008                 .maxlen         = sizeof(int),
3009                 .mode           = 0644,
3010                 .proc_handler   = &proc_dointvec,
3011         },
3012         {
3013                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3014                 .procname       = "gc_elasticity",
3015                 .data           = &ip_rt_gc_elasticity,
3016                 .maxlen         = sizeof(int),
3017                 .mode           = 0644,
3018                 .proc_handler   = &proc_dointvec,
3019         },
3020         {
3021                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3022                 .procname       = "mtu_expires",
3023                 .data           = &ip_rt_mtu_expires,
3024                 .maxlen         = sizeof(int),
3025                 .mode           = 0644,
3026                 .proc_handler   = &proc_dointvec_jiffies,
3027                 .strategy       = &sysctl_jiffies,
3028         },
3029         {
3030                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3031                 .procname       = "min_pmtu",
3032                 .data           = &ip_rt_min_pmtu,
3033                 .maxlen         = sizeof(int),
3034                 .mode           = 0644,
3035                 .proc_handler   = &proc_dointvec,
3036         },
3037         {
3038                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3039                 .procname       = "min_adv_mss",
3040                 .data           = &ip_rt_min_advmss,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = &proc_dointvec,
3044         },
3045         {
3046                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3047                 .procname       = "secret_interval",
3048                 .data           = &ip_rt_secret_interval,
3049                 .maxlen         = sizeof(int),
3050                 .mode           = 0644,
3051                 .proc_handler   = &proc_dointvec_jiffies,
3052                 .strategy       = &sysctl_jiffies,
3053         },
3054         { .ctl_name = 0 }
3055 };
3056
3057 static __net_initdata struct ctl_path ipv4_route_path[] = {
3058         { .procname = "net", .ctl_name = CTL_NET, },
3059         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3060         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3061         { },
3062 };
3063
3064
3065 static struct ctl_table ipv4_route_flush_table[] = {
3066         {
3067                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3068                 .procname       = "flush",
3069                 .maxlen         = sizeof(int),
3070                 .mode           = 0200,
3071                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
3072                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
3073         },
3074         { .ctl_name = 0 },
3075 };
3076
3077 static __net_init int sysctl_route_net_init(struct net *net)
3078 {
3079         struct ctl_table *tbl;
3080
3081         tbl = ipv4_route_flush_table;
3082         if (net != &init_net) {
3083                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3084                 if (tbl == NULL)
3085                         goto err_dup;
3086         }
3087         tbl[0].extra1 = net;
3088
3089         net->ipv4.route_hdr =
3090                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3091         if (net->ipv4.route_hdr == NULL)
3092                 goto err_reg;
3093         return 0;
3094
3095 err_reg:
3096         if (tbl != ipv4_route_flush_table)
3097                 kfree(tbl);
3098 err_dup:
3099         return -ENOMEM;
3100 }
3101
3102 static __net_exit void sysctl_route_net_exit(struct net *net)
3103 {
3104         struct ctl_table *tbl;
3105
3106         tbl = net->ipv4.route_hdr->ctl_table_arg;
3107         unregister_net_sysctl_table(net->ipv4.route_hdr);
3108         BUG_ON(tbl == ipv4_route_flush_table);
3109         kfree(tbl);
3110 }
3111
3112 static __net_initdata struct pernet_operations sysctl_route_ops = {
3113         .init = sysctl_route_net_init,
3114         .exit = sysctl_route_net_exit,
3115 };
3116 #endif
3117
3118
3119 static __net_init int rt_secret_timer_init(struct net *net)
3120 {
3121         atomic_set(&net->ipv4.rt_genid,
3122                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3123                         (jiffies ^ (jiffies >> 7))));
3124
3125         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3126         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3127         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3128
3129         net->ipv4.rt_secret_timer.expires =
3130                 jiffies + net_random() % ip_rt_secret_interval +
3131                 ip_rt_secret_interval;
3132         add_timer(&net->ipv4.rt_secret_timer);
3133         return 0;
3134 }
3135
3136 static __net_exit void rt_secret_timer_exit(struct net *net)
3137 {
3138         del_timer_sync(&net->ipv4.rt_secret_timer);
3139 }
3140
3141 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3142         .init = rt_secret_timer_init,
3143         .exit = rt_secret_timer_exit,
3144 };
3145
3146
3147 #ifdef CONFIG_NET_CLS_ROUTE
3148 struct ip_rt_acct *ip_rt_acct __read_mostly;
3149 #endif /* CONFIG_NET_CLS_ROUTE */
3150
3151 static __initdata unsigned long rhash_entries;
3152 static int __init set_rhash_entries(char *str)
3153 {
3154         if (!str)
3155                 return 0;
3156         rhash_entries = simple_strtoul(str, &str, 0);
3157         return 1;
3158 }
3159 __setup("rhash_entries=", set_rhash_entries);
3160
3161 int __init ip_rt_init(void)
3162 {
3163         int rc = 0;
3164
3165 #ifdef CONFIG_NET_CLS_ROUTE
3166         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3167         if (!ip_rt_acct)
3168                 panic("IP: failed to allocate ip_rt_acct\n");
3169 #endif
3170
3171         ipv4_dst_ops.kmem_cachep =
3172                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3173                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3174
3175         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3176
3177         rt_hash_table = (struct rt_hash_bucket *)
3178                 alloc_large_system_hash("IP route cache",
3179                                         sizeof(struct rt_hash_bucket),
3180                                         rhash_entries,
3181                                         (num_physpages >= 128 * 1024) ?
3182                                         15 : 17,
3183                                         0,
3184                                         &rt_hash_log,
3185                                         &rt_hash_mask,
3186                                         0);
3187         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3188         rt_hash_lock_init();
3189
3190         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3191         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3192
3193         devinet_init();
3194         ip_fib_init();
3195
3196         /* All the timers, started at system startup tend
3197            to synchronize. Perturb it a bit.
3198          */
3199         schedule_delayed_work(&expires_work,
3200                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3201
3202         if (register_pernet_subsys(&rt_secret_timer_ops))
3203                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3204
3205         if (ip_rt_proc_init())
3206                 printk(KERN_ERR "Unable to create route proc files\n");
3207 #ifdef CONFIG_XFRM
3208         xfrm_init();
3209         xfrm4_init();
3210 #endif
3211         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3212
3213 #ifdef CONFIG_SYSCTL
3214         register_pernet_subsys(&sysctl_route_ops);
3215 #endif
3216         return rc;
3217 }
3218
3219 /*
3220  * We really need to sanitize the damn ipv4 init order, then all
3221  * this nonsense will go away.
3222  */
3223 void __init ip_static_sysctl_init(void)
3224 {
3225         register_sysctl_paths(ipv4_route_path, ipv4_route_table);
3226 }
3227
3228 EXPORT_SYMBOL(__ip_select_ident);
3229 EXPORT_SYMBOL(ip_route_input);
3230 EXPORT_SYMBOL(ip_route_output_key);