2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
100 #include <net/ip_fib.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
108 #include <linux/sysctl.h>
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly = 8;
128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly = 256;
131 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
133 static void rt_worker_func(struct work_struct *work);
134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
135 static struct timer_list rt_secret_timer;
138 * Interface to generic destination cache.
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void ipv4_dst_destroy(struct dst_entry *dst);
143 static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
151 static struct dst_ops ipv4_dst_ops = {
153 .protocol = __constant_htons(ETH_P_IP),
154 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check,
156 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice,
159 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out,
162 .entry_size = sizeof(struct rtable),
163 .entries = ATOMIC_INIT(0),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 const __u8 ip_tos2prio[16] = {
172 ECN_OR_COST(BESTEFFORT),
178 ECN_OR_COST(INTERACTIVE),
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
202 struct rt_hash_bucket {
203 struct rtable *chain;
205 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206 defined(CONFIG_PROVE_LOCKING)
208 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209 * The size of this table is a power of two and depends on the number of CPUS.
210 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212 #ifdef CONFIG_LOCKDEP
213 # define RT_HASH_LOCK_SZ 256
216 # define RT_HASH_LOCK_SZ 4096
218 # define RT_HASH_LOCK_SZ 2048
220 # define RT_HASH_LOCK_SZ 1024
222 # define RT_HASH_LOCK_SZ 512
224 # define RT_HASH_LOCK_SZ 256
228 static spinlock_t *rt_hash_locks;
229 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231 static __init void rt_hash_lock_init(void)
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238 panic("IP: failed to allocate rt_hash_locks\n");
240 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241 spin_lock_init(&rt_hash_locks[i]);
244 # define rt_hash_lock_addr(slot) NULL
246 static inline void rt_hash_lock_init(void)
251 static struct rt_hash_bucket *rt_hash_table __read_mostly;
252 static unsigned rt_hash_mask __read_mostly;
253 static unsigned int rt_hash_log __read_mostly;
254 static atomic_t rt_genid __read_mostly;
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
262 return jhash_3words((__force u32)(__be32)(daddr),
263 (__force u32)(__be32)(saddr),
264 idx, atomic_read(&rt_genid))
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270 struct seq_net_private p;
275 static struct rtable *rt_cache_get_first(struct seq_file *seq)
277 struct rt_cache_iter_state *st = seq->private;
278 struct rtable *r = NULL;
280 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
282 r = rcu_dereference(rt_hash_table[st->bucket].chain);
284 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
285 r->rt_genid == st->genid)
287 r = rcu_dereference(r->u.dst.rt_next);
289 rcu_read_unlock_bh();
294 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
297 struct rt_cache_iter_state *st = seq->private;
298 r = r->u.dst.rt_next;
300 rcu_read_unlock_bh();
301 if (--st->bucket < 0)
304 r = rt_hash_table[st->bucket].chain;
306 return rcu_dereference(r);
309 static struct rtable *rt_cache_get_next(struct seq_file *seq,
312 struct rt_cache_iter_state *st = seq->private;
313 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
314 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
316 if (r->rt_genid == st->genid)
322 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
324 struct rtable *r = rt_cache_get_first(seq);
327 while (pos && (r = rt_cache_get_next(seq, r)))
329 return pos ? NULL : r;
332 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
334 struct rt_cache_iter_state *st = seq->private;
336 return rt_cache_get_idx(seq, *pos - 1);
337 st->genid = atomic_read(&rt_genid);
338 return SEQ_START_TOKEN;
341 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
345 if (v == SEQ_START_TOKEN)
346 r = rt_cache_get_first(seq);
348 r = rt_cache_get_next(seq, v);
353 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
355 if (v && v != SEQ_START_TOKEN)
356 rcu_read_unlock_bh();
359 static int rt_cache_seq_show(struct seq_file *seq, void *v)
361 if (v == SEQ_START_TOKEN)
362 seq_printf(seq, "%-127s\n",
363 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
364 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
367 struct rtable *r = v;
370 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
371 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
372 r->u.dst.dev ? r->u.dst.dev->name : "*",
373 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
374 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
375 r->u.dst.__use, 0, (unsigned long)r->rt_src,
376 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
377 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
378 dst_metric(&r->u.dst, RTAX_WINDOW),
379 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
380 dst_metric(&r->u.dst, RTAX_RTTVAR)),
382 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
383 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
385 r->rt_spec_dst, &len);
387 seq_printf(seq, "%*s\n", 127 - len, "");
392 static const struct seq_operations rt_cache_seq_ops = {
393 .start = rt_cache_seq_start,
394 .next = rt_cache_seq_next,
395 .stop = rt_cache_seq_stop,
396 .show = rt_cache_seq_show,
399 static int rt_cache_seq_open(struct inode *inode, struct file *file)
401 return seq_open_net(inode, file, &rt_cache_seq_ops,
402 sizeof(struct rt_cache_iter_state));
405 static const struct file_operations rt_cache_seq_fops = {
406 .owner = THIS_MODULE,
407 .open = rt_cache_seq_open,
410 .release = seq_release_net,
414 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
419 return SEQ_START_TOKEN;
421 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
422 if (!cpu_possible(cpu))
425 return &per_cpu(rt_cache_stat, cpu);
430 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
434 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
435 if (!cpu_possible(cpu))
438 return &per_cpu(rt_cache_stat, cpu);
444 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
449 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
451 struct rt_cache_stat *st = v;
453 if (v == SEQ_START_TOKEN) {
454 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
458 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
459 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
460 atomic_read(&ipv4_dst_ops.entries),
483 static const struct seq_operations rt_cpu_seq_ops = {
484 .start = rt_cpu_seq_start,
485 .next = rt_cpu_seq_next,
486 .stop = rt_cpu_seq_stop,
487 .show = rt_cpu_seq_show,
491 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
493 return seq_open(file, &rt_cpu_seq_ops);
496 static const struct file_operations rt_cpu_seq_fops = {
497 .owner = THIS_MODULE,
498 .open = rt_cpu_seq_open,
501 .release = seq_release,
504 #ifdef CONFIG_NET_CLS_ROUTE
505 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
506 int length, int *eof, void *data)
510 if ((offset & 3) || (length & 3))
513 if (offset >= sizeof(struct ip_rt_acct) * 256) {
518 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
519 length = sizeof(struct ip_rt_acct) * 256 - offset;
523 offset /= sizeof(u32);
526 u32 *dst = (u32 *) buffer;
529 memset(dst, 0, length);
531 for_each_possible_cpu(i) {
535 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
536 for (j = 0; j < length/4; j++)
544 static int __net_init ip_rt_do_proc_init(struct net *net)
546 struct proc_dir_entry *pde;
548 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
553 pde = proc_create("rt_cache", S_IRUGO,
554 net->proc_net_stat, &rt_cpu_seq_fops);
558 #ifdef CONFIG_NET_CLS_ROUTE
559 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
560 ip_rt_acct_read, NULL);
566 #ifdef CONFIG_NET_CLS_ROUTE
568 remove_proc_entry("rt_cache", net->proc_net_stat);
571 remove_proc_entry("rt_cache", net->proc_net);
576 static void __net_exit ip_rt_do_proc_exit(struct net *net)
578 remove_proc_entry("rt_cache", net->proc_net_stat);
579 remove_proc_entry("rt_cache", net->proc_net);
580 remove_proc_entry("rt_acct", net->proc_net);
583 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
584 .init = ip_rt_do_proc_init,
585 .exit = ip_rt_do_proc_exit,
588 static int __init ip_rt_proc_init(void)
590 return register_pernet_subsys(&ip_rt_proc_ops);
594 static inline int ip_rt_proc_init(void)
598 #endif /* CONFIG_PROC_FS */
600 static inline void rt_free(struct rtable *rt)
602 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
605 static inline void rt_drop(struct rtable *rt)
608 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
611 static inline int rt_fast_clean(struct rtable *rth)
613 /* Kill broadcast/multicast entries very aggresively, if they
614 collide in hash table with more useful entries */
615 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
616 rth->fl.iif && rth->u.dst.rt_next;
619 static inline int rt_valuable(struct rtable *rth)
621 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
625 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
630 if (atomic_read(&rth->u.dst.__refcnt))
634 if (rth->u.dst.expires &&
635 time_after_eq(jiffies, rth->u.dst.expires))
638 age = jiffies - rth->u.dst.lastuse;
640 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
641 (age <= tmo2 && rt_valuable(rth)))
647 /* Bits of score are:
649 * 30: not quite useless
650 * 29..0: usage counter
652 static inline u32 rt_score(struct rtable *rt)
654 u32 score = jiffies - rt->u.dst.lastuse;
656 score = ~score & ~(3<<30);
662 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
668 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
670 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
671 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
672 (fl1->mark ^ fl2->mark) |
673 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
674 *(u16 *)&fl2->nl_u.ip4_u.tos) |
675 (fl1->oif ^ fl2->oif) |
676 (fl1->iif ^ fl2->iif)) == 0;
679 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
681 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
685 * Perform a full scan of hash table and free all entries.
686 * Can be called by a softirq or a process.
687 * In the later case, we want to be reschedule if necessary
689 static void rt_do_flush(int process_context)
692 struct rtable *rth, *next;
694 for (i = 0; i <= rt_hash_mask; i++) {
695 if (process_context && need_resched())
697 rth = rt_hash_table[i].chain;
701 spin_lock_bh(rt_hash_lock_addr(i));
702 rth = rt_hash_table[i].chain;
703 rt_hash_table[i].chain = NULL;
704 spin_unlock_bh(rt_hash_lock_addr(i));
706 for (; rth; rth = next) {
707 next = rth->u.dst.rt_next;
713 static void rt_check_expire(void)
715 static unsigned int rover;
716 unsigned int i = rover, goal;
717 struct rtable *rth, **rthp;
720 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
721 if (ip_rt_gc_timeout > 1)
722 do_div(mult, ip_rt_gc_timeout);
723 goal = (unsigned int)mult;
724 if (goal > rt_hash_mask)
725 goal = rt_hash_mask + 1;
726 for (; goal > 0; goal--) {
727 unsigned long tmo = ip_rt_gc_timeout;
729 i = (i + 1) & rt_hash_mask;
730 rthp = &rt_hash_table[i].chain;
737 spin_lock_bh(rt_hash_lock_addr(i));
738 while ((rth = *rthp) != NULL) {
739 if (rth->rt_genid != atomic_read(&rt_genid)) {
740 *rthp = rth->u.dst.rt_next;
744 if (rth->u.dst.expires) {
745 /* Entry is expired even if it is in use */
746 if (time_before_eq(jiffies, rth->u.dst.expires)) {
748 rthp = &rth->u.dst.rt_next;
751 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
753 rthp = &rth->u.dst.rt_next;
757 /* Cleanup aged off entries. */
758 *rthp = rth->u.dst.rt_next;
761 spin_unlock_bh(rt_hash_lock_addr(i));
767 * rt_worker_func() is run in process context.
768 * we call rt_check_expire() to scan part of the hash table
770 static void rt_worker_func(struct work_struct *work)
773 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
777 * Pertubation of rt_genid by a small quantity [1..256]
778 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
779 * many times (2^24) without giving recent rt_genid.
780 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
782 static void rt_cache_invalidate(void)
784 unsigned char shuffle;
786 get_random_bytes(&shuffle, sizeof(shuffle));
787 atomic_add(shuffle + 1U, &rt_genid);
791 * delay < 0 : invalidate cache (fast : entries will be deleted later)
792 * delay >= 0 : invalidate & flush cache (can be long)
794 void rt_cache_flush(struct net *net, int delay)
796 rt_cache_invalidate();
798 rt_do_flush(!in_softirq());
802 * We change rt_genid and let gc do the cleanup
804 static void rt_secret_rebuild(unsigned long dummy)
806 rt_cache_invalidate();
807 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
811 Short description of GC goals.
813 We want to build algorithm, which will keep routing cache
814 at some equilibrium point, when number of aged off entries
815 is kept approximately equal to newly generated ones.
817 Current expiration strength is variable "expire".
818 We try to adjust it dynamically, so that if networking
819 is idle expires is large enough to keep enough of warm entries,
820 and when load increases it reduces to limit cache size.
823 static int rt_garbage_collect(struct dst_ops *ops)
825 static unsigned long expire = RT_GC_TIMEOUT;
826 static unsigned long last_gc;
828 static int equilibrium;
829 struct rtable *rth, **rthp;
830 unsigned long now = jiffies;
834 * Garbage collection is pretty expensive,
835 * do not make it too frequently.
838 RT_CACHE_STAT_INC(gc_total);
840 if (now - last_gc < ip_rt_gc_min_interval &&
841 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842 RT_CACHE_STAT_INC(gc_ignored);
846 /* Calculate number of entries, which we want to expire now. */
847 goal = atomic_read(&ipv4_dst_ops.entries) -
848 (ip_rt_gc_elasticity << rt_hash_log);
850 if (equilibrium < ipv4_dst_ops.gc_thresh)
851 equilibrium = ipv4_dst_ops.gc_thresh;
852 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
854 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
855 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
858 /* We are in dangerous area. Try to reduce cache really
861 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
862 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
865 if (now - last_gc >= ip_rt_gc_min_interval)
876 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877 unsigned long tmo = expire;
879 k = (k + 1) & rt_hash_mask;
880 rthp = &rt_hash_table[k].chain;
881 spin_lock_bh(rt_hash_lock_addr(k));
882 while ((rth = *rthp) != NULL) {
883 if (rth->rt_genid == atomic_read(&rt_genid) &&
884 !rt_may_expire(rth, tmo, expire)) {
886 rthp = &rth->u.dst.rt_next;
889 *rthp = rth->u.dst.rt_next;
893 spin_unlock_bh(rt_hash_lock_addr(k));
902 /* Goal is not achieved. We stop process if:
904 - if expire reduced to zero. Otherwise, expire is halfed.
905 - if table is not full.
906 - if we are called from interrupt.
907 - jiffies check is just fallback/debug loop breaker.
908 We will not spin here for long time in any case.
911 RT_CACHE_STAT_INC(gc_goal_miss);
917 #if RT_CACHE_DEBUG >= 2
918 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
919 atomic_read(&ipv4_dst_ops.entries), goal, i);
922 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
924 } while (!in_softirq() && time_before_eq(jiffies, now));
926 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
929 printk(KERN_WARNING "dst cache overflow\n");
930 RT_CACHE_STAT_INC(gc_dst_overflow);
934 expire += ip_rt_gc_min_interval;
935 if (expire > ip_rt_gc_timeout ||
936 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
937 expire = ip_rt_gc_timeout;
938 #if RT_CACHE_DEBUG >= 2
939 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
940 atomic_read(&ipv4_dst_ops.entries), goal, rover);
945 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
947 struct rtable *rth, **rthp;
949 struct rtable *cand, **candp;
952 int attempts = !in_softirq();
961 rthp = &rt_hash_table[hash].chain;
963 spin_lock_bh(rt_hash_lock_addr(hash));
964 while ((rth = *rthp) != NULL) {
965 if (rth->rt_genid != atomic_read(&rt_genid)) {
966 *rthp = rth->u.dst.rt_next;
970 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
972 *rthp = rth->u.dst.rt_next;
974 * Since lookup is lockfree, the deletion
975 * must be visible to another weakly ordered CPU before
976 * the insertion at the start of the hash chain.
978 rcu_assign_pointer(rth->u.dst.rt_next,
979 rt_hash_table[hash].chain);
981 * Since lookup is lockfree, the update writes
982 * must be ordered for consistency on SMP.
984 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
986 dst_use(&rth->u.dst, now);
987 spin_unlock_bh(rt_hash_lock_addr(hash));
994 if (!atomic_read(&rth->u.dst.__refcnt)) {
995 u32 score = rt_score(rth);
997 if (score <= min_score) {
1006 rthp = &rth->u.dst.rt_next;
1010 /* ip_rt_gc_elasticity used to be average length of chain
1011 * length, when exceeded gc becomes really aggressive.
1013 * The second limit is less certain. At the moment it allows
1014 * only 2 entries per bucket. We will see.
1016 if (chain_length > ip_rt_gc_elasticity) {
1017 *candp = cand->u.dst.rt_next;
1022 /* Try to bind route to arp only if it is output
1023 route or unicast forwarding path.
1025 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026 int err = arp_bind_neighbour(&rt->u.dst);
1028 spin_unlock_bh(rt_hash_lock_addr(hash));
1030 if (err != -ENOBUFS) {
1035 /* Neighbour tables are full and nothing
1036 can be released. Try to shrink route cache,
1037 it is most likely it holds some neighbour records.
1039 if (attempts-- > 0) {
1040 int saved_elasticity = ip_rt_gc_elasticity;
1041 int saved_int = ip_rt_gc_min_interval;
1042 ip_rt_gc_elasticity = 1;
1043 ip_rt_gc_min_interval = 0;
1044 rt_garbage_collect(&ipv4_dst_ops);
1045 ip_rt_gc_min_interval = saved_int;
1046 ip_rt_gc_elasticity = saved_elasticity;
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "Neighbour table overflow.\n");
1057 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1058 #if RT_CACHE_DEBUG >= 2
1059 if (rt->u.dst.rt_next) {
1061 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1062 NIPQUAD(rt->rt_dst));
1063 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1064 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1068 rt_hash_table[hash].chain = rt;
1069 spin_unlock_bh(rt_hash_lock_addr(hash));
1074 void rt_bind_peer(struct rtable *rt, int create)
1076 static DEFINE_SPINLOCK(rt_peer_lock);
1077 struct inet_peer *peer;
1079 peer = inet_getpeer(rt->rt_dst, create);
1081 spin_lock_bh(&rt_peer_lock);
1082 if (rt->peer == NULL) {
1086 spin_unlock_bh(&rt_peer_lock);
1092 * Peer allocation may fail only in serious out-of-memory conditions. However
1093 * we still can generate some output.
1094 * Random ID selection looks a bit dangerous because we have no chances to
1095 * select ID being unique in a reasonable period of time.
1096 * But broken packet identifier may be better than no packet at all.
1098 static void ip_select_fb_ident(struct iphdr *iph)
1100 static DEFINE_SPINLOCK(ip_fb_id_lock);
1101 static u32 ip_fallback_id;
1104 spin_lock_bh(&ip_fb_id_lock);
1105 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1106 iph->id = htons(salt & 0xFFFF);
1107 ip_fallback_id = salt;
1108 spin_unlock_bh(&ip_fb_id_lock);
1111 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1113 struct rtable *rt = (struct rtable *) dst;
1116 if (rt->peer == NULL)
1117 rt_bind_peer(rt, 1);
1119 /* If peer is attached to destination, it is never detached,
1120 so that we need not to grab a lock to dereference it.
1123 iph->id = htons(inet_getid(rt->peer, more));
1127 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1128 __builtin_return_address(0));
1130 ip_select_fb_ident(iph);
1133 static void rt_del(unsigned hash, struct rtable *rt)
1135 struct rtable **rthp, *aux;
1137 rthp = &rt_hash_table[hash].chain;
1138 spin_lock_bh(rt_hash_lock_addr(hash));
1140 while ((aux = *rthp) != NULL) {
1141 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1142 *rthp = aux->u.dst.rt_next;
1146 rthp = &aux->u.dst.rt_next;
1148 spin_unlock_bh(rt_hash_lock_addr(hash));
1151 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1152 __be32 saddr, struct net_device *dev)
1155 struct in_device *in_dev = in_dev_get(dev);
1156 struct rtable *rth, **rthp;
1157 __be32 skeys[2] = { saddr, 0 };
1158 int ikeys[2] = { dev->ifindex, 0 };
1159 struct netevent_redirect netevent;
1166 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1167 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1168 || ipv4_is_zeronet(new_gw))
1169 goto reject_redirect;
1171 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1172 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1173 goto reject_redirect;
1174 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1175 goto reject_redirect;
1177 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1178 goto reject_redirect;
1181 for (i = 0; i < 2; i++) {
1182 for (k = 0; k < 2; k++) {
1183 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1185 rthp=&rt_hash_table[hash].chain;
1188 while ((rth = rcu_dereference(*rthp)) != NULL) {
1191 if (rth->fl.fl4_dst != daddr ||
1192 rth->fl.fl4_src != skeys[i] ||
1193 rth->fl.oif != ikeys[k] ||
1195 rth->rt_genid != atomic_read(&rt_genid) ||
1196 !net_eq(dev_net(rth->u.dst.dev), net)) {
1197 rthp = &rth->u.dst.rt_next;
1201 if (rth->rt_dst != daddr ||
1202 rth->rt_src != saddr ||
1204 rth->rt_gateway != old_gw ||
1205 rth->u.dst.dev != dev)
1208 dst_hold(&rth->u.dst);
1211 rt = dst_alloc(&ipv4_dst_ops);
1218 /* Copy all the information. */
1220 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1221 rt->u.dst.__use = 1;
1222 atomic_set(&rt->u.dst.__refcnt, 1);
1223 rt->u.dst.child = NULL;
1225 dev_hold(rt->u.dst.dev);
1227 in_dev_hold(rt->idev);
1228 rt->u.dst.obsolete = 0;
1229 rt->u.dst.lastuse = jiffies;
1230 rt->u.dst.path = &rt->u.dst;
1231 rt->u.dst.neighbour = NULL;
1232 rt->u.dst.hh = NULL;
1233 rt->u.dst.xfrm = NULL;
1234 rt->rt_genid = atomic_read(&rt_genid);
1235 rt->rt_flags |= RTCF_REDIRECTED;
1237 /* Gateway is different ... */
1238 rt->rt_gateway = new_gw;
1240 /* Redirect received -> path was valid */
1241 dst_confirm(&rth->u.dst);
1244 atomic_inc(&rt->peer->refcnt);
1246 if (arp_bind_neighbour(&rt->u.dst) ||
1247 !(rt->u.dst.neighbour->nud_state &
1249 if (rt->u.dst.neighbour)
1250 neigh_event_send(rt->u.dst.neighbour, NULL);
1256 netevent.old = &rth->u.dst;
1257 netevent.new = &rt->u.dst;
1258 call_netevent_notifiers(NETEVENT_REDIRECT,
1262 if (!rt_intern_hash(hash, rt, &rt))
1275 #ifdef CONFIG_IP_ROUTE_VERBOSE
1276 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1277 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1278 NIPQUAD_FMT " ignored.\n"
1279 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1280 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1281 NIPQUAD(saddr), NIPQUAD(daddr));
1286 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1288 struct rtable *rt = (struct rtable *)dst;
1289 struct dst_entry *ret = dst;
1292 if (dst->obsolete) {
1295 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1296 rt->u.dst.expires) {
1297 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1299 #if RT_CACHE_DEBUG >= 1
1300 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1301 NIPQUAD_FMT "/%02x dropped\n",
1302 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1313 * 1. The first ip_rt_redirect_number redirects are sent
1314 * with exponential backoff, then we stop sending them at all,
1315 * assuming that the host ignores our redirects.
1316 * 2. If we did not see packets requiring redirects
1317 * during ip_rt_redirect_silence, we assume that the host
1318 * forgot redirected route and start to send redirects again.
1320 * This algorithm is much cheaper and more intelligent than dumb load limiting
1323 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1324 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1327 void ip_rt_send_redirect(struct sk_buff *skb)
1329 struct rtable *rt = skb->rtable;
1330 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1335 if (!IN_DEV_TX_REDIRECTS(in_dev))
1338 /* No redirected packets during ip_rt_redirect_silence;
1339 * reset the algorithm.
1341 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1342 rt->u.dst.rate_tokens = 0;
1344 /* Too many ignored redirects; do not send anything
1345 * set u.dst.rate_last to the last seen redirected packet.
1347 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1348 rt->u.dst.rate_last = jiffies;
1352 /* Check for load limit; set rate_last to the latest sent
1355 if (rt->u.dst.rate_tokens == 0 ||
1357 (rt->u.dst.rate_last +
1358 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1359 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1360 rt->u.dst.rate_last = jiffies;
1361 ++rt->u.dst.rate_tokens;
1362 #ifdef CONFIG_IP_ROUTE_VERBOSE
1363 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1364 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1366 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1367 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1368 NIPQUAD(rt->rt_src), rt->rt_iif,
1369 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1376 static int ip_error(struct sk_buff *skb)
1378 struct rtable *rt = skb->rtable;
1382 switch (rt->u.dst.error) {
1387 code = ICMP_HOST_UNREACH;
1390 code = ICMP_NET_UNREACH;
1391 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1394 code = ICMP_PKT_FILTERED;
1399 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1400 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1401 rt->u.dst.rate_tokens = ip_rt_error_burst;
1402 rt->u.dst.rate_last = now;
1403 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1404 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1405 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1408 out: kfree_skb(skb);
1413 * The last two values are not from the RFC but
1414 * are needed for AMPRnet AX.25 paths.
1417 static const unsigned short mtu_plateau[] =
1418 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1420 static inline unsigned short guess_mtu(unsigned short old_mtu)
1424 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1425 if (old_mtu > mtu_plateau[i])
1426 return mtu_plateau[i];
1430 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1431 unsigned short new_mtu,
1432 struct net_device *dev)
1435 unsigned short old_mtu = ntohs(iph->tot_len);
1437 int ikeys[2] = { dev->ifindex, 0 };
1438 __be32 skeys[2] = { iph->saddr, 0, };
1439 __be32 daddr = iph->daddr;
1440 unsigned short est_mtu = 0;
1442 if (ipv4_config.no_pmtu_disc)
1445 for (k = 0; k < 2; k++) {
1446 for (i = 0; i < 2; i++) {
1447 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1450 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1451 rth = rcu_dereference(rth->u.dst.rt_next)) {
1452 unsigned short mtu = new_mtu;
1454 if (rth->fl.fl4_dst != daddr ||
1455 rth->fl.fl4_src != skeys[i] ||
1456 rth->rt_dst != daddr ||
1457 rth->rt_src != iph->saddr ||
1458 rth->fl.oif != ikeys[k] ||
1460 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1461 !net_eq(dev_net(rth->u.dst.dev), net) ||
1462 rth->rt_genid != atomic_read(&rt_genid))
1465 if (new_mtu < 68 || new_mtu >= old_mtu) {
1467 /* BSD 4.2 compatibility hack :-( */
1469 old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1470 old_mtu >= 68 + (iph->ihl << 2))
1471 old_mtu -= iph->ihl << 2;
1473 mtu = guess_mtu(old_mtu);
1475 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1476 if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1477 dst_confirm(&rth->u.dst);
1478 if (mtu < ip_rt_min_pmtu) {
1479 mtu = ip_rt_min_pmtu;
1480 rth->u.dst.metrics[RTAX_LOCK-1] |=
1483 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1484 dst_set_expires(&rth->u.dst,
1493 return est_mtu ? : new_mtu;
1496 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1498 if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1499 !(dst_metric_locked(dst, RTAX_MTU))) {
1500 if (mtu < ip_rt_min_pmtu) {
1501 mtu = ip_rt_min_pmtu;
1502 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1504 dst->metrics[RTAX_MTU-1] = mtu;
1505 dst_set_expires(dst, ip_rt_mtu_expires);
1506 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1510 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1515 static void ipv4_dst_destroy(struct dst_entry *dst)
1517 struct rtable *rt = (struct rtable *) dst;
1518 struct inet_peer *peer = rt->peer;
1519 struct in_device *idev = rt->idev;
1532 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1535 struct rtable *rt = (struct rtable *) dst;
1536 struct in_device *idev = rt->idev;
1537 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1538 struct in_device *loopback_idev =
1539 in_dev_get(dev_net(dev)->loopback_dev);
1540 if (loopback_idev) {
1541 rt->idev = loopback_idev;
1547 static void ipv4_link_failure(struct sk_buff *skb)
1551 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1555 dst_set_expires(&rt->u.dst, 0);
1558 static int ip_rt_bug(struct sk_buff *skb)
1560 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1561 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1562 skb->dev ? skb->dev->name : "?");
1568 We do not cache source address of outgoing interface,
1569 because it is used only by IP RR, TS and SRR options,
1570 so that it out of fast path.
1572 BTW remember: "addr" is allowed to be not aligned
1576 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1579 struct fib_result res;
1581 if (rt->fl.iif == 0)
1583 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1584 src = FIB_RES_PREFSRC(res);
1587 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1589 memcpy(addr, &src, 4);
1592 #ifdef CONFIG_NET_CLS_ROUTE
1593 static void set_class_tag(struct rtable *rt, u32 tag)
1595 if (!(rt->u.dst.tclassid & 0xFFFF))
1596 rt->u.dst.tclassid |= tag & 0xFFFF;
1597 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1598 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1602 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1604 struct fib_info *fi = res->fi;
1607 if (FIB_RES_GW(*res) &&
1608 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1609 rt->rt_gateway = FIB_RES_GW(*res);
1610 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1611 sizeof(rt->u.dst.metrics));
1612 if (fi->fib_mtu == 0) {
1613 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1614 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1615 rt->rt_gateway != rt->rt_dst &&
1616 rt->u.dst.dev->mtu > 576)
1617 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1619 #ifdef CONFIG_NET_CLS_ROUTE
1620 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1623 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1625 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1626 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1627 if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1628 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1629 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1630 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1632 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1633 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 #ifdef CONFIG_IP_MULTIPLE_TABLES
1637 set_class_tag(rt, fib_rules_tclass(res));
1639 set_class_tag(rt, itag);
1641 rt->rt_type = res->type;
1644 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1645 u8 tos, struct net_device *dev, int our)
1650 struct in_device *in_dev = in_dev_get(dev);
1653 /* Primary sanity checks. */
1658 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1659 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1662 if (ipv4_is_zeronet(saddr)) {
1663 if (!ipv4_is_local_multicast(daddr))
1665 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1666 } else if (fib_validate_source(saddr, 0, tos, 0,
1667 dev, &spec_dst, &itag) < 0)
1670 rth = dst_alloc(&ipv4_dst_ops);
1674 rth->u.dst.output= ip_rt_bug;
1676 atomic_set(&rth->u.dst.__refcnt, 1);
1677 rth->u.dst.flags= DST_HOST;
1678 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1679 rth->u.dst.flags |= DST_NOPOLICY;
1680 rth->fl.fl4_dst = daddr;
1681 rth->rt_dst = daddr;
1682 rth->fl.fl4_tos = tos;
1683 rth->fl.mark = skb->mark;
1684 rth->fl.fl4_src = saddr;
1685 rth->rt_src = saddr;
1686 #ifdef CONFIG_NET_CLS_ROUTE
1687 rth->u.dst.tclassid = itag;
1690 rth->fl.iif = dev->ifindex;
1691 rth->u.dst.dev = init_net.loopback_dev;
1692 dev_hold(rth->u.dst.dev);
1693 rth->idev = in_dev_get(rth->u.dst.dev);
1695 rth->rt_gateway = daddr;
1696 rth->rt_spec_dst= spec_dst;
1697 rth->rt_genid = atomic_read(&rt_genid);
1698 rth->rt_flags = RTCF_MULTICAST;
1699 rth->rt_type = RTN_MULTICAST;
1701 rth->u.dst.input= ip_local_deliver;
1702 rth->rt_flags |= RTCF_LOCAL;
1705 #ifdef CONFIG_IP_MROUTE
1706 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1707 rth->u.dst.input = ip_mr_input;
1709 RT_CACHE_STAT_INC(in_slow_mc);
1712 hash = rt_hash(daddr, saddr, dev->ifindex);
1713 return rt_intern_hash(hash, rth, &skb->rtable);
1725 static void ip_handle_martian_source(struct net_device *dev,
1726 struct in_device *in_dev,
1727 struct sk_buff *skb,
1731 RT_CACHE_STAT_INC(in_martian_src);
1732 #ifdef CONFIG_IP_ROUTE_VERBOSE
1733 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1735 * RFC1812 recommendation, if source is martian,
1736 * the only hint is MAC header.
1738 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1739 NIPQUAD_FMT", on dev %s\n",
1740 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1741 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1743 const unsigned char *p = skb_mac_header(skb);
1744 printk(KERN_WARNING "ll header: ");
1745 for (i = 0; i < dev->hard_header_len; i++, p++) {
1747 if (i < (dev->hard_header_len - 1))
1756 static int __mkroute_input(struct sk_buff *skb,
1757 struct fib_result *res,
1758 struct in_device *in_dev,
1759 __be32 daddr, __be32 saddr, u32 tos,
1760 struct rtable **result)
1765 struct in_device *out_dev;
1770 /* get a working reference to the output device */
1771 out_dev = in_dev_get(FIB_RES_DEV(*res));
1772 if (out_dev == NULL) {
1773 if (net_ratelimit())
1774 printk(KERN_CRIT "Bug in ip_route_input" \
1775 "_slow(). Please, report\n");
1780 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1781 in_dev->dev, &spec_dst, &itag);
1783 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1791 flags |= RTCF_DIRECTSRC;
1793 if (out_dev == in_dev && err &&
1794 (IN_DEV_SHARED_MEDIA(out_dev) ||
1795 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1796 flags |= RTCF_DOREDIRECT;
1798 if (skb->protocol != htons(ETH_P_IP)) {
1799 /* Not IP (i.e. ARP). Do not create route, if it is
1800 * invalid for proxy arp. DNAT routes are always valid.
1802 if (out_dev == in_dev) {
1809 rth = dst_alloc(&ipv4_dst_ops);
1815 atomic_set(&rth->u.dst.__refcnt, 1);
1816 rth->u.dst.flags= DST_HOST;
1817 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1818 rth->u.dst.flags |= DST_NOPOLICY;
1819 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1820 rth->u.dst.flags |= DST_NOXFRM;
1821 rth->fl.fl4_dst = daddr;
1822 rth->rt_dst = daddr;
1823 rth->fl.fl4_tos = tos;
1824 rth->fl.mark = skb->mark;
1825 rth->fl.fl4_src = saddr;
1826 rth->rt_src = saddr;
1827 rth->rt_gateway = daddr;
1829 rth->fl.iif = in_dev->dev->ifindex;
1830 rth->u.dst.dev = (out_dev)->dev;
1831 dev_hold(rth->u.dst.dev);
1832 rth->idev = in_dev_get(rth->u.dst.dev);
1834 rth->rt_spec_dst= spec_dst;
1836 rth->u.dst.input = ip_forward;
1837 rth->u.dst.output = ip_output;
1838 rth->rt_genid = atomic_read(&rt_genid);
1840 rt_set_nexthop(rth, res, itag);
1842 rth->rt_flags = flags;
1847 /* release the working reference to the output device */
1848 in_dev_put(out_dev);
1852 static int ip_mkroute_input(struct sk_buff *skb,
1853 struct fib_result *res,
1854 const struct flowi *fl,
1855 struct in_device *in_dev,
1856 __be32 daddr, __be32 saddr, u32 tos)
1858 struct rtable* rth = NULL;
1862 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1863 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1864 fib_select_multipath(fl, res);
1867 /* create a routing cache entry */
1868 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1872 /* put it into the cache */
1873 hash = rt_hash(daddr, saddr, fl->iif);
1874 return rt_intern_hash(hash, rth, &skb->rtable);
1878 * NOTE. We drop all the packets that has local source
1879 * addresses, because every properly looped back packet
1880 * must have correct destination already attached by output routine.
1882 * Such approach solves two big problems:
1883 * 1. Not simplex devices are handled properly.
1884 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1887 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1888 u8 tos, struct net_device *dev)
1890 struct fib_result res;
1891 struct in_device *in_dev = in_dev_get(dev);
1892 struct flowi fl = { .nl_u = { .ip4_u =
1896 .scope = RT_SCOPE_UNIVERSE,
1899 .iif = dev->ifindex };
1902 struct rtable * rth;
1907 struct net * net = dev_net(dev);
1909 /* IP on this device is disabled. */
1914 /* Check for the most weird martians, which can be not detected
1918 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1919 ipv4_is_loopback(saddr))
1920 goto martian_source;
1922 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1925 /* Accept zero addresses only to limited broadcast;
1926 * I even do not know to fix it or not. Waiting for complains :-)
1928 if (ipv4_is_zeronet(saddr))
1929 goto martian_source;
1931 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1932 ipv4_is_loopback(daddr))
1933 goto martian_destination;
1936 * Now we are ready to route packet.
1938 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1939 if (!IN_DEV_FORWARD(in_dev))
1945 RT_CACHE_STAT_INC(in_slow_tot);
1947 if (res.type == RTN_BROADCAST)
1950 if (res.type == RTN_LOCAL) {
1952 result = fib_validate_source(saddr, daddr, tos,
1953 net->loopback_dev->ifindex,
1954 dev, &spec_dst, &itag);
1956 goto martian_source;
1958 flags |= RTCF_DIRECTSRC;
1963 if (!IN_DEV_FORWARD(in_dev))
1965 if (res.type != RTN_UNICAST)
1966 goto martian_destination;
1968 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1976 if (skb->protocol != htons(ETH_P_IP))
1979 if (ipv4_is_zeronet(saddr))
1980 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1982 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1985 goto martian_source;
1987 flags |= RTCF_DIRECTSRC;
1989 flags |= RTCF_BROADCAST;
1990 res.type = RTN_BROADCAST;
1991 RT_CACHE_STAT_INC(in_brd);
1994 rth = dst_alloc(&ipv4_dst_ops);
1998 rth->u.dst.output= ip_rt_bug;
1999 rth->rt_genid = atomic_read(&rt_genid);
2001 atomic_set(&rth->u.dst.__refcnt, 1);
2002 rth->u.dst.flags= DST_HOST;
2003 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2004 rth->u.dst.flags |= DST_NOPOLICY;
2005 rth->fl.fl4_dst = daddr;
2006 rth->rt_dst = daddr;
2007 rth->fl.fl4_tos = tos;
2008 rth->fl.mark = skb->mark;
2009 rth->fl.fl4_src = saddr;
2010 rth->rt_src = saddr;
2011 #ifdef CONFIG_NET_CLS_ROUTE
2012 rth->u.dst.tclassid = itag;
2015 rth->fl.iif = dev->ifindex;
2016 rth->u.dst.dev = net->loopback_dev;
2017 dev_hold(rth->u.dst.dev);
2018 rth->idev = in_dev_get(rth->u.dst.dev);
2019 rth->rt_gateway = daddr;
2020 rth->rt_spec_dst= spec_dst;
2021 rth->u.dst.input= ip_local_deliver;
2022 rth->rt_flags = flags|RTCF_LOCAL;
2023 if (res.type == RTN_UNREACHABLE) {
2024 rth->u.dst.input= ip_error;
2025 rth->u.dst.error= -err;
2026 rth->rt_flags &= ~RTCF_LOCAL;
2028 rth->rt_type = res.type;
2029 hash = rt_hash(daddr, saddr, fl.iif);
2030 err = rt_intern_hash(hash, rth, &skb->rtable);
2034 RT_CACHE_STAT_INC(in_no_route);
2035 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2036 res.type = RTN_UNREACHABLE;
2042 * Do not cache martian addresses: they should be logged (RFC1812)
2044 martian_destination:
2045 RT_CACHE_STAT_INC(in_martian_dst);
2046 #ifdef CONFIG_IP_ROUTE_VERBOSE
2047 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2048 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2049 NIPQUAD_FMT ", dev %s\n",
2050 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2054 err = -EHOSTUNREACH;
2066 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2070 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2071 u8 tos, struct net_device *dev)
2073 struct rtable * rth;
2075 int iif = dev->ifindex;
2079 tos &= IPTOS_RT_MASK;
2080 hash = rt_hash(daddr, saddr, iif);
2083 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2084 rth = rcu_dereference(rth->u.dst.rt_next)) {
2085 if (((rth->fl.fl4_dst ^ daddr) |
2086 (rth->fl.fl4_src ^ saddr) |
2087 (rth->fl.iif ^ iif) |
2089 (rth->fl.fl4_tos ^ tos)) == 0 &&
2090 rth->fl.mark == skb->mark &&
2091 net_eq(dev_net(rth->u.dst.dev), net) &&
2092 rth->rt_genid == atomic_read(&rt_genid)) {
2093 dst_use(&rth->u.dst, jiffies);
2094 RT_CACHE_STAT_INC(in_hit);
2099 RT_CACHE_STAT_INC(in_hlist_search);
2103 /* Multicast recognition logic is moved from route cache to here.
2104 The problem was that too many Ethernet cards have broken/missing
2105 hardware multicast filters :-( As result the host on multicasting
2106 network acquires a lot of useless route cache entries, sort of
2107 SDR messages from all the world. Now we try to get rid of them.
2108 Really, provided software IP multicast filter is organized
2109 reasonably (at least, hashed), it does not result in a slowdown
2110 comparing with route cache reject entries.
2111 Note, that multicast routers are not affected, because
2112 route cache entry is created eventually.
2114 if (ipv4_is_multicast(daddr)) {
2115 struct in_device *in_dev;
2118 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2119 int our = ip_check_mc(in_dev, daddr, saddr,
2120 ip_hdr(skb)->protocol);
2122 #ifdef CONFIG_IP_MROUTE
2123 || (!ipv4_is_local_multicast(daddr) &&
2124 IN_DEV_MFORWARD(in_dev))
2128 return ip_route_input_mc(skb, daddr, saddr,
2135 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2138 static int __mkroute_output(struct rtable **result,
2139 struct fib_result *res,
2140 const struct flowi *fl,
2141 const struct flowi *oldflp,
2142 struct net_device *dev_out,
2146 struct in_device *in_dev;
2147 u32 tos = RT_FL_TOS(oldflp);
2150 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2153 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2154 res->type = RTN_BROADCAST;
2155 else if (ipv4_is_multicast(fl->fl4_dst))
2156 res->type = RTN_MULTICAST;
2157 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2160 if (dev_out->flags & IFF_LOOPBACK)
2161 flags |= RTCF_LOCAL;
2163 /* get work reference to inet device */
2164 in_dev = in_dev_get(dev_out);
2168 if (res->type == RTN_BROADCAST) {
2169 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2171 fib_info_put(res->fi);
2174 } else if (res->type == RTN_MULTICAST) {
2175 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2176 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2178 flags &= ~RTCF_LOCAL;
2179 /* If multicast route do not exist use
2180 default one, but do not gateway in this case.
2183 if (res->fi && res->prefixlen < 4) {
2184 fib_info_put(res->fi);
2190 rth = dst_alloc(&ipv4_dst_ops);
2196 atomic_set(&rth->u.dst.__refcnt, 1);
2197 rth->u.dst.flags= DST_HOST;
2198 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2199 rth->u.dst.flags |= DST_NOXFRM;
2200 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2201 rth->u.dst.flags |= DST_NOPOLICY;
2203 rth->fl.fl4_dst = oldflp->fl4_dst;
2204 rth->fl.fl4_tos = tos;
2205 rth->fl.fl4_src = oldflp->fl4_src;
2206 rth->fl.oif = oldflp->oif;
2207 rth->fl.mark = oldflp->mark;
2208 rth->rt_dst = fl->fl4_dst;
2209 rth->rt_src = fl->fl4_src;
2210 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2211 /* get references to the devices that are to be hold by the routing
2213 rth->u.dst.dev = dev_out;
2215 rth->idev = in_dev_get(dev_out);
2216 rth->rt_gateway = fl->fl4_dst;
2217 rth->rt_spec_dst= fl->fl4_src;
2219 rth->u.dst.output=ip_output;
2220 rth->rt_genid = atomic_read(&rt_genid);
2222 RT_CACHE_STAT_INC(out_slow_tot);
2224 if (flags & RTCF_LOCAL) {
2225 rth->u.dst.input = ip_local_deliver;
2226 rth->rt_spec_dst = fl->fl4_dst;
2228 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2229 rth->rt_spec_dst = fl->fl4_src;
2230 if (flags & RTCF_LOCAL &&
2231 !(dev_out->flags & IFF_LOOPBACK)) {
2232 rth->u.dst.output = ip_mc_output;
2233 RT_CACHE_STAT_INC(out_slow_mc);
2235 #ifdef CONFIG_IP_MROUTE
2236 if (res->type == RTN_MULTICAST) {
2237 if (IN_DEV_MFORWARD(in_dev) &&
2238 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2239 rth->u.dst.input = ip_mr_input;
2240 rth->u.dst.output = ip_mc_output;
2246 rt_set_nexthop(rth, res, 0);
2248 rth->rt_flags = flags;
2252 /* release work reference to inet device */
2258 static int ip_mkroute_output(struct rtable **rp,
2259 struct fib_result *res,
2260 const struct flowi *fl,
2261 const struct flowi *oldflp,
2262 struct net_device *dev_out,
2265 struct rtable *rth = NULL;
2266 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2269 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2270 err = rt_intern_hash(hash, rth, rp);
2277 * Major route resolver routine.
2280 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2281 const struct flowi *oldflp)
2283 u32 tos = RT_FL_TOS(oldflp);
2284 struct flowi fl = { .nl_u = { .ip4_u =
2285 { .daddr = oldflp->fl4_dst,
2286 .saddr = oldflp->fl4_src,
2287 .tos = tos & IPTOS_RT_MASK,
2288 .scope = ((tos & RTO_ONLINK) ?
2292 .mark = oldflp->mark,
2293 .iif = net->loopback_dev->ifindex,
2294 .oif = oldflp->oif };
2295 struct fib_result res;
2297 struct net_device *dev_out = NULL;
2303 #ifdef CONFIG_IP_MULTIPLE_TABLES
2307 if (oldflp->fl4_src) {
2309 if (ipv4_is_multicast(oldflp->fl4_src) ||
2310 ipv4_is_lbcast(oldflp->fl4_src) ||
2311 ipv4_is_zeronet(oldflp->fl4_src))
2314 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2315 dev_out = ip_dev_find(net, oldflp->fl4_src);
2316 if (dev_out == NULL)
2319 /* I removed check for oif == dev_out->oif here.
2320 It was wrong for two reasons:
2321 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322 is assigned to multiple interfaces.
2323 2. Moreover, we are allowed to send packets with saddr
2324 of another iface. --ANK
2327 if (oldflp->oif == 0
2328 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2329 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2330 /* Special hack: user can direct multicasts
2331 and limited broadcast via necessary interface
2332 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2333 This hack is not just for fun, it allows
2334 vic,vat and friends to work.
2335 They bind socket to loopback, set ttl to zero
2336 and expect that it will work.
2337 From the viewpoint of routing cache they are broken,
2338 because we are not allowed to build multicast path
2339 with loopback source addr (look, routing cache
2340 cannot know, that ttl is zero, so that packet
2341 will not leave this host and route is valid).
2342 Luckily, this hack is good workaround.
2345 fl.oif = dev_out->ifindex;
2355 dev_out = dev_get_by_index(net, oldflp->oif);
2357 if (dev_out == NULL)
2360 /* RACE: Check return value of inet_select_addr instead. */
2361 if (__in_dev_get_rtnl(dev_out) == NULL) {
2363 goto out; /* Wrong error code */
2366 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2367 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2369 fl.fl4_src = inet_select_addr(dev_out, 0,
2374 if (ipv4_is_multicast(oldflp->fl4_dst))
2375 fl.fl4_src = inet_select_addr(dev_out, 0,
2377 else if (!oldflp->fl4_dst)
2378 fl.fl4_src = inet_select_addr(dev_out, 0,
2384 fl.fl4_dst = fl.fl4_src;
2386 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2389 dev_out = net->loopback_dev;
2391 fl.oif = net->loopback_dev->ifindex;
2392 res.type = RTN_LOCAL;
2393 flags |= RTCF_LOCAL;
2397 if (fib_lookup(net, &fl, &res)) {
2400 /* Apparently, routing tables are wrong. Assume,
2401 that the destination is on link.
2404 Because we are allowed to send to iface
2405 even if it has NO routes and NO assigned
2406 addresses. When oif is specified, routing
2407 tables are looked up with only one purpose:
2408 to catch if destination is gatewayed, rather than
2409 direct. Moreover, if MSG_DONTROUTE is set,
2410 we send packet, ignoring both routing tables
2411 and ifaddr state. --ANK
2414 We could make it even if oif is unknown,
2415 likely IPv6, but we do not.
2418 if (fl.fl4_src == 0)
2419 fl.fl4_src = inet_select_addr(dev_out, 0,
2421 res.type = RTN_UNICAST;
2431 if (res.type == RTN_LOCAL) {
2433 fl.fl4_src = fl.fl4_dst;
2436 dev_out = net->loopback_dev;
2438 fl.oif = dev_out->ifindex;
2440 fib_info_put(res.fi);
2442 flags |= RTCF_LOCAL;
2446 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2447 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2448 fib_select_multipath(&fl, &res);
2451 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2452 fib_select_default(net, &fl, &res);
2455 fl.fl4_src = FIB_RES_PREFSRC(res);
2459 dev_out = FIB_RES_DEV(res);
2461 fl.oif = dev_out->ifindex;
2465 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2475 int __ip_route_output_key(struct net *net, struct rtable **rp,
2476 const struct flowi *flp)
2481 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2484 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2485 rth = rcu_dereference(rth->u.dst.rt_next)) {
2486 if (rth->fl.fl4_dst == flp->fl4_dst &&
2487 rth->fl.fl4_src == flp->fl4_src &&
2489 rth->fl.oif == flp->oif &&
2490 rth->fl.mark == flp->mark &&
2491 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2492 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2493 net_eq(dev_net(rth->u.dst.dev), net) &&
2494 rth->rt_genid == atomic_read(&rt_genid)) {
2495 dst_use(&rth->u.dst, jiffies);
2496 RT_CACHE_STAT_INC(out_hit);
2497 rcu_read_unlock_bh();
2501 RT_CACHE_STAT_INC(out_hlist_search);
2503 rcu_read_unlock_bh();
2505 return ip_route_output_slow(net, rp, flp);
2508 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2510 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2514 static struct dst_ops ipv4_dst_blackhole_ops = {
2516 .protocol = __constant_htons(ETH_P_IP),
2517 .destroy = ipv4_dst_destroy,
2518 .check = ipv4_dst_check,
2519 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2520 .entry_size = sizeof(struct rtable),
2521 .entries = ATOMIC_INIT(0),
2525 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2527 struct rtable *ort = *rp;
2528 struct rtable *rt = (struct rtable *)
2529 dst_alloc(&ipv4_dst_blackhole_ops);
2532 struct dst_entry *new = &rt->u.dst;
2534 atomic_set(&new->__refcnt, 1);
2536 new->input = dst_discard;
2537 new->output = dst_discard;
2538 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2540 new->dev = ort->u.dst.dev;
2546 rt->idev = ort->idev;
2548 in_dev_hold(rt->idev);
2549 rt->rt_genid = atomic_read(&rt_genid);
2550 rt->rt_flags = ort->rt_flags;
2551 rt->rt_type = ort->rt_type;
2552 rt->rt_dst = ort->rt_dst;
2553 rt->rt_src = ort->rt_src;
2554 rt->rt_iif = ort->rt_iif;
2555 rt->rt_gateway = ort->rt_gateway;
2556 rt->rt_spec_dst = ort->rt_spec_dst;
2557 rt->peer = ort->peer;
2559 atomic_inc(&rt->peer->refcnt);
2564 dst_release(&(*rp)->u.dst);
2566 return (rt ? 0 : -ENOMEM);
2569 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2570 struct sock *sk, int flags)
2574 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2579 flp->fl4_src = (*rp)->rt_src;
2581 flp->fl4_dst = (*rp)->rt_dst;
2582 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2583 flags ? XFRM_LOOKUP_WAIT : 0);
2584 if (err == -EREMOTE)
2585 err = ipv4_dst_blackhole(rp, flp);
2593 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2595 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2597 return ip_route_output_flow(net, rp, flp, NULL, 0);
2600 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2601 int nowait, unsigned int flags)
2603 struct rtable *rt = skb->rtable;
2605 struct nlmsghdr *nlh;
2607 u32 id = 0, ts = 0, tsage = 0, error;
2609 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2613 r = nlmsg_data(nlh);
2614 r->rtm_family = AF_INET;
2615 r->rtm_dst_len = 32;
2617 r->rtm_tos = rt->fl.fl4_tos;
2618 r->rtm_table = RT_TABLE_MAIN;
2619 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2620 r->rtm_type = rt->rt_type;
2621 r->rtm_scope = RT_SCOPE_UNIVERSE;
2622 r->rtm_protocol = RTPROT_UNSPEC;
2623 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2624 if (rt->rt_flags & RTCF_NOTIFY)
2625 r->rtm_flags |= RTM_F_NOTIFY;
2627 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2629 if (rt->fl.fl4_src) {
2630 r->rtm_src_len = 32;
2631 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2634 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2635 #ifdef CONFIG_NET_CLS_ROUTE
2636 if (rt->u.dst.tclassid)
2637 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2640 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2641 else if (rt->rt_src != rt->fl.fl4_src)
2642 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2644 if (rt->rt_dst != rt->rt_gateway)
2645 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2647 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2648 goto nla_put_failure;
2650 error = rt->u.dst.error;
2651 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2653 id = rt->peer->ip_id_count;
2654 if (rt->peer->tcp_ts_stamp) {
2655 ts = rt->peer->tcp_ts;
2656 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2661 #ifdef CONFIG_IP_MROUTE
2662 __be32 dst = rt->rt_dst;
2664 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2666 int err = ipmr_get_route(skb, r, nowait);
2671 goto nla_put_failure;
2673 if (err == -EMSGSIZE)
2674 goto nla_put_failure;
2680 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2683 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2684 expires, error) < 0)
2685 goto nla_put_failure;
2687 return nlmsg_end(skb, nlh);
2690 nlmsg_cancel(skb, nlh);
2694 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2696 struct net *net = sock_net(in_skb->sk);
2698 struct nlattr *tb[RTA_MAX+1];
2699 struct rtable *rt = NULL;
2704 struct sk_buff *skb;
2706 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2710 rtm = nlmsg_data(nlh);
2712 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2718 /* Reserve room for dummy headers, this skb can pass
2719 through good chunk of routing engine.
2721 skb_reset_mac_header(skb);
2722 skb_reset_network_header(skb);
2724 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2725 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2726 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2728 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2729 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2730 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2733 struct net_device *dev;
2735 dev = __dev_get_by_index(net, iif);
2741 skb->protocol = htons(ETH_P_IP);
2744 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2748 if (err == 0 && rt->u.dst.error)
2749 err = -rt->u.dst.error;
2756 .tos = rtm->rtm_tos,
2759 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2761 err = ip_route_output_key(net, &rt, &fl);
2768 if (rtm->rtm_flags & RTM_F_NOTIFY)
2769 rt->rt_flags |= RTCF_NOTIFY;
2771 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2772 RTM_NEWROUTE, 0, 0);
2776 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2785 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2792 net = sock_net(skb->sk);
2797 s_idx = idx = cb->args[1];
2798 for (h = s_h; h <= rt_hash_mask; h++) {
2800 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2801 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2802 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2804 if (rt->rt_genid != atomic_read(&rt_genid))
2806 skb->dst = dst_clone(&rt->u.dst);
2807 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2808 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2809 1, NLM_F_MULTI) <= 0) {
2810 dst_release(xchg(&skb->dst, NULL));
2811 rcu_read_unlock_bh();
2814 dst_release(xchg(&skb->dst, NULL));
2816 rcu_read_unlock_bh();
2826 void ip_rt_multicast_event(struct in_device *in_dev)
2828 rt_cache_flush(dev_net(in_dev->dev), 0);
2831 #ifdef CONFIG_SYSCTL
2832 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2833 struct file *filp, void __user *buffer,
2834 size_t *lenp, loff_t *ppos)
2839 static DEFINE_MUTEX(flush_mutex);
2841 mutex_lock(&flush_mutex);
2842 ctl->data = &flush_delay;
2843 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2845 mutex_unlock(&flush_mutex);
2847 net = (struct net *)ctl->extra1;
2848 rt_cache_flush(net, flush_delay);
2855 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2858 void __user *oldval,
2859 size_t __user *oldlenp,
2860 void __user *newval,
2865 if (newlen != sizeof(int))
2867 if (get_user(delay, (int __user *)newval))
2869 net = (struct net *)table->extra1;
2870 rt_cache_flush(net, delay);
2874 ctl_table ipv4_route_table[] = {
2876 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2877 .procname = "gc_thresh",
2878 .data = &ipv4_dst_ops.gc_thresh,
2879 .maxlen = sizeof(int),
2881 .proc_handler = &proc_dointvec,
2884 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2885 .procname = "max_size",
2886 .data = &ip_rt_max_size,
2887 .maxlen = sizeof(int),
2889 .proc_handler = &proc_dointvec,
2892 /* Deprecated. Use gc_min_interval_ms */
2894 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2895 .procname = "gc_min_interval",
2896 .data = &ip_rt_gc_min_interval,
2897 .maxlen = sizeof(int),
2899 .proc_handler = &proc_dointvec_jiffies,
2900 .strategy = &sysctl_jiffies,
2903 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2904 .procname = "gc_min_interval_ms",
2905 .data = &ip_rt_gc_min_interval,
2906 .maxlen = sizeof(int),
2908 .proc_handler = &proc_dointvec_ms_jiffies,
2909 .strategy = &sysctl_ms_jiffies,
2912 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2913 .procname = "gc_timeout",
2914 .data = &ip_rt_gc_timeout,
2915 .maxlen = sizeof(int),
2917 .proc_handler = &proc_dointvec_jiffies,
2918 .strategy = &sysctl_jiffies,
2921 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2922 .procname = "gc_interval",
2923 .data = &ip_rt_gc_interval,
2924 .maxlen = sizeof(int),
2926 .proc_handler = &proc_dointvec_jiffies,
2927 .strategy = &sysctl_jiffies,
2930 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2931 .procname = "redirect_load",
2932 .data = &ip_rt_redirect_load,
2933 .maxlen = sizeof(int),
2935 .proc_handler = &proc_dointvec,
2938 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2939 .procname = "redirect_number",
2940 .data = &ip_rt_redirect_number,
2941 .maxlen = sizeof(int),
2943 .proc_handler = &proc_dointvec,
2946 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2947 .procname = "redirect_silence",
2948 .data = &ip_rt_redirect_silence,
2949 .maxlen = sizeof(int),
2951 .proc_handler = &proc_dointvec,
2954 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2955 .procname = "error_cost",
2956 .data = &ip_rt_error_cost,
2957 .maxlen = sizeof(int),
2959 .proc_handler = &proc_dointvec,
2962 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2963 .procname = "error_burst",
2964 .data = &ip_rt_error_burst,
2965 .maxlen = sizeof(int),
2967 .proc_handler = &proc_dointvec,
2970 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2971 .procname = "gc_elasticity",
2972 .data = &ip_rt_gc_elasticity,
2973 .maxlen = sizeof(int),
2975 .proc_handler = &proc_dointvec,
2978 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2979 .procname = "mtu_expires",
2980 .data = &ip_rt_mtu_expires,
2981 .maxlen = sizeof(int),
2983 .proc_handler = &proc_dointvec_jiffies,
2984 .strategy = &sysctl_jiffies,
2987 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2988 .procname = "min_pmtu",
2989 .data = &ip_rt_min_pmtu,
2990 .maxlen = sizeof(int),
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2996 .procname = "min_adv_mss",
2997 .data = &ip_rt_min_advmss,
2998 .maxlen = sizeof(int),
3000 .proc_handler = &proc_dointvec,
3003 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3004 .procname = "secret_interval",
3005 .data = &ip_rt_secret_interval,
3006 .maxlen = sizeof(int),
3008 .proc_handler = &proc_dointvec_jiffies,
3009 .strategy = &sysctl_jiffies,
3014 static __net_initdata struct ctl_path ipv4_route_path[] = {
3015 { .procname = "net", .ctl_name = CTL_NET, },
3016 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3017 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3022 static struct ctl_table ipv4_route_flush_table[] = {
3024 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3025 .procname = "flush",
3026 .maxlen = sizeof(int),
3028 .proc_handler = &ipv4_sysctl_rtcache_flush,
3029 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
3034 static __net_init int sysctl_route_net_init(struct net *net)
3036 struct ctl_table *tbl;
3038 tbl = ipv4_route_flush_table;
3039 if (net != &init_net) {
3040 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3044 tbl[0].extra1 = net;
3046 net->ipv4.route_hdr =
3047 register_net_sysctl_table(net, ipv4_route_path, tbl);
3048 if (net->ipv4.route_hdr == NULL)
3053 if (tbl != ipv4_route_flush_table)
3059 static __net_exit void sysctl_route_net_exit(struct net *net)
3061 struct ctl_table *tbl;
3063 tbl = net->ipv4.route_hdr->ctl_table_arg;
3064 unregister_net_sysctl_table(net->ipv4.route_hdr);
3065 BUG_ON(tbl == ipv4_route_flush_table);
3069 static __net_initdata struct pernet_operations sysctl_route_ops = {
3070 .init = sysctl_route_net_init,
3071 .exit = sysctl_route_net_exit,
3075 #ifdef CONFIG_NET_CLS_ROUTE
3076 struct ip_rt_acct *ip_rt_acct __read_mostly;
3077 #endif /* CONFIG_NET_CLS_ROUTE */
3079 static __initdata unsigned long rhash_entries;
3080 static int __init set_rhash_entries(char *str)
3084 rhash_entries = simple_strtoul(str, &str, 0);
3087 __setup("rhash_entries=", set_rhash_entries);
3089 int __init ip_rt_init(void)
3093 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3094 (jiffies ^ (jiffies >> 7))));
3096 #ifdef CONFIG_NET_CLS_ROUTE
3097 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3099 panic("IP: failed to allocate ip_rt_acct\n");
3102 ipv4_dst_ops.kmem_cachep =
3103 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3104 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3106 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3108 rt_hash_table = (struct rt_hash_bucket *)
3109 alloc_large_system_hash("IP route cache",
3110 sizeof(struct rt_hash_bucket),
3112 (num_physpages >= 128 * 1024) ?
3118 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3119 rt_hash_lock_init();
3121 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3122 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3127 rt_secret_timer.function = rt_secret_rebuild;
3128 rt_secret_timer.data = 0;
3129 init_timer_deferrable(&rt_secret_timer);
3131 /* All the timers, started at system startup tend
3132 to synchronize. Perturb it a bit.
3134 schedule_delayed_work(&expires_work,
3135 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3137 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3138 ip_rt_secret_interval;
3139 add_timer(&rt_secret_timer);
3141 if (ip_rt_proc_init())
3142 printk(KERN_ERR "Unable to create route proc files\n");
3147 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3149 #ifdef CONFIG_SYSCTL
3150 register_pernet_subsys(&sysctl_route_ops);
3155 EXPORT_SYMBOL(__ip_select_ident);
3156 EXPORT_SYMBOL(ip_route_input);
3157 EXPORT_SYMBOL(ip_route_output_key);