2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <net/protocol.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
99 #include <net/ip_fib.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/ip_mp_alg.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
108 #include <linux/sysctl.h>
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_min_delay = 2 * HZ;
119 static int ip_rt_max_delay = 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval = 60 * HZ;
123 static int ip_rt_gc_min_interval = HZ / 2;
124 static int ip_rt_redirect_number = 9;
125 static int ip_rt_redirect_load = HZ / 50;
126 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost = HZ;
128 static int ip_rt_error_burst = 5 * HZ;
129 static int ip_rt_gc_elasticity = 8;
130 static int ip_rt_mtu_expires = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu = 512 + 20 + 20;
132 static int ip_rt_min_advmss = 256;
133 static int ip_rt_secret_interval = 10 * 60 * HZ;
134 static unsigned long rt_deadline;
136 #define RTprint(a...) printk(KERN_DEBUG a)
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
143 * Interface to generic destination cache.
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void ipv4_link_failure(struct sk_buff *skb);
152 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
156 static struct dst_ops ipv4_dst_ops = {
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
169 #define ECN_OR_COST(class) TC_PRIO_##class
171 __u8 ip_tos2prio[16] = {
175 ECN_OR_COST(BESTEFFORT),
181 ECN_OR_COST(INTERACTIVE),
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
195 /* The locking scheme is rather straight forward:
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
205 struct rt_hash_bucket {
206 struct rtable *chain;
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215 #ifdef CONFIG_LOCKDEP
216 # define RT_HASH_LOCK_SZ 256
219 # define RT_HASH_LOCK_SZ 4096
221 # define RT_HASH_LOCK_SZ 2048
223 # define RT_HASH_LOCK_SZ 1024
225 # define RT_HASH_LOCK_SZ 512
227 # define RT_HASH_LOCK_SZ 256
231 static spinlock_t *rt_hash_locks;
232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 # define rt_hash_lock_init() { \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
241 # define rt_hash_lock_addr(slot) NULL
242 # define rt_hash_lock_init()
245 static struct rt_hash_bucket *rt_hash_table;
246 static unsigned rt_hash_mask;
247 static int rt_hash_log;
248 static unsigned int rt_hash_rnd;
250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251 #define RT_CACHE_STAT_INC(field) \
252 (__raw_get_cpu_var(rt_cache_stat).field++)
254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
263 #define rt_hash(daddr, saddr, idx) \
264 rt_hash_code((__force u32)(__be32)(daddr),\
265 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267 #ifdef CONFIG_PROC_FS
268 struct rt_cache_iter_state {
272 static struct rtable *rt_cache_get_first(struct seq_file *seq)
274 struct rtable *r = NULL;
275 struct rt_cache_iter_state *st = seq->private;
277 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279 r = rt_hash_table[st->bucket].chain;
282 rcu_read_unlock_bh();
287 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291 r = r->u.dst.rt_next;
293 rcu_read_unlock_bh();
294 if (--st->bucket < 0)
297 r = rt_hash_table[st->bucket].chain;
302 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304 struct rtable *r = rt_cache_get_first(seq);
307 while (pos && (r = rt_cache_get_next(seq, r)))
309 return pos ? NULL : r;
312 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319 struct rtable *r = NULL;
321 if (v == SEQ_START_TOKEN)
322 r = rt_cache_get_first(seq);
324 r = rt_cache_get_next(seq, v);
329 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331 if (v && v != SEQ_START_TOKEN)
332 rcu_read_unlock_bh();
335 static int rt_cache_seq_show(struct seq_file *seq, void *v)
337 if (v == SEQ_START_TOKEN)
338 seq_printf(seq, "%-127s\n",
339 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343 struct rtable *r = v;
346 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348 r->u.dst.dev ? r->u.dst.dev->name : "*",
349 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351 r->u.dst.__use, 0, (unsigned long)r->rt_src,
352 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354 dst_metric(&r->u.dst, RTAX_WINDOW),
355 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356 dst_metric(&r->u.dst, RTAX_RTTVAR)),
358 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362 seq_printf(seq, "%-127s\n", temp);
367 static const struct seq_operations rt_cache_seq_ops = {
368 .start = rt_cache_seq_start,
369 .next = rt_cache_seq_next,
370 .stop = rt_cache_seq_stop,
371 .show = rt_cache_seq_show,
374 static int rt_cache_seq_open(struct inode *inode, struct file *file)
376 struct seq_file *seq;
378 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
382 rc = seq_open(file, &rt_cache_seq_ops);
385 seq = file->private_data;
387 memset(s, 0, sizeof(*s));
395 static const struct file_operations rt_cache_seq_fops = {
396 .owner = THIS_MODULE,
397 .open = rt_cache_seq_open,
400 .release = seq_release_private,
404 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
409 return SEQ_START_TOKEN;
411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
415 return &per_cpu(rt_cache_stat, cpu);
420 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
428 return &per_cpu(rt_cache_stat, cpu);
434 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
439 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441 struct rt_cache_stat *st = v;
443 if (v == SEQ_START_TOKEN) {
444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 atomic_read(&ipv4_dst_ops.entries),
473 static const struct seq_operations rt_cpu_seq_ops = {
474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop,
477 .show = rt_cpu_seq_show,
481 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483 return seq_open(file, &rt_cpu_seq_ops);
486 static const struct file_operations rt_cpu_seq_fops = {
487 .owner = THIS_MODULE,
488 .open = rt_cpu_seq_open,
491 .release = seq_release,
494 #endif /* CONFIG_PROC_FS */
496 static __inline__ void rt_free(struct rtable *rt)
498 multipath_remove(rt);
499 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
502 static __inline__ void rt_drop(struct rtable *rt)
504 multipath_remove(rt);
506 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
509 static __inline__ int rt_fast_clean(struct rtable *rth)
511 /* Kill broadcast/multicast entries very aggresively, if they
512 collide in hash table with more useful entries */
513 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
514 rth->fl.iif && rth->u.dst.rt_next;
517 static __inline__ int rt_valuable(struct rtable *rth)
519 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
523 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
528 if (atomic_read(&rth->u.dst.__refcnt))
532 if (rth->u.dst.expires &&
533 time_after_eq(jiffies, rth->u.dst.expires))
536 age = jiffies - rth->u.dst.lastuse;
538 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
539 (age <= tmo2 && rt_valuable(rth)))
545 /* Bits of score are:
547 * 30: not quite useless
548 * 29..0: usage counter
550 static inline u32 rt_score(struct rtable *rt)
552 u32 score = jiffies - rt->u.dst.lastuse;
554 score = ~score & ~(3<<30);
560 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
566 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
569 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
570 (fl1->mark ^ fl2->mark) |
571 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
572 *(u16 *)&fl2->nl_u.ip4_u.tos) |
573 (fl1->oif ^ fl2->oif) |
574 (fl1->iif ^ fl2->iif)) == 0;
577 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
578 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
579 struct rtable *expentry,
582 int passedexpired = 0;
583 struct rtable **nextstep = NULL;
584 struct rtable **rthp = chain_head;
590 while ((rth = *rthp) != NULL) {
594 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
595 compare_keys(&(*rthp)->fl, &expentry->fl)) {
596 if (*rthp == expentry) {
597 *rthp = rth->u.dst.rt_next;
600 *rthp = rth->u.dst.rt_next;
606 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
607 passedexpired && !nextstep)
608 nextstep = &rth->u.dst.rt_next;
610 rthp = &rth->u.dst.rt_next;
620 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
623 /* This runs via a timer and thus is always in BH context. */
624 static void rt_check_expire(unsigned long dummy)
626 static unsigned int rover;
627 unsigned int i = rover, goal;
628 struct rtable *rth, **rthp;
629 unsigned long now = jiffies;
632 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
633 if (ip_rt_gc_timeout > 1)
634 do_div(mult, ip_rt_gc_timeout);
635 goal = (unsigned int)mult;
636 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
637 for (; goal > 0; goal--) {
638 unsigned long tmo = ip_rt_gc_timeout;
640 i = (i + 1) & rt_hash_mask;
641 rthp = &rt_hash_table[i].chain;
645 spin_lock(rt_hash_lock_addr(i));
646 while ((rth = *rthp) != NULL) {
647 if (rth->u.dst.expires) {
648 /* Entry is expired even if it is in use */
649 if (time_before_eq(now, rth->u.dst.expires)) {
651 rthp = &rth->u.dst.rt_next;
654 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656 rthp = &rth->u.dst.rt_next;
660 /* Cleanup aged off entries. */
661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
662 /* remove all related balanced entries if necessary */
663 if (rth->u.dst.flags & DST_BALANCED) {
664 rthp = rt_remove_balanced_route(
665 &rt_hash_table[i].chain,
670 *rthp = rth->u.dst.rt_next;
673 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
674 *rthp = rth->u.dst.rt_next;
676 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678 spin_unlock(rt_hash_lock_addr(i));
680 /* Fallback loop breaker. */
681 if (time_after(jiffies, now))
685 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
688 /* This can run from both BH and non-BH contexts, the latter
689 * in the case of a forced flush event.
691 static void rt_run_flush(unsigned long dummy)
694 struct rtable *rth, *next;
698 get_random_bytes(&rt_hash_rnd, 4);
700 for (i = rt_hash_mask; i >= 0; i--) {
701 spin_lock_bh(rt_hash_lock_addr(i));
702 rth = rt_hash_table[i].chain;
704 rt_hash_table[i].chain = NULL;
705 spin_unlock_bh(rt_hash_lock_addr(i));
707 for (; rth; rth = next) {
708 next = rth->u.dst.rt_next;
714 static DEFINE_SPINLOCK(rt_flush_lock);
716 void rt_cache_flush(int delay)
718 unsigned long now = jiffies;
719 int user_mode = !in_softirq();
722 delay = ip_rt_min_delay;
724 /* flush existing multipath state*/
727 spin_lock_bh(&rt_flush_lock);
729 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
730 long tmo = (long)(rt_deadline - now);
732 /* If flush timer is already running
733 and flush request is not immediate (delay > 0):
735 if deadline is not achieved, prolongate timer to "delay",
736 otherwise fire it at deadline time.
739 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
747 spin_unlock_bh(&rt_flush_lock);
752 if (rt_deadline == 0)
753 rt_deadline = now + ip_rt_max_delay;
755 mod_timer(&rt_flush_timer, now+delay);
756 spin_unlock_bh(&rt_flush_lock);
759 static void rt_secret_rebuild(unsigned long dummy)
761 unsigned long now = jiffies;
764 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
768 Short description of GC goals.
770 We want to build algorithm, which will keep routing cache
771 at some equilibrium point, when number of aged off entries
772 is kept approximately equal to newly generated ones.
774 Current expiration strength is variable "expire".
775 We try to adjust it dynamically, so that if networking
776 is idle expires is large enough to keep enough of warm entries,
777 and when load increases it reduces to limit cache size.
780 static int rt_garbage_collect(void)
782 static unsigned long expire = RT_GC_TIMEOUT;
783 static unsigned long last_gc;
785 static int equilibrium;
786 struct rtable *rth, **rthp;
787 unsigned long now = jiffies;
791 * Garbage collection is pretty expensive,
792 * do not make it too frequently.
795 RT_CACHE_STAT_INC(gc_total);
797 if (now - last_gc < ip_rt_gc_min_interval &&
798 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
799 RT_CACHE_STAT_INC(gc_ignored);
803 /* Calculate number of entries, which we want to expire now. */
804 goal = atomic_read(&ipv4_dst_ops.entries) -
805 (ip_rt_gc_elasticity << rt_hash_log);
807 if (equilibrium < ipv4_dst_ops.gc_thresh)
808 equilibrium = ipv4_dst_ops.gc_thresh;
809 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
812 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
815 /* We are in dangerous area. Try to reduce cache really
818 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
819 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
822 if (now - last_gc >= ip_rt_gc_min_interval)
833 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
834 unsigned long tmo = expire;
836 k = (k + 1) & rt_hash_mask;
837 rthp = &rt_hash_table[k].chain;
838 spin_lock_bh(rt_hash_lock_addr(k));
839 while ((rth = *rthp) != NULL) {
840 if (!rt_may_expire(rth, tmo, expire)) {
842 rthp = &rth->u.dst.rt_next;
845 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
846 /* remove all related balanced entries
849 if (rth->u.dst.flags & DST_BALANCED) {
852 rthp = rt_remove_balanced_route(
853 &rt_hash_table[k].chain,
860 *rthp = rth->u.dst.rt_next;
864 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
865 *rthp = rth->u.dst.rt_next;
868 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870 spin_unlock_bh(rt_hash_lock_addr(k));
879 /* Goal is not achieved. We stop process if:
881 - if expire reduced to zero. Otherwise, expire is halfed.
882 - if table is not full.
883 - if we are called from interrupt.
884 - jiffies check is just fallback/debug loop breaker.
885 We will not spin here for long time in any case.
888 RT_CACHE_STAT_INC(gc_goal_miss);
894 #if RT_CACHE_DEBUG >= 2
895 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
896 atomic_read(&ipv4_dst_ops.entries), goal, i);
899 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 } while (!in_softirq() && time_before_eq(jiffies, now));
903 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
906 printk(KERN_WARNING "dst cache overflow\n");
907 RT_CACHE_STAT_INC(gc_dst_overflow);
911 expire += ip_rt_gc_min_interval;
912 if (expire > ip_rt_gc_timeout ||
913 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
914 expire = ip_rt_gc_timeout;
915 #if RT_CACHE_DEBUG >= 2
916 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
917 atomic_read(&ipv4_dst_ops.entries), goal, rover);
922 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924 struct rtable *rth, **rthp;
926 struct rtable *cand, **candp;
929 int attempts = !in_softirq();
938 rthp = &rt_hash_table[hash].chain;
940 spin_lock_bh(rt_hash_lock_addr(hash));
941 while ((rth = *rthp) != NULL) {
942 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
943 if (!(rth->u.dst.flags & DST_BALANCED) &&
944 compare_keys(&rth->fl, &rt->fl)) {
946 if (compare_keys(&rth->fl, &rt->fl)) {
949 *rthp = rth->u.dst.rt_next;
951 * Since lookup is lockfree, the deletion
952 * must be visible to another weakly ordered CPU before
953 * the insertion at the start of the hash chain.
955 rcu_assign_pointer(rth->u.dst.rt_next,
956 rt_hash_table[hash].chain);
958 * Since lookup is lockfree, the update writes
959 * must be ordered for consistency on SMP.
961 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
964 dst_hold(&rth->u.dst);
965 rth->u.dst.lastuse = now;
966 spin_unlock_bh(rt_hash_lock_addr(hash));
973 if (!atomic_read(&rth->u.dst.__refcnt)) {
974 u32 score = rt_score(rth);
976 if (score <= min_score) {
985 rthp = &rth->u.dst.rt_next;
989 /* ip_rt_gc_elasticity used to be average length of chain
990 * length, when exceeded gc becomes really aggressive.
992 * The second limit is less certain. At the moment it allows
993 * only 2 entries per bucket. We will see.
995 if (chain_length > ip_rt_gc_elasticity) {
996 *candp = cand->u.dst.rt_next;
1001 /* Try to bind route to arp only if it is output
1002 route or unicast forwarding path.
1004 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1005 int err = arp_bind_neighbour(&rt->u.dst);
1007 spin_unlock_bh(rt_hash_lock_addr(hash));
1009 if (err != -ENOBUFS) {
1014 /* Neighbour tables are full and nothing
1015 can be released. Try to shrink route cache,
1016 it is most likely it holds some neighbour records.
1018 if (attempts-- > 0) {
1019 int saved_elasticity = ip_rt_gc_elasticity;
1020 int saved_int = ip_rt_gc_min_interval;
1021 ip_rt_gc_elasticity = 1;
1022 ip_rt_gc_min_interval = 0;
1023 rt_garbage_collect();
1024 ip_rt_gc_min_interval = saved_int;
1025 ip_rt_gc_elasticity = saved_elasticity;
1029 if (net_ratelimit())
1030 printk(KERN_WARNING "Neighbour table overflow.\n");
1036 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1037 #if RT_CACHE_DEBUG >= 2
1038 if (rt->u.dst.rt_next) {
1040 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1041 NIPQUAD(rt->rt_dst));
1042 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1043 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1047 rt_hash_table[hash].chain = rt;
1048 spin_unlock_bh(rt_hash_lock_addr(hash));
1053 void rt_bind_peer(struct rtable *rt, int create)
1055 static DEFINE_SPINLOCK(rt_peer_lock);
1056 struct inet_peer *peer;
1058 peer = inet_getpeer(rt->rt_dst, create);
1060 spin_lock_bh(&rt_peer_lock);
1061 if (rt->peer == NULL) {
1065 spin_unlock_bh(&rt_peer_lock);
1071 * Peer allocation may fail only in serious out-of-memory conditions. However
1072 * we still can generate some output.
1073 * Random ID selection looks a bit dangerous because we have no chances to
1074 * select ID being unique in a reasonable period of time.
1075 * But broken packet identifier may be better than no packet at all.
1077 static void ip_select_fb_ident(struct iphdr *iph)
1079 static DEFINE_SPINLOCK(ip_fb_id_lock);
1080 static u32 ip_fallback_id;
1083 spin_lock_bh(&ip_fb_id_lock);
1084 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1085 iph->id = htons(salt & 0xFFFF);
1086 ip_fallback_id = salt;
1087 spin_unlock_bh(&ip_fb_id_lock);
1090 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092 struct rtable *rt = (struct rtable *) dst;
1095 if (rt->peer == NULL)
1096 rt_bind_peer(rt, 1);
1098 /* If peer is attached to destination, it is never detached,
1099 so that we need not to grab a lock to dereference it.
1102 iph->id = htons(inet_getid(rt->peer, more));
1106 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1107 __builtin_return_address(0));
1109 ip_select_fb_ident(iph);
1112 static void rt_del(unsigned hash, struct rtable *rt)
1114 struct rtable **rthp;
1116 spin_lock_bh(rt_hash_lock_addr(hash));
1118 for (rthp = &rt_hash_table[hash].chain; *rthp;
1119 rthp = &(*rthp)->u.dst.rt_next)
1121 *rthp = rt->u.dst.rt_next;
1125 spin_unlock_bh(rt_hash_lock_addr(hash));
1128 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1129 __be32 saddr, struct net_device *dev)
1132 struct in_device *in_dev = in_dev_get(dev);
1133 struct rtable *rth, **rthp;
1134 __be32 skeys[2] = { saddr, 0 };
1135 int ikeys[2] = { dev->ifindex, 0 };
1136 struct netevent_redirect netevent;
1141 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1142 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1143 goto reject_redirect;
1145 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1146 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1147 goto reject_redirect;
1148 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1149 goto reject_redirect;
1151 if (inet_addr_type(new_gw) != RTN_UNICAST)
1152 goto reject_redirect;
1155 for (i = 0; i < 2; i++) {
1156 for (k = 0; k < 2; k++) {
1157 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1159 rthp=&rt_hash_table[hash].chain;
1162 while ((rth = rcu_dereference(*rthp)) != NULL) {
1165 if (rth->fl.fl4_dst != daddr ||
1166 rth->fl.fl4_src != skeys[i] ||
1167 rth->fl.oif != ikeys[k] ||
1169 rthp = &rth->u.dst.rt_next;
1173 if (rth->rt_dst != daddr ||
1174 rth->rt_src != saddr ||
1176 rth->rt_gateway != old_gw ||
1177 rth->u.dst.dev != dev)
1180 dst_hold(&rth->u.dst);
1183 rt = dst_alloc(&ipv4_dst_ops);
1190 /* Copy all the information. */
1192 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1193 rt->u.dst.__use = 1;
1194 atomic_set(&rt->u.dst.__refcnt, 1);
1195 rt->u.dst.child = NULL;
1197 dev_hold(rt->u.dst.dev);
1199 in_dev_hold(rt->idev);
1200 rt->u.dst.obsolete = 0;
1201 rt->u.dst.lastuse = jiffies;
1202 rt->u.dst.path = &rt->u.dst;
1203 rt->u.dst.neighbour = NULL;
1204 rt->u.dst.hh = NULL;
1205 rt->u.dst.xfrm = NULL;
1207 rt->rt_flags |= RTCF_REDIRECTED;
1209 /* Gateway is different ... */
1210 rt->rt_gateway = new_gw;
1212 /* Redirect received -> path was valid */
1213 dst_confirm(&rth->u.dst);
1216 atomic_inc(&rt->peer->refcnt);
1218 if (arp_bind_neighbour(&rt->u.dst) ||
1219 !(rt->u.dst.neighbour->nud_state &
1221 if (rt->u.dst.neighbour)
1222 neigh_event_send(rt->u.dst.neighbour, NULL);
1228 netevent.old = &rth->u.dst;
1229 netevent.new = &rt->u.dst;
1230 call_netevent_notifiers(NETEVENT_REDIRECT,
1234 if (!rt_intern_hash(hash, rt, &rt))
1247 #ifdef CONFIG_IP_ROUTE_VERBOSE
1248 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1249 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1250 "%u.%u.%u.%u ignored.\n"
1251 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1252 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1253 NIPQUAD(saddr), NIPQUAD(daddr));
1258 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260 struct rtable *rt = (struct rtable*)dst;
1261 struct dst_entry *ret = dst;
1264 if (dst->obsolete) {
1267 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1268 rt->u.dst.expires) {
1269 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271 #if RT_CACHE_DEBUG >= 1
1272 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1273 "%u.%u.%u.%u/%02x dropped\n",
1274 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1285 * 1. The first ip_rt_redirect_number redirects are sent
1286 * with exponential backoff, then we stop sending them at all,
1287 * assuming that the host ignores our redirects.
1288 * 2. If we did not see packets requiring redirects
1289 * during ip_rt_redirect_silence, we assume that the host
1290 * forgot redirected route and start to send redirects again.
1292 * This algorithm is much cheaper and more intelligent than dumb load limiting
1295 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1296 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1299 void ip_rt_send_redirect(struct sk_buff *skb)
1301 struct rtable *rt = (struct rtable*)skb->dst;
1302 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1307 if (!IN_DEV_TX_REDIRECTS(in_dev))
1310 /* No redirected packets during ip_rt_redirect_silence;
1311 * reset the algorithm.
1313 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1314 rt->u.dst.rate_tokens = 0;
1316 /* Too many ignored redirects; do not send anything
1317 * set u.dst.rate_last to the last seen redirected packet.
1319 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1320 rt->u.dst.rate_last = jiffies;
1324 /* Check for load limit; set rate_last to the latest sent
1327 if (rt->u.dst.rate_tokens == 0 ||
1329 (rt->u.dst.rate_last +
1330 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332 rt->u.dst.rate_last = jiffies;
1333 ++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1338 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340 NIPQUAD(rt->rt_src), rt->rt_iif,
1341 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1348 static int ip_error(struct sk_buff *skb)
1350 struct rtable *rt = (struct rtable*)skb->dst;
1354 switch (rt->u.dst.error) {
1359 code = ICMP_HOST_UNREACH;
1362 code = ICMP_NET_UNREACH;
1365 code = ICMP_PKT_FILTERED;
1370 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372 rt->u.dst.rate_tokens = ip_rt_error_burst;
1373 rt->u.dst.rate_last = now;
1374 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1379 out: kfree_skb(skb);
1384 * The last two values are not from the RFC but
1385 * are needed for AMPRnet AX.25 paths.
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1395 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396 if (old_mtu > mtu_plateau[i])
1397 return mtu_plateau[i];
1401 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1404 unsigned short old_mtu = ntohs(iph->tot_len);
1406 __be32 skeys[2] = { iph->saddr, 0, };
1407 __be32 daddr = iph->daddr;
1408 unsigned short est_mtu = 0;
1410 if (ipv4_config.no_pmtu_disc)
1413 for (i = 0; i < 2; i++) {
1414 unsigned hash = rt_hash(daddr, skeys[i], 0);
1417 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418 rth = rcu_dereference(rth->u.dst.rt_next)) {
1419 if (rth->fl.fl4_dst == daddr &&
1420 rth->fl.fl4_src == skeys[i] &&
1421 rth->rt_dst == daddr &&
1422 rth->rt_src == iph->saddr &&
1424 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425 unsigned short mtu = new_mtu;
1427 if (new_mtu < 68 || new_mtu >= old_mtu) {
1429 /* BSD 4.2 compatibility hack :-( */
1431 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432 old_mtu >= 68 + (iph->ihl << 2))
1433 old_mtu -= iph->ihl << 2;
1435 mtu = guess_mtu(old_mtu);
1437 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1439 dst_confirm(&rth->u.dst);
1440 if (mtu < ip_rt_min_pmtu) {
1441 mtu = ip_rt_min_pmtu;
1442 rth->u.dst.metrics[RTAX_LOCK-1] |=
1445 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446 dst_set_expires(&rth->u.dst,
1455 return est_mtu ? : new_mtu;
1458 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1460 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461 !(dst_metric_locked(dst, RTAX_MTU))) {
1462 if (mtu < ip_rt_min_pmtu) {
1463 mtu = ip_rt_min_pmtu;
1464 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1466 dst->metrics[RTAX_MTU-1] = mtu;
1467 dst_set_expires(dst, ip_rt_mtu_expires);
1468 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1472 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1477 static void ipv4_dst_destroy(struct dst_entry *dst)
1479 struct rtable *rt = (struct rtable *) dst;
1480 struct inet_peer *peer = rt->peer;
1481 struct in_device *idev = rt->idev;
1494 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1497 struct rtable *rt = (struct rtable *) dst;
1498 struct in_device *idev = rt->idev;
1499 if (dev != &loopback_dev && idev && idev->dev == dev) {
1500 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501 if (loopback_idev) {
1502 rt->idev = loopback_idev;
1508 static void ipv4_link_failure(struct sk_buff *skb)
1512 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1514 rt = (struct rtable *) skb->dst;
1516 dst_set_expires(&rt->u.dst, 0);
1519 static int ip_rt_bug(struct sk_buff *skb)
1521 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1523 skb->dev ? skb->dev->name : "?");
1529 We do not cache source address of outgoing interface,
1530 because it is used only by IP RR, TS and SRR options,
1531 so that it out of fast path.
1533 BTW remember: "addr" is allowed to be not aligned
1537 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1540 struct fib_result res;
1542 if (rt->fl.iif == 0)
1544 else if (fib_lookup(&rt->fl, &res) == 0) {
1545 src = FIB_RES_PREFSRC(res);
1548 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1550 memcpy(addr, &src, 4);
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554 static void set_class_tag(struct rtable *rt, u32 tag)
1556 if (!(rt->u.dst.tclassid & 0xFFFF))
1557 rt->u.dst.tclassid |= tag & 0xFFFF;
1558 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1563 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1565 struct fib_info *fi = res->fi;
1568 if (FIB_RES_GW(*res) &&
1569 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570 rt->rt_gateway = FIB_RES_GW(*res);
1571 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572 sizeof(rt->u.dst.metrics));
1573 if (fi->fib_mtu == 0) {
1574 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576 rt->rt_gateway != rt->rt_dst &&
1577 rt->u.dst.dev->mtu > 576)
1578 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1580 #ifdef CONFIG_NET_CLS_ROUTE
1581 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1584 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1586 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1593 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 #ifdef CONFIG_IP_MULTIPLE_TABLES
1598 set_class_tag(rt, fib_rules_tclass(res));
1600 set_class_tag(rt, itag);
1602 rt->rt_type = res->type;
1605 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606 u8 tos, struct net_device *dev, int our)
1611 struct in_device *in_dev = in_dev_get(dev);
1614 /* Primary sanity checks. */
1619 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620 skb->protocol != htons(ETH_P_IP))
1623 if (ZERONET(saddr)) {
1624 if (!LOCAL_MCAST(daddr))
1626 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627 } else if (fib_validate_source(saddr, 0, tos, 0,
1628 dev, &spec_dst, &itag) < 0)
1631 rth = dst_alloc(&ipv4_dst_ops);
1635 rth->u.dst.output= ip_rt_bug;
1637 atomic_set(&rth->u.dst.__refcnt, 1);
1638 rth->u.dst.flags= DST_HOST;
1639 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1640 rth->u.dst.flags |= DST_NOPOLICY;
1641 rth->fl.fl4_dst = daddr;
1642 rth->rt_dst = daddr;
1643 rth->fl.fl4_tos = tos;
1644 rth->fl.mark = skb->mark;
1645 rth->fl.fl4_src = saddr;
1646 rth->rt_src = saddr;
1647 #ifdef CONFIG_NET_CLS_ROUTE
1648 rth->u.dst.tclassid = itag;
1651 rth->fl.iif = dev->ifindex;
1652 rth->u.dst.dev = &loopback_dev;
1653 dev_hold(rth->u.dst.dev);
1654 rth->idev = in_dev_get(rth->u.dst.dev);
1656 rth->rt_gateway = daddr;
1657 rth->rt_spec_dst= spec_dst;
1658 rth->rt_type = RTN_MULTICAST;
1659 rth->rt_flags = RTCF_MULTICAST;
1661 rth->u.dst.input= ip_local_deliver;
1662 rth->rt_flags |= RTCF_LOCAL;
1665 #ifdef CONFIG_IP_MROUTE
1666 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667 rth->u.dst.input = ip_mr_input;
1669 RT_CACHE_STAT_INC(in_slow_mc);
1672 hash = rt_hash(daddr, saddr, dev->ifindex);
1673 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1685 static void ip_handle_martian_source(struct net_device *dev,
1686 struct in_device *in_dev,
1687 struct sk_buff *skb,
1691 RT_CACHE_STAT_INC(in_martian_src);
1692 #ifdef CONFIG_IP_ROUTE_VERBOSE
1693 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1695 * RFC1812 recommendation, if source is martian,
1696 * the only hint is MAC header.
1698 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699 "%u.%u.%u.%u, on dev %s\n",
1700 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1703 const unsigned char *p = skb_mac_header(skb);
1704 printk(KERN_WARNING "ll header: ");
1705 for (i = 0; i < dev->hard_header_len; i++, p++) {
1707 if (i < (dev->hard_header_len - 1))
1716 static inline int __mkroute_input(struct sk_buff *skb,
1717 struct fib_result* res,
1718 struct in_device *in_dev,
1719 __be32 daddr, __be32 saddr, u32 tos,
1720 struct rtable **result)
1725 struct in_device *out_dev;
1730 /* get a working reference to the output device */
1731 out_dev = in_dev_get(FIB_RES_DEV(*res));
1732 if (out_dev == NULL) {
1733 if (net_ratelimit())
1734 printk(KERN_CRIT "Bug in ip_route_input" \
1735 "_slow(). Please, report\n");
1740 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1741 in_dev->dev, &spec_dst, &itag);
1743 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1751 flags |= RTCF_DIRECTSRC;
1753 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754 (IN_DEV_SHARED_MEDIA(out_dev) ||
1755 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756 flags |= RTCF_DOREDIRECT;
1758 if (skb->protocol != htons(ETH_P_IP)) {
1759 /* Not IP (i.e. ARP). Do not create route, if it is
1760 * invalid for proxy arp. DNAT routes are always valid.
1762 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1769 rth = dst_alloc(&ipv4_dst_ops);
1775 atomic_set(&rth->u.dst.__refcnt, 1);
1776 rth->u.dst.flags= DST_HOST;
1777 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778 if (res->fi->fib_nhs > 1)
1779 rth->u.dst.flags |= DST_BALANCED;
1781 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1782 rth->u.dst.flags |= DST_NOPOLICY;
1783 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1784 rth->u.dst.flags |= DST_NOXFRM;
1785 rth->fl.fl4_dst = daddr;
1786 rth->rt_dst = daddr;
1787 rth->fl.fl4_tos = tos;
1788 rth->fl.mark = skb->mark;
1789 rth->fl.fl4_src = saddr;
1790 rth->rt_src = saddr;
1791 rth->rt_gateway = daddr;
1793 rth->fl.iif = in_dev->dev->ifindex;
1794 rth->u.dst.dev = (out_dev)->dev;
1795 dev_hold(rth->u.dst.dev);
1796 rth->idev = in_dev_get(rth->u.dst.dev);
1798 rth->rt_spec_dst= spec_dst;
1800 rth->u.dst.input = ip_forward;
1801 rth->u.dst.output = ip_output;
1803 rt_set_nexthop(rth, res, itag);
1805 rth->rt_flags = flags;
1810 /* release the working reference to the output device */
1811 in_dev_put(out_dev);
1815 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816 struct fib_result* res,
1817 const struct flowi *fl,
1818 struct in_device *in_dev,
1819 __be32 daddr, __be32 saddr, u32 tos)
1821 struct rtable* rth = NULL;
1825 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1826 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827 fib_select_multipath(fl, res);
1830 /* create a routing cache entry */
1831 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1835 /* put it into the cache */
1836 hash = rt_hash(daddr, saddr, fl->iif);
1837 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1840 static inline int ip_mkroute_input(struct sk_buff *skb,
1841 struct fib_result* res,
1842 const struct flowi *fl,
1843 struct in_device *in_dev,
1844 __be32 daddr, __be32 saddr, u32 tos)
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847 struct rtable* rth = NULL, *rtres;
1848 unsigned char hop, hopcount;
1853 hopcount = res->fi->fib_nhs;
1857 /* distinguish between multipath and singlepath */
1859 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1862 /* add all alternatives to the routing cache */
1863 for (hop = 0; hop < hopcount; hop++) {
1866 /* put reference to previous result */
1870 /* create a routing cache entry */
1871 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1876 /* put it into the cache */
1877 hash = rt_hash(daddr, saddr, fl->iif);
1878 err = rt_intern_hash(hash, rth, &rtres);
1882 /* forward hop information to multipath impl. */
1883 multipath_set_nhinfo(rth,
1884 FIB_RES_NETWORK(*res),
1885 FIB_RES_NETMASK(*res),
1889 skb->dst = &rtres->u.dst;
1891 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1892 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1898 * NOTE. We drop all the packets that has local source
1899 * addresses, because every properly looped back packet
1900 * must have correct destination already attached by output routine.
1902 * Such approach solves two big problems:
1903 * 1. Not simplex devices are handled properly.
1904 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1907 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908 u8 tos, struct net_device *dev)
1910 struct fib_result res;
1911 struct in_device *in_dev = in_dev_get(dev);
1912 struct flowi fl = { .nl_u = { .ip4_u =
1916 .scope = RT_SCOPE_UNIVERSE,
1919 .iif = dev->ifindex };
1922 struct rtable * rth;
1928 /* IP on this device is disabled. */
1933 /* Check for the most weird martians, which can be not detected
1937 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938 goto martian_source;
1940 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1943 /* Accept zero addresses only to limited broadcast;
1944 * I even do not know to fix it or not. Waiting for complains :-)
1947 goto martian_source;
1949 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950 goto martian_destination;
1953 * Now we are ready to route packet.
1955 if ((err = fib_lookup(&fl, &res)) != 0) {
1956 if (!IN_DEV_FORWARD(in_dev))
1962 RT_CACHE_STAT_INC(in_slow_tot);
1964 if (res.type == RTN_BROADCAST)
1967 if (res.type == RTN_LOCAL) {
1969 result = fib_validate_source(saddr, daddr, tos,
1970 loopback_dev.ifindex,
1971 dev, &spec_dst, &itag);
1973 goto martian_source;
1975 flags |= RTCF_DIRECTSRC;
1980 if (!IN_DEV_FORWARD(in_dev))
1982 if (res.type != RTN_UNICAST)
1983 goto martian_destination;
1985 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986 if (err == -ENOBUFS)
1998 if (skb->protocol != htons(ETH_P_IP))
2002 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2007 goto martian_source;
2009 flags |= RTCF_DIRECTSRC;
2011 flags |= RTCF_BROADCAST;
2012 res.type = RTN_BROADCAST;
2013 RT_CACHE_STAT_INC(in_brd);
2016 rth = dst_alloc(&ipv4_dst_ops);
2020 rth->u.dst.output= ip_rt_bug;
2022 atomic_set(&rth->u.dst.__refcnt, 1);
2023 rth->u.dst.flags= DST_HOST;
2024 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2025 rth->u.dst.flags |= DST_NOPOLICY;
2026 rth->fl.fl4_dst = daddr;
2027 rth->rt_dst = daddr;
2028 rth->fl.fl4_tos = tos;
2029 rth->fl.mark = skb->mark;
2030 rth->fl.fl4_src = saddr;
2031 rth->rt_src = saddr;
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033 rth->u.dst.tclassid = itag;
2036 rth->fl.iif = dev->ifindex;
2037 rth->u.dst.dev = &loopback_dev;
2038 dev_hold(rth->u.dst.dev);
2039 rth->idev = in_dev_get(rth->u.dst.dev);
2040 rth->rt_gateway = daddr;
2041 rth->rt_spec_dst= spec_dst;
2042 rth->u.dst.input= ip_local_deliver;
2043 rth->rt_flags = flags|RTCF_LOCAL;
2044 if (res.type == RTN_UNREACHABLE) {
2045 rth->u.dst.input= ip_error;
2046 rth->u.dst.error= -err;
2047 rth->rt_flags &= ~RTCF_LOCAL;
2049 rth->rt_type = res.type;
2050 hash = rt_hash(daddr, saddr, fl.iif);
2051 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2055 RT_CACHE_STAT_INC(in_no_route);
2056 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057 res.type = RTN_UNREACHABLE;
2061 * Do not cache martian addresses: they should be logged (RFC1812)
2063 martian_destination:
2064 RT_CACHE_STAT_INC(in_martian_dst);
2065 #ifdef CONFIG_IP_ROUTE_VERBOSE
2066 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068 "%u.%u.%u.%u, dev %s\n",
2069 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2073 err = -EHOSTUNREACH;
2085 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2089 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090 u8 tos, struct net_device *dev)
2092 struct rtable * rth;
2094 int iif = dev->ifindex;
2096 tos &= IPTOS_RT_MASK;
2097 hash = rt_hash(daddr, saddr, iif);
2100 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101 rth = rcu_dereference(rth->u.dst.rt_next)) {
2102 if (rth->fl.fl4_dst == daddr &&
2103 rth->fl.fl4_src == saddr &&
2104 rth->fl.iif == iif &&
2106 rth->fl.mark == skb->mark &&
2107 rth->fl.fl4_tos == tos) {
2108 rth->u.dst.lastuse = jiffies;
2109 dst_hold(&rth->u.dst);
2111 RT_CACHE_STAT_INC(in_hit);
2113 skb->dst = (struct dst_entry*)rth;
2116 RT_CACHE_STAT_INC(in_hlist_search);
2120 /* Multicast recognition logic is moved from route cache to here.
2121 The problem was that too many Ethernet cards have broken/missing
2122 hardware multicast filters :-( As result the host on multicasting
2123 network acquires a lot of useless route cache entries, sort of
2124 SDR messages from all the world. Now we try to get rid of them.
2125 Really, provided software IP multicast filter is organized
2126 reasonably (at least, hashed), it does not result in a slowdown
2127 comparing with route cache reject entries.
2128 Note, that multicast routers are not affected, because
2129 route cache entry is created eventually.
2131 if (MULTICAST(daddr)) {
2132 struct in_device *in_dev;
2135 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136 int our = ip_check_mc(in_dev, daddr, saddr,
2137 ip_hdr(skb)->protocol);
2139 #ifdef CONFIG_IP_MROUTE
2140 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2144 return ip_route_input_mc(skb, daddr, saddr,
2151 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2154 static inline int __mkroute_output(struct rtable **result,
2155 struct fib_result* res,
2156 const struct flowi *fl,
2157 const struct flowi *oldflp,
2158 struct net_device *dev_out,
2162 struct in_device *in_dev;
2163 u32 tos = RT_FL_TOS(oldflp);
2166 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2169 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2170 res->type = RTN_BROADCAST;
2171 else if (MULTICAST(fl->fl4_dst))
2172 res->type = RTN_MULTICAST;
2173 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2176 if (dev_out->flags & IFF_LOOPBACK)
2177 flags |= RTCF_LOCAL;
2179 /* get work reference to inet device */
2180 in_dev = in_dev_get(dev_out);
2184 if (res->type == RTN_BROADCAST) {
2185 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2187 fib_info_put(res->fi);
2190 } else if (res->type == RTN_MULTICAST) {
2191 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2194 flags &= ~RTCF_LOCAL;
2195 /* If multicast route do not exist use
2196 default one, but do not gateway in this case.
2199 if (res->fi && res->prefixlen < 4) {
2200 fib_info_put(res->fi);
2206 rth = dst_alloc(&ipv4_dst_ops);
2212 atomic_set(&rth->u.dst.__refcnt, 1);
2213 rth->u.dst.flags= DST_HOST;
2214 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2216 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217 if (res->fi->fib_nhs > 1)
2218 rth->u.dst.flags |= DST_BALANCED;
2221 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2222 rth->u.dst.flags |= DST_NOXFRM;
2223 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2224 rth->u.dst.flags |= DST_NOPOLICY;
2226 rth->fl.fl4_dst = oldflp->fl4_dst;
2227 rth->fl.fl4_tos = tos;
2228 rth->fl.fl4_src = oldflp->fl4_src;
2229 rth->fl.oif = oldflp->oif;
2230 rth->fl.mark = oldflp->mark;
2231 rth->rt_dst = fl->fl4_dst;
2232 rth->rt_src = fl->fl4_src;
2233 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2234 /* get references to the devices that are to be hold by the routing
2236 rth->u.dst.dev = dev_out;
2238 rth->idev = in_dev_get(dev_out);
2239 rth->rt_gateway = fl->fl4_dst;
2240 rth->rt_spec_dst= fl->fl4_src;
2242 rth->u.dst.output=ip_output;
2244 RT_CACHE_STAT_INC(out_slow_tot);
2246 if (flags & RTCF_LOCAL) {
2247 rth->u.dst.input = ip_local_deliver;
2248 rth->rt_spec_dst = fl->fl4_dst;
2250 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251 rth->rt_spec_dst = fl->fl4_src;
2252 if (flags & RTCF_LOCAL &&
2253 !(dev_out->flags & IFF_LOOPBACK)) {
2254 rth->u.dst.output = ip_mc_output;
2255 RT_CACHE_STAT_INC(out_slow_mc);
2257 #ifdef CONFIG_IP_MROUTE
2258 if (res->type == RTN_MULTICAST) {
2259 if (IN_DEV_MFORWARD(in_dev) &&
2260 !LOCAL_MCAST(oldflp->fl4_dst)) {
2261 rth->u.dst.input = ip_mr_input;
2262 rth->u.dst.output = ip_mc_output;
2268 rt_set_nexthop(rth, res, 0);
2270 rth->rt_flags = flags;
2274 /* release work reference to inet device */
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281 struct fib_result* res,
2282 const struct flowi *fl,
2283 const struct flowi *oldflp,
2284 struct net_device *dev_out,
2287 struct rtable *rth = NULL;
2288 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2291 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2292 err = rt_intern_hash(hash, rth, rp);
2298 static inline int ip_mkroute_output(struct rtable** rp,
2299 struct fib_result* res,
2300 const struct flowi *fl,
2301 const struct flowi *oldflp,
2302 struct net_device *dev_out,
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2309 struct rtable *rth = NULL;
2311 if (res->fi && res->fi->fib_nhs > 1) {
2312 unsigned char hopcount = res->fi->fib_nhs;
2314 for (hop = 0; hop < hopcount; hop++) {
2315 struct net_device *dev2nexthop;
2319 /* hold a work reference to the output device */
2320 dev2nexthop = FIB_RES_DEV(*res);
2321 dev_hold(dev2nexthop);
2323 /* put reference to previous result */
2327 err = __mkroute_output(&rth, res, fl, oldflp,
2328 dev2nexthop, flags);
2333 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2335 err = rt_intern_hash(hash, rth, rp);
2337 /* forward hop information to multipath impl. */
2338 multipath_set_nhinfo(rth,
2339 FIB_RES_NETWORK(*res),
2340 FIB_RES_NETMASK(*res),
2344 /* release work reference to output device */
2345 dev_put(dev2nexthop);
2352 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2355 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2361 * Major route resolver routine.
2364 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2366 u32 tos = RT_FL_TOS(oldflp);
2367 struct flowi fl = { .nl_u = { .ip4_u =
2368 { .daddr = oldflp->fl4_dst,
2369 .saddr = oldflp->fl4_src,
2370 .tos = tos & IPTOS_RT_MASK,
2371 .scope = ((tos & RTO_ONLINK) ?
2375 .mark = oldflp->mark,
2376 .iif = loopback_dev.ifindex,
2377 .oif = oldflp->oif };
2378 struct fib_result res;
2380 struct net_device *dev_out = NULL;
2386 #ifdef CONFIG_IP_MULTIPLE_TABLES
2390 if (oldflp->fl4_src) {
2392 if (MULTICAST(oldflp->fl4_src) ||
2393 BADCLASS(oldflp->fl4_src) ||
2394 ZERONET(oldflp->fl4_src))
2397 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398 dev_out = ip_dev_find(oldflp->fl4_src);
2399 if (dev_out == NULL)
2402 /* I removed check for oif == dev_out->oif here.
2403 It was wrong for two reasons:
2404 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405 assigned to multiple interfaces.
2406 2. Moreover, we are allowed to send packets with saddr
2407 of another iface. --ANK
2410 if (oldflp->oif == 0
2411 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412 /* Special hack: user can direct multicasts
2413 and limited broadcast via necessary interface
2414 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415 This hack is not just for fun, it allows
2416 vic,vat and friends to work.
2417 They bind socket to loopback, set ttl to zero
2418 and expect that it will work.
2419 From the viewpoint of routing cache they are broken,
2420 because we are not allowed to build multicast path
2421 with loopback source addr (look, routing cache
2422 cannot know, that ttl is zero, so that packet
2423 will not leave this host and route is valid).
2424 Luckily, this hack is good workaround.
2427 fl.oif = dev_out->ifindex;
2437 dev_out = dev_get_by_index(oldflp->oif);
2439 if (dev_out == NULL)
2442 /* RACE: Check return value of inet_select_addr instead. */
2443 if (__in_dev_get_rtnl(dev_out) == NULL) {
2445 goto out; /* Wrong error code */
2448 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2450 fl.fl4_src = inet_select_addr(dev_out, 0,
2455 if (MULTICAST(oldflp->fl4_dst))
2456 fl.fl4_src = inet_select_addr(dev_out, 0,
2458 else if (!oldflp->fl4_dst)
2459 fl.fl4_src = inet_select_addr(dev_out, 0,
2465 fl.fl4_dst = fl.fl4_src;
2467 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2470 dev_out = &loopback_dev;
2472 fl.oif = loopback_dev.ifindex;
2473 res.type = RTN_LOCAL;
2474 flags |= RTCF_LOCAL;
2478 if (fib_lookup(&fl, &res)) {
2481 /* Apparently, routing tables are wrong. Assume,
2482 that the destination is on link.
2485 Because we are allowed to send to iface
2486 even if it has NO routes and NO assigned
2487 addresses. When oif is specified, routing
2488 tables are looked up with only one purpose:
2489 to catch if destination is gatewayed, rather than
2490 direct. Moreover, if MSG_DONTROUTE is set,
2491 we send packet, ignoring both routing tables
2492 and ifaddr state. --ANK
2495 We could make it even if oif is unknown,
2496 likely IPv6, but we do not.
2499 if (fl.fl4_src == 0)
2500 fl.fl4_src = inet_select_addr(dev_out, 0,
2502 res.type = RTN_UNICAST;
2512 if (res.type == RTN_LOCAL) {
2514 fl.fl4_src = fl.fl4_dst;
2517 dev_out = &loopback_dev;
2519 fl.oif = dev_out->ifindex;
2521 fib_info_put(res.fi);
2523 flags |= RTCF_LOCAL;
2527 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2528 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529 fib_select_multipath(&fl, &res);
2532 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533 fib_select_default(&fl, &res);
2536 fl.fl4_src = FIB_RES_PREFSRC(res);
2540 dev_out = FIB_RES_DEV(res);
2542 fl.oif = dev_out->ifindex;
2546 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2556 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2561 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2564 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2565 rth = rcu_dereference(rth->u.dst.rt_next)) {
2566 if (rth->fl.fl4_dst == flp->fl4_dst &&
2567 rth->fl.fl4_src == flp->fl4_src &&
2569 rth->fl.oif == flp->oif &&
2570 rth->fl.mark == flp->mark &&
2571 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572 (IPTOS_RT_MASK | RTO_ONLINK))) {
2574 /* check for multipath routes and choose one if
2577 if (multipath_select_route(flp, rth, rp)) {
2578 dst_hold(&(*rp)->u.dst);
2579 RT_CACHE_STAT_INC(out_hit);
2580 rcu_read_unlock_bh();
2584 rth->u.dst.lastuse = jiffies;
2585 dst_hold(&rth->u.dst);
2587 RT_CACHE_STAT_INC(out_hit);
2588 rcu_read_unlock_bh();
2592 RT_CACHE_STAT_INC(out_hlist_search);
2594 rcu_read_unlock_bh();
2596 return ip_route_output_slow(rp, flp);
2599 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2601 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2605 static struct dst_ops ipv4_dst_blackhole_ops = {
2607 .protocol = __constant_htons(ETH_P_IP),
2608 .destroy = ipv4_dst_destroy,
2609 .check = ipv4_dst_check,
2610 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2611 .entry_size = sizeof(struct rtable),
2615 static int ipv4_blackhole_output(struct sk_buff *skb)
2621 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2623 struct rtable *ort = *rp;
2624 struct rtable *rt = (struct rtable *)
2625 dst_alloc(&ipv4_dst_blackhole_ops);
2628 struct dst_entry *new = &rt->u.dst;
2630 atomic_set(&new->__refcnt, 1);
2632 new->input = ipv4_blackhole_output;
2633 new->output = ipv4_blackhole_output;
2634 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2636 new->dev = ort->u.dst.dev;
2642 rt->idev = ort->idev;
2644 in_dev_hold(rt->idev);
2645 rt->rt_flags = ort->rt_flags;
2646 rt->rt_type = ort->rt_type;
2647 rt->rt_dst = ort->rt_dst;
2648 rt->rt_src = ort->rt_src;
2649 rt->rt_iif = ort->rt_iif;
2650 rt->rt_gateway = ort->rt_gateway;
2651 rt->rt_spec_dst = ort->rt_spec_dst;
2652 rt->peer = ort->peer;
2654 atomic_inc(&rt->peer->refcnt);
2659 dst_release(&(*rp)->u.dst);
2661 return (rt ? 0 : -ENOMEM);
2664 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2668 if ((err = __ip_route_output_key(rp, flp)) != 0)
2673 flp->fl4_src = (*rp)->rt_src;
2675 flp->fl4_dst = (*rp)->rt_dst;
2676 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2677 if (err == -EREMOTE)
2678 err = ipv4_dst_blackhole(rp, flp, sk);
2686 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2688 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2690 return ip_route_output_flow(rp, flp, NULL, 0);
2693 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2694 int nowait, unsigned int flags)
2696 struct rtable *rt = (struct rtable*)skb->dst;
2698 struct nlmsghdr *nlh;
2700 u32 id = 0, ts = 0, tsage = 0, error;
2702 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2706 r = nlmsg_data(nlh);
2707 r->rtm_family = AF_INET;
2708 r->rtm_dst_len = 32;
2710 r->rtm_tos = rt->fl.fl4_tos;
2711 r->rtm_table = RT_TABLE_MAIN;
2712 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2713 r->rtm_type = rt->rt_type;
2714 r->rtm_scope = RT_SCOPE_UNIVERSE;
2715 r->rtm_protocol = RTPROT_UNSPEC;
2716 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2717 if (rt->rt_flags & RTCF_NOTIFY)
2718 r->rtm_flags |= RTM_F_NOTIFY;
2720 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2722 if (rt->fl.fl4_src) {
2723 r->rtm_src_len = 32;
2724 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2727 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2728 #ifdef CONFIG_NET_CLS_ROUTE
2729 if (rt->u.dst.tclassid)
2730 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2732 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2733 if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2734 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2737 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2738 else if (rt->rt_src != rt->fl.fl4_src)
2739 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2741 if (rt->rt_dst != rt->rt_gateway)
2742 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2744 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2745 goto nla_put_failure;
2747 error = rt->u.dst.error;
2748 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2750 id = rt->peer->ip_id_count;
2751 if (rt->peer->tcp_ts_stamp) {
2752 ts = rt->peer->tcp_ts;
2753 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2758 #ifdef CONFIG_IP_MROUTE
2759 __be32 dst = rt->rt_dst;
2761 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2762 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2763 int err = ipmr_get_route(skb, r, nowait);
2768 goto nla_put_failure;
2770 if (err == -EMSGSIZE)
2771 goto nla_put_failure;
2777 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2780 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2781 expires, error) < 0)
2782 goto nla_put_failure;
2784 return nlmsg_end(skb, nlh);
2787 nlmsg_cancel(skb, nlh);
2791 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2794 struct nlattr *tb[RTA_MAX+1];
2795 struct rtable *rt = NULL;
2800 struct sk_buff *skb;
2802 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2806 rtm = nlmsg_data(nlh);
2808 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2814 /* Reserve room for dummy headers, this skb can pass
2815 through good chunk of routing engine.
2817 skb_reset_mac_header(skb);
2818 skb_reset_network_header(skb);
2820 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2821 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2822 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2824 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2825 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2826 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2829 struct net_device *dev;
2831 dev = __dev_get_by_index(iif);
2837 skb->protocol = htons(ETH_P_IP);
2840 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2843 rt = (struct rtable*) skb->dst;
2844 if (err == 0 && rt->u.dst.error)
2845 err = -rt->u.dst.error;
2852 .tos = rtm->rtm_tos,
2855 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2857 err = ip_route_output_key(&rt, &fl);
2863 skb->dst = &rt->u.dst;
2864 if (rtm->rtm_flags & RTM_F_NOTIFY)
2865 rt->rt_flags |= RTCF_NOTIFY;
2867 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2868 RTM_NEWROUTE, 0, 0);
2872 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2881 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2888 s_idx = idx = cb->args[1];
2889 for (h = 0; h <= rt_hash_mask; h++) {
2890 if (h < s_h) continue;
2894 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2895 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2898 skb->dst = dst_clone(&rt->u.dst);
2899 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2900 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2901 1, NLM_F_MULTI) <= 0) {
2902 dst_release(xchg(&skb->dst, NULL));
2903 rcu_read_unlock_bh();
2906 dst_release(xchg(&skb->dst, NULL));
2908 rcu_read_unlock_bh();
2917 void ip_rt_multicast_event(struct in_device *in_dev)
2922 #ifdef CONFIG_SYSCTL
2923 static int flush_delay;
2925 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2926 struct file *filp, void __user *buffer,
2927 size_t *lenp, loff_t *ppos)
2930 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2931 rt_cache_flush(flush_delay);
2938 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2941 void __user *oldval,
2942 size_t __user *oldlenp,
2943 void __user *newval,
2947 if (newlen != sizeof(int))
2949 if (get_user(delay, (int __user *)newval))
2951 rt_cache_flush(delay);
2955 ctl_table ipv4_route_table[] = {
2957 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2958 .procname = "flush",
2959 .data = &flush_delay,
2960 .maxlen = sizeof(int),
2962 .proc_handler = &ipv4_sysctl_rtcache_flush,
2963 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2966 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2967 .procname = "min_delay",
2968 .data = &ip_rt_min_delay,
2969 .maxlen = sizeof(int),
2971 .proc_handler = &proc_dointvec_jiffies,
2972 .strategy = &sysctl_jiffies,
2975 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2976 .procname = "max_delay",
2977 .data = &ip_rt_max_delay,
2978 .maxlen = sizeof(int),
2980 .proc_handler = &proc_dointvec_jiffies,
2981 .strategy = &sysctl_jiffies,
2984 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2985 .procname = "gc_thresh",
2986 .data = &ipv4_dst_ops.gc_thresh,
2987 .maxlen = sizeof(int),
2989 .proc_handler = &proc_dointvec,
2992 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2993 .procname = "max_size",
2994 .data = &ip_rt_max_size,
2995 .maxlen = sizeof(int),
2997 .proc_handler = &proc_dointvec,
3000 /* Deprecated. Use gc_min_interval_ms */
3002 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3003 .procname = "gc_min_interval",
3004 .data = &ip_rt_gc_min_interval,
3005 .maxlen = sizeof(int),
3007 .proc_handler = &proc_dointvec_jiffies,
3008 .strategy = &sysctl_jiffies,
3011 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3012 .procname = "gc_min_interval_ms",
3013 .data = &ip_rt_gc_min_interval,
3014 .maxlen = sizeof(int),
3016 .proc_handler = &proc_dointvec_ms_jiffies,
3017 .strategy = &sysctl_ms_jiffies,
3020 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3021 .procname = "gc_timeout",
3022 .data = &ip_rt_gc_timeout,
3023 .maxlen = sizeof(int),
3025 .proc_handler = &proc_dointvec_jiffies,
3026 .strategy = &sysctl_jiffies,
3029 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3030 .procname = "gc_interval",
3031 .data = &ip_rt_gc_interval,
3032 .maxlen = sizeof(int),
3034 .proc_handler = &proc_dointvec_jiffies,
3035 .strategy = &sysctl_jiffies,
3038 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3039 .procname = "redirect_load",
3040 .data = &ip_rt_redirect_load,
3041 .maxlen = sizeof(int),
3043 .proc_handler = &proc_dointvec,
3046 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3047 .procname = "redirect_number",
3048 .data = &ip_rt_redirect_number,
3049 .maxlen = sizeof(int),
3051 .proc_handler = &proc_dointvec,
3054 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3055 .procname = "redirect_silence",
3056 .data = &ip_rt_redirect_silence,
3057 .maxlen = sizeof(int),
3059 .proc_handler = &proc_dointvec,
3062 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3063 .procname = "error_cost",
3064 .data = &ip_rt_error_cost,
3065 .maxlen = sizeof(int),
3067 .proc_handler = &proc_dointvec,
3070 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3071 .procname = "error_burst",
3072 .data = &ip_rt_error_burst,
3073 .maxlen = sizeof(int),
3075 .proc_handler = &proc_dointvec,
3078 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3079 .procname = "gc_elasticity",
3080 .data = &ip_rt_gc_elasticity,
3081 .maxlen = sizeof(int),
3083 .proc_handler = &proc_dointvec,
3086 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3087 .procname = "mtu_expires",
3088 .data = &ip_rt_mtu_expires,
3089 .maxlen = sizeof(int),
3091 .proc_handler = &proc_dointvec_jiffies,
3092 .strategy = &sysctl_jiffies,
3095 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3096 .procname = "min_pmtu",
3097 .data = &ip_rt_min_pmtu,
3098 .maxlen = sizeof(int),
3100 .proc_handler = &proc_dointvec,
3103 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3104 .procname = "min_adv_mss",
3105 .data = &ip_rt_min_advmss,
3106 .maxlen = sizeof(int),
3108 .proc_handler = &proc_dointvec,
3111 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3112 .procname = "secret_interval",
3113 .data = &ip_rt_secret_interval,
3114 .maxlen = sizeof(int),
3116 .proc_handler = &proc_dointvec_jiffies,
3117 .strategy = &sysctl_jiffies,
3123 #ifdef CONFIG_NET_CLS_ROUTE
3124 struct ip_rt_acct *ip_rt_acct;
3126 /* This code sucks. But you should have seen it before! --RR */
3128 /* IP route accounting ptr for this logical cpu number. */
3129 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3131 #ifdef CONFIG_PROC_FS
3132 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3133 int length, int *eof, void *data)
3137 if ((offset & 3) || (length & 3))
3140 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3145 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3146 length = sizeof(struct ip_rt_acct) * 256 - offset;
3150 offset /= sizeof(u32);
3153 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3154 u32 *dst = (u32 *) buffer;
3156 /* Copy first cpu. */
3158 memcpy(dst, src, length);
3160 /* Add the other cpus in, one int at a time */
3161 for_each_possible_cpu(i) {
3164 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3166 for (j = 0; j < length/4; j++)
3172 #endif /* CONFIG_PROC_FS */
3173 #endif /* CONFIG_NET_CLS_ROUTE */
3175 static __initdata unsigned long rhash_entries;
3176 static int __init set_rhash_entries(char *str)
3180 rhash_entries = simple_strtoul(str, &str, 0);
3183 __setup("rhash_entries=", set_rhash_entries);
3185 int __init ip_rt_init(void)
3189 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3190 (jiffies ^ (jiffies >> 7)));
3192 #ifdef CONFIG_NET_CLS_ROUTE
3196 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3198 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3200 panic("IP: failed to allocate ip_rt_acct\n");
3201 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3205 ipv4_dst_ops.kmem_cachep =
3206 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3207 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3209 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3211 rt_hash_table = (struct rt_hash_bucket *)
3212 alloc_large_system_hash("IP route cache",
3213 sizeof(struct rt_hash_bucket),
3215 (num_physpages >= 128 * 1024) ?
3221 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3222 rt_hash_lock_init();
3224 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3225 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3230 init_timer(&rt_flush_timer);
3231 rt_flush_timer.function = rt_run_flush;
3232 init_timer(&rt_periodic_timer);
3233 rt_periodic_timer.function = rt_check_expire;
3234 init_timer(&rt_secret_timer);
3235 rt_secret_timer.function = rt_secret_rebuild;
3237 /* All the timers, started at system startup tend
3238 to synchronize. Perturb it a bit.
3240 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3242 add_timer(&rt_periodic_timer);
3244 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3245 ip_rt_secret_interval;
3246 add_timer(&rt_secret_timer);
3248 #ifdef CONFIG_PROC_FS
3250 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3251 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3252 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3256 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3258 #ifdef CONFIG_NET_CLS_ROUTE
3259 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3266 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3271 EXPORT_SYMBOL(__ip_select_ident);
3272 EXPORT_SYMBOL(ip_route_input);
3273 EXPORT_SYMBOL(ip_route_output_key);