2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
108 #include <linux/sysctl.h>
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_min_delay = 2 * HZ;
119 static int ip_rt_max_delay = 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval = 60 * HZ;
123 static int ip_rt_gc_min_interval = HZ / 2;
124 static int ip_rt_redirect_number = 9;
125 static int ip_rt_redirect_load = HZ / 50;
126 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost = HZ;
128 static int ip_rt_error_burst = 5 * HZ;
129 static int ip_rt_gc_elasticity = 8;
130 static int ip_rt_mtu_expires = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu = 512 + 20 + 20;
132 static int ip_rt_min_advmss = 256;
133 static int ip_rt_secret_interval = 10 * 60 * HZ;
134 static unsigned long rt_deadline;
136 #define RTprint(a...) printk(KERN_DEBUG a)
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
143 * Interface to generic destination cache.
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void ipv4_link_failure(struct sk_buff *skb);
152 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
156 static struct dst_ops ipv4_dst_ops = {
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
169 #define ECN_OR_COST(class) TC_PRIO_##class
171 __u8 ip_tos2prio[16] = {
175 ECN_OR_COST(BESTEFFORT),
181 ECN_OR_COST(INTERACTIVE),
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
195 /* The locking scheme is rather straight forward:
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
205 struct rt_hash_bucket {
206 struct rtable *chain;
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215 #ifdef CONFIG_LOCKDEP
216 # define RT_HASH_LOCK_SZ 256
219 # define RT_HASH_LOCK_SZ 4096
221 # define RT_HASH_LOCK_SZ 2048
223 # define RT_HASH_LOCK_SZ 1024
225 # define RT_HASH_LOCK_SZ 512
227 # define RT_HASH_LOCK_SZ 256
231 static spinlock_t *rt_hash_locks;
232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 # define rt_hash_lock_init() { \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
241 # define rt_hash_lock_addr(slot) NULL
242 # define rt_hash_lock_init()
245 static struct rt_hash_bucket *rt_hash_table;
246 static unsigned rt_hash_mask;
247 static int rt_hash_log;
248 static unsigned int rt_hash_rnd;
250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251 #define RT_CACHE_STAT_INC(field) \
252 (__raw_get_cpu_var(rt_cache_stat).field++)
254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
263 #ifdef CONFIG_PROC_FS
264 struct rt_cache_iter_state {
268 static struct rtable *rt_cache_get_first(struct seq_file *seq)
270 struct rtable *r = NULL;
271 struct rt_cache_iter_state *st = seq->private;
273 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
275 r = rt_hash_table[st->bucket].chain;
278 rcu_read_unlock_bh();
283 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
285 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
289 rcu_read_unlock_bh();
290 if (--st->bucket < 0)
293 r = rt_hash_table[st->bucket].chain;
298 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
300 struct rtable *r = rt_cache_get_first(seq);
303 while (pos && (r = rt_cache_get_next(seq, r)))
305 return pos ? NULL : r;
308 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
310 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
313 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
315 struct rtable *r = NULL;
317 if (v == SEQ_START_TOKEN)
318 r = rt_cache_get_first(seq);
320 r = rt_cache_get_next(seq, v);
325 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
327 if (v && v != SEQ_START_TOKEN)
328 rcu_read_unlock_bh();
331 static int rt_cache_seq_show(struct seq_file *seq, void *v)
333 if (v == SEQ_START_TOKEN)
334 seq_printf(seq, "%-127s\n",
335 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
336 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
339 struct rtable *r = v;
342 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
343 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
344 r->u.dst.dev ? r->u.dst.dev->name : "*",
345 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
346 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
347 r->u.dst.__use, 0, (unsigned long)r->rt_src,
348 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
349 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
350 dst_metric(&r->u.dst, RTAX_WINDOW),
351 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
352 dst_metric(&r->u.dst, RTAX_RTTVAR)),
354 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
355 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
358 seq_printf(seq, "%-127s\n", temp);
363 static struct seq_operations rt_cache_seq_ops = {
364 .start = rt_cache_seq_start,
365 .next = rt_cache_seq_next,
366 .stop = rt_cache_seq_stop,
367 .show = rt_cache_seq_show,
370 static int rt_cache_seq_open(struct inode *inode, struct file *file)
372 struct seq_file *seq;
374 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
378 rc = seq_open(file, &rt_cache_seq_ops);
381 seq = file->private_data;
383 memset(s, 0, sizeof(*s));
391 static struct file_operations rt_cache_seq_fops = {
392 .owner = THIS_MODULE,
393 .open = rt_cache_seq_open,
396 .release = seq_release_private,
400 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405 return SEQ_START_TOKEN;
407 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
408 if (!cpu_possible(cpu))
411 return &per_cpu(rt_cache_stat, cpu);
416 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
421 if (!cpu_possible(cpu))
424 return &per_cpu(rt_cache_stat, cpu);
430 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
437 struct rt_cache_stat *st = v;
439 if (v == SEQ_START_TOKEN) {
440 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
445 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
446 atomic_read(&ipv4_dst_ops.entries),
469 static struct seq_operations rt_cpu_seq_ops = {
470 .start = rt_cpu_seq_start,
471 .next = rt_cpu_seq_next,
472 .stop = rt_cpu_seq_stop,
473 .show = rt_cpu_seq_show,
477 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
479 return seq_open(file, &rt_cpu_seq_ops);
482 static struct file_operations rt_cpu_seq_fops = {
483 .owner = THIS_MODULE,
484 .open = rt_cpu_seq_open,
487 .release = seq_release,
490 #endif /* CONFIG_PROC_FS */
492 static __inline__ void rt_free(struct rtable *rt)
494 multipath_remove(rt);
495 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
498 static __inline__ void rt_drop(struct rtable *rt)
500 multipath_remove(rt);
502 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
505 static __inline__ int rt_fast_clean(struct rtable *rth)
507 /* Kill broadcast/multicast entries very aggresively, if they
508 collide in hash table with more useful entries */
509 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
510 rth->fl.iif && rth->u.rt_next;
513 static __inline__ int rt_valuable(struct rtable *rth)
515 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
519 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
524 if (atomic_read(&rth->u.dst.__refcnt))
528 if (rth->u.dst.expires &&
529 time_after_eq(jiffies, rth->u.dst.expires))
532 age = jiffies - rth->u.dst.lastuse;
534 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
535 (age <= tmo2 && rt_valuable(rth)))
541 /* Bits of score are:
543 * 30: not quite useless
544 * 29..0: usage counter
546 static inline u32 rt_score(struct rtable *rt)
548 u32 score = jiffies - rt->u.dst.lastuse;
550 score = ~score & ~(3<<30);
556 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
564 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
565 fl1->oif == fl2->oif &&
566 fl1->iif == fl2->iif;
569 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
570 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
571 struct rtable *expentry,
574 int passedexpired = 0;
575 struct rtable **nextstep = NULL;
576 struct rtable **rthp = chain_head;
582 while ((rth = *rthp) != NULL) {
586 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
587 compare_keys(&(*rthp)->fl, &expentry->fl)) {
588 if (*rthp == expentry) {
589 *rthp = rth->u.rt_next;
592 *rthp = rth->u.rt_next;
598 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
599 passedexpired && !nextstep)
600 nextstep = &rth->u.rt_next;
602 rthp = &rth->u.rt_next;
612 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
615 /* This runs via a timer and thus is always in BH context. */
616 static void rt_check_expire(unsigned long dummy)
618 static unsigned int rover;
619 unsigned int i = rover, goal;
620 struct rtable *rth, **rthp;
621 unsigned long now = jiffies;
624 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
625 if (ip_rt_gc_timeout > 1)
626 do_div(mult, ip_rt_gc_timeout);
627 goal = (unsigned int)mult;
628 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
629 for (; goal > 0; goal--) {
630 unsigned long tmo = ip_rt_gc_timeout;
632 i = (i + 1) & rt_hash_mask;
633 rthp = &rt_hash_table[i].chain;
637 spin_lock(rt_hash_lock_addr(i));
638 while ((rth = *rthp) != NULL) {
639 if (rth->u.dst.expires) {
640 /* Entry is expired even if it is in use */
641 if (time_before_eq(now, rth->u.dst.expires)) {
643 rthp = &rth->u.rt_next;
646 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
648 rthp = &rth->u.rt_next;
652 /* Cleanup aged off entries. */
653 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
654 /* remove all related balanced entries if necessary */
655 if (rth->u.dst.flags & DST_BALANCED) {
656 rthp = rt_remove_balanced_route(
657 &rt_hash_table[i].chain,
662 *rthp = rth->u.rt_next;
665 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
666 *rthp = rth->u.rt_next;
668 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
670 spin_unlock(rt_hash_lock_addr(i));
672 /* Fallback loop breaker. */
673 if (time_after(jiffies, now))
677 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
680 /* This can run from both BH and non-BH contexts, the latter
681 * in the case of a forced flush event.
683 static void rt_run_flush(unsigned long dummy)
686 struct rtable *rth, *next;
690 get_random_bytes(&rt_hash_rnd, 4);
692 for (i = rt_hash_mask; i >= 0; i--) {
693 spin_lock_bh(rt_hash_lock_addr(i));
694 rth = rt_hash_table[i].chain;
696 rt_hash_table[i].chain = NULL;
697 spin_unlock_bh(rt_hash_lock_addr(i));
699 for (; rth; rth = next) {
700 next = rth->u.rt_next;
706 static DEFINE_SPINLOCK(rt_flush_lock);
708 void rt_cache_flush(int delay)
710 unsigned long now = jiffies;
711 int user_mode = !in_softirq();
714 delay = ip_rt_min_delay;
716 /* flush existing multipath state*/
719 spin_lock_bh(&rt_flush_lock);
721 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
722 long tmo = (long)(rt_deadline - now);
724 /* If flush timer is already running
725 and flush request is not immediate (delay > 0):
727 if deadline is not achieved, prolongate timer to "delay",
728 otherwise fire it at deadline time.
731 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
739 spin_unlock_bh(&rt_flush_lock);
744 if (rt_deadline == 0)
745 rt_deadline = now + ip_rt_max_delay;
747 mod_timer(&rt_flush_timer, now+delay);
748 spin_unlock_bh(&rt_flush_lock);
751 static void rt_secret_rebuild(unsigned long dummy)
753 unsigned long now = jiffies;
756 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
760 Short description of GC goals.
762 We want to build algorithm, which will keep routing cache
763 at some equilibrium point, when number of aged off entries
764 is kept approximately equal to newly generated ones.
766 Current expiration strength is variable "expire".
767 We try to adjust it dynamically, so that if networking
768 is idle expires is large enough to keep enough of warm entries,
769 and when load increases it reduces to limit cache size.
772 static int rt_garbage_collect(void)
774 static unsigned long expire = RT_GC_TIMEOUT;
775 static unsigned long last_gc;
777 static int equilibrium;
778 struct rtable *rth, **rthp;
779 unsigned long now = jiffies;
783 * Garbage collection is pretty expensive,
784 * do not make it too frequently.
787 RT_CACHE_STAT_INC(gc_total);
789 if (now - last_gc < ip_rt_gc_min_interval &&
790 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
791 RT_CACHE_STAT_INC(gc_ignored);
795 /* Calculate number of entries, which we want to expire now. */
796 goal = atomic_read(&ipv4_dst_ops.entries) -
797 (ip_rt_gc_elasticity << rt_hash_log);
799 if (equilibrium < ipv4_dst_ops.gc_thresh)
800 equilibrium = ipv4_dst_ops.gc_thresh;
801 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
803 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
807 /* We are in dangerous area. Try to reduce cache really
810 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
811 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
814 if (now - last_gc >= ip_rt_gc_min_interval)
825 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
826 unsigned long tmo = expire;
828 k = (k + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[k].chain;
830 spin_lock_bh(rt_hash_lock_addr(k));
831 while ((rth = *rthp) != NULL) {
832 if (!rt_may_expire(rth, tmo, expire)) {
834 rthp = &rth->u.rt_next;
837 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
838 /* remove all related balanced entries
841 if (rth->u.dst.flags & DST_BALANCED) {
844 rthp = rt_remove_balanced_route(
845 &rt_hash_table[k].chain,
852 *rthp = rth->u.rt_next;
856 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
857 *rthp = rth->u.rt_next;
860 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
862 spin_unlock_bh(rt_hash_lock_addr(k));
871 /* Goal is not achieved. We stop process if:
873 - if expire reduced to zero. Otherwise, expire is halfed.
874 - if table is not full.
875 - if we are called from interrupt.
876 - jiffies check is just fallback/debug loop breaker.
877 We will not spin here for long time in any case.
880 RT_CACHE_STAT_INC(gc_goal_miss);
886 #if RT_CACHE_DEBUG >= 2
887 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
888 atomic_read(&ipv4_dst_ops.entries), goal, i);
891 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
893 } while (!in_softirq() && time_before_eq(jiffies, now));
895 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
898 printk(KERN_WARNING "dst cache overflow\n");
899 RT_CACHE_STAT_INC(gc_dst_overflow);
903 expire += ip_rt_gc_min_interval;
904 if (expire > ip_rt_gc_timeout ||
905 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
906 expire = ip_rt_gc_timeout;
907 #if RT_CACHE_DEBUG >= 2
908 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
909 atomic_read(&ipv4_dst_ops.entries), goal, rover);
914 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
916 struct rtable *rth, **rthp;
918 struct rtable *cand, **candp;
921 int attempts = !in_softirq();
930 rthp = &rt_hash_table[hash].chain;
932 spin_lock_bh(rt_hash_lock_addr(hash));
933 while ((rth = *rthp) != NULL) {
934 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
935 if (!(rth->u.dst.flags & DST_BALANCED) &&
936 compare_keys(&rth->fl, &rt->fl)) {
938 if (compare_keys(&rth->fl, &rt->fl)) {
941 *rthp = rth->u.rt_next;
943 * Since lookup is lockfree, the deletion
944 * must be visible to another weakly ordered CPU before
945 * the insertion at the start of the hash chain.
947 rcu_assign_pointer(rth->u.rt_next,
948 rt_hash_table[hash].chain);
950 * Since lookup is lockfree, the update writes
951 * must be ordered for consistency on SMP.
953 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
956 dst_hold(&rth->u.dst);
957 rth->u.dst.lastuse = now;
958 spin_unlock_bh(rt_hash_lock_addr(hash));
965 if (!atomic_read(&rth->u.dst.__refcnt)) {
966 u32 score = rt_score(rth);
968 if (score <= min_score) {
977 rthp = &rth->u.rt_next;
981 /* ip_rt_gc_elasticity used to be average length of chain
982 * length, when exceeded gc becomes really aggressive.
984 * The second limit is less certain. At the moment it allows
985 * only 2 entries per bucket. We will see.
987 if (chain_length > ip_rt_gc_elasticity) {
988 *candp = cand->u.rt_next;
993 /* Try to bind route to arp only if it is output
994 route or unicast forwarding path.
996 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
997 int err = arp_bind_neighbour(&rt->u.dst);
999 spin_unlock_bh(rt_hash_lock_addr(hash));
1001 if (err != -ENOBUFS) {
1006 /* Neighbour tables are full and nothing
1007 can be released. Try to shrink route cache,
1008 it is most likely it holds some neighbour records.
1010 if (attempts-- > 0) {
1011 int saved_elasticity = ip_rt_gc_elasticity;
1012 int saved_int = ip_rt_gc_min_interval;
1013 ip_rt_gc_elasticity = 1;
1014 ip_rt_gc_min_interval = 0;
1015 rt_garbage_collect();
1016 ip_rt_gc_min_interval = saved_int;
1017 ip_rt_gc_elasticity = saved_elasticity;
1021 if (net_ratelimit())
1022 printk(KERN_WARNING "Neighbour table overflow.\n");
1028 rt->u.rt_next = rt_hash_table[hash].chain;
1029 #if RT_CACHE_DEBUG >= 2
1030 if (rt->u.rt_next) {
1032 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1033 NIPQUAD(rt->rt_dst));
1034 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1035 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1039 rt_hash_table[hash].chain = rt;
1040 spin_unlock_bh(rt_hash_lock_addr(hash));
1045 void rt_bind_peer(struct rtable *rt, int create)
1047 static DEFINE_SPINLOCK(rt_peer_lock);
1048 struct inet_peer *peer;
1050 peer = inet_getpeer(rt->rt_dst, create);
1052 spin_lock_bh(&rt_peer_lock);
1053 if (rt->peer == NULL) {
1057 spin_unlock_bh(&rt_peer_lock);
1063 * Peer allocation may fail only in serious out-of-memory conditions. However
1064 * we still can generate some output.
1065 * Random ID selection looks a bit dangerous because we have no chances to
1066 * select ID being unique in a reasonable period of time.
1067 * But broken packet identifier may be better than no packet at all.
1069 static void ip_select_fb_ident(struct iphdr *iph)
1071 static DEFINE_SPINLOCK(ip_fb_id_lock);
1072 static u32 ip_fallback_id;
1075 spin_lock_bh(&ip_fb_id_lock);
1076 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1077 iph->id = htons(salt & 0xFFFF);
1078 ip_fallback_id = salt;
1079 spin_unlock_bh(&ip_fb_id_lock);
1082 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1084 struct rtable *rt = (struct rtable *) dst;
1087 if (rt->peer == NULL)
1088 rt_bind_peer(rt, 1);
1090 /* If peer is attached to destination, it is never detached,
1091 so that we need not to grab a lock to dereference it.
1094 iph->id = htons(inet_getid(rt->peer, more));
1098 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1099 __builtin_return_address(0));
1101 ip_select_fb_ident(iph);
1104 static void rt_del(unsigned hash, struct rtable *rt)
1106 struct rtable **rthp;
1108 spin_lock_bh(rt_hash_lock_addr(hash));
1110 for (rthp = &rt_hash_table[hash].chain; *rthp;
1111 rthp = &(*rthp)->u.rt_next)
1113 *rthp = rt->u.rt_next;
1117 spin_unlock_bh(rt_hash_lock_addr(hash));
1120 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1121 u32 saddr, struct net_device *dev)
1124 struct in_device *in_dev = in_dev_get(dev);
1125 struct rtable *rth, **rthp;
1126 u32 skeys[2] = { saddr, 0 };
1127 int ikeys[2] = { dev->ifindex, 0 };
1132 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1133 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1134 goto reject_redirect;
1136 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1137 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1138 goto reject_redirect;
1139 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1140 goto reject_redirect;
1142 if (inet_addr_type(new_gw) != RTN_UNICAST)
1143 goto reject_redirect;
1146 for (i = 0; i < 2; i++) {
1147 for (k = 0; k < 2; k++) {
1148 unsigned hash = rt_hash_code(daddr,
1149 skeys[i] ^ (ikeys[k] << 5));
1151 rthp=&rt_hash_table[hash].chain;
1154 while ((rth = rcu_dereference(*rthp)) != NULL) {
1157 if (rth->fl.fl4_dst != daddr ||
1158 rth->fl.fl4_src != skeys[i] ||
1159 rth->fl.oif != ikeys[k] ||
1161 rthp = &rth->u.rt_next;
1165 if (rth->rt_dst != daddr ||
1166 rth->rt_src != saddr ||
1168 rth->rt_gateway != old_gw ||
1169 rth->u.dst.dev != dev)
1172 dst_hold(&rth->u.dst);
1175 rt = dst_alloc(&ipv4_dst_ops);
1182 /* Copy all the information. */
1184 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1185 rt->u.dst.__use = 1;
1186 atomic_set(&rt->u.dst.__refcnt, 1);
1187 rt->u.dst.child = NULL;
1189 dev_hold(rt->u.dst.dev);
1191 in_dev_hold(rt->idev);
1192 rt->u.dst.obsolete = 0;
1193 rt->u.dst.lastuse = jiffies;
1194 rt->u.dst.path = &rt->u.dst;
1195 rt->u.dst.neighbour = NULL;
1196 rt->u.dst.hh = NULL;
1197 rt->u.dst.xfrm = NULL;
1199 rt->rt_flags |= RTCF_REDIRECTED;
1201 /* Gateway is different ... */
1202 rt->rt_gateway = new_gw;
1204 /* Redirect received -> path was valid */
1205 dst_confirm(&rth->u.dst);
1208 atomic_inc(&rt->peer->refcnt);
1210 if (arp_bind_neighbour(&rt->u.dst) ||
1211 !(rt->u.dst.neighbour->nud_state &
1213 if (rt->u.dst.neighbour)
1214 neigh_event_send(rt->u.dst.neighbour, NULL);
1221 if (!rt_intern_hash(hash, rt, &rt))
1234 #ifdef CONFIG_IP_ROUTE_VERBOSE
1235 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1236 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1237 "%u.%u.%u.%u ignored.\n"
1238 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1239 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1240 NIPQUAD(saddr), NIPQUAD(daddr));
1245 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1247 struct rtable *rt = (struct rtable*)dst;
1248 struct dst_entry *ret = dst;
1251 if (dst->obsolete) {
1254 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1255 rt->u.dst.expires) {
1256 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1259 #if RT_CACHE_DEBUG >= 1
1260 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1261 "%u.%u.%u.%u/%02x dropped\n",
1262 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1273 * 1. The first ip_rt_redirect_number redirects are sent
1274 * with exponential backoff, then we stop sending them at all,
1275 * assuming that the host ignores our redirects.
1276 * 2. If we did not see packets requiring redirects
1277 * during ip_rt_redirect_silence, we assume that the host
1278 * forgot redirected route and start to send redirects again.
1280 * This algorithm is much cheaper and more intelligent than dumb load limiting
1283 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1284 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1287 void ip_rt_send_redirect(struct sk_buff *skb)
1289 struct rtable *rt = (struct rtable*)skb->dst;
1290 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1295 if (!IN_DEV_TX_REDIRECTS(in_dev))
1298 /* No redirected packets during ip_rt_redirect_silence;
1299 * reset the algorithm.
1301 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1302 rt->u.dst.rate_tokens = 0;
1304 /* Too many ignored redirects; do not send anything
1305 * set u.dst.rate_last to the last seen redirected packet.
1307 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1308 rt->u.dst.rate_last = jiffies;
1312 /* Check for load limit; set rate_last to the latest sent
1315 if (time_after(jiffies,
1316 (rt->u.dst.rate_last +
1317 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1318 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1319 rt->u.dst.rate_last = jiffies;
1320 ++rt->u.dst.rate_tokens;
1321 #ifdef CONFIG_IP_ROUTE_VERBOSE
1322 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1323 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1325 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1326 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1327 NIPQUAD(rt->rt_src), rt->rt_iif,
1328 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1335 static int ip_error(struct sk_buff *skb)
1337 struct rtable *rt = (struct rtable*)skb->dst;
1341 switch (rt->u.dst.error) {
1346 code = ICMP_HOST_UNREACH;
1349 code = ICMP_NET_UNREACH;
1352 code = ICMP_PKT_FILTERED;
1357 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1358 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1359 rt->u.dst.rate_tokens = ip_rt_error_burst;
1360 rt->u.dst.rate_last = now;
1361 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1362 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1363 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1366 out: kfree_skb(skb);
1371 * The last two values are not from the RFC but
1372 * are needed for AMPRnet AX.25 paths.
1375 static const unsigned short mtu_plateau[] =
1376 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1378 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1382 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1383 if (old_mtu > mtu_plateau[i])
1384 return mtu_plateau[i];
1388 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1391 unsigned short old_mtu = ntohs(iph->tot_len);
1393 u32 skeys[2] = { iph->saddr, 0, };
1394 u32 daddr = iph->daddr;
1395 unsigned short est_mtu = 0;
1397 if (ipv4_config.no_pmtu_disc)
1400 for (i = 0; i < 2; i++) {
1401 unsigned hash = rt_hash_code(daddr, skeys[i]);
1404 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 rth = rcu_dereference(rth->u.rt_next)) {
1406 if (rth->fl.fl4_dst == daddr &&
1407 rth->fl.fl4_src == skeys[i] &&
1408 rth->rt_dst == daddr &&
1409 rth->rt_src == iph->saddr &&
1411 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1412 unsigned short mtu = new_mtu;
1414 if (new_mtu < 68 || new_mtu >= old_mtu) {
1416 /* BSD 4.2 compatibility hack :-( */
1418 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1419 old_mtu >= 68 + (iph->ihl << 2))
1420 old_mtu -= iph->ihl << 2;
1422 mtu = guess_mtu(old_mtu);
1424 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1425 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1426 dst_confirm(&rth->u.dst);
1427 if (mtu < ip_rt_min_pmtu) {
1428 mtu = ip_rt_min_pmtu;
1429 rth->u.dst.metrics[RTAX_LOCK-1] |=
1432 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1433 dst_set_expires(&rth->u.dst,
1442 return est_mtu ? : new_mtu;
1445 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1448 !(dst_metric_locked(dst, RTAX_MTU))) {
1449 if (mtu < ip_rt_min_pmtu) {
1450 mtu = ip_rt_min_pmtu;
1451 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453 dst->metrics[RTAX_MTU-1] = mtu;
1454 dst_set_expires(dst, ip_rt_mtu_expires);
1458 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1463 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 struct rtable *rt = (struct rtable *) dst;
1466 struct inet_peer *peer = rt->peer;
1467 struct in_device *idev = rt->idev;
1480 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1483 struct rtable *rt = (struct rtable *) dst;
1484 struct in_device *idev = rt->idev;
1485 if (dev != &loopback_dev && idev && idev->dev == dev) {
1486 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1487 if (loopback_idev) {
1488 rt->idev = loopback_idev;
1494 static void ipv4_link_failure(struct sk_buff *skb)
1498 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500 rt = (struct rtable *) skb->dst;
1502 dst_set_expires(&rt->u.dst, 0);
1505 static int ip_rt_bug(struct sk_buff *skb)
1507 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1508 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1509 skb->dev ? skb->dev->name : "?");
1515 We do not cache source address of outgoing interface,
1516 because it is used only by IP RR, TS and SRR options,
1517 so that it out of fast path.
1519 BTW remember: "addr" is allowed to be not aligned
1523 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1526 struct fib_result res;
1528 if (rt->fl.iif == 0)
1530 else if (fib_lookup(&rt->fl, &res) == 0) {
1531 src = FIB_RES_PREFSRC(res);
1534 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536 memcpy(addr, &src, 4);
1539 #ifdef CONFIG_NET_CLS_ROUTE
1540 static void set_class_tag(struct rtable *rt, u32 tag)
1542 if (!(rt->u.dst.tclassid & 0xFFFF))
1543 rt->u.dst.tclassid |= tag & 0xFFFF;
1544 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1545 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1549 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 struct fib_info *fi = res->fi;
1554 if (FIB_RES_GW(*res) &&
1555 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1556 rt->rt_gateway = FIB_RES_GW(*res);
1557 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1558 sizeof(rt->u.dst.metrics));
1559 if (fi->fib_mtu == 0) {
1560 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1561 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1562 rt->rt_gateway != rt->rt_dst &&
1563 rt->u.dst.dev->mtu > 576)
1564 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566 #ifdef CONFIG_NET_CLS_ROUTE
1567 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1570 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1573 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1574 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1575 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1576 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1577 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1580 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582 #ifdef CONFIG_NET_CLS_ROUTE
1583 #ifdef CONFIG_IP_MULTIPLE_TABLES
1584 set_class_tag(rt, fib_rules_tclass(res));
1586 set_class_tag(rt, itag);
1588 rt->rt_type = res->type;
1591 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1592 u8 tos, struct net_device *dev, int our)
1597 struct in_device *in_dev = in_dev_get(dev);
1600 /* Primary sanity checks. */
1605 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1606 skb->protocol != htons(ETH_P_IP))
1609 if (ZERONET(saddr)) {
1610 if (!LOCAL_MCAST(daddr))
1612 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1613 } else if (fib_validate_source(saddr, 0, tos, 0,
1614 dev, &spec_dst, &itag) < 0)
1617 rth = dst_alloc(&ipv4_dst_ops);
1621 rth->u.dst.output= ip_rt_bug;
1623 atomic_set(&rth->u.dst.__refcnt, 1);
1624 rth->u.dst.flags= DST_HOST;
1625 if (in_dev->cnf.no_policy)
1626 rth->u.dst.flags |= DST_NOPOLICY;
1627 rth->fl.fl4_dst = daddr;
1628 rth->rt_dst = daddr;
1629 rth->fl.fl4_tos = tos;
1630 #ifdef CONFIG_IP_ROUTE_FWMARK
1631 rth->fl.fl4_fwmark= skb->nfmark;
1633 rth->fl.fl4_src = saddr;
1634 rth->rt_src = saddr;
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 rth->u.dst.tclassid = itag;
1639 rth->fl.iif = dev->ifindex;
1640 rth->u.dst.dev = &loopback_dev;
1641 dev_hold(rth->u.dst.dev);
1642 rth->idev = in_dev_get(rth->u.dst.dev);
1644 rth->rt_gateway = daddr;
1645 rth->rt_spec_dst= spec_dst;
1646 rth->rt_type = RTN_MULTICAST;
1647 rth->rt_flags = RTCF_MULTICAST;
1649 rth->u.dst.input= ip_local_deliver;
1650 rth->rt_flags |= RTCF_LOCAL;
1653 #ifdef CONFIG_IP_MROUTE
1654 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1655 rth->u.dst.input = ip_mr_input;
1657 RT_CACHE_STAT_INC(in_slow_mc);
1660 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1661 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1673 static void ip_handle_martian_source(struct net_device *dev,
1674 struct in_device *in_dev,
1675 struct sk_buff *skb,
1679 RT_CACHE_STAT_INC(in_martian_src);
1680 #ifdef CONFIG_IP_ROUTE_VERBOSE
1681 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683 * RFC1812 recommendation, if source is martian,
1684 * the only hint is MAC header.
1686 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1687 "%u.%u.%u.%u, on dev %s\n",
1688 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1689 if (dev->hard_header_len && skb->mac.raw) {
1691 unsigned char *p = skb->mac.raw;
1692 printk(KERN_WARNING "ll header: ");
1693 for (i = 0; i < dev->hard_header_len; i++, p++) {
1695 if (i < (dev->hard_header_len - 1))
1704 static inline int __mkroute_input(struct sk_buff *skb,
1705 struct fib_result* res,
1706 struct in_device *in_dev,
1707 u32 daddr, u32 saddr, u32 tos,
1708 struct rtable **result)
1713 struct in_device *out_dev;
1717 /* get a working reference to the output device */
1718 out_dev = in_dev_get(FIB_RES_DEV(*res));
1719 if (out_dev == NULL) {
1720 if (net_ratelimit())
1721 printk(KERN_CRIT "Bug in ip_route_input" \
1722 "_slow(). Please, report\n");
1727 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1728 in_dev->dev, &spec_dst, &itag);
1730 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1738 flags |= RTCF_DIRECTSRC;
1740 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1741 (IN_DEV_SHARED_MEDIA(out_dev) ||
1742 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1743 flags |= RTCF_DOREDIRECT;
1745 if (skb->protocol != htons(ETH_P_IP)) {
1746 /* Not IP (i.e. ARP). Do not create route, if it is
1747 * invalid for proxy arp. DNAT routes are always valid.
1749 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1756 rth = dst_alloc(&ipv4_dst_ops);
1762 atomic_set(&rth->u.dst.__refcnt, 1);
1763 rth->u.dst.flags= DST_HOST;
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765 if (res->fi->fib_nhs > 1)
1766 rth->u.dst.flags |= DST_BALANCED;
1768 if (in_dev->cnf.no_policy)
1769 rth->u.dst.flags |= DST_NOPOLICY;
1770 if (in_dev->cnf.no_xfrm)
1771 rth->u.dst.flags |= DST_NOXFRM;
1772 rth->fl.fl4_dst = daddr;
1773 rth->rt_dst = daddr;
1774 rth->fl.fl4_tos = tos;
1775 #ifdef CONFIG_IP_ROUTE_FWMARK
1776 rth->fl.fl4_fwmark= skb->nfmark;
1778 rth->fl.fl4_src = saddr;
1779 rth->rt_src = saddr;
1780 rth->rt_gateway = daddr;
1782 rth->fl.iif = in_dev->dev->ifindex;
1783 rth->u.dst.dev = (out_dev)->dev;
1784 dev_hold(rth->u.dst.dev);
1785 rth->idev = in_dev_get(rth->u.dst.dev);
1787 rth->rt_spec_dst= spec_dst;
1789 rth->u.dst.input = ip_forward;
1790 rth->u.dst.output = ip_output;
1792 rt_set_nexthop(rth, res, itag);
1794 rth->rt_flags = flags;
1799 /* release the working reference to the output device */
1800 in_dev_put(out_dev);
1804 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805 struct fib_result* res,
1806 const struct flowi *fl,
1807 struct in_device *in_dev,
1808 u32 daddr, u32 saddr, u32 tos)
1810 struct rtable* rth = NULL;
1814 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1815 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816 fib_select_multipath(fl, res);
1819 /* create a routing cache entry */
1820 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1824 /* put it into the cache */
1825 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1826 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1829 static inline int ip_mkroute_input(struct sk_buff *skb,
1830 struct fib_result* res,
1831 const struct flowi *fl,
1832 struct in_device *in_dev,
1833 u32 daddr, u32 saddr, u32 tos)
1835 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1836 struct rtable* rth = NULL, *rtres;
1837 unsigned char hop, hopcount;
1842 hopcount = res->fi->fib_nhs;
1846 /* distinguish between multipath and singlepath */
1848 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1851 /* add all alternatives to the routing cache */
1852 for (hop = 0; hop < hopcount; hop++) {
1855 /* put reference to previous result */
1859 /* create a routing cache entry */
1860 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1865 /* put it into the cache */
1866 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1867 err = rt_intern_hash(hash, rth, &rtres);
1871 /* forward hop information to multipath impl. */
1872 multipath_set_nhinfo(rth,
1873 FIB_RES_NETWORK(*res),
1874 FIB_RES_NETMASK(*res),
1878 skb->dst = &rtres->u.dst;
1880 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1881 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1882 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1887 * NOTE. We drop all the packets that has local source
1888 * addresses, because every properly looped back packet
1889 * must have correct destination already attached by output routine.
1891 * Such approach solves two big problems:
1892 * 1. Not simplex devices are handled properly.
1893 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1896 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1897 u8 tos, struct net_device *dev)
1899 struct fib_result res;
1900 struct in_device *in_dev = in_dev_get(dev);
1901 struct flowi fl = { .nl_u = { .ip4_u =
1905 .scope = RT_SCOPE_UNIVERSE,
1906 #ifdef CONFIG_IP_ROUTE_FWMARK
1907 .fwmark = skb->nfmark
1910 .iif = dev->ifindex };
1913 struct rtable * rth;
1919 /* IP on this device is disabled. */
1924 /* Check for the most weird martians, which can be not detected
1928 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1929 goto martian_source;
1931 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1934 /* Accept zero addresses only to limited broadcast;
1935 * I even do not know to fix it or not. Waiting for complains :-)
1938 goto martian_source;
1940 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1941 goto martian_destination;
1944 * Now we are ready to route packet.
1946 if ((err = fib_lookup(&fl, &res)) != 0) {
1947 if (!IN_DEV_FORWARD(in_dev))
1953 RT_CACHE_STAT_INC(in_slow_tot);
1955 if (res.type == RTN_BROADCAST)
1958 if (res.type == RTN_LOCAL) {
1960 result = fib_validate_source(saddr, daddr, tos,
1961 loopback_dev.ifindex,
1962 dev, &spec_dst, &itag);
1964 goto martian_source;
1966 flags |= RTCF_DIRECTSRC;
1971 if (!IN_DEV_FORWARD(in_dev))
1973 if (res.type != RTN_UNICAST)
1974 goto martian_destination;
1976 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1977 if (err == -ENOBUFS)
1989 if (skb->protocol != htons(ETH_P_IP))
1993 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1998 goto martian_source;
2000 flags |= RTCF_DIRECTSRC;
2002 flags |= RTCF_BROADCAST;
2003 res.type = RTN_BROADCAST;
2004 RT_CACHE_STAT_INC(in_brd);
2007 rth = dst_alloc(&ipv4_dst_ops);
2011 rth->u.dst.output= ip_rt_bug;
2013 atomic_set(&rth->u.dst.__refcnt, 1);
2014 rth->u.dst.flags= DST_HOST;
2015 if (in_dev->cnf.no_policy)
2016 rth->u.dst.flags |= DST_NOPOLICY;
2017 rth->fl.fl4_dst = daddr;
2018 rth->rt_dst = daddr;
2019 rth->fl.fl4_tos = tos;
2020 #ifdef CONFIG_IP_ROUTE_FWMARK
2021 rth->fl.fl4_fwmark= skb->nfmark;
2023 rth->fl.fl4_src = saddr;
2024 rth->rt_src = saddr;
2025 #ifdef CONFIG_NET_CLS_ROUTE
2026 rth->u.dst.tclassid = itag;
2029 rth->fl.iif = dev->ifindex;
2030 rth->u.dst.dev = &loopback_dev;
2031 dev_hold(rth->u.dst.dev);
2032 rth->idev = in_dev_get(rth->u.dst.dev);
2033 rth->rt_gateway = daddr;
2034 rth->rt_spec_dst= spec_dst;
2035 rth->u.dst.input= ip_local_deliver;
2036 rth->rt_flags = flags|RTCF_LOCAL;
2037 if (res.type == RTN_UNREACHABLE) {
2038 rth->u.dst.input= ip_error;
2039 rth->u.dst.error= -err;
2040 rth->rt_flags &= ~RTCF_LOCAL;
2042 rth->rt_type = res.type;
2043 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2044 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2048 RT_CACHE_STAT_INC(in_no_route);
2049 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2050 res.type = RTN_UNREACHABLE;
2054 * Do not cache martian addresses: they should be logged (RFC1812)
2056 martian_destination:
2057 RT_CACHE_STAT_INC(in_martian_dst);
2058 #ifdef CONFIG_IP_ROUTE_VERBOSE
2059 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2060 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2061 "%u.%u.%u.%u, dev %s\n",
2062 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2066 err = -EHOSTUNREACH;
2078 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2082 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2083 u8 tos, struct net_device *dev)
2085 struct rtable * rth;
2087 int iif = dev->ifindex;
2089 tos &= IPTOS_RT_MASK;
2090 hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2093 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2094 rth = rcu_dereference(rth->u.rt_next)) {
2095 if (rth->fl.fl4_dst == daddr &&
2096 rth->fl.fl4_src == saddr &&
2097 rth->fl.iif == iif &&
2099 #ifdef CONFIG_IP_ROUTE_FWMARK
2100 rth->fl.fl4_fwmark == skb->nfmark &&
2102 rth->fl.fl4_tos == tos) {
2103 rth->u.dst.lastuse = jiffies;
2104 dst_hold(&rth->u.dst);
2106 RT_CACHE_STAT_INC(in_hit);
2108 skb->dst = (struct dst_entry*)rth;
2111 RT_CACHE_STAT_INC(in_hlist_search);
2115 /* Multicast recognition logic is moved from route cache to here.
2116 The problem was that too many Ethernet cards have broken/missing
2117 hardware multicast filters :-( As result the host on multicasting
2118 network acquires a lot of useless route cache entries, sort of
2119 SDR messages from all the world. Now we try to get rid of them.
2120 Really, provided software IP multicast filter is organized
2121 reasonably (at least, hashed), it does not result in a slowdown
2122 comparing with route cache reject entries.
2123 Note, that multicast routers are not affected, because
2124 route cache entry is created eventually.
2126 if (MULTICAST(daddr)) {
2127 struct in_device *in_dev;
2130 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2131 int our = ip_check_mc(in_dev, daddr, saddr,
2132 skb->nh.iph->protocol);
2134 #ifdef CONFIG_IP_MROUTE
2135 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2139 return ip_route_input_mc(skb, daddr, saddr,
2146 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2149 static inline int __mkroute_output(struct rtable **result,
2150 struct fib_result* res,
2151 const struct flowi *fl,
2152 const struct flowi *oldflp,
2153 struct net_device *dev_out,
2157 struct in_device *in_dev;
2158 u32 tos = RT_FL_TOS(oldflp);
2161 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2164 if (fl->fl4_dst == 0xFFFFFFFF)
2165 res->type = RTN_BROADCAST;
2166 else if (MULTICAST(fl->fl4_dst))
2167 res->type = RTN_MULTICAST;
2168 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2171 if (dev_out->flags & IFF_LOOPBACK)
2172 flags |= RTCF_LOCAL;
2174 /* get work reference to inet device */
2175 in_dev = in_dev_get(dev_out);
2179 if (res->type == RTN_BROADCAST) {
2180 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182 fib_info_put(res->fi);
2185 } else if (res->type == RTN_MULTICAST) {
2186 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2187 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2189 flags &= ~RTCF_LOCAL;
2190 /* If multicast route do not exist use
2191 default one, but do not gateway in this case.
2194 if (res->fi && res->prefixlen < 4) {
2195 fib_info_put(res->fi);
2201 rth = dst_alloc(&ipv4_dst_ops);
2207 atomic_set(&rth->u.dst.__refcnt, 1);
2208 rth->u.dst.flags= DST_HOST;
2209 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2212 if (res->fi->fib_nhs > 1)
2213 rth->u.dst.flags |= DST_BALANCED;
2216 if (in_dev->cnf.no_xfrm)
2217 rth->u.dst.flags |= DST_NOXFRM;
2218 if (in_dev->cnf.no_policy)
2219 rth->u.dst.flags |= DST_NOPOLICY;
2221 rth->fl.fl4_dst = oldflp->fl4_dst;
2222 rth->fl.fl4_tos = tos;
2223 rth->fl.fl4_src = oldflp->fl4_src;
2224 rth->fl.oif = oldflp->oif;
2225 #ifdef CONFIG_IP_ROUTE_FWMARK
2226 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 rth->rt_dst = fl->fl4_dst;
2229 rth->rt_src = fl->fl4_src;
2230 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2231 /* get references to the devices that are to be hold by the routing
2233 rth->u.dst.dev = dev_out;
2235 rth->idev = in_dev_get(dev_out);
2236 rth->rt_gateway = fl->fl4_dst;
2237 rth->rt_spec_dst= fl->fl4_src;
2239 rth->u.dst.output=ip_output;
2241 RT_CACHE_STAT_INC(out_slow_tot);
2243 if (flags & RTCF_LOCAL) {
2244 rth->u.dst.input = ip_local_deliver;
2245 rth->rt_spec_dst = fl->fl4_dst;
2247 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2248 rth->rt_spec_dst = fl->fl4_src;
2249 if (flags & RTCF_LOCAL &&
2250 !(dev_out->flags & IFF_LOOPBACK)) {
2251 rth->u.dst.output = ip_mc_output;
2252 RT_CACHE_STAT_INC(out_slow_mc);
2254 #ifdef CONFIG_IP_MROUTE
2255 if (res->type == RTN_MULTICAST) {
2256 if (IN_DEV_MFORWARD(in_dev) &&
2257 !LOCAL_MCAST(oldflp->fl4_dst)) {
2258 rth->u.dst.input = ip_mr_input;
2259 rth->u.dst.output = ip_mc_output;
2265 rt_set_nexthop(rth, res, 0);
2267 rth->rt_flags = flags;
2271 /* release work reference to inet device */
2277 static inline int ip_mkroute_output_def(struct rtable **rp,
2278 struct fib_result* res,
2279 const struct flowi *fl,
2280 const struct flowi *oldflp,
2281 struct net_device *dev_out,
2284 struct rtable *rth = NULL;
2285 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2288 hash = rt_hash_code(oldflp->fl4_dst,
2289 oldflp->fl4_src ^ (oldflp->oif << 5));
2290 err = rt_intern_hash(hash, rth, rp);
2296 static inline int ip_mkroute_output(struct rtable** rp,
2297 struct fib_result* res,
2298 const struct flowi *fl,
2299 const struct flowi *oldflp,
2300 struct net_device *dev_out,
2303 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307 struct rtable *rth = NULL;
2309 if (res->fi && res->fi->fib_nhs > 1) {
2310 unsigned char hopcount = res->fi->fib_nhs;
2312 for (hop = 0; hop < hopcount; hop++) {
2313 struct net_device *dev2nexthop;
2317 /* hold a work reference to the output device */
2318 dev2nexthop = FIB_RES_DEV(*res);
2319 dev_hold(dev2nexthop);
2321 /* put reference to previous result */
2325 err = __mkroute_output(&rth, res, fl, oldflp,
2326 dev2nexthop, flags);
2331 hash = rt_hash_code(oldflp->fl4_dst,
2333 (oldflp->oif << 5));
2334 err = rt_intern_hash(hash, rth, rp);
2336 /* forward hop information to multipath impl. */
2337 multipath_set_nhinfo(rth,
2338 FIB_RES_NETWORK(*res),
2339 FIB_RES_NETMASK(*res),
2343 /* release work reference to output device */
2344 dev_put(dev2nexthop);
2351 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2354 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2355 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2360 * Major route resolver routine.
2363 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365 u32 tos = RT_FL_TOS(oldflp);
2366 struct flowi fl = { .nl_u = { .ip4_u =
2367 { .daddr = oldflp->fl4_dst,
2368 .saddr = oldflp->fl4_src,
2369 .tos = tos & IPTOS_RT_MASK,
2370 .scope = ((tos & RTO_ONLINK) ?
2373 #ifdef CONFIG_IP_ROUTE_FWMARK
2374 .fwmark = oldflp->fl4_fwmark
2377 .iif = loopback_dev.ifindex,
2378 .oif = oldflp->oif };
2379 struct fib_result res;
2381 struct net_device *dev_out = NULL;
2387 #ifdef CONFIG_IP_MULTIPLE_TABLES
2391 if (oldflp->fl4_src) {
2393 if (MULTICAST(oldflp->fl4_src) ||
2394 BADCLASS(oldflp->fl4_src) ||
2395 ZERONET(oldflp->fl4_src))
2398 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 dev_out = ip_dev_find(oldflp->fl4_src);
2400 if (dev_out == NULL)
2403 /* I removed check for oif == dev_out->oif here.
2404 It was wrong for two reasons:
2405 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 assigned to multiple interfaces.
2407 2. Moreover, we are allowed to send packets with saddr
2408 of another iface. --ANK
2411 if (oldflp->oif == 0
2412 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2413 /* Special hack: user can direct multicasts
2414 and limited broadcast via necessary interface
2415 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 This hack is not just for fun, it allows
2417 vic,vat and friends to work.
2418 They bind socket to loopback, set ttl to zero
2419 and expect that it will work.
2420 From the viewpoint of routing cache they are broken,
2421 because we are not allowed to build multicast path
2422 with loopback source addr (look, routing cache
2423 cannot know, that ttl is zero, so that packet
2424 will not leave this host and route is valid).
2425 Luckily, this hack is good workaround.
2428 fl.oif = dev_out->ifindex;
2438 dev_out = dev_get_by_index(oldflp->oif);
2440 if (dev_out == NULL)
2443 /* RACE: Check return value of inet_select_addr instead. */
2444 if (__in_dev_get_rtnl(dev_out) == NULL) {
2446 goto out; /* Wrong error code */
2449 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2456 if (MULTICAST(oldflp->fl4_dst))
2457 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 else if (!oldflp->fl4_dst)
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2466 fl.fl4_dst = fl.fl4_src;
2468 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2471 dev_out = &loopback_dev;
2473 fl.oif = loopback_dev.ifindex;
2474 res.type = RTN_LOCAL;
2475 flags |= RTCF_LOCAL;
2479 if (fib_lookup(&fl, &res)) {
2482 /* Apparently, routing tables are wrong. Assume,
2483 that the destination is on link.
2486 Because we are allowed to send to iface
2487 even if it has NO routes and NO assigned
2488 addresses. When oif is specified, routing
2489 tables are looked up with only one purpose:
2490 to catch if destination is gatewayed, rather than
2491 direct. Moreover, if MSG_DONTROUTE is set,
2492 we send packet, ignoring both routing tables
2493 and ifaddr state. --ANK
2496 We could make it even if oif is unknown,
2497 likely IPv6, but we do not.
2500 if (fl.fl4_src == 0)
2501 fl.fl4_src = inet_select_addr(dev_out, 0,
2503 res.type = RTN_UNICAST;
2513 if (res.type == RTN_LOCAL) {
2515 fl.fl4_src = fl.fl4_dst;
2518 dev_out = &loopback_dev;
2520 fl.oif = dev_out->ifindex;
2522 fib_info_put(res.fi);
2524 flags |= RTCF_LOCAL;
2528 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 fib_select_multipath(&fl, &res);
2533 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 fib_select_default(&fl, &res);
2537 fl.fl4_src = FIB_RES_PREFSRC(res);
2541 dev_out = FIB_RES_DEV(res);
2543 fl.oif = dev_out->ifindex;
2547 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2557 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2562 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2565 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 rth = rcu_dereference(rth->u.rt_next)) {
2567 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 rth->fl.fl4_src == flp->fl4_src &&
2570 rth->fl.oif == flp->oif &&
2571 #ifdef CONFIG_IP_ROUTE_FWMARK
2572 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2574 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2575 (IPTOS_RT_MASK | RTO_ONLINK))) {
2577 /* check for multipath routes and choose one if
2580 if (multipath_select_route(flp, rth, rp)) {
2581 dst_hold(&(*rp)->u.dst);
2582 RT_CACHE_STAT_INC(out_hit);
2583 rcu_read_unlock_bh();
2587 rth->u.dst.lastuse = jiffies;
2588 dst_hold(&rth->u.dst);
2590 RT_CACHE_STAT_INC(out_hit);
2591 rcu_read_unlock_bh();
2595 RT_CACHE_STAT_INC(out_hlist_search);
2597 rcu_read_unlock_bh();
2599 return ip_route_output_slow(rp, flp);
2602 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2604 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2608 if ((err = __ip_route_output_key(rp, flp)) != 0)
2613 flp->fl4_src = (*rp)->rt_src;
2615 flp->fl4_dst = (*rp)->rt_dst;
2616 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2622 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2624 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2626 return ip_route_output_flow(rp, flp, NULL, 0);
2629 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2630 int nowait, unsigned int flags)
2632 struct rtable *rt = (struct rtable*)skb->dst;
2634 struct nlmsghdr *nlh;
2635 unsigned char *b = skb->tail;
2636 struct rta_cacheinfo ci;
2637 #ifdef CONFIG_IP_MROUTE
2638 struct rtattr *eptr;
2640 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2641 r = NLMSG_DATA(nlh);
2642 r->rtm_family = AF_INET;
2643 r->rtm_dst_len = 32;
2645 r->rtm_tos = rt->fl.fl4_tos;
2646 r->rtm_table = RT_TABLE_MAIN;
2647 r->rtm_type = rt->rt_type;
2648 r->rtm_scope = RT_SCOPE_UNIVERSE;
2649 r->rtm_protocol = RTPROT_UNSPEC;
2650 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 if (rt->rt_flags & RTCF_NOTIFY)
2652 r->rtm_flags |= RTM_F_NOTIFY;
2653 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2654 if (rt->fl.fl4_src) {
2655 r->rtm_src_len = 32;
2656 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2659 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2660 #ifdef CONFIG_NET_CLS_ROUTE
2661 if (rt->u.dst.tclassid)
2662 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2664 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2665 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2666 __u32 alg = rt->rt_multipath_alg;
2668 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2672 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2673 else if (rt->rt_src != rt->fl.fl4_src)
2674 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2675 if (rt->rt_dst != rt->rt_gateway)
2676 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2677 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2678 goto rtattr_failure;
2679 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2680 ci.rta_used = rt->u.dst.__use;
2681 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2682 if (rt->u.dst.expires)
2683 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2686 ci.rta_error = rt->u.dst.error;
2687 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2689 ci.rta_id = rt->peer->ip_id_count;
2690 if (rt->peer->tcp_ts_stamp) {
2691 ci.rta_ts = rt->peer->tcp_ts;
2692 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2695 #ifdef CONFIG_IP_MROUTE
2696 eptr = (struct rtattr*)skb->tail;
2698 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2700 #ifdef CONFIG_IP_MROUTE
2701 u32 dst = rt->rt_dst;
2703 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2704 ipv4_devconf.mc_forwarding) {
2705 int err = ipmr_get_route(skb, r, nowait);
2712 if (err == -EMSGSIZE)
2714 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2719 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2722 nlh->nlmsg_len = skb->tail - b;
2727 skb_trim(skb, b - skb->data);
2731 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2733 struct rtattr **rta = arg;
2734 struct rtmsg *rtm = NLMSG_DATA(nlh);
2735 struct rtable *rt = NULL;
2740 struct sk_buff *skb;
2742 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2746 /* Reserve room for dummy headers, this skb can pass
2747 through good chunk of routing engine.
2749 skb->mac.raw = skb->nh.raw = skb->data;
2751 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2752 skb->nh.iph->protocol = IPPROTO_ICMP;
2753 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755 if (rta[RTA_SRC - 1])
2756 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757 if (rta[RTA_DST - 1])
2758 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759 if (rta[RTA_IIF - 1])
2760 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2763 struct net_device *dev = __dev_get_by_index(iif);
2767 skb->protocol = htons(ETH_P_IP);
2770 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772 rt = (struct rtable*)skb->dst;
2773 if (!err && rt->u.dst.error)
2774 err = -rt->u.dst.error;
2776 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2778 .tos = rtm->rtm_tos } } };
2780 if (rta[RTA_OIF - 1])
2781 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2783 err = ip_route_output_key(&rt, &fl);
2788 skb->dst = &rt->u.dst;
2789 if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 rt->rt_flags |= RTCF_NOTIFY;
2792 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2794 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2795 RTM_NEWROUTE, 0, 0);
2803 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2813 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2820 s_idx = idx = cb->args[1];
2821 for (h = 0; h <= rt_hash_mask; h++) {
2822 if (h < s_h) continue;
2826 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 rt = rcu_dereference(rt->u.rt_next), idx++) {
2830 skb->dst = dst_clone(&rt->u.dst);
2831 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 1, NLM_F_MULTI) <= 0) {
2834 dst_release(xchg(&skb->dst, NULL));
2835 rcu_read_unlock_bh();
2838 dst_release(xchg(&skb->dst, NULL));
2840 rcu_read_unlock_bh();
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 struct file *filp, void __user *buffer,
2859 size_t *lenp, loff_t *ppos)
2862 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 rt_cache_flush(flush_delay);
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2873 void __user *oldval,
2874 size_t __user *oldlenp,
2875 void __user *newval,
2880 if (newlen != sizeof(int))
2882 if (get_user(delay, (int __user *)newval))
2884 rt_cache_flush(delay);
2888 ctl_table ipv4_route_table[] = {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
2895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2922 .proc_handler = &proc_dointvec,
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2930 .proc_handler = &proc_dointvec,
2933 /* Deprecated. Use gc_min_interval_ms */
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2976 .proc_handler = &proc_dointvec,
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2984 .proc_handler = &proc_dointvec,
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
3000 .proc_handler = &proc_dointvec,
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3008 .proc_handler = &proc_dointvec,
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3016 .proc_handler = &proc_dointvec,
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3033 .proc_handler = &proc_dointvec,
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3041 .proc_handler = &proc_dointvec,
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3059 /* This code sucks. But you should have seen it before! --RR */
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3070 if ((offset & 3) || (length & 3))
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3083 offset /= sizeof(u32);
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3089 /* Copy first cpu. */
3091 memcpy(dst, src, length);
3093 /* Add the other cpus in, one int at a time */
3094 for_each_possible_cpu(i) {
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099 for (j = 0; j < length/4; j++)
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3113 rhash_entries = simple_strtoul(str, &str, 0);
3116 __setup("rhash_entries=", set_rhash_entries);
3118 int __init ip_rt_init(void)
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3125 #ifdef CONFIG_NET_CLS_ROUTE
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3138 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139 sizeof(struct rtable),
3140 0, SLAB_HWCACHE_ALIGN,
3143 if (!ipv4_dst_ops.kmem_cachep)
3144 panic("IP: failed to allocate ip_dst_cache\n");
3146 rt_hash_table = (struct rt_hash_bucket *)
3147 alloc_large_system_hash("IP route cache",
3148 sizeof(struct rt_hash_bucket),
3150 (num_physpages >= 128 * 1024) ?
3156 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157 rt_hash_lock_init();
3159 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3165 init_timer(&rt_flush_timer);
3166 rt_flush_timer.function = rt_run_flush;
3167 init_timer(&rt_periodic_timer);
3168 rt_periodic_timer.function = rt_check_expire;
3169 init_timer(&rt_secret_timer);
3170 rt_secret_timer.function = rt_secret_rebuild;
3172 /* All the timers, started at system startup tend
3173 to synchronize. Perturb it a bit.
3175 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3177 add_timer(&rt_periodic_timer);
3179 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3180 ip_rt_secret_interval;
3181 add_timer(&rt_secret_timer);
3183 #ifdef CONFIG_PROC_FS
3185 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3186 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3187 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3191 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);