2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
100 #include <net/ip_fib.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
107 #include <linux/sysctl.h>
110 #define RT_FL_TOS(oldflp) \
111 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 #define IP_MAX_MTU 0xFFF0
115 #define RT_GC_TIMEOUT (300*HZ)
117 static int ip_rt_min_delay = 2 * HZ;
118 static int ip_rt_max_delay = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval = 60 * HZ;
122 static int ip_rt_gc_min_interval = HZ / 2;
123 static int ip_rt_redirect_number = 9;
124 static int ip_rt_redirect_load = HZ / 50;
125 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost = HZ;
127 static int ip_rt_error_burst = 5 * HZ;
128 static int ip_rt_gc_elasticity = 8;
129 static int ip_rt_mtu_expires = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu = 512 + 20 + 20;
131 static int ip_rt_min_advmss = 256;
132 static int ip_rt_secret_interval = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
135 #define RTprint(a...) printk(KERN_DEBUG a)
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static void ipv4_dst_ifdown(struct dst_entry *dst,
148 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
155 static struct dst_ops ipv4_dst_ops = {
157 .protocol = __constant_htons(ETH_P_IP),
158 .gc = rt_garbage_collect,
159 .check = ipv4_dst_check,
160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
165 .entry_size = sizeof(struct rtable),
168 #define ECN_OR_COST(class) TC_PRIO_##class
170 __u8 ip_tos2prio[16] = {
174 ECN_OR_COST(BESTEFFORT),
180 ECN_OR_COST(INTERACTIVE),
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
194 /* The locking scheme is rather straight forward:
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
204 struct rt_hash_bucket {
205 struct rtable *chain;
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
213 #define RT_HASH_LOCK_SZ 4096
215 #define RT_HASH_LOCK_SZ 2048
217 #define RT_HASH_LOCK_SZ 1024
219 #define RT_HASH_LOCK_SZ 512
221 #define RT_HASH_LOCK_SZ 256
224 static spinlock_t *rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init() { \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
238 static struct rt_hash_bucket *rt_hash_table;
239 static unsigned rt_hash_mask;
240 static int rt_hash_log;
241 static unsigned int rt_hash_rnd;
243 struct rt_cache_stat *rt_cache_stat;
245 static int rt_intern_hash(unsigned hash, struct rtable *rth,
246 struct rtable **res);
248 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
250 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
254 #ifdef CONFIG_PROC_FS
255 struct rt_cache_iter_state {
259 static struct rtable *rt_cache_get_first(struct seq_file *seq)
261 struct rtable *r = NULL;
262 struct rt_cache_iter_state *st = seq->private;
264 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
266 r = rt_hash_table[st->bucket].chain;
269 rcu_read_unlock_bh();
274 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
276 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
280 rcu_read_unlock_bh();
281 if (--st->bucket < 0)
284 r = rt_hash_table[st->bucket].chain;
289 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
291 struct rtable *r = rt_cache_get_first(seq);
294 while (pos && (r = rt_cache_get_next(seq, r)))
296 return pos ? NULL : r;
299 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
301 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
306 struct rtable *r = NULL;
308 if (v == SEQ_START_TOKEN)
309 r = rt_cache_get_first(seq);
311 r = rt_cache_get_next(seq, v);
316 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
318 if (v && v != SEQ_START_TOKEN)
319 rcu_read_unlock_bh();
322 static int rt_cache_seq_show(struct seq_file *seq, void *v)
324 if (v == SEQ_START_TOKEN)
325 seq_printf(seq, "%-127s\n",
326 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
327 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
330 struct rtable *r = v;
333 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
334 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
335 r->u.dst.dev ? r->u.dst.dev->name : "*",
336 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
337 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
338 r->u.dst.__use, 0, (unsigned long)r->rt_src,
339 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
340 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
341 dst_metric(&r->u.dst, RTAX_WINDOW),
342 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
343 dst_metric(&r->u.dst, RTAX_RTTVAR)),
345 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
346 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
349 seq_printf(seq, "%-127s\n", temp);
354 static struct seq_operations rt_cache_seq_ops = {
355 .start = rt_cache_seq_start,
356 .next = rt_cache_seq_next,
357 .stop = rt_cache_seq_stop,
358 .show = rt_cache_seq_show,
361 static int rt_cache_seq_open(struct inode *inode, struct file *file)
363 struct seq_file *seq;
365 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
369 rc = seq_open(file, &rt_cache_seq_ops);
372 seq = file->private_data;
374 memset(s, 0, sizeof(*s));
382 static struct file_operations rt_cache_seq_fops = {
383 .owner = THIS_MODULE,
384 .open = rt_cache_seq_open,
387 .release = seq_release_private,
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
396 return SEQ_START_TOKEN;
398 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399 if (!cpu_possible(cpu))
402 return per_cpu_ptr(rt_cache_stat, cpu);
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
415 return per_cpu_ptr(rt_cache_stat, cpu);
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
428 struct rt_cache_stat *st = v;
430 if (v == SEQ_START_TOKEN) {
431 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
435 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
436 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437 atomic_read(&ipv4_dst_ops.entries),
460 static struct seq_operations rt_cpu_seq_ops = {
461 .start = rt_cpu_seq_start,
462 .next = rt_cpu_seq_next,
463 .stop = rt_cpu_seq_stop,
464 .show = rt_cpu_seq_show,
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
470 return seq_open(file, &rt_cpu_seq_ops);
473 static struct file_operations rt_cpu_seq_fops = {
474 .owner = THIS_MODULE,
475 .open = rt_cpu_seq_open,
478 .release = seq_release,
481 #endif /* CONFIG_PROC_FS */
483 static __inline__ void rt_free(struct rtable *rt)
485 multipath_remove(rt);
486 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 static __inline__ void rt_drop(struct rtable *rt)
491 multipath_remove(rt);
493 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 static __inline__ int rt_fast_clean(struct rtable *rth)
498 /* Kill broadcast/multicast entries very aggresively, if they
499 collide in hash table with more useful entries */
500 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
501 rth->fl.iif && rth->u.rt_next;
504 static __inline__ int rt_valuable(struct rtable *rth)
506 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
510 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
515 if (atomic_read(&rth->u.dst.__refcnt))
519 if (rth->u.dst.expires &&
520 time_after_eq(jiffies, rth->u.dst.expires))
523 age = jiffies - rth->u.dst.lastuse;
525 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
526 (age <= tmo2 && rt_valuable(rth)))
532 /* Bits of score are:
534 * 30: not quite useless
535 * 29..0: usage counter
537 static inline u32 rt_score(struct rtable *rt)
539 u32 score = jiffies - rt->u.dst.lastuse;
541 score = ~score & ~(3<<30);
547 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
553 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
555 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
556 fl1->oif == fl2->oif &&
557 fl1->iif == fl2->iif;
560 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
561 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
562 struct rtable *expentry,
565 int passedexpired = 0;
566 struct rtable **nextstep = NULL;
567 struct rtable **rthp = chain_head;
573 while ((rth = *rthp) != NULL) {
577 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
578 compare_keys(&(*rthp)->fl, &expentry->fl)) {
579 if (*rthp == expentry) {
580 *rthp = rth->u.rt_next;
583 *rthp = rth->u.rt_next;
589 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
590 passedexpired && !nextstep)
591 nextstep = &rth->u.rt_next;
593 rthp = &rth->u.rt_next;
603 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606 /* This runs via a timer and thus is always in BH context. */
607 static void rt_check_expire(unsigned long dummy)
609 static unsigned int rover;
610 unsigned int i = rover, goal;
611 struct rtable *rth, **rthp;
612 unsigned long now = jiffies;
615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
621 unsigned long tmo = ip_rt_gc_timeout;
623 i = (i + 1) & rt_hash_mask;
624 rthp = &rt_hash_table[i].chain;
628 spin_lock(rt_hash_lock_addr(i));
629 while ((rth = *rthp) != NULL) {
630 if (rth->u.dst.expires) {
631 /* Entry is expired even if it is in use */
632 if (time_before_eq(now, rth->u.dst.expires)) {
634 rthp = &rth->u.rt_next;
637 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
639 rthp = &rth->u.rt_next;
643 /* Cleanup aged off entries. */
644 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
645 /* remove all related balanced entries if necessary */
646 if (rth->u.dst.flags & DST_BALANCED) {
647 rthp = rt_remove_balanced_route(
648 &rt_hash_table[i].chain,
653 *rthp = rth->u.rt_next;
656 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
657 *rthp = rth->u.rt_next;
659 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
661 spin_unlock(rt_hash_lock_addr(i));
663 /* Fallback loop breaker. */
664 if (time_after(jiffies, now))
668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
671 /* This can run from both BH and non-BH contexts, the latter
672 * in the case of a forced flush event.
674 static void rt_run_flush(unsigned long dummy)
677 struct rtable *rth, *next;
681 get_random_bytes(&rt_hash_rnd, 4);
683 for (i = rt_hash_mask; i >= 0; i--) {
684 spin_lock_bh(rt_hash_lock_addr(i));
685 rth = rt_hash_table[i].chain;
687 rt_hash_table[i].chain = NULL;
688 spin_unlock_bh(rt_hash_lock_addr(i));
690 for (; rth; rth = next) {
691 next = rth->u.rt_next;
697 static DEFINE_SPINLOCK(rt_flush_lock);
699 void rt_cache_flush(int delay)
701 unsigned long now = jiffies;
702 int user_mode = !in_softirq();
705 delay = ip_rt_min_delay;
707 /* flush existing multipath state*/
710 spin_lock_bh(&rt_flush_lock);
712 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
713 long tmo = (long)(rt_deadline - now);
715 /* If flush timer is already running
716 and flush request is not immediate (delay > 0):
718 if deadline is not achieved, prolongate timer to "delay",
719 otherwise fire it at deadline time.
722 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
730 spin_unlock_bh(&rt_flush_lock);
735 if (rt_deadline == 0)
736 rt_deadline = now + ip_rt_max_delay;
738 mod_timer(&rt_flush_timer, now+delay);
739 spin_unlock_bh(&rt_flush_lock);
742 static void rt_secret_rebuild(unsigned long dummy)
744 unsigned long now = jiffies;
747 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
751 Short description of GC goals.
753 We want to build algorithm, which will keep routing cache
754 at some equilibrium point, when number of aged off entries
755 is kept approximately equal to newly generated ones.
757 Current expiration strength is variable "expire".
758 We try to adjust it dynamically, so that if networking
759 is idle expires is large enough to keep enough of warm entries,
760 and when load increases it reduces to limit cache size.
763 static int rt_garbage_collect(void)
765 static unsigned long expire = RT_GC_TIMEOUT;
766 static unsigned long last_gc;
768 static int equilibrium;
769 struct rtable *rth, **rthp;
770 unsigned long now = jiffies;
774 * Garbage collection is pretty expensive,
775 * do not make it too frequently.
778 RT_CACHE_STAT_INC(gc_total);
780 if (now - last_gc < ip_rt_gc_min_interval &&
781 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
782 RT_CACHE_STAT_INC(gc_ignored);
786 /* Calculate number of entries, which we want to expire now. */
787 goal = atomic_read(&ipv4_dst_ops.entries) -
788 (ip_rt_gc_elasticity << rt_hash_log);
790 if (equilibrium < ipv4_dst_ops.gc_thresh)
791 equilibrium = ipv4_dst_ops.gc_thresh;
792 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
794 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
795 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
798 /* We are in dangerous area. Try to reduce cache really
801 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
802 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805 if (now - last_gc >= ip_rt_gc_min_interval)
816 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
817 unsigned long tmo = expire;
819 k = (k + 1) & rt_hash_mask;
820 rthp = &rt_hash_table[k].chain;
821 spin_lock_bh(rt_hash_lock_addr(k));
822 while ((rth = *rthp) != NULL) {
823 if (!rt_may_expire(rth, tmo, expire)) {
825 rthp = &rth->u.rt_next;
828 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
829 /* remove all related balanced entries
832 if (rth->u.dst.flags & DST_BALANCED) {
835 rthp = rt_remove_balanced_route(
836 &rt_hash_table[i].chain,
843 *rthp = rth->u.rt_next;
847 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
848 *rthp = rth->u.rt_next;
851 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
853 spin_unlock_bh(rt_hash_lock_addr(k));
862 /* Goal is not achieved. We stop process if:
864 - if expire reduced to zero. Otherwise, expire is halfed.
865 - if table is not full.
866 - if we are called from interrupt.
867 - jiffies check is just fallback/debug loop breaker.
868 We will not spin here for long time in any case.
871 RT_CACHE_STAT_INC(gc_goal_miss);
877 #if RT_CACHE_DEBUG >= 2
878 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
879 atomic_read(&ipv4_dst_ops.entries), goal, i);
882 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
884 } while (!in_softirq() && time_before_eq(jiffies, now));
886 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
889 printk(KERN_WARNING "dst cache overflow\n");
890 RT_CACHE_STAT_INC(gc_dst_overflow);
894 expire += ip_rt_gc_min_interval;
895 if (expire > ip_rt_gc_timeout ||
896 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
897 expire = ip_rt_gc_timeout;
898 #if RT_CACHE_DEBUG >= 2
899 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
900 atomic_read(&ipv4_dst_ops.entries), goal, rover);
905 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
907 struct rtable *rth, **rthp;
909 struct rtable *cand, **candp;
912 int attempts = !in_softirq();
921 rthp = &rt_hash_table[hash].chain;
923 spin_lock_bh(rt_hash_lock_addr(hash));
924 while ((rth = *rthp) != NULL) {
925 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
926 if (!(rth->u.dst.flags & DST_BALANCED) &&
927 compare_keys(&rth->fl, &rt->fl)) {
929 if (compare_keys(&rth->fl, &rt->fl)) {
932 *rthp = rth->u.rt_next;
934 * Since lookup is lockfree, the deletion
935 * must be visible to another weakly ordered CPU before
936 * the insertion at the start of the hash chain.
938 rcu_assign_pointer(rth->u.rt_next,
939 rt_hash_table[hash].chain);
941 * Since lookup is lockfree, the update writes
942 * must be ordered for consistency on SMP.
944 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
947 dst_hold(&rth->u.dst);
948 rth->u.dst.lastuse = now;
949 spin_unlock_bh(rt_hash_lock_addr(hash));
956 if (!atomic_read(&rth->u.dst.__refcnt)) {
957 u32 score = rt_score(rth);
959 if (score <= min_score) {
968 rthp = &rth->u.rt_next;
972 /* ip_rt_gc_elasticity used to be average length of chain
973 * length, when exceeded gc becomes really aggressive.
975 * The second limit is less certain. At the moment it allows
976 * only 2 entries per bucket. We will see.
978 if (chain_length > ip_rt_gc_elasticity) {
979 *candp = cand->u.rt_next;
984 /* Try to bind route to arp only if it is output
985 route or unicast forwarding path.
987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
988 int err = arp_bind_neighbour(&rt->u.dst);
990 spin_unlock_bh(rt_hash_lock_addr(hash));
992 if (err != -ENOBUFS) {
997 /* Neighbour tables are full and nothing
998 can be released. Try to shrink route cache,
999 it is most likely it holds some neighbour records.
1001 if (attempts-- > 0) {
1002 int saved_elasticity = ip_rt_gc_elasticity;
1003 int saved_int = ip_rt_gc_min_interval;
1004 ip_rt_gc_elasticity = 1;
1005 ip_rt_gc_min_interval = 0;
1006 rt_garbage_collect();
1007 ip_rt_gc_min_interval = saved_int;
1008 ip_rt_gc_elasticity = saved_elasticity;
1012 if (net_ratelimit())
1013 printk(KERN_WARNING "Neighbour table overflow.\n");
1019 rt->u.rt_next = rt_hash_table[hash].chain;
1020 #if RT_CACHE_DEBUG >= 2
1021 if (rt->u.rt_next) {
1023 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1024 NIPQUAD(rt->rt_dst));
1025 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1026 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1030 rt_hash_table[hash].chain = rt;
1031 spin_unlock_bh(rt_hash_lock_addr(hash));
1036 void rt_bind_peer(struct rtable *rt, int create)
1038 static DEFINE_SPINLOCK(rt_peer_lock);
1039 struct inet_peer *peer;
1041 peer = inet_getpeer(rt->rt_dst, create);
1043 spin_lock_bh(&rt_peer_lock);
1044 if (rt->peer == NULL) {
1048 spin_unlock_bh(&rt_peer_lock);
1054 * Peer allocation may fail only in serious out-of-memory conditions. However
1055 * we still can generate some output.
1056 * Random ID selection looks a bit dangerous because we have no chances to
1057 * select ID being unique in a reasonable period of time.
1058 * But broken packet identifier may be better than no packet at all.
1060 static void ip_select_fb_ident(struct iphdr *iph)
1062 static DEFINE_SPINLOCK(ip_fb_id_lock);
1063 static u32 ip_fallback_id;
1066 spin_lock_bh(&ip_fb_id_lock);
1067 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1068 iph->id = htons(salt & 0xFFFF);
1069 ip_fallback_id = salt;
1070 spin_unlock_bh(&ip_fb_id_lock);
1073 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1075 struct rtable *rt = (struct rtable *) dst;
1078 if (rt->peer == NULL)
1079 rt_bind_peer(rt, 1);
1081 /* If peer is attached to destination, it is never detached,
1082 so that we need not to grab a lock to dereference it.
1085 iph->id = htons(inet_getid(rt->peer, more));
1089 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1090 __builtin_return_address(0));
1092 ip_select_fb_ident(iph);
1095 static void rt_del(unsigned hash, struct rtable *rt)
1097 struct rtable **rthp;
1099 spin_lock_bh(rt_hash_lock_addr(hash));
1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1102 rthp = &(*rthp)->u.rt_next)
1104 *rthp = rt->u.rt_next;
1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1111 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1112 u32 saddr, u8 tos, struct net_device *dev)
1115 struct in_device *in_dev = in_dev_get(dev);
1116 struct rtable *rth, **rthp;
1117 u32 skeys[2] = { saddr, 0 };
1118 int ikeys[2] = { dev->ifindex, 0 };
1120 tos &= IPTOS_RT_MASK;
1125 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1126 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1127 goto reject_redirect;
1129 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1130 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1131 goto reject_redirect;
1132 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1133 goto reject_redirect;
1135 if (inet_addr_type(new_gw) != RTN_UNICAST)
1136 goto reject_redirect;
1139 for (i = 0; i < 2; i++) {
1140 for (k = 0; k < 2; k++) {
1141 unsigned hash = rt_hash_code(daddr,
1142 skeys[i] ^ (ikeys[k] << 5),
1145 rthp=&rt_hash_table[hash].chain;
1148 while ((rth = rcu_dereference(*rthp)) != NULL) {
1151 if (rth->fl.fl4_dst != daddr ||
1152 rth->fl.fl4_src != skeys[i] ||
1153 rth->fl.fl4_tos != tos ||
1154 rth->fl.oif != ikeys[k] ||
1156 rthp = &rth->u.rt_next;
1160 if (rth->rt_dst != daddr ||
1161 rth->rt_src != saddr ||
1163 rth->rt_gateway != old_gw ||
1164 rth->u.dst.dev != dev)
1167 dst_hold(&rth->u.dst);
1170 rt = dst_alloc(&ipv4_dst_ops);
1177 /* Copy all the information. */
1179 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1180 rt->u.dst.__use = 1;
1181 atomic_set(&rt->u.dst.__refcnt, 1);
1182 rt->u.dst.child = NULL;
1184 dev_hold(rt->u.dst.dev);
1186 in_dev_hold(rt->idev);
1187 rt->u.dst.obsolete = 0;
1188 rt->u.dst.lastuse = jiffies;
1189 rt->u.dst.path = &rt->u.dst;
1190 rt->u.dst.neighbour = NULL;
1191 rt->u.dst.hh = NULL;
1192 rt->u.dst.xfrm = NULL;
1194 rt->rt_flags |= RTCF_REDIRECTED;
1196 /* Gateway is different ... */
1197 rt->rt_gateway = new_gw;
1199 /* Redirect received -> path was valid */
1200 dst_confirm(&rth->u.dst);
1203 atomic_inc(&rt->peer->refcnt);
1205 if (arp_bind_neighbour(&rt->u.dst) ||
1206 !(rt->u.dst.neighbour->nud_state &
1208 if (rt->u.dst.neighbour)
1209 neigh_event_send(rt->u.dst.neighbour, NULL);
1216 if (!rt_intern_hash(hash, rt, &rt))
1229 #ifdef CONFIG_IP_ROUTE_VERBOSE
1230 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1231 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1232 "%u.%u.%u.%u ignored.\n"
1233 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1235 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1236 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1241 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1243 struct rtable *rt = (struct rtable*)dst;
1244 struct dst_entry *ret = dst;
1247 if (dst->obsolete) {
1250 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1251 rt->u.dst.expires) {
1252 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1256 #if RT_CACHE_DEBUG >= 1
1257 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1258 "%u.%u.%u.%u/%02x dropped\n",
1259 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1270 * 1. The first ip_rt_redirect_number redirects are sent
1271 * with exponential backoff, then we stop sending them at all,
1272 * assuming that the host ignores our redirects.
1273 * 2. If we did not see packets requiring redirects
1274 * during ip_rt_redirect_silence, we assume that the host
1275 * forgot redirected route and start to send redirects again.
1277 * This algorithm is much cheaper and more intelligent than dumb load limiting
1280 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1281 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284 void ip_rt_send_redirect(struct sk_buff *skb)
1286 struct rtable *rt = (struct rtable*)skb->dst;
1287 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1292 if (!IN_DEV_TX_REDIRECTS(in_dev))
1295 /* No redirected packets during ip_rt_redirect_silence;
1296 * reset the algorithm.
1298 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1299 rt->u.dst.rate_tokens = 0;
1301 /* Too many ignored redirects; do not send anything
1302 * set u.dst.rate_last to the last seen redirected packet.
1304 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1305 rt->u.dst.rate_last = jiffies;
1309 /* Check for load limit; set rate_last to the latest sent
1312 if (time_after(jiffies,
1313 (rt->u.dst.rate_last +
1314 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1315 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1316 rt->u.dst.rate_last = jiffies;
1317 ++rt->u.dst.rate_tokens;
1318 #ifdef CONFIG_IP_ROUTE_VERBOSE
1319 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1320 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1322 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1323 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1324 NIPQUAD(rt->rt_src), rt->rt_iif,
1325 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1332 static int ip_error(struct sk_buff *skb)
1334 struct rtable *rt = (struct rtable*)skb->dst;
1338 switch (rt->u.dst.error) {
1343 code = ICMP_HOST_UNREACH;
1346 code = ICMP_NET_UNREACH;
1349 code = ICMP_PKT_FILTERED;
1354 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1355 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1356 rt->u.dst.rate_tokens = ip_rt_error_burst;
1357 rt->u.dst.rate_last = now;
1358 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1359 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1360 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363 out: kfree_skb(skb);
1368 * The last two values are not from the RFC but
1369 * are needed for AMPRnet AX.25 paths.
1372 static unsigned short mtu_plateau[] =
1373 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1375 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1379 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1380 if (old_mtu > mtu_plateau[i])
1381 return mtu_plateau[i];
1385 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388 unsigned short old_mtu = ntohs(iph->tot_len);
1390 u32 skeys[2] = { iph->saddr, 0, };
1391 u32 daddr = iph->daddr;
1392 u8 tos = iph->tos & IPTOS_RT_MASK;
1393 unsigned short est_mtu = 0;
1395 if (ipv4_config.no_pmtu_disc)
1398 for (i = 0; i < 2; i++) {
1399 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1403 rth = rcu_dereference(rth->u.rt_next)) {
1404 if (rth->fl.fl4_dst == daddr &&
1405 rth->fl.fl4_src == skeys[i] &&
1406 rth->rt_dst == daddr &&
1407 rth->rt_src == iph->saddr &&
1408 rth->fl.fl4_tos == tos &&
1410 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1411 unsigned short mtu = new_mtu;
1413 if (new_mtu < 68 || new_mtu >= old_mtu) {
1415 /* BSD 4.2 compatibility hack :-( */
1417 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1418 old_mtu >= 68 + (iph->ihl << 2))
1419 old_mtu -= iph->ihl << 2;
1421 mtu = guess_mtu(old_mtu);
1423 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1424 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1425 dst_confirm(&rth->u.dst);
1426 if (mtu < ip_rt_min_pmtu) {
1427 mtu = ip_rt_min_pmtu;
1428 rth->u.dst.metrics[RTAX_LOCK-1] |=
1431 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1432 dst_set_expires(&rth->u.dst,
1441 return est_mtu ? : new_mtu;
1444 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1446 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1447 !(dst_metric_locked(dst, RTAX_MTU))) {
1448 if (mtu < ip_rt_min_pmtu) {
1449 mtu = ip_rt_min_pmtu;
1450 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1452 dst->metrics[RTAX_MTU-1] = mtu;
1453 dst_set_expires(dst, ip_rt_mtu_expires);
1457 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1462 static void ipv4_dst_destroy(struct dst_entry *dst)
1464 struct rtable *rt = (struct rtable *) dst;
1465 struct inet_peer *peer = rt->peer;
1466 struct in_device *idev = rt->idev;
1479 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482 struct rtable *rt = (struct rtable *) dst;
1483 struct in_device *idev = rt->idev;
1484 if (dev != &loopback_dev && idev && idev->dev == dev) {
1485 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1486 if (loopback_idev) {
1487 rt->idev = loopback_idev;
1493 static void ipv4_link_failure(struct sk_buff *skb)
1497 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1499 rt = (struct rtable *) skb->dst;
1501 dst_set_expires(&rt->u.dst, 0);
1504 static int ip_rt_bug(struct sk_buff *skb)
1506 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1507 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1508 skb->dev ? skb->dev->name : "?");
1514 We do not cache source address of outgoing interface,
1515 because it is used only by IP RR, TS and SRR options,
1516 so that it out of fast path.
1518 BTW remember: "addr" is allowed to be not aligned
1522 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525 struct fib_result res;
1527 if (rt->fl.iif == 0)
1529 else if (fib_lookup(&rt->fl, &res) == 0) {
1530 src = FIB_RES_PREFSRC(res);
1533 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1535 memcpy(addr, &src, 4);
1538 #ifdef CONFIG_NET_CLS_ROUTE
1539 static void set_class_tag(struct rtable *rt, u32 tag)
1541 if (!(rt->u.dst.tclassid & 0xFFFF))
1542 rt->u.dst.tclassid |= tag & 0xFFFF;
1543 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1544 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1548 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1550 struct fib_info *fi = res->fi;
1553 if (FIB_RES_GW(*res) &&
1554 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1555 rt->rt_gateway = FIB_RES_GW(*res);
1556 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1557 sizeof(rt->u.dst.metrics));
1558 if (fi->fib_mtu == 0) {
1559 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1560 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1561 rt->rt_gateway != rt->rt_dst &&
1562 rt->u.dst.dev->mtu > 576)
1563 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1565 #ifdef CONFIG_NET_CLS_ROUTE
1566 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1571 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1572 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1573 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1574 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1575 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1576 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1578 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1579 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582 #ifdef CONFIG_IP_MULTIPLE_TABLES
1583 set_class_tag(rt, fib_rules_tclass(res));
1585 set_class_tag(rt, itag);
1587 rt->rt_type = res->type;
1590 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1591 u8 tos, struct net_device *dev, int our)
1596 struct in_device *in_dev = in_dev_get(dev);
1599 /* Primary sanity checks. */
1604 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1605 skb->protocol != htons(ETH_P_IP))
1608 if (ZERONET(saddr)) {
1609 if (!LOCAL_MCAST(daddr))
1611 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1612 } else if (fib_validate_source(saddr, 0, tos, 0,
1613 dev, &spec_dst, &itag) < 0)
1616 rth = dst_alloc(&ipv4_dst_ops);
1620 rth->u.dst.output= ip_rt_bug;
1622 atomic_set(&rth->u.dst.__refcnt, 1);
1623 rth->u.dst.flags= DST_HOST;
1624 if (in_dev->cnf.no_policy)
1625 rth->u.dst.flags |= DST_NOPOLICY;
1626 rth->fl.fl4_dst = daddr;
1627 rth->rt_dst = daddr;
1628 rth->fl.fl4_tos = tos;
1629 #ifdef CONFIG_IP_ROUTE_FWMARK
1630 rth->fl.fl4_fwmark= skb->nfmark;
1632 rth->fl.fl4_src = saddr;
1633 rth->rt_src = saddr;
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635 rth->u.dst.tclassid = itag;
1638 rth->fl.iif = dev->ifindex;
1639 rth->u.dst.dev = &loopback_dev;
1640 dev_hold(rth->u.dst.dev);
1641 rth->idev = in_dev_get(rth->u.dst.dev);
1643 rth->rt_gateway = daddr;
1644 rth->rt_spec_dst= spec_dst;
1645 rth->rt_type = RTN_MULTICAST;
1646 rth->rt_flags = RTCF_MULTICAST;
1648 rth->u.dst.input= ip_local_deliver;
1649 rth->rt_flags |= RTCF_LOCAL;
1652 #ifdef CONFIG_IP_MROUTE
1653 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1654 rth->u.dst.input = ip_mr_input;
1656 RT_CACHE_STAT_INC(in_slow_mc);
1659 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1660 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1672 static void ip_handle_martian_source(struct net_device *dev,
1673 struct in_device *in_dev,
1674 struct sk_buff *skb,
1678 RT_CACHE_STAT_INC(in_martian_src);
1679 #ifdef CONFIG_IP_ROUTE_VERBOSE
1680 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1682 * RFC1812 recommendation, if source is martian,
1683 * the only hint is MAC header.
1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1686 "%u.%u.%u.%u, on dev %s\n",
1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1688 if (dev->hard_header_len) {
1690 unsigned char *p = skb->mac.raw;
1691 printk(KERN_WARNING "ll header: ");
1692 for (i = 0; i < dev->hard_header_len; i++, p++) {
1694 if (i < (dev->hard_header_len - 1))
1703 static inline int __mkroute_input(struct sk_buff *skb,
1704 struct fib_result* res,
1705 struct in_device *in_dev,
1706 u32 daddr, u32 saddr, u32 tos,
1707 struct rtable **result)
1712 struct in_device *out_dev;
1716 /* get a working reference to the output device */
1717 out_dev = in_dev_get(FIB_RES_DEV(*res));
1718 if (out_dev == NULL) {
1719 if (net_ratelimit())
1720 printk(KERN_CRIT "Bug in ip_route_input" \
1721 "_slow(). Please, report\n");
1726 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1727 in_dev->dev, &spec_dst, &itag);
1729 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1737 flags |= RTCF_DIRECTSRC;
1739 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1740 (IN_DEV_SHARED_MEDIA(out_dev) ||
1741 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1742 flags |= RTCF_DOREDIRECT;
1744 if (skb->protocol != htons(ETH_P_IP)) {
1745 /* Not IP (i.e. ARP). Do not create route, if it is
1746 * invalid for proxy arp. DNAT routes are always valid.
1748 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1755 rth = dst_alloc(&ipv4_dst_ops);
1761 rth->u.dst.flags= DST_HOST;
1762 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1763 if (res->fi->fib_nhs > 1)
1764 rth->u.dst.flags |= DST_BALANCED;
1766 if (in_dev->cnf.no_policy)
1767 rth->u.dst.flags |= DST_NOPOLICY;
1768 if (in_dev->cnf.no_xfrm)
1769 rth->u.dst.flags |= DST_NOXFRM;
1770 rth->fl.fl4_dst = daddr;
1771 rth->rt_dst = daddr;
1772 rth->fl.fl4_tos = tos;
1773 #ifdef CONFIG_IP_ROUTE_FWMARK
1774 rth->fl.fl4_fwmark= skb->nfmark;
1776 rth->fl.fl4_src = saddr;
1777 rth->rt_src = saddr;
1778 rth->rt_gateway = daddr;
1780 rth->fl.iif = in_dev->dev->ifindex;
1781 rth->u.dst.dev = (out_dev)->dev;
1782 dev_hold(rth->u.dst.dev);
1783 rth->idev = in_dev_get(rth->u.dst.dev);
1785 rth->rt_spec_dst= spec_dst;
1787 rth->u.dst.input = ip_forward;
1788 rth->u.dst.output = ip_output;
1790 rt_set_nexthop(rth, res, itag);
1792 rth->rt_flags = flags;
1797 /* release the working reference to the output device */
1798 in_dev_put(out_dev);
1802 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1803 struct fib_result* res,
1804 const struct flowi *fl,
1805 struct in_device *in_dev,
1806 u32 daddr, u32 saddr, u32 tos)
1808 struct rtable* rth = NULL;
1812 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1813 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1814 fib_select_multipath(fl, res);
1817 /* create a routing cache entry */
1818 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1821 atomic_set(&rth->u.dst.__refcnt, 1);
1823 /* put it into the cache */
1824 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1828 static inline int ip_mkroute_input(struct sk_buff *skb,
1829 struct fib_result* res,
1830 const struct flowi *fl,
1831 struct in_device *in_dev,
1832 u32 daddr, u32 saddr, u32 tos)
1834 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1835 struct rtable* rth = NULL;
1836 unsigned char hop, hopcount, lasthop;
1841 hopcount = res->fi->fib_nhs;
1845 lasthop = hopcount - 1;
1847 /* distinguish between multipath and singlepath */
1849 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1852 /* add all alternatives to the routing cache */
1853 for (hop = 0; hop < hopcount; hop++) {
1856 /* create a routing cache entry */
1857 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1862 /* put it into the cache */
1863 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1864 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1868 /* forward hop information to multipath impl. */
1869 multipath_set_nhinfo(rth,
1870 FIB_RES_NETWORK(*res),
1871 FIB_RES_NETMASK(*res),
1875 /* only for the last hop the reference count is handled
1879 atomic_set(&(skb->dst->__refcnt), 1);
1882 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1883 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1884 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1889 * NOTE. We drop all the packets that has local source
1890 * addresses, because every properly looped back packet
1891 * must have correct destination already attached by output routine.
1893 * Such approach solves two big problems:
1894 * 1. Not simplex devices are handled properly.
1895 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1898 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1899 u8 tos, struct net_device *dev)
1901 struct fib_result res;
1902 struct in_device *in_dev = in_dev_get(dev);
1903 struct flowi fl = { .nl_u = { .ip4_u =
1907 .scope = RT_SCOPE_UNIVERSE,
1908 #ifdef CONFIG_IP_ROUTE_FWMARK
1909 .fwmark = skb->nfmark
1912 .iif = dev->ifindex };
1915 struct rtable * rth;
1921 /* IP on this device is disabled. */
1926 /* Check for the most weird martians, which can be not detected
1930 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1931 goto martian_source;
1933 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1936 /* Accept zero addresses only to limited broadcast;
1937 * I even do not know to fix it or not. Waiting for complains :-)
1940 goto martian_source;
1942 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1943 goto martian_destination;
1946 * Now we are ready to route packet.
1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1949 if (!IN_DEV_FORWARD(in_dev))
1955 RT_CACHE_STAT_INC(in_slow_tot);
1957 if (res.type == RTN_BROADCAST)
1960 if (res.type == RTN_LOCAL) {
1962 result = fib_validate_source(saddr, daddr, tos,
1963 loopback_dev.ifindex,
1964 dev, &spec_dst, &itag);
1966 goto martian_source;
1968 flags |= RTCF_DIRECTSRC;
1973 if (!IN_DEV_FORWARD(in_dev))
1975 if (res.type != RTN_UNICAST)
1976 goto martian_destination;
1978 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1979 if (err == -ENOBUFS)
1991 if (skb->protocol != htons(ETH_P_IP))
1995 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1997 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2000 goto martian_source;
2002 flags |= RTCF_DIRECTSRC;
2004 flags |= RTCF_BROADCAST;
2005 res.type = RTN_BROADCAST;
2006 RT_CACHE_STAT_INC(in_brd);
2009 rth = dst_alloc(&ipv4_dst_ops);
2013 rth->u.dst.output= ip_rt_bug;
2015 atomic_set(&rth->u.dst.__refcnt, 1);
2016 rth->u.dst.flags= DST_HOST;
2017 if (in_dev->cnf.no_policy)
2018 rth->u.dst.flags |= DST_NOPOLICY;
2019 rth->fl.fl4_dst = daddr;
2020 rth->rt_dst = daddr;
2021 rth->fl.fl4_tos = tos;
2022 #ifdef CONFIG_IP_ROUTE_FWMARK
2023 rth->fl.fl4_fwmark= skb->nfmark;
2025 rth->fl.fl4_src = saddr;
2026 rth->rt_src = saddr;
2027 #ifdef CONFIG_NET_CLS_ROUTE
2028 rth->u.dst.tclassid = itag;
2031 rth->fl.iif = dev->ifindex;
2032 rth->u.dst.dev = &loopback_dev;
2033 dev_hold(rth->u.dst.dev);
2034 rth->idev = in_dev_get(rth->u.dst.dev);
2035 rth->rt_gateway = daddr;
2036 rth->rt_spec_dst= spec_dst;
2037 rth->u.dst.input= ip_local_deliver;
2038 rth->rt_flags = flags|RTCF_LOCAL;
2039 if (res.type == RTN_UNREACHABLE) {
2040 rth->u.dst.input= ip_error;
2041 rth->u.dst.error= -err;
2042 rth->rt_flags &= ~RTCF_LOCAL;
2044 rth->rt_type = res.type;
2045 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2046 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2050 RT_CACHE_STAT_INC(in_no_route);
2051 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2052 res.type = RTN_UNREACHABLE;
2056 * Do not cache martian addresses: they should be logged (RFC1812)
2058 martian_destination:
2059 RT_CACHE_STAT_INC(in_martian_dst);
2060 #ifdef CONFIG_IP_ROUTE_VERBOSE
2061 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2062 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2063 "%u.%u.%u.%u, dev %s\n",
2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2068 err = -EHOSTUNREACH;
2080 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2084 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2085 u8 tos, struct net_device *dev)
2087 struct rtable * rth;
2089 int iif = dev->ifindex;
2091 tos &= IPTOS_RT_MASK;
2092 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2095 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2096 rth = rcu_dereference(rth->u.rt_next)) {
2097 if (rth->fl.fl4_dst == daddr &&
2098 rth->fl.fl4_src == saddr &&
2099 rth->fl.iif == iif &&
2101 #ifdef CONFIG_IP_ROUTE_FWMARK
2102 rth->fl.fl4_fwmark == skb->nfmark &&
2104 rth->fl.fl4_tos == tos) {
2105 rth->u.dst.lastuse = jiffies;
2106 dst_hold(&rth->u.dst);
2108 RT_CACHE_STAT_INC(in_hit);
2110 skb->dst = (struct dst_entry*)rth;
2113 RT_CACHE_STAT_INC(in_hlist_search);
2117 /* Multicast recognition logic is moved from route cache to here.
2118 The problem was that too many Ethernet cards have broken/missing
2119 hardware multicast filters :-( As result the host on multicasting
2120 network acquires a lot of useless route cache entries, sort of
2121 SDR messages from all the world. Now we try to get rid of them.
2122 Really, provided software IP multicast filter is organized
2123 reasonably (at least, hashed), it does not result in a slowdown
2124 comparing with route cache reject entries.
2125 Note, that multicast routers are not affected, because
2126 route cache entry is created eventually.
2128 if (MULTICAST(daddr)) {
2129 struct in_device *in_dev;
2132 if ((in_dev = __in_dev_get(dev)) != NULL) {
2133 int our = ip_check_mc(in_dev, daddr, saddr,
2134 skb->nh.iph->protocol);
2136 #ifdef CONFIG_IP_MROUTE
2137 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141 return ip_route_input_mc(skb, daddr, saddr,
2148 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2151 static inline int __mkroute_output(struct rtable **result,
2152 struct fib_result* res,
2153 const struct flowi *fl,
2154 const struct flowi *oldflp,
2155 struct net_device *dev_out,
2159 struct in_device *in_dev;
2160 u32 tos = RT_FL_TOS(oldflp);
2163 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2166 if (fl->fl4_dst == 0xFFFFFFFF)
2167 res->type = RTN_BROADCAST;
2168 else if (MULTICAST(fl->fl4_dst))
2169 res->type = RTN_MULTICAST;
2170 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2173 if (dev_out->flags & IFF_LOOPBACK)
2174 flags |= RTCF_LOCAL;
2176 /* get work reference to inet device */
2177 in_dev = in_dev_get(dev_out);
2181 if (res->type == RTN_BROADCAST) {
2182 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2184 fib_info_put(res->fi);
2187 } else if (res->type == RTN_MULTICAST) {
2188 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2189 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2191 flags &= ~RTCF_LOCAL;
2192 /* If multicast route do not exist use
2193 default one, but do not gateway in this case.
2196 if (res->fi && res->prefixlen < 4) {
2197 fib_info_put(res->fi);
2203 rth = dst_alloc(&ipv4_dst_ops);
2209 rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2212 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213 if (res->fi->fib_nhs > 1)
2214 rth->u.dst.flags |= DST_BALANCED;
2217 if (in_dev->cnf.no_xfrm)
2218 rth->u.dst.flags |= DST_NOXFRM;
2219 if (in_dev->cnf.no_policy)
2220 rth->u.dst.flags |= DST_NOPOLICY;
2222 rth->fl.fl4_dst = oldflp->fl4_dst;
2223 rth->fl.fl4_tos = tos;
2224 rth->fl.fl4_src = oldflp->fl4_src;
2225 rth->fl.oif = oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2229 rth->rt_dst = fl->fl4_dst;
2230 rth->rt_src = fl->fl4_src;
2231 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2232 /* get references to the devices that are to be hold by the routing
2234 rth->u.dst.dev = dev_out;
2236 rth->idev = in_dev_get(dev_out);
2237 rth->rt_gateway = fl->fl4_dst;
2238 rth->rt_spec_dst= fl->fl4_src;
2240 rth->u.dst.output=ip_output;
2242 RT_CACHE_STAT_INC(out_slow_tot);
2244 if (flags & RTCF_LOCAL) {
2245 rth->u.dst.input = ip_local_deliver;
2246 rth->rt_spec_dst = fl->fl4_dst;
2248 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249 rth->rt_spec_dst = fl->fl4_src;
2250 if (flags & RTCF_LOCAL &&
2251 !(dev_out->flags & IFF_LOOPBACK)) {
2252 rth->u.dst.output = ip_mc_output;
2253 RT_CACHE_STAT_INC(out_slow_mc);
2255 #ifdef CONFIG_IP_MROUTE
2256 if (res->type == RTN_MULTICAST) {
2257 if (IN_DEV_MFORWARD(in_dev) &&
2258 !LOCAL_MCAST(oldflp->fl4_dst)) {
2259 rth->u.dst.input = ip_mr_input;
2260 rth->u.dst.output = ip_mc_output;
2266 rt_set_nexthop(rth, res, 0);
2268 rth->rt_flags = flags;
2272 /* release work reference to inet device */
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279 struct fib_result* res,
2280 const struct flowi *fl,
2281 const struct flowi *oldflp,
2282 struct net_device *dev_out,
2285 struct rtable *rth = NULL;
2286 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289 u32 tos = RT_FL_TOS(oldflp);
2291 atomic_set(&rth->u.dst.__refcnt, 1);
2293 hash = rt_hash_code(oldflp->fl4_dst,
2294 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2295 err = rt_intern_hash(hash, rth, rp);
2301 static inline int ip_mkroute_output(struct rtable** rp,
2302 struct fib_result* res,
2303 const struct flowi *fl,
2304 const struct flowi *oldflp,
2305 struct net_device *dev_out,
2308 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2309 u32 tos = RT_FL_TOS(oldflp);
2313 struct rtable *rth = NULL;
2315 if (res->fi && res->fi->fib_nhs > 1) {
2316 unsigned char hopcount = res->fi->fib_nhs;
2318 for (hop = 0; hop < hopcount; hop++) {
2319 struct net_device *dev2nexthop;
2323 /* hold a work reference to the output device */
2324 dev2nexthop = FIB_RES_DEV(*res);
2325 dev_hold(dev2nexthop);
2327 err = __mkroute_output(&rth, res, fl, oldflp,
2328 dev2nexthop, flags);
2333 hash = rt_hash_code(oldflp->fl4_dst,
2335 (oldflp->oif << 5), tos);
2336 err = rt_intern_hash(hash, rth, rp);
2338 /* forward hop information to multipath impl. */
2339 multipath_set_nhinfo(rth,
2340 FIB_RES_NETWORK(*res),
2341 FIB_RES_NETMASK(*res),
2345 /* release work reference to output device */
2346 dev_put(dev2nexthop);
2351 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2354 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2357 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2358 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2363 * Major route resolver routine.
2366 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368 u32 tos = RT_FL_TOS(oldflp);
2369 struct flowi fl = { .nl_u = { .ip4_u =
2370 { .daddr = oldflp->fl4_dst,
2371 .saddr = oldflp->fl4_src,
2372 .tos = tos & IPTOS_RT_MASK,
2373 .scope = ((tos & RTO_ONLINK) ?
2376 #ifdef CONFIG_IP_ROUTE_FWMARK
2377 .fwmark = oldflp->fl4_fwmark
2380 .iif = loopback_dev.ifindex,
2381 .oif = oldflp->oif };
2382 struct fib_result res;
2384 struct net_device *dev_out = NULL;
2390 #ifdef CONFIG_IP_MULTIPLE_TABLES
2394 if (oldflp->fl4_src) {
2396 if (MULTICAST(oldflp->fl4_src) ||
2397 BADCLASS(oldflp->fl4_src) ||
2398 ZERONET(oldflp->fl4_src))
2401 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2402 dev_out = ip_dev_find(oldflp->fl4_src);
2403 if (dev_out == NULL)
2406 /* I removed check for oif == dev_out->oif here.
2407 It was wrong for two reasons:
2408 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2409 assigned to multiple interfaces.
2410 2. Moreover, we are allowed to send packets with saddr
2411 of another iface. --ANK
2414 if (oldflp->oif == 0
2415 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2416 /* Special hack: user can direct multicasts
2417 and limited broadcast via necessary interface
2418 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2419 This hack is not just for fun, it allows
2420 vic,vat and friends to work.
2421 They bind socket to loopback, set ttl to zero
2422 and expect that it will work.
2423 From the viewpoint of routing cache they are broken,
2424 because we are not allowed to build multicast path
2425 with loopback source addr (look, routing cache
2426 cannot know, that ttl is zero, so that packet
2427 will not leave this host and route is valid).
2428 Luckily, this hack is good workaround.
2431 fl.oif = dev_out->ifindex;
2441 dev_out = dev_get_by_index(oldflp->oif);
2443 if (dev_out == NULL)
2445 if (__in_dev_get(dev_out) == NULL) {
2447 goto out; /* Wrong error code */
2450 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2452 fl.fl4_src = inet_select_addr(dev_out, 0,
2457 if (MULTICAST(oldflp->fl4_dst))
2458 fl.fl4_src = inet_select_addr(dev_out, 0,
2460 else if (!oldflp->fl4_dst)
2461 fl.fl4_src = inet_select_addr(dev_out, 0,
2467 fl.fl4_dst = fl.fl4_src;
2469 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2472 dev_out = &loopback_dev;
2474 fl.oif = loopback_dev.ifindex;
2475 res.type = RTN_LOCAL;
2476 flags |= RTCF_LOCAL;
2480 if (fib_lookup(&fl, &res)) {
2483 /* Apparently, routing tables are wrong. Assume,
2484 that the destination is on link.
2487 Because we are allowed to send to iface
2488 even if it has NO routes and NO assigned
2489 addresses. When oif is specified, routing
2490 tables are looked up with only one purpose:
2491 to catch if destination is gatewayed, rather than
2492 direct. Moreover, if MSG_DONTROUTE is set,
2493 we send packet, ignoring both routing tables
2494 and ifaddr state. --ANK
2497 We could make it even if oif is unknown,
2498 likely IPv6, but we do not.
2501 if (fl.fl4_src == 0)
2502 fl.fl4_src = inet_select_addr(dev_out, 0,
2504 res.type = RTN_UNICAST;
2514 if (res.type == RTN_LOCAL) {
2516 fl.fl4_src = fl.fl4_dst;
2519 dev_out = &loopback_dev;
2521 fl.oif = dev_out->ifindex;
2523 fib_info_put(res.fi);
2525 flags |= RTCF_LOCAL;
2529 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2530 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2531 fib_select_multipath(&fl, &res);
2534 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2535 fib_select_default(&fl, &res);
2538 fl.fl4_src = FIB_RES_PREFSRC(res);
2542 dev_out = FIB_RES_DEV(res);
2544 fl.oif = dev_out->ifindex;
2548 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2558 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2563 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2566 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2567 rth = rcu_dereference(rth->u.rt_next)) {
2568 if (rth->fl.fl4_dst == flp->fl4_dst &&
2569 rth->fl.fl4_src == flp->fl4_src &&
2571 rth->fl.oif == flp->oif &&
2572 #ifdef CONFIG_IP_ROUTE_FWMARK
2573 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2575 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2576 (IPTOS_RT_MASK | RTO_ONLINK))) {
2578 /* check for multipath routes and choose one if
2581 if (multipath_select_route(flp, rth, rp)) {
2582 dst_hold(&(*rp)->u.dst);
2583 RT_CACHE_STAT_INC(out_hit);
2584 rcu_read_unlock_bh();
2588 rth->u.dst.lastuse = jiffies;
2589 dst_hold(&rth->u.dst);
2591 RT_CACHE_STAT_INC(out_hit);
2592 rcu_read_unlock_bh();
2596 RT_CACHE_STAT_INC(out_hlist_search);
2598 rcu_read_unlock_bh();
2600 return ip_route_output_slow(rp, flp);
2603 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2607 if ((err = __ip_route_output_key(rp, flp)) != 0)
2612 flp->fl4_src = (*rp)->rt_src;
2614 flp->fl4_dst = (*rp)->rt_dst;
2615 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2623 return ip_route_output_flow(rp, flp, NULL, 0);
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627 int nowait, unsigned int flags)
2629 struct rtable *rt = (struct rtable*)skb->dst;
2631 struct nlmsghdr *nlh;
2632 unsigned char *b = skb->tail;
2633 struct rta_cacheinfo ci;
2634 #ifdef CONFIG_IP_MROUTE
2635 struct rtattr *eptr;
2637 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2638 r = NLMSG_DATA(nlh);
2639 r->rtm_family = AF_INET;
2640 r->rtm_dst_len = 32;
2642 r->rtm_tos = rt->fl.fl4_tos;
2643 r->rtm_table = RT_TABLE_MAIN;
2644 r->rtm_type = rt->rt_type;
2645 r->rtm_scope = RT_SCOPE_UNIVERSE;
2646 r->rtm_protocol = RTPROT_UNSPEC;
2647 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2648 if (rt->rt_flags & RTCF_NOTIFY)
2649 r->rtm_flags |= RTM_F_NOTIFY;
2650 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2651 if (rt->fl.fl4_src) {
2652 r->rtm_src_len = 32;
2653 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2656 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2657 #ifdef CONFIG_NET_CLS_ROUTE
2658 if (rt->u.dst.tclassid)
2659 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2662 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2663 __u32 alg = rt->rt_multipath_alg;
2665 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2669 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2670 else if (rt->rt_src != rt->fl.fl4_src)
2671 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2672 if (rt->rt_dst != rt->rt_gateway)
2673 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2674 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2675 goto rtattr_failure;
2676 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2677 ci.rta_used = rt->u.dst.__use;
2678 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2679 if (rt->u.dst.expires)
2680 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2683 ci.rta_error = rt->u.dst.error;
2684 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2686 ci.rta_id = rt->peer->ip_id_count;
2687 if (rt->peer->tcp_ts_stamp) {
2688 ci.rta_ts = rt->peer->tcp_ts;
2689 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2692 #ifdef CONFIG_IP_MROUTE
2693 eptr = (struct rtattr*)skb->tail;
2695 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2697 #ifdef CONFIG_IP_MROUTE
2698 u32 dst = rt->rt_dst;
2700 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2701 ipv4_devconf.mc_forwarding) {
2702 int err = ipmr_get_route(skb, r, nowait);
2709 if (err == -EMSGSIZE)
2711 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2716 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2719 nlh->nlmsg_len = skb->tail - b;
2724 skb_trim(skb, b - skb->data);
2728 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2730 struct rtattr **rta = arg;
2731 struct rtmsg *rtm = NLMSG_DATA(nlh);
2732 struct rtable *rt = NULL;
2737 struct sk_buff *skb;
2739 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2743 /* Reserve room for dummy headers, this skb can pass
2744 through good chunk of routing engine.
2746 skb->mac.raw = skb->data;
2747 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2749 if (rta[RTA_SRC - 1])
2750 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751 if (rta[RTA_DST - 1])
2752 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753 if (rta[RTA_IIF - 1])
2754 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2757 struct net_device *dev = __dev_get_by_index(iif);
2761 skb->protocol = htons(ETH_P_IP);
2764 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2766 rt = (struct rtable*)skb->dst;
2767 if (!err && rt->u.dst.error)
2768 err = -rt->u.dst.error;
2770 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2772 .tos = rtm->rtm_tos } } };
2774 if (rta[RTA_OIF - 1])
2775 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2777 err = ip_route_output_key(&rt, &fl);
2782 skb->dst = &rt->u.dst;
2783 if (rtm->rtm_flags & RTM_F_NOTIFY)
2784 rt->rt_flags |= RTCF_NOTIFY;
2786 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2788 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2789 RTM_NEWROUTE, 0, 0);
2797 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2807 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2814 s_idx = idx = cb->args[1];
2815 for (h = 0; h <= rt_hash_mask; h++) {
2816 if (h < s_h) continue;
2820 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821 rt = rcu_dereference(rt->u.rt_next), idx++) {
2824 skb->dst = dst_clone(&rt->u.dst);
2825 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2826 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2827 1, NLM_F_MULTI) <= 0) {
2828 dst_release(xchg(&skb->dst, NULL));
2829 rcu_read_unlock_bh();
2832 dst_release(xchg(&skb->dst, NULL));
2834 rcu_read_unlock_bh();
2843 void ip_rt_multicast_event(struct in_device *in_dev)
2848 #ifdef CONFIG_SYSCTL
2849 static int flush_delay;
2851 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852 struct file *filp, void __user *buffer,
2853 size_t *lenp, loff_t *ppos)
2856 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857 rt_cache_flush(flush_delay);
2864 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2867 void __user *oldval,
2868 size_t __user *oldlenp,
2869 void __user *newval,
2874 if (newlen != sizeof(int))
2876 if (get_user(delay, (int __user *)newval))
2878 rt_cache_flush(delay);
2882 ctl_table ipv4_route_table[] = {
2884 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2885 .procname = "flush",
2886 .data = &flush_delay,
2887 .maxlen = sizeof(int),
2889 .proc_handler = &ipv4_sysctl_rtcache_flush,
2890 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2893 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2894 .procname = "min_delay",
2895 .data = &ip_rt_min_delay,
2896 .maxlen = sizeof(int),
2898 .proc_handler = &proc_dointvec_jiffies,
2899 .strategy = &sysctl_jiffies,
2902 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2903 .procname = "max_delay",
2904 .data = &ip_rt_max_delay,
2905 .maxlen = sizeof(int),
2907 .proc_handler = &proc_dointvec_jiffies,
2908 .strategy = &sysctl_jiffies,
2911 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2912 .procname = "gc_thresh",
2913 .data = &ipv4_dst_ops.gc_thresh,
2914 .maxlen = sizeof(int),
2916 .proc_handler = &proc_dointvec,
2919 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2920 .procname = "max_size",
2921 .data = &ip_rt_max_size,
2922 .maxlen = sizeof(int),
2924 .proc_handler = &proc_dointvec,
2927 /* Deprecated. Use gc_min_interval_ms */
2929 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930 .procname = "gc_min_interval",
2931 .data = &ip_rt_gc_min_interval,
2932 .maxlen = sizeof(int),
2934 .proc_handler = &proc_dointvec_jiffies,
2935 .strategy = &sysctl_jiffies,
2938 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939 .procname = "gc_min_interval_ms",
2940 .data = &ip_rt_gc_min_interval,
2941 .maxlen = sizeof(int),
2943 .proc_handler = &proc_dointvec_ms_jiffies,
2944 .strategy = &sysctl_ms_jiffies,
2947 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2948 .procname = "gc_timeout",
2949 .data = &ip_rt_gc_timeout,
2950 .maxlen = sizeof(int),
2952 .proc_handler = &proc_dointvec_jiffies,
2953 .strategy = &sysctl_jiffies,
2956 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2957 .procname = "gc_interval",
2958 .data = &ip_rt_gc_interval,
2959 .maxlen = sizeof(int),
2961 .proc_handler = &proc_dointvec_jiffies,
2962 .strategy = &sysctl_jiffies,
2965 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2966 .procname = "redirect_load",
2967 .data = &ip_rt_redirect_load,
2968 .maxlen = sizeof(int),
2970 .proc_handler = &proc_dointvec,
2973 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974 .procname = "redirect_number",
2975 .data = &ip_rt_redirect_number,
2976 .maxlen = sizeof(int),
2978 .proc_handler = &proc_dointvec,
2981 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982 .procname = "redirect_silence",
2983 .data = &ip_rt_redirect_silence,
2984 .maxlen = sizeof(int),
2986 .proc_handler = &proc_dointvec,
2989 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2990 .procname = "error_cost",
2991 .data = &ip_rt_error_cost,
2992 .maxlen = sizeof(int),
2994 .proc_handler = &proc_dointvec,
2997 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2998 .procname = "error_burst",
2999 .data = &ip_rt_error_burst,
3000 .maxlen = sizeof(int),
3002 .proc_handler = &proc_dointvec,
3005 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3006 .procname = "gc_elasticity",
3007 .data = &ip_rt_gc_elasticity,
3008 .maxlen = sizeof(int),
3010 .proc_handler = &proc_dointvec,
3013 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3014 .procname = "mtu_expires",
3015 .data = &ip_rt_mtu_expires,
3016 .maxlen = sizeof(int),
3018 .proc_handler = &proc_dointvec_jiffies,
3019 .strategy = &sysctl_jiffies,
3022 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3023 .procname = "min_pmtu",
3024 .data = &ip_rt_min_pmtu,
3025 .maxlen = sizeof(int),
3027 .proc_handler = &proc_dointvec,
3030 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3031 .procname = "min_adv_mss",
3032 .data = &ip_rt_min_advmss,
3033 .maxlen = sizeof(int),
3035 .proc_handler = &proc_dointvec,
3038 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3039 .procname = "secret_interval",
3040 .data = &ip_rt_secret_interval,
3041 .maxlen = sizeof(int),
3043 .proc_handler = &proc_dointvec_jiffies,
3044 .strategy = &sysctl_jiffies,
3050 #ifdef CONFIG_NET_CLS_ROUTE
3051 struct ip_rt_acct *ip_rt_acct;
3053 /* This code sucks. But you should have seen it before! --RR */
3055 /* IP route accounting ptr for this logical cpu number. */
3056 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3058 #ifdef CONFIG_PROC_FS
3059 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060 int length, int *eof, void *data)
3064 if ((offset & 3) || (length & 3))
3067 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3072 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073 length = sizeof(struct ip_rt_acct) * 256 - offset;
3077 offset /= sizeof(u32);
3080 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081 u32 *dst = (u32 *) buffer;
3083 /* Copy first cpu. */
3085 memcpy(dst, src, length);
3087 /* Add the other cpus in, one int at a time */
3091 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3093 for (j = 0; j < length/4; j++)
3099 #endif /* CONFIG_PROC_FS */
3100 #endif /* CONFIG_NET_CLS_ROUTE */
3102 static __initdata unsigned long rhash_entries;
3103 static int __init set_rhash_entries(char *str)
3107 rhash_entries = simple_strtoul(str, &str, 0);
3110 __setup("rhash_entries=", set_rhash_entries);
3112 int __init ip_rt_init(void)
3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117 (jiffies ^ (jiffies >> 7)));
3119 #ifdef CONFIG_NET_CLS_ROUTE
3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3125 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3127 panic("IP: failed to allocate ip_rt_acct\n");
3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133 sizeof(struct rtable),
3134 0, SLAB_HWCACHE_ALIGN,
3137 if (!ipv4_dst_ops.kmem_cachep)
3138 panic("IP: failed to allocate ip_dst_cache\n");
3140 rt_hash_table = (struct rt_hash_bucket *)
3141 alloc_large_system_hash("IP route cache",
3142 sizeof(struct rt_hash_bucket),
3144 (num_physpages >= 128 * 1024) ?
3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 rt_hash_lock_init();
3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3157 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3164 init_timer(&rt_flush_timer);
3165 rt_flush_timer.function = rt_run_flush;
3166 init_timer(&rt_periodic_timer);
3167 rt_periodic_timer.function = rt_check_expire;
3168 init_timer(&rt_secret_timer);
3169 rt_secret_timer.function = rt_secret_rebuild;
3171 /* All the timers, started at system startup tend
3172 to synchronize. Perturb it a bit.
3174 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3176 add_timer(&rt_periodic_timer);
3178 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3179 ip_rt_secret_interval;
3180 add_timer(&rt_secret_timer);
3182 #ifdef CONFIG_PROC_FS
3184 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3185 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3186 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3188 free_percpu(rt_cache_stat);
3191 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);