2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
100 #include <net/ip_fib.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
107 #include <linux/sysctl.h>
110 #define RT_FL_TOS(oldflp) \
111 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 #define IP_MAX_MTU 0xFFF0
115 #define RT_GC_TIMEOUT (300*HZ)
117 static int ip_rt_min_delay = 2 * HZ;
118 static int ip_rt_max_delay = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval = 60 * HZ;
122 static int ip_rt_gc_min_interval = HZ / 2;
123 static int ip_rt_redirect_number = 9;
124 static int ip_rt_redirect_load = HZ / 50;
125 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost = HZ;
127 static int ip_rt_error_burst = 5 * HZ;
128 static int ip_rt_gc_elasticity = 8;
129 static int ip_rt_mtu_expires = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu = 512 + 20 + 20;
131 static int ip_rt_min_advmss = 256;
132 static int ip_rt_secret_interval = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
135 #define RTprint(a...) printk(KERN_DEBUG a)
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static void ipv4_dst_ifdown(struct dst_entry *dst,
148 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
155 static struct dst_ops ipv4_dst_ops = {
157 .protocol = __constant_htons(ETH_P_IP),
158 .gc = rt_garbage_collect,
159 .check = ipv4_dst_check,
160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
165 .entry_size = sizeof(struct rtable),
168 #define ECN_OR_COST(class) TC_PRIO_##class
170 __u8 ip_tos2prio[16] = {
174 ECN_OR_COST(BESTEFFORT),
180 ECN_OR_COST(INTERACTIVE),
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
194 /* The locking scheme is rather straight forward:
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
204 struct rt_hash_bucket {
205 struct rtable *chain;
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
213 #define RT_HASH_LOCK_SZ 4096
215 #define RT_HASH_LOCK_SZ 2048
217 #define RT_HASH_LOCK_SZ 1024
219 #define RT_HASH_LOCK_SZ 512
221 #define RT_HASH_LOCK_SZ 256
224 static spinlock_t *rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init() { \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
238 static struct rt_hash_bucket *rt_hash_table;
239 static unsigned rt_hash_mask;
240 static int rt_hash_log;
241 static unsigned int rt_hash_rnd;
243 static struct rt_cache_stat *rt_cache_stat;
244 #define RT_CACHE_STAT_INC(field) \
245 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
247 static int rt_intern_hash(unsigned hash, struct rtable *rth,
248 struct rtable **res);
250 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
252 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
256 #ifdef CONFIG_PROC_FS
257 struct rt_cache_iter_state {
261 static struct rtable *rt_cache_get_first(struct seq_file *seq)
263 struct rtable *r = NULL;
264 struct rt_cache_iter_state *st = seq->private;
266 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
268 r = rt_hash_table[st->bucket].chain;
271 rcu_read_unlock_bh();
276 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
278 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
282 rcu_read_unlock_bh();
283 if (--st->bucket < 0)
286 r = rt_hash_table[st->bucket].chain;
291 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
293 struct rtable *r = rt_cache_get_first(seq);
296 while (pos && (r = rt_cache_get_next(seq, r)))
298 return pos ? NULL : r;
301 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
303 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
306 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
308 struct rtable *r = NULL;
310 if (v == SEQ_START_TOKEN)
311 r = rt_cache_get_first(seq);
313 r = rt_cache_get_next(seq, v);
318 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
320 if (v && v != SEQ_START_TOKEN)
321 rcu_read_unlock_bh();
324 static int rt_cache_seq_show(struct seq_file *seq, void *v)
326 if (v == SEQ_START_TOKEN)
327 seq_printf(seq, "%-127s\n",
328 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
329 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
332 struct rtable *r = v;
335 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
336 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
337 r->u.dst.dev ? r->u.dst.dev->name : "*",
338 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
339 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
340 r->u.dst.__use, 0, (unsigned long)r->rt_src,
341 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
342 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
343 dst_metric(&r->u.dst, RTAX_WINDOW),
344 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
345 dst_metric(&r->u.dst, RTAX_RTTVAR)),
347 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
348 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
351 seq_printf(seq, "%-127s\n", temp);
356 static struct seq_operations rt_cache_seq_ops = {
357 .start = rt_cache_seq_start,
358 .next = rt_cache_seq_next,
359 .stop = rt_cache_seq_stop,
360 .show = rt_cache_seq_show,
363 static int rt_cache_seq_open(struct inode *inode, struct file *file)
365 struct seq_file *seq;
367 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
371 rc = seq_open(file, &rt_cache_seq_ops);
374 seq = file->private_data;
376 memset(s, 0, sizeof(*s));
384 static struct file_operations rt_cache_seq_fops = {
385 .owner = THIS_MODULE,
386 .open = rt_cache_seq_open,
389 .release = seq_release_private,
393 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
398 return SEQ_START_TOKEN;
400 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401 if (!cpu_possible(cpu))
404 return per_cpu_ptr(rt_cache_stat, cpu);
409 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
413 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414 if (!cpu_possible(cpu))
417 return per_cpu_ptr(rt_cache_stat, cpu);
423 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
428 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430 struct rt_cache_stat *st = v;
432 if (v == SEQ_START_TOKEN) {
433 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
437 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
438 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439 atomic_read(&ipv4_dst_ops.entries),
462 static struct seq_operations rt_cpu_seq_ops = {
463 .start = rt_cpu_seq_start,
464 .next = rt_cpu_seq_next,
465 .stop = rt_cpu_seq_stop,
466 .show = rt_cpu_seq_show,
470 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472 return seq_open(file, &rt_cpu_seq_ops);
475 static struct file_operations rt_cpu_seq_fops = {
476 .owner = THIS_MODULE,
477 .open = rt_cpu_seq_open,
480 .release = seq_release,
483 #endif /* CONFIG_PROC_FS */
485 static __inline__ void rt_free(struct rtable *rt)
487 multipath_remove(rt);
488 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
491 static __inline__ void rt_drop(struct rtable *rt)
493 multipath_remove(rt);
495 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
498 static __inline__ int rt_fast_clean(struct rtable *rth)
500 /* Kill broadcast/multicast entries very aggresively, if they
501 collide in hash table with more useful entries */
502 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
503 rth->fl.iif && rth->u.rt_next;
506 static __inline__ int rt_valuable(struct rtable *rth)
508 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
512 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
517 if (atomic_read(&rth->u.dst.__refcnt))
521 if (rth->u.dst.expires &&
522 time_after_eq(jiffies, rth->u.dst.expires))
525 age = jiffies - rth->u.dst.lastuse;
527 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
528 (age <= tmo2 && rt_valuable(rth)))
534 /* Bits of score are:
536 * 30: not quite useless
537 * 29..0: usage counter
539 static inline u32 rt_score(struct rtable *rt)
541 u32 score = jiffies - rt->u.dst.lastuse;
543 score = ~score & ~(3<<30);
549 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
555 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
557 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
558 fl1->oif == fl2->oif &&
559 fl1->iif == fl2->iif;
562 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
563 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
564 struct rtable *expentry,
567 int passedexpired = 0;
568 struct rtable **nextstep = NULL;
569 struct rtable **rthp = chain_head;
575 while ((rth = *rthp) != NULL) {
579 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
580 compare_keys(&(*rthp)->fl, &expentry->fl)) {
581 if (*rthp == expentry) {
582 *rthp = rth->u.rt_next;
585 *rthp = rth->u.rt_next;
591 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
592 passedexpired && !nextstep)
593 nextstep = &rth->u.rt_next;
595 rthp = &rth->u.rt_next;
605 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
608 /* This runs via a timer and thus is always in BH context. */
609 static void rt_check_expire(unsigned long dummy)
611 static unsigned int rover;
612 unsigned int i = rover, goal;
613 struct rtable *rth, **rthp;
614 unsigned long now = jiffies;
617 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
618 if (ip_rt_gc_timeout > 1)
619 do_div(mult, ip_rt_gc_timeout);
620 goal = (unsigned int)mult;
621 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
622 for (; goal > 0; goal--) {
623 unsigned long tmo = ip_rt_gc_timeout;
625 i = (i + 1) & rt_hash_mask;
626 rthp = &rt_hash_table[i].chain;
630 spin_lock(rt_hash_lock_addr(i));
631 while ((rth = *rthp) != NULL) {
632 if (rth->u.dst.expires) {
633 /* Entry is expired even if it is in use */
634 if (time_before_eq(now, rth->u.dst.expires)) {
636 rthp = &rth->u.rt_next;
639 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
641 rthp = &rth->u.rt_next;
645 /* Cleanup aged off entries. */
646 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
647 /* remove all related balanced entries if necessary */
648 if (rth->u.dst.flags & DST_BALANCED) {
649 rthp = rt_remove_balanced_route(
650 &rt_hash_table[i].chain,
655 *rthp = rth->u.rt_next;
658 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
659 *rthp = rth->u.rt_next;
661 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
663 spin_unlock(rt_hash_lock_addr(i));
665 /* Fallback loop breaker. */
666 if (time_after(jiffies, now))
670 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
673 /* This can run from both BH and non-BH contexts, the latter
674 * in the case of a forced flush event.
676 static void rt_run_flush(unsigned long dummy)
679 struct rtable *rth, *next;
683 get_random_bytes(&rt_hash_rnd, 4);
685 for (i = rt_hash_mask; i >= 0; i--) {
686 spin_lock_bh(rt_hash_lock_addr(i));
687 rth = rt_hash_table[i].chain;
689 rt_hash_table[i].chain = NULL;
690 spin_unlock_bh(rt_hash_lock_addr(i));
692 for (; rth; rth = next) {
693 next = rth->u.rt_next;
699 static DEFINE_SPINLOCK(rt_flush_lock);
701 void rt_cache_flush(int delay)
703 unsigned long now = jiffies;
704 int user_mode = !in_softirq();
707 delay = ip_rt_min_delay;
709 /* flush existing multipath state*/
712 spin_lock_bh(&rt_flush_lock);
714 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
715 long tmo = (long)(rt_deadline - now);
717 /* If flush timer is already running
718 and flush request is not immediate (delay > 0):
720 if deadline is not achieved, prolongate timer to "delay",
721 otherwise fire it at deadline time.
724 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
732 spin_unlock_bh(&rt_flush_lock);
737 if (rt_deadline == 0)
738 rt_deadline = now + ip_rt_max_delay;
740 mod_timer(&rt_flush_timer, now+delay);
741 spin_unlock_bh(&rt_flush_lock);
744 static void rt_secret_rebuild(unsigned long dummy)
746 unsigned long now = jiffies;
749 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
753 Short description of GC goals.
755 We want to build algorithm, which will keep routing cache
756 at some equilibrium point, when number of aged off entries
757 is kept approximately equal to newly generated ones.
759 Current expiration strength is variable "expire".
760 We try to adjust it dynamically, so that if networking
761 is idle expires is large enough to keep enough of warm entries,
762 and when load increases it reduces to limit cache size.
765 static int rt_garbage_collect(void)
767 static unsigned long expire = RT_GC_TIMEOUT;
768 static unsigned long last_gc;
770 static int equilibrium;
771 struct rtable *rth, **rthp;
772 unsigned long now = jiffies;
776 * Garbage collection is pretty expensive,
777 * do not make it too frequently.
780 RT_CACHE_STAT_INC(gc_total);
782 if (now - last_gc < ip_rt_gc_min_interval &&
783 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
784 RT_CACHE_STAT_INC(gc_ignored);
788 /* Calculate number of entries, which we want to expire now. */
789 goal = atomic_read(&ipv4_dst_ops.entries) -
790 (ip_rt_gc_elasticity << rt_hash_log);
792 if (equilibrium < ipv4_dst_ops.gc_thresh)
793 equilibrium = ipv4_dst_ops.gc_thresh;
794 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
796 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
797 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
800 /* We are in dangerous area. Try to reduce cache really
803 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
807 if (now - last_gc >= ip_rt_gc_min_interval)
818 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
819 unsigned long tmo = expire;
821 k = (k + 1) & rt_hash_mask;
822 rthp = &rt_hash_table[k].chain;
823 spin_lock_bh(rt_hash_lock_addr(k));
824 while ((rth = *rthp) != NULL) {
825 if (!rt_may_expire(rth, tmo, expire)) {
827 rthp = &rth->u.rt_next;
830 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
831 /* remove all related balanced entries
834 if (rth->u.dst.flags & DST_BALANCED) {
837 rthp = rt_remove_balanced_route(
838 &rt_hash_table[i].chain,
845 *rthp = rth->u.rt_next;
849 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
850 *rthp = rth->u.rt_next;
853 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
855 spin_unlock_bh(rt_hash_lock_addr(k));
864 /* Goal is not achieved. We stop process if:
866 - if expire reduced to zero. Otherwise, expire is halfed.
867 - if table is not full.
868 - if we are called from interrupt.
869 - jiffies check is just fallback/debug loop breaker.
870 We will not spin here for long time in any case.
873 RT_CACHE_STAT_INC(gc_goal_miss);
879 #if RT_CACHE_DEBUG >= 2
880 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
881 atomic_read(&ipv4_dst_ops.entries), goal, i);
884 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
886 } while (!in_softirq() && time_before_eq(jiffies, now));
888 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
891 printk(KERN_WARNING "dst cache overflow\n");
892 RT_CACHE_STAT_INC(gc_dst_overflow);
896 expire += ip_rt_gc_min_interval;
897 if (expire > ip_rt_gc_timeout ||
898 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
899 expire = ip_rt_gc_timeout;
900 #if RT_CACHE_DEBUG >= 2
901 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
902 atomic_read(&ipv4_dst_ops.entries), goal, rover);
907 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
909 struct rtable *rth, **rthp;
911 struct rtable *cand, **candp;
914 int attempts = !in_softirq();
923 rthp = &rt_hash_table[hash].chain;
925 spin_lock_bh(rt_hash_lock_addr(hash));
926 while ((rth = *rthp) != NULL) {
927 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928 if (!(rth->u.dst.flags & DST_BALANCED) &&
929 compare_keys(&rth->fl, &rt->fl)) {
931 if (compare_keys(&rth->fl, &rt->fl)) {
934 *rthp = rth->u.rt_next;
936 * Since lookup is lockfree, the deletion
937 * must be visible to another weakly ordered CPU before
938 * the insertion at the start of the hash chain.
940 rcu_assign_pointer(rth->u.rt_next,
941 rt_hash_table[hash].chain);
943 * Since lookup is lockfree, the update writes
944 * must be ordered for consistency on SMP.
946 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
949 dst_hold(&rth->u.dst);
950 rth->u.dst.lastuse = now;
951 spin_unlock_bh(rt_hash_lock_addr(hash));
958 if (!atomic_read(&rth->u.dst.__refcnt)) {
959 u32 score = rt_score(rth);
961 if (score <= min_score) {
970 rthp = &rth->u.rt_next;
974 /* ip_rt_gc_elasticity used to be average length of chain
975 * length, when exceeded gc becomes really aggressive.
977 * The second limit is less certain. At the moment it allows
978 * only 2 entries per bucket. We will see.
980 if (chain_length > ip_rt_gc_elasticity) {
981 *candp = cand->u.rt_next;
986 /* Try to bind route to arp only if it is output
987 route or unicast forwarding path.
989 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
990 int err = arp_bind_neighbour(&rt->u.dst);
992 spin_unlock_bh(rt_hash_lock_addr(hash));
994 if (err != -ENOBUFS) {
999 /* Neighbour tables are full and nothing
1000 can be released. Try to shrink route cache,
1001 it is most likely it holds some neighbour records.
1003 if (attempts-- > 0) {
1004 int saved_elasticity = ip_rt_gc_elasticity;
1005 int saved_int = ip_rt_gc_min_interval;
1006 ip_rt_gc_elasticity = 1;
1007 ip_rt_gc_min_interval = 0;
1008 rt_garbage_collect();
1009 ip_rt_gc_min_interval = saved_int;
1010 ip_rt_gc_elasticity = saved_elasticity;
1014 if (net_ratelimit())
1015 printk(KERN_WARNING "Neighbour table overflow.\n");
1021 rt->u.rt_next = rt_hash_table[hash].chain;
1022 #if RT_CACHE_DEBUG >= 2
1023 if (rt->u.rt_next) {
1025 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026 NIPQUAD(rt->rt_dst));
1027 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1032 rt_hash_table[hash].chain = rt;
1033 spin_unlock_bh(rt_hash_lock_addr(hash));
1038 void rt_bind_peer(struct rtable *rt, int create)
1040 static DEFINE_SPINLOCK(rt_peer_lock);
1041 struct inet_peer *peer;
1043 peer = inet_getpeer(rt->rt_dst, create);
1045 spin_lock_bh(&rt_peer_lock);
1046 if (rt->peer == NULL) {
1050 spin_unlock_bh(&rt_peer_lock);
1056 * Peer allocation may fail only in serious out-of-memory conditions. However
1057 * we still can generate some output.
1058 * Random ID selection looks a bit dangerous because we have no chances to
1059 * select ID being unique in a reasonable period of time.
1060 * But broken packet identifier may be better than no packet at all.
1062 static void ip_select_fb_ident(struct iphdr *iph)
1064 static DEFINE_SPINLOCK(ip_fb_id_lock);
1065 static u32 ip_fallback_id;
1068 spin_lock_bh(&ip_fb_id_lock);
1069 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070 iph->id = htons(salt & 0xFFFF);
1071 ip_fallback_id = salt;
1072 spin_unlock_bh(&ip_fb_id_lock);
1075 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1077 struct rtable *rt = (struct rtable *) dst;
1080 if (rt->peer == NULL)
1081 rt_bind_peer(rt, 1);
1083 /* If peer is attached to destination, it is never detached,
1084 so that we need not to grab a lock to dereference it.
1087 iph->id = htons(inet_getid(rt->peer, more));
1091 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1092 __builtin_return_address(0));
1094 ip_select_fb_ident(iph);
1097 static void rt_del(unsigned hash, struct rtable *rt)
1099 struct rtable **rthp;
1101 spin_lock_bh(rt_hash_lock_addr(hash));
1103 for (rthp = &rt_hash_table[hash].chain; *rthp;
1104 rthp = &(*rthp)->u.rt_next)
1106 *rthp = rt->u.rt_next;
1110 spin_unlock_bh(rt_hash_lock_addr(hash));
1113 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114 u32 saddr, u8 tos, struct net_device *dev)
1117 struct in_device *in_dev = in_dev_get(dev);
1118 struct rtable *rth, **rthp;
1119 u32 skeys[2] = { saddr, 0 };
1120 int ikeys[2] = { dev->ifindex, 0 };
1122 tos &= IPTOS_RT_MASK;
1127 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129 goto reject_redirect;
1131 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133 goto reject_redirect;
1134 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135 goto reject_redirect;
1137 if (inet_addr_type(new_gw) != RTN_UNICAST)
1138 goto reject_redirect;
1141 for (i = 0; i < 2; i++) {
1142 for (k = 0; k < 2; k++) {
1143 unsigned hash = rt_hash_code(daddr,
1144 skeys[i] ^ (ikeys[k] << 5),
1147 rthp=&rt_hash_table[hash].chain;
1150 while ((rth = rcu_dereference(*rthp)) != NULL) {
1153 if (rth->fl.fl4_dst != daddr ||
1154 rth->fl.fl4_src != skeys[i] ||
1155 rth->fl.fl4_tos != tos ||
1156 rth->fl.oif != ikeys[k] ||
1158 rthp = &rth->u.rt_next;
1162 if (rth->rt_dst != daddr ||
1163 rth->rt_src != saddr ||
1165 rth->rt_gateway != old_gw ||
1166 rth->u.dst.dev != dev)
1169 dst_hold(&rth->u.dst);
1172 rt = dst_alloc(&ipv4_dst_ops);
1179 /* Copy all the information. */
1181 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182 rt->u.dst.__use = 1;
1183 atomic_set(&rt->u.dst.__refcnt, 1);
1184 rt->u.dst.child = NULL;
1186 dev_hold(rt->u.dst.dev);
1188 in_dev_hold(rt->idev);
1189 rt->u.dst.obsolete = 0;
1190 rt->u.dst.lastuse = jiffies;
1191 rt->u.dst.path = &rt->u.dst;
1192 rt->u.dst.neighbour = NULL;
1193 rt->u.dst.hh = NULL;
1194 rt->u.dst.xfrm = NULL;
1196 rt->rt_flags |= RTCF_REDIRECTED;
1198 /* Gateway is different ... */
1199 rt->rt_gateway = new_gw;
1201 /* Redirect received -> path was valid */
1202 dst_confirm(&rth->u.dst);
1205 atomic_inc(&rt->peer->refcnt);
1207 if (arp_bind_neighbour(&rt->u.dst) ||
1208 !(rt->u.dst.neighbour->nud_state &
1210 if (rt->u.dst.neighbour)
1211 neigh_event_send(rt->u.dst.neighbour, NULL);
1218 if (!rt_intern_hash(hash, rt, &rt))
1231 #ifdef CONFIG_IP_ROUTE_VERBOSE
1232 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234 "%u.%u.%u.%u ignored.\n"
1235 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1237 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1243 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1245 struct rtable *rt = (struct rtable*)dst;
1246 struct dst_entry *ret = dst;
1249 if (dst->obsolete) {
1252 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253 rt->u.dst.expires) {
1254 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1258 #if RT_CACHE_DEBUG >= 1
1259 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260 "%u.%u.%u.%u/%02x dropped\n",
1261 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1272 * 1. The first ip_rt_redirect_number redirects are sent
1273 * with exponential backoff, then we stop sending them at all,
1274 * assuming that the host ignores our redirects.
1275 * 2. If we did not see packets requiring redirects
1276 * during ip_rt_redirect_silence, we assume that the host
1277 * forgot redirected route and start to send redirects again.
1279 * This algorithm is much cheaper and more intelligent than dumb load limiting
1282 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1286 void ip_rt_send_redirect(struct sk_buff *skb)
1288 struct rtable *rt = (struct rtable*)skb->dst;
1289 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1294 if (!IN_DEV_TX_REDIRECTS(in_dev))
1297 /* No redirected packets during ip_rt_redirect_silence;
1298 * reset the algorithm.
1300 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301 rt->u.dst.rate_tokens = 0;
1303 /* Too many ignored redirects; do not send anything
1304 * set u.dst.rate_last to the last seen redirected packet.
1306 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307 rt->u.dst.rate_last = jiffies;
1311 /* Check for load limit; set rate_last to the latest sent
1314 if (time_after(jiffies,
1315 (rt->u.dst.rate_last +
1316 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318 rt->u.dst.rate_last = jiffies;
1319 ++rt->u.dst.rate_tokens;
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1324 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326 NIPQUAD(rt->rt_src), rt->rt_iif,
1327 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1334 static int ip_error(struct sk_buff *skb)
1336 struct rtable *rt = (struct rtable*)skb->dst;
1340 switch (rt->u.dst.error) {
1345 code = ICMP_HOST_UNREACH;
1348 code = ICMP_NET_UNREACH;
1351 code = ICMP_PKT_FILTERED;
1356 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358 rt->u.dst.rate_tokens = ip_rt_error_burst;
1359 rt->u.dst.rate_last = now;
1360 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1365 out: kfree_skb(skb);
1370 * The last two values are not from the RFC but
1371 * are needed for AMPRnet AX.25 paths.
1374 static unsigned short mtu_plateau[] =
1375 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1377 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1381 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382 if (old_mtu > mtu_plateau[i])
1383 return mtu_plateau[i];
1387 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1390 unsigned short old_mtu = ntohs(iph->tot_len);
1392 u32 skeys[2] = { iph->saddr, 0, };
1393 u32 daddr = iph->daddr;
1394 u8 tos = iph->tos & IPTOS_RT_MASK;
1395 unsigned short est_mtu = 0;
1397 if (ipv4_config.no_pmtu_disc)
1400 for (i = 0; i < 2; i++) {
1401 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1404 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 rth = rcu_dereference(rth->u.rt_next)) {
1406 if (rth->fl.fl4_dst == daddr &&
1407 rth->fl.fl4_src == skeys[i] &&
1408 rth->rt_dst == daddr &&
1409 rth->rt_src == iph->saddr &&
1410 rth->fl.fl4_tos == tos &&
1412 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413 unsigned short mtu = new_mtu;
1415 if (new_mtu < 68 || new_mtu >= old_mtu) {
1417 /* BSD 4.2 compatibility hack :-( */
1419 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420 old_mtu >= 68 + (iph->ihl << 2))
1421 old_mtu -= iph->ihl << 2;
1423 mtu = guess_mtu(old_mtu);
1425 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1427 dst_confirm(&rth->u.dst);
1428 if (mtu < ip_rt_min_pmtu) {
1429 mtu = ip_rt_min_pmtu;
1430 rth->u.dst.metrics[RTAX_LOCK-1] |=
1433 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434 dst_set_expires(&rth->u.dst,
1443 return est_mtu ? : new_mtu;
1446 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1448 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449 !(dst_metric_locked(dst, RTAX_MTU))) {
1450 if (mtu < ip_rt_min_pmtu) {
1451 mtu = ip_rt_min_pmtu;
1452 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1454 dst->metrics[RTAX_MTU-1] = mtu;
1455 dst_set_expires(dst, ip_rt_mtu_expires);
1459 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1464 static void ipv4_dst_destroy(struct dst_entry *dst)
1466 struct rtable *rt = (struct rtable *) dst;
1467 struct inet_peer *peer = rt->peer;
1468 struct in_device *idev = rt->idev;
1481 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1484 struct rtable *rt = (struct rtable *) dst;
1485 struct in_device *idev = rt->idev;
1486 if (dev != &loopback_dev && idev && idev->dev == dev) {
1487 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488 if (loopback_idev) {
1489 rt->idev = loopback_idev;
1495 static void ipv4_link_failure(struct sk_buff *skb)
1499 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1501 rt = (struct rtable *) skb->dst;
1503 dst_set_expires(&rt->u.dst, 0);
1506 static int ip_rt_bug(struct sk_buff *skb)
1508 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510 skb->dev ? skb->dev->name : "?");
1516 We do not cache source address of outgoing interface,
1517 because it is used only by IP RR, TS and SRR options,
1518 so that it out of fast path.
1520 BTW remember: "addr" is allowed to be not aligned
1524 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1527 struct fib_result res;
1529 if (rt->fl.iif == 0)
1531 else if (fib_lookup(&rt->fl, &res) == 0) {
1532 src = FIB_RES_PREFSRC(res);
1535 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1537 memcpy(addr, &src, 4);
1540 #ifdef CONFIG_NET_CLS_ROUTE
1541 static void set_class_tag(struct rtable *rt, u32 tag)
1543 if (!(rt->u.dst.tclassid & 0xFFFF))
1544 rt->u.dst.tclassid |= tag & 0xFFFF;
1545 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1550 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1552 struct fib_info *fi = res->fi;
1555 if (FIB_RES_GW(*res) &&
1556 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557 rt->rt_gateway = FIB_RES_GW(*res);
1558 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559 sizeof(rt->u.dst.metrics));
1560 if (fi->fib_mtu == 0) {
1561 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563 rt->rt_gateway != rt->rt_dst &&
1564 rt->u.dst.dev->mtu > 576)
1565 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1567 #ifdef CONFIG_NET_CLS_ROUTE
1568 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1571 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1573 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1580 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1583 #ifdef CONFIG_NET_CLS_ROUTE
1584 #ifdef CONFIG_IP_MULTIPLE_TABLES
1585 set_class_tag(rt, fib_rules_tclass(res));
1587 set_class_tag(rt, itag);
1589 rt->rt_type = res->type;
1592 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593 u8 tos, struct net_device *dev, int our)
1598 struct in_device *in_dev = in_dev_get(dev);
1601 /* Primary sanity checks. */
1606 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607 skb->protocol != htons(ETH_P_IP))
1610 if (ZERONET(saddr)) {
1611 if (!LOCAL_MCAST(daddr))
1613 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614 } else if (fib_validate_source(saddr, 0, tos, 0,
1615 dev, &spec_dst, &itag) < 0)
1618 rth = dst_alloc(&ipv4_dst_ops);
1622 rth->u.dst.output= ip_rt_bug;
1624 atomic_set(&rth->u.dst.__refcnt, 1);
1625 rth->u.dst.flags= DST_HOST;
1626 if (in_dev->cnf.no_policy)
1627 rth->u.dst.flags |= DST_NOPOLICY;
1628 rth->fl.fl4_dst = daddr;
1629 rth->rt_dst = daddr;
1630 rth->fl.fl4_tos = tos;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632 rth->fl.fl4_fwmark= skb->nfmark;
1634 rth->fl.fl4_src = saddr;
1635 rth->rt_src = saddr;
1636 #ifdef CONFIG_NET_CLS_ROUTE
1637 rth->u.dst.tclassid = itag;
1640 rth->fl.iif = dev->ifindex;
1641 rth->u.dst.dev = &loopback_dev;
1642 dev_hold(rth->u.dst.dev);
1643 rth->idev = in_dev_get(rth->u.dst.dev);
1645 rth->rt_gateway = daddr;
1646 rth->rt_spec_dst= spec_dst;
1647 rth->rt_type = RTN_MULTICAST;
1648 rth->rt_flags = RTCF_MULTICAST;
1650 rth->u.dst.input= ip_local_deliver;
1651 rth->rt_flags |= RTCF_LOCAL;
1654 #ifdef CONFIG_IP_MROUTE
1655 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656 rth->u.dst.input = ip_mr_input;
1658 RT_CACHE_STAT_INC(in_slow_mc);
1661 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674 static void ip_handle_martian_source(struct net_device *dev,
1675 struct in_device *in_dev,
1676 struct sk_buff *skb,
1680 RT_CACHE_STAT_INC(in_martian_src);
1681 #ifdef CONFIG_IP_ROUTE_VERBOSE
1682 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1684 * RFC1812 recommendation, if source is martian,
1685 * the only hint is MAC header.
1687 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688 "%u.%u.%u.%u, on dev %s\n",
1689 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690 if (dev->hard_header_len && skb->mac.raw) {
1692 unsigned char *p = skb->mac.raw;
1693 printk(KERN_WARNING "ll header: ");
1694 for (i = 0; i < dev->hard_header_len; i++, p++) {
1696 if (i < (dev->hard_header_len - 1))
1705 static inline int __mkroute_input(struct sk_buff *skb,
1706 struct fib_result* res,
1707 struct in_device *in_dev,
1708 u32 daddr, u32 saddr, u32 tos,
1709 struct rtable **result)
1714 struct in_device *out_dev;
1718 /* get a working reference to the output device */
1719 out_dev = in_dev_get(FIB_RES_DEV(*res));
1720 if (out_dev == NULL) {
1721 if (net_ratelimit())
1722 printk(KERN_CRIT "Bug in ip_route_input" \
1723 "_slow(). Please, report\n");
1728 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1729 in_dev->dev, &spec_dst, &itag);
1731 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1739 flags |= RTCF_DIRECTSRC;
1741 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742 (IN_DEV_SHARED_MEDIA(out_dev) ||
1743 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744 flags |= RTCF_DOREDIRECT;
1746 if (skb->protocol != htons(ETH_P_IP)) {
1747 /* Not IP (i.e. ARP). Do not create route, if it is
1748 * invalid for proxy arp. DNAT routes are always valid.
1750 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1757 rth = dst_alloc(&ipv4_dst_ops);
1763 rth->u.dst.flags= DST_HOST;
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765 if (res->fi->fib_nhs > 1)
1766 rth->u.dst.flags |= DST_BALANCED;
1768 if (in_dev->cnf.no_policy)
1769 rth->u.dst.flags |= DST_NOPOLICY;
1770 if (in_dev->cnf.no_xfrm)
1771 rth->u.dst.flags |= DST_NOXFRM;
1772 rth->fl.fl4_dst = daddr;
1773 rth->rt_dst = daddr;
1774 rth->fl.fl4_tos = tos;
1775 #ifdef CONFIG_IP_ROUTE_FWMARK
1776 rth->fl.fl4_fwmark= skb->nfmark;
1778 rth->fl.fl4_src = saddr;
1779 rth->rt_src = saddr;
1780 rth->rt_gateway = daddr;
1782 rth->fl.iif = in_dev->dev->ifindex;
1783 rth->u.dst.dev = (out_dev)->dev;
1784 dev_hold(rth->u.dst.dev);
1785 rth->idev = in_dev_get(rth->u.dst.dev);
1787 rth->rt_spec_dst= spec_dst;
1789 rth->u.dst.input = ip_forward;
1790 rth->u.dst.output = ip_output;
1792 rt_set_nexthop(rth, res, itag);
1794 rth->rt_flags = flags;
1799 /* release the working reference to the output device */
1800 in_dev_put(out_dev);
1804 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805 struct fib_result* res,
1806 const struct flowi *fl,
1807 struct in_device *in_dev,
1808 u32 daddr, u32 saddr, u32 tos)
1810 struct rtable* rth = NULL;
1814 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1815 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816 fib_select_multipath(fl, res);
1819 /* create a routing cache entry */
1820 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1823 atomic_set(&rth->u.dst.__refcnt, 1);
1825 /* put it into the cache */
1826 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1830 static inline int ip_mkroute_input(struct sk_buff *skb,
1831 struct fib_result* res,
1832 const struct flowi *fl,
1833 struct in_device *in_dev,
1834 u32 daddr, u32 saddr, u32 tos)
1836 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1837 struct rtable* rth = NULL;
1838 unsigned char hop, hopcount, lasthop;
1843 hopcount = res->fi->fib_nhs;
1847 lasthop = hopcount - 1;
1849 /* distinguish between multipath and singlepath */
1851 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1854 /* add all alternatives to the routing cache */
1855 for (hop = 0; hop < hopcount; hop++) {
1858 /* create a routing cache entry */
1859 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1864 /* put it into the cache */
1865 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1866 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1870 /* forward hop information to multipath impl. */
1871 multipath_set_nhinfo(rth,
1872 FIB_RES_NETWORK(*res),
1873 FIB_RES_NETMASK(*res),
1877 /* only for the last hop the reference count is handled
1881 atomic_set(&(skb->dst->__refcnt), 1);
1884 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1885 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1886 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1891 * NOTE. We drop all the packets that has local source
1892 * addresses, because every properly looped back packet
1893 * must have correct destination already attached by output routine.
1895 * Such approach solves two big problems:
1896 * 1. Not simplex devices are handled properly.
1897 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1900 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1901 u8 tos, struct net_device *dev)
1903 struct fib_result res;
1904 struct in_device *in_dev = in_dev_get(dev);
1905 struct flowi fl = { .nl_u = { .ip4_u =
1909 .scope = RT_SCOPE_UNIVERSE,
1910 #ifdef CONFIG_IP_ROUTE_FWMARK
1911 .fwmark = skb->nfmark
1914 .iif = dev->ifindex };
1917 struct rtable * rth;
1923 /* IP on this device is disabled. */
1928 /* Check for the most weird martians, which can be not detected
1932 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1933 goto martian_source;
1935 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1938 /* Accept zero addresses only to limited broadcast;
1939 * I even do not know to fix it or not. Waiting for complains :-)
1942 goto martian_source;
1944 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1945 goto martian_destination;
1948 * Now we are ready to route packet.
1950 if ((err = fib_lookup(&fl, &res)) != 0) {
1951 if (!IN_DEV_FORWARD(in_dev))
1957 RT_CACHE_STAT_INC(in_slow_tot);
1959 if (res.type == RTN_BROADCAST)
1962 if (res.type == RTN_LOCAL) {
1964 result = fib_validate_source(saddr, daddr, tos,
1965 loopback_dev.ifindex,
1966 dev, &spec_dst, &itag);
1968 goto martian_source;
1970 flags |= RTCF_DIRECTSRC;
1975 if (!IN_DEV_FORWARD(in_dev))
1977 if (res.type != RTN_UNICAST)
1978 goto martian_destination;
1980 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1981 if (err == -ENOBUFS)
1993 if (skb->protocol != htons(ETH_P_IP))
1997 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1999 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2002 goto martian_source;
2004 flags |= RTCF_DIRECTSRC;
2006 flags |= RTCF_BROADCAST;
2007 res.type = RTN_BROADCAST;
2008 RT_CACHE_STAT_INC(in_brd);
2011 rth = dst_alloc(&ipv4_dst_ops);
2015 rth->u.dst.output= ip_rt_bug;
2017 atomic_set(&rth->u.dst.__refcnt, 1);
2018 rth->u.dst.flags= DST_HOST;
2019 if (in_dev->cnf.no_policy)
2020 rth->u.dst.flags |= DST_NOPOLICY;
2021 rth->fl.fl4_dst = daddr;
2022 rth->rt_dst = daddr;
2023 rth->fl.fl4_tos = tos;
2024 #ifdef CONFIG_IP_ROUTE_FWMARK
2025 rth->fl.fl4_fwmark= skb->nfmark;
2027 rth->fl.fl4_src = saddr;
2028 rth->rt_src = saddr;
2029 #ifdef CONFIG_NET_CLS_ROUTE
2030 rth->u.dst.tclassid = itag;
2033 rth->fl.iif = dev->ifindex;
2034 rth->u.dst.dev = &loopback_dev;
2035 dev_hold(rth->u.dst.dev);
2036 rth->idev = in_dev_get(rth->u.dst.dev);
2037 rth->rt_gateway = daddr;
2038 rth->rt_spec_dst= spec_dst;
2039 rth->u.dst.input= ip_local_deliver;
2040 rth->rt_flags = flags|RTCF_LOCAL;
2041 if (res.type == RTN_UNREACHABLE) {
2042 rth->u.dst.input= ip_error;
2043 rth->u.dst.error= -err;
2044 rth->rt_flags &= ~RTCF_LOCAL;
2046 rth->rt_type = res.type;
2047 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2048 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052 RT_CACHE_STAT_INC(in_no_route);
2053 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2054 res.type = RTN_UNREACHABLE;
2058 * Do not cache martian addresses: they should be logged (RFC1812)
2060 martian_destination:
2061 RT_CACHE_STAT_INC(in_martian_dst);
2062 #ifdef CONFIG_IP_ROUTE_VERBOSE
2063 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2064 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2065 "%u.%u.%u.%u, dev %s\n",
2066 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070 err = -EHOSTUNREACH;
2082 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2087 u8 tos, struct net_device *dev)
2089 struct rtable * rth;
2091 int iif = dev->ifindex;
2093 tos &= IPTOS_RT_MASK;
2094 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2097 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2098 rth = rcu_dereference(rth->u.rt_next)) {
2099 if (rth->fl.fl4_dst == daddr &&
2100 rth->fl.fl4_src == saddr &&
2101 rth->fl.iif == iif &&
2103 #ifdef CONFIG_IP_ROUTE_FWMARK
2104 rth->fl.fl4_fwmark == skb->nfmark &&
2106 rth->fl.fl4_tos == tos) {
2107 rth->u.dst.lastuse = jiffies;
2108 dst_hold(&rth->u.dst);
2110 RT_CACHE_STAT_INC(in_hit);
2112 skb->dst = (struct dst_entry*)rth;
2115 RT_CACHE_STAT_INC(in_hlist_search);
2119 /* Multicast recognition logic is moved from route cache to here.
2120 The problem was that too many Ethernet cards have broken/missing
2121 hardware multicast filters :-( As result the host on multicasting
2122 network acquires a lot of useless route cache entries, sort of
2123 SDR messages from all the world. Now we try to get rid of them.
2124 Really, provided software IP multicast filter is organized
2125 reasonably (at least, hashed), it does not result in a slowdown
2126 comparing with route cache reject entries.
2127 Note, that multicast routers are not affected, because
2128 route cache entry is created eventually.
2130 if (MULTICAST(daddr)) {
2131 struct in_device *in_dev;
2134 if ((in_dev = __in_dev_get(dev)) != NULL) {
2135 int our = ip_check_mc(in_dev, daddr, saddr,
2136 skb->nh.iph->protocol);
2138 #ifdef CONFIG_IP_MROUTE
2139 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2143 return ip_route_input_mc(skb, daddr, saddr,
2150 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2153 static inline int __mkroute_output(struct rtable **result,
2154 struct fib_result* res,
2155 const struct flowi *fl,
2156 const struct flowi *oldflp,
2157 struct net_device *dev_out,
2161 struct in_device *in_dev;
2162 u32 tos = RT_FL_TOS(oldflp);
2165 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2168 if (fl->fl4_dst == 0xFFFFFFFF)
2169 res->type = RTN_BROADCAST;
2170 else if (MULTICAST(fl->fl4_dst))
2171 res->type = RTN_MULTICAST;
2172 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2175 if (dev_out->flags & IFF_LOOPBACK)
2176 flags |= RTCF_LOCAL;
2178 /* get work reference to inet device */
2179 in_dev = in_dev_get(dev_out);
2183 if (res->type == RTN_BROADCAST) {
2184 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186 fib_info_put(res->fi);
2189 } else if (res->type == RTN_MULTICAST) {
2190 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2191 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2193 flags &= ~RTCF_LOCAL;
2194 /* If multicast route do not exist use
2195 default one, but do not gateway in this case.
2198 if (res->fi && res->prefixlen < 4) {
2199 fib_info_put(res->fi);
2205 rth = dst_alloc(&ipv4_dst_ops);
2211 rth->u.dst.flags= DST_HOST;
2212 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2214 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2215 if (res->fi->fib_nhs > 1)
2216 rth->u.dst.flags |= DST_BALANCED;
2219 if (in_dev->cnf.no_xfrm)
2220 rth->u.dst.flags |= DST_NOXFRM;
2221 if (in_dev->cnf.no_policy)
2222 rth->u.dst.flags |= DST_NOPOLICY;
2224 rth->fl.fl4_dst = oldflp->fl4_dst;
2225 rth->fl.fl4_tos = tos;
2226 rth->fl.fl4_src = oldflp->fl4_src;
2227 rth->fl.oif = oldflp->oif;
2228 #ifdef CONFIG_IP_ROUTE_FWMARK
2229 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2231 rth->rt_dst = fl->fl4_dst;
2232 rth->rt_src = fl->fl4_src;
2233 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2234 /* get references to the devices that are to be hold by the routing
2236 rth->u.dst.dev = dev_out;
2238 rth->idev = in_dev_get(dev_out);
2239 rth->rt_gateway = fl->fl4_dst;
2240 rth->rt_spec_dst= fl->fl4_src;
2242 rth->u.dst.output=ip_output;
2244 RT_CACHE_STAT_INC(out_slow_tot);
2246 if (flags & RTCF_LOCAL) {
2247 rth->u.dst.input = ip_local_deliver;
2248 rth->rt_spec_dst = fl->fl4_dst;
2250 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251 rth->rt_spec_dst = fl->fl4_src;
2252 if (flags & RTCF_LOCAL &&
2253 !(dev_out->flags & IFF_LOOPBACK)) {
2254 rth->u.dst.output = ip_mc_output;
2255 RT_CACHE_STAT_INC(out_slow_mc);
2257 #ifdef CONFIG_IP_MROUTE
2258 if (res->type == RTN_MULTICAST) {
2259 if (IN_DEV_MFORWARD(in_dev) &&
2260 !LOCAL_MCAST(oldflp->fl4_dst)) {
2261 rth->u.dst.input = ip_mr_input;
2262 rth->u.dst.output = ip_mc_output;
2268 rt_set_nexthop(rth, res, 0);
2270 rth->rt_flags = flags;
2274 /* release work reference to inet device */
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281 struct fib_result* res,
2282 const struct flowi *fl,
2283 const struct flowi *oldflp,
2284 struct net_device *dev_out,
2287 struct rtable *rth = NULL;
2288 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2291 u32 tos = RT_FL_TOS(oldflp);
2293 atomic_set(&rth->u.dst.__refcnt, 1);
2295 hash = rt_hash_code(oldflp->fl4_dst,
2296 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2297 err = rt_intern_hash(hash, rth, rp);
2303 static inline int ip_mkroute_output(struct rtable** rp,
2304 struct fib_result* res,
2305 const struct flowi *fl,
2306 const struct flowi *oldflp,
2307 struct net_device *dev_out,
2310 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2311 u32 tos = RT_FL_TOS(oldflp);
2315 struct rtable *rth = NULL;
2317 if (res->fi && res->fi->fib_nhs > 1) {
2318 unsigned char hopcount = res->fi->fib_nhs;
2320 for (hop = 0; hop < hopcount; hop++) {
2321 struct net_device *dev2nexthop;
2325 /* hold a work reference to the output device */
2326 dev2nexthop = FIB_RES_DEV(*res);
2327 dev_hold(dev2nexthop);
2329 err = __mkroute_output(&rth, res, fl, oldflp,
2330 dev2nexthop, flags);
2335 hash = rt_hash_code(oldflp->fl4_dst,
2337 (oldflp->oif << 5), tos);
2338 err = rt_intern_hash(hash, rth, rp);
2340 /* forward hop information to multipath impl. */
2341 multipath_set_nhinfo(rth,
2342 FIB_RES_NETWORK(*res),
2343 FIB_RES_NETMASK(*res),
2347 /* release work reference to output device */
2348 dev_put(dev2nexthop);
2353 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2356 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2359 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2360 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2365 * Major route resolver routine.
2368 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2370 u32 tos = RT_FL_TOS(oldflp);
2371 struct flowi fl = { .nl_u = { .ip4_u =
2372 { .daddr = oldflp->fl4_dst,
2373 .saddr = oldflp->fl4_src,
2374 .tos = tos & IPTOS_RT_MASK,
2375 .scope = ((tos & RTO_ONLINK) ?
2378 #ifdef CONFIG_IP_ROUTE_FWMARK
2379 .fwmark = oldflp->fl4_fwmark
2382 .iif = loopback_dev.ifindex,
2383 .oif = oldflp->oif };
2384 struct fib_result res;
2386 struct net_device *dev_out = NULL;
2392 #ifdef CONFIG_IP_MULTIPLE_TABLES
2396 if (oldflp->fl4_src) {
2398 if (MULTICAST(oldflp->fl4_src) ||
2399 BADCLASS(oldflp->fl4_src) ||
2400 ZERONET(oldflp->fl4_src))
2403 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2404 dev_out = ip_dev_find(oldflp->fl4_src);
2405 if (dev_out == NULL)
2408 /* I removed check for oif == dev_out->oif here.
2409 It was wrong for two reasons:
2410 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2411 assigned to multiple interfaces.
2412 2. Moreover, we are allowed to send packets with saddr
2413 of another iface. --ANK
2416 if (oldflp->oif == 0
2417 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2418 /* Special hack: user can direct multicasts
2419 and limited broadcast via necessary interface
2420 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2421 This hack is not just for fun, it allows
2422 vic,vat and friends to work.
2423 They bind socket to loopback, set ttl to zero
2424 and expect that it will work.
2425 From the viewpoint of routing cache they are broken,
2426 because we are not allowed to build multicast path
2427 with loopback source addr (look, routing cache
2428 cannot know, that ttl is zero, so that packet
2429 will not leave this host and route is valid).
2430 Luckily, this hack is good workaround.
2433 fl.oif = dev_out->ifindex;
2443 dev_out = dev_get_by_index(oldflp->oif);
2445 if (dev_out == NULL)
2447 if (__in_dev_get(dev_out) == NULL) {
2449 goto out; /* Wrong error code */
2452 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2454 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 if (MULTICAST(oldflp->fl4_dst))
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2462 else if (!oldflp->fl4_dst)
2463 fl.fl4_src = inet_select_addr(dev_out, 0,
2469 fl.fl4_dst = fl.fl4_src;
2471 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2474 dev_out = &loopback_dev;
2476 fl.oif = loopback_dev.ifindex;
2477 res.type = RTN_LOCAL;
2478 flags |= RTCF_LOCAL;
2482 if (fib_lookup(&fl, &res)) {
2485 /* Apparently, routing tables are wrong. Assume,
2486 that the destination is on link.
2489 Because we are allowed to send to iface
2490 even if it has NO routes and NO assigned
2491 addresses. When oif is specified, routing
2492 tables are looked up with only one purpose:
2493 to catch if destination is gatewayed, rather than
2494 direct. Moreover, if MSG_DONTROUTE is set,
2495 we send packet, ignoring both routing tables
2496 and ifaddr state. --ANK
2499 We could make it even if oif is unknown,
2500 likely IPv6, but we do not.
2503 if (fl.fl4_src == 0)
2504 fl.fl4_src = inet_select_addr(dev_out, 0,
2506 res.type = RTN_UNICAST;
2516 if (res.type == RTN_LOCAL) {
2518 fl.fl4_src = fl.fl4_dst;
2521 dev_out = &loopback_dev;
2523 fl.oif = dev_out->ifindex;
2525 fib_info_put(res.fi);
2527 flags |= RTCF_LOCAL;
2531 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2532 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2533 fib_select_multipath(&fl, &res);
2536 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2537 fib_select_default(&fl, &res);
2540 fl.fl4_src = FIB_RES_PREFSRC(res);
2544 dev_out = FIB_RES_DEV(res);
2546 fl.oif = dev_out->ifindex;
2550 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2560 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2565 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2568 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2569 rth = rcu_dereference(rth->u.rt_next)) {
2570 if (rth->fl.fl4_dst == flp->fl4_dst &&
2571 rth->fl.fl4_src == flp->fl4_src &&
2573 rth->fl.oif == flp->oif &&
2574 #ifdef CONFIG_IP_ROUTE_FWMARK
2575 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2577 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2578 (IPTOS_RT_MASK | RTO_ONLINK))) {
2580 /* check for multipath routes and choose one if
2583 if (multipath_select_route(flp, rth, rp)) {
2584 dst_hold(&(*rp)->u.dst);
2585 RT_CACHE_STAT_INC(out_hit);
2586 rcu_read_unlock_bh();
2590 rth->u.dst.lastuse = jiffies;
2591 dst_hold(&rth->u.dst);
2593 RT_CACHE_STAT_INC(out_hit);
2594 rcu_read_unlock_bh();
2598 RT_CACHE_STAT_INC(out_hlist_search);
2600 rcu_read_unlock_bh();
2602 return ip_route_output_slow(rp, flp);
2605 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2607 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2611 if ((err = __ip_route_output_key(rp, flp)) != 0)
2616 flp->fl4_src = (*rp)->rt_src;
2618 flp->fl4_dst = (*rp)->rt_dst;
2619 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2625 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2627 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2629 return ip_route_output_flow(rp, flp, NULL, 0);
2632 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2633 int nowait, unsigned int flags)
2635 struct rtable *rt = (struct rtable*)skb->dst;
2637 struct nlmsghdr *nlh;
2638 unsigned char *b = skb->tail;
2639 struct rta_cacheinfo ci;
2640 #ifdef CONFIG_IP_MROUTE
2641 struct rtattr *eptr;
2643 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2644 r = NLMSG_DATA(nlh);
2645 r->rtm_family = AF_INET;
2646 r->rtm_dst_len = 32;
2648 r->rtm_tos = rt->fl.fl4_tos;
2649 r->rtm_table = RT_TABLE_MAIN;
2650 r->rtm_type = rt->rt_type;
2651 r->rtm_scope = RT_SCOPE_UNIVERSE;
2652 r->rtm_protocol = RTPROT_UNSPEC;
2653 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2654 if (rt->rt_flags & RTCF_NOTIFY)
2655 r->rtm_flags |= RTM_F_NOTIFY;
2656 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2657 if (rt->fl.fl4_src) {
2658 r->rtm_src_len = 32;
2659 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2662 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2663 #ifdef CONFIG_NET_CLS_ROUTE
2664 if (rt->u.dst.tclassid)
2665 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2667 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2668 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2669 __u32 alg = rt->rt_multipath_alg;
2671 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2675 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2676 else if (rt->rt_src != rt->fl.fl4_src)
2677 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2678 if (rt->rt_dst != rt->rt_gateway)
2679 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2680 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2681 goto rtattr_failure;
2682 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2683 ci.rta_used = rt->u.dst.__use;
2684 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2685 if (rt->u.dst.expires)
2686 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2689 ci.rta_error = rt->u.dst.error;
2690 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2692 ci.rta_id = rt->peer->ip_id_count;
2693 if (rt->peer->tcp_ts_stamp) {
2694 ci.rta_ts = rt->peer->tcp_ts;
2695 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2698 #ifdef CONFIG_IP_MROUTE
2699 eptr = (struct rtattr*)skb->tail;
2701 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2703 #ifdef CONFIG_IP_MROUTE
2704 u32 dst = rt->rt_dst;
2706 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2707 ipv4_devconf.mc_forwarding) {
2708 int err = ipmr_get_route(skb, r, nowait);
2715 if (err == -EMSGSIZE)
2717 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2722 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2725 nlh->nlmsg_len = skb->tail - b;
2730 skb_trim(skb, b - skb->data);
2734 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2736 struct rtattr **rta = arg;
2737 struct rtmsg *rtm = NLMSG_DATA(nlh);
2738 struct rtable *rt = NULL;
2743 struct sk_buff *skb;
2745 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2749 /* Reserve room for dummy headers, this skb can pass
2750 through good chunk of routing engine.
2752 skb->mac.raw = skb->data;
2753 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755 if (rta[RTA_SRC - 1])
2756 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757 if (rta[RTA_DST - 1])
2758 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759 if (rta[RTA_IIF - 1])
2760 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2763 struct net_device *dev = __dev_get_by_index(iif);
2767 skb->protocol = htons(ETH_P_IP);
2770 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772 rt = (struct rtable*)skb->dst;
2773 if (!err && rt->u.dst.error)
2774 err = -rt->u.dst.error;
2776 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2778 .tos = rtm->rtm_tos } } };
2780 if (rta[RTA_OIF - 1])
2781 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2783 err = ip_route_output_key(&rt, &fl);
2788 skb->dst = &rt->u.dst;
2789 if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 rt->rt_flags |= RTCF_NOTIFY;
2792 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2794 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2795 RTM_NEWROUTE, 0, 0);
2803 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2813 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2820 s_idx = idx = cb->args[1];
2821 for (h = 0; h <= rt_hash_mask; h++) {
2822 if (h < s_h) continue;
2826 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 rt = rcu_dereference(rt->u.rt_next), idx++) {
2830 skb->dst = dst_clone(&rt->u.dst);
2831 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 1, NLM_F_MULTI) <= 0) {
2834 dst_release(xchg(&skb->dst, NULL));
2835 rcu_read_unlock_bh();
2838 dst_release(xchg(&skb->dst, NULL));
2840 rcu_read_unlock_bh();
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 struct file *filp, void __user *buffer,
2859 size_t *lenp, loff_t *ppos)
2862 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 rt_cache_flush(flush_delay);
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2873 void __user *oldval,
2874 size_t __user *oldlenp,
2875 void __user *newval,
2880 if (newlen != sizeof(int))
2882 if (get_user(delay, (int __user *)newval))
2884 rt_cache_flush(delay);
2888 ctl_table ipv4_route_table[] = {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
2895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2922 .proc_handler = &proc_dointvec,
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2930 .proc_handler = &proc_dointvec,
2933 /* Deprecated. Use gc_min_interval_ms */
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2976 .proc_handler = &proc_dointvec,
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2984 .proc_handler = &proc_dointvec,
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
3000 .proc_handler = &proc_dointvec,
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3008 .proc_handler = &proc_dointvec,
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3016 .proc_handler = &proc_dointvec,
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3033 .proc_handler = &proc_dointvec,
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3041 .proc_handler = &proc_dointvec,
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3059 /* This code sucks. But you should have seen it before! --RR */
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3070 if ((offset & 3) || (length & 3))
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3083 offset /= sizeof(u32);
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3089 /* Copy first cpu. */
3091 memcpy(dst, src, length);
3093 /* Add the other cpus in, one int at a time */
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099 for (j = 0; j < length/4; j++)
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3113 rhash_entries = simple_strtoul(str, &str, 0);
3116 __setup("rhash_entries=", set_rhash_entries);
3118 int __init ip_rt_init(void)
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3125 #ifdef CONFIG_NET_CLS_ROUTE
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3138 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139 sizeof(struct rtable),
3140 0, SLAB_HWCACHE_ALIGN,
3143 if (!ipv4_dst_ops.kmem_cachep)
3144 panic("IP: failed to allocate ip_dst_cache\n");
3146 rt_hash_table = (struct rt_hash_bucket *)
3147 alloc_large_system_hash("IP route cache",
3148 sizeof(struct rt_hash_bucket),
3150 (num_physpages >= 128 * 1024) ?
3157 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3158 rt_hash_lock_init();
3160 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3161 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3163 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3170 init_timer(&rt_flush_timer);
3171 rt_flush_timer.function = rt_run_flush;
3172 init_timer(&rt_periodic_timer);
3173 rt_periodic_timer.function = rt_check_expire;
3174 init_timer(&rt_secret_timer);
3175 rt_secret_timer.function = rt_secret_rebuild;
3177 /* All the timers, started at system startup tend
3178 to synchronize. Perturb it a bit.
3180 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3182 add_timer(&rt_periodic_timer);
3184 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3185 ip_rt_secret_interval;
3186 add_timer(&rt_secret_timer);
3188 #ifdef CONFIG_PROC_FS
3190 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3191 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3192 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3194 free_percpu(rt_cache_stat);
3197 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3199 #ifdef CONFIG_NET_CLS_ROUTE
3200 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3210 EXPORT_SYMBOL(__ip_select_ident);
3211 EXPORT_SYMBOL(ip_route_input);
3212 EXPORT_SYMBOL(ip_route_output_key);