2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
108 #include <linux/sysctl.h>
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_min_delay = 2 * HZ;
119 static int ip_rt_max_delay = 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval = 60 * HZ;
123 static int ip_rt_gc_min_interval = HZ / 2;
124 static int ip_rt_redirect_number = 9;
125 static int ip_rt_redirect_load = HZ / 50;
126 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost = HZ;
128 static int ip_rt_error_burst = 5 * HZ;
129 static int ip_rt_gc_elasticity = 8;
130 static int ip_rt_mtu_expires = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu = 512 + 20 + 20;
132 static int ip_rt_min_advmss = 256;
133 static int ip_rt_secret_interval = 10 * 60 * HZ;
134 static unsigned long rt_deadline;
136 #define RTprint(a...) printk(KERN_DEBUG a)
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
143 * Interface to generic destination cache.
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void ipv4_link_failure(struct sk_buff *skb);
152 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
156 static struct dst_ops ipv4_dst_ops = {
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
169 #define ECN_OR_COST(class) TC_PRIO_##class
171 __u8 ip_tos2prio[16] = {
175 ECN_OR_COST(BESTEFFORT),
181 ECN_OR_COST(INTERACTIVE),
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
195 /* The locking scheme is rather straight forward:
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
205 struct rt_hash_bucket {
206 struct rtable *chain;
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
214 #define RT_HASH_LOCK_SZ 4096
216 #define RT_HASH_LOCK_SZ 2048
218 #define RT_HASH_LOCK_SZ 1024
220 #define RT_HASH_LOCK_SZ 512
222 #define RT_HASH_LOCK_SZ 256
225 static spinlock_t *rt_hash_locks;
226 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
227 # define rt_hash_lock_init() { \
229 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
230 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
231 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
232 spin_lock_init(&rt_hash_locks[i]); \
235 # define rt_hash_lock_addr(slot) NULL
236 # define rt_hash_lock_init()
239 static struct rt_hash_bucket *rt_hash_table;
240 static unsigned rt_hash_mask;
241 static int rt_hash_log;
242 static unsigned int rt_hash_rnd;
244 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
245 #define RT_CACHE_STAT_INC(field) \
246 (__raw_get_cpu_var(rt_cache_stat).field++)
248 static int rt_intern_hash(unsigned hash, struct rtable *rth,
249 struct rtable **res);
251 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
253 return (jhash_2words(daddr, saddr, rt_hash_rnd)
257 #ifdef CONFIG_PROC_FS
258 struct rt_cache_iter_state {
262 static struct rtable *rt_cache_get_first(struct seq_file *seq)
264 struct rtable *r = NULL;
265 struct rt_cache_iter_state *st = seq->private;
267 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
269 r = rt_hash_table[st->bucket].chain;
272 rcu_read_unlock_bh();
277 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
279 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
283 rcu_read_unlock_bh();
284 if (--st->bucket < 0)
287 r = rt_hash_table[st->bucket].chain;
292 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
294 struct rtable *r = rt_cache_get_first(seq);
297 while (pos && (r = rt_cache_get_next(seq, r)))
299 return pos ? NULL : r;
302 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
304 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
307 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
309 struct rtable *r = NULL;
311 if (v == SEQ_START_TOKEN)
312 r = rt_cache_get_first(seq);
314 r = rt_cache_get_next(seq, v);
319 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
321 if (v && v != SEQ_START_TOKEN)
322 rcu_read_unlock_bh();
325 static int rt_cache_seq_show(struct seq_file *seq, void *v)
327 if (v == SEQ_START_TOKEN)
328 seq_printf(seq, "%-127s\n",
329 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
330 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
333 struct rtable *r = v;
336 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
337 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
338 r->u.dst.dev ? r->u.dst.dev->name : "*",
339 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
340 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
341 r->u.dst.__use, 0, (unsigned long)r->rt_src,
342 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
343 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
344 dst_metric(&r->u.dst, RTAX_WINDOW),
345 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
346 dst_metric(&r->u.dst, RTAX_RTTVAR)),
348 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
349 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
352 seq_printf(seq, "%-127s\n", temp);
357 static struct seq_operations rt_cache_seq_ops = {
358 .start = rt_cache_seq_start,
359 .next = rt_cache_seq_next,
360 .stop = rt_cache_seq_stop,
361 .show = rt_cache_seq_show,
364 static int rt_cache_seq_open(struct inode *inode, struct file *file)
366 struct seq_file *seq;
368 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
372 rc = seq_open(file, &rt_cache_seq_ops);
375 seq = file->private_data;
377 memset(s, 0, sizeof(*s));
385 static struct file_operations rt_cache_seq_fops = {
386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
390 .release = seq_release_private,
394 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
399 return SEQ_START_TOKEN;
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
405 return &per_cpu(rt_cache_stat, cpu);
410 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
418 return &per_cpu(rt_cache_stat, cpu);
424 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
429 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
431 struct rt_cache_stat *st = v;
433 if (v == SEQ_START_TOKEN) {
434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
463 static struct seq_operations rt_cpu_seq_ops = {
464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
471 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
473 return seq_open(file, &rt_cpu_seq_ops);
476 static struct file_operations rt_cpu_seq_fops = {
477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
481 .release = seq_release,
484 #endif /* CONFIG_PROC_FS */
486 static __inline__ void rt_free(struct rtable *rt)
488 multipath_remove(rt);
489 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492 static __inline__ void rt_drop(struct rtable *rt)
494 multipath_remove(rt);
496 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
499 static __inline__ int rt_fast_clean(struct rtable *rth)
501 /* Kill broadcast/multicast entries very aggresively, if they
502 collide in hash table with more useful entries */
503 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
504 rth->fl.iif && rth->u.rt_next;
507 static __inline__ int rt_valuable(struct rtable *rth)
509 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
513 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
518 if (atomic_read(&rth->u.dst.__refcnt))
522 if (rth->u.dst.expires &&
523 time_after_eq(jiffies, rth->u.dst.expires))
526 age = jiffies - rth->u.dst.lastuse;
528 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
529 (age <= tmo2 && rt_valuable(rth)))
535 /* Bits of score are:
537 * 30: not quite useless
538 * 29..0: usage counter
540 static inline u32 rt_score(struct rtable *rt)
542 u32 score = jiffies - rt->u.dst.lastuse;
544 score = ~score & ~(3<<30);
550 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
556 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
558 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
559 fl1->oif == fl2->oif &&
560 fl1->iif == fl2->iif;
563 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
564 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
565 struct rtable *expentry,
568 int passedexpired = 0;
569 struct rtable **nextstep = NULL;
570 struct rtable **rthp = chain_head;
576 while ((rth = *rthp) != NULL) {
580 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
581 compare_keys(&(*rthp)->fl, &expentry->fl)) {
582 if (*rthp == expentry) {
583 *rthp = rth->u.rt_next;
586 *rthp = rth->u.rt_next;
592 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
593 passedexpired && !nextstep)
594 nextstep = &rth->u.rt_next;
596 rthp = &rth->u.rt_next;
606 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
609 /* This runs via a timer and thus is always in BH context. */
610 static void rt_check_expire(unsigned long dummy)
612 static unsigned int rover;
613 unsigned int i = rover, goal;
614 struct rtable *rth, **rthp;
615 unsigned long now = jiffies;
618 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
619 if (ip_rt_gc_timeout > 1)
620 do_div(mult, ip_rt_gc_timeout);
621 goal = (unsigned int)mult;
622 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
623 for (; goal > 0; goal--) {
624 unsigned long tmo = ip_rt_gc_timeout;
626 i = (i + 1) & rt_hash_mask;
627 rthp = &rt_hash_table[i].chain;
631 spin_lock(rt_hash_lock_addr(i));
632 while ((rth = *rthp) != NULL) {
633 if (rth->u.dst.expires) {
634 /* Entry is expired even if it is in use */
635 if (time_before_eq(now, rth->u.dst.expires)) {
637 rthp = &rth->u.rt_next;
640 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
642 rthp = &rth->u.rt_next;
646 /* Cleanup aged off entries. */
647 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
648 /* remove all related balanced entries if necessary */
649 if (rth->u.dst.flags & DST_BALANCED) {
650 rthp = rt_remove_balanced_route(
651 &rt_hash_table[i].chain,
656 *rthp = rth->u.rt_next;
659 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
660 *rthp = rth->u.rt_next;
662 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
664 spin_unlock(rt_hash_lock_addr(i));
666 /* Fallback loop breaker. */
667 if (time_after(jiffies, now))
671 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
674 /* This can run from both BH and non-BH contexts, the latter
675 * in the case of a forced flush event.
677 static void rt_run_flush(unsigned long dummy)
680 struct rtable *rth, *next;
684 get_random_bytes(&rt_hash_rnd, 4);
686 for (i = rt_hash_mask; i >= 0; i--) {
687 spin_lock_bh(rt_hash_lock_addr(i));
688 rth = rt_hash_table[i].chain;
690 rt_hash_table[i].chain = NULL;
691 spin_unlock_bh(rt_hash_lock_addr(i));
693 for (; rth; rth = next) {
694 next = rth->u.rt_next;
700 static DEFINE_SPINLOCK(rt_flush_lock);
702 void rt_cache_flush(int delay)
704 unsigned long now = jiffies;
705 int user_mode = !in_softirq();
708 delay = ip_rt_min_delay;
710 /* flush existing multipath state*/
713 spin_lock_bh(&rt_flush_lock);
715 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
716 long tmo = (long)(rt_deadline - now);
718 /* If flush timer is already running
719 and flush request is not immediate (delay > 0):
721 if deadline is not achieved, prolongate timer to "delay",
722 otherwise fire it at deadline time.
725 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
733 spin_unlock_bh(&rt_flush_lock);
738 if (rt_deadline == 0)
739 rt_deadline = now + ip_rt_max_delay;
741 mod_timer(&rt_flush_timer, now+delay);
742 spin_unlock_bh(&rt_flush_lock);
745 static void rt_secret_rebuild(unsigned long dummy)
747 unsigned long now = jiffies;
750 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
754 Short description of GC goals.
756 We want to build algorithm, which will keep routing cache
757 at some equilibrium point, when number of aged off entries
758 is kept approximately equal to newly generated ones.
760 Current expiration strength is variable "expire".
761 We try to adjust it dynamically, so that if networking
762 is idle expires is large enough to keep enough of warm entries,
763 and when load increases it reduces to limit cache size.
766 static int rt_garbage_collect(void)
768 static unsigned long expire = RT_GC_TIMEOUT;
769 static unsigned long last_gc;
771 static int equilibrium;
772 struct rtable *rth, **rthp;
773 unsigned long now = jiffies;
777 * Garbage collection is pretty expensive,
778 * do not make it too frequently.
781 RT_CACHE_STAT_INC(gc_total);
783 if (now - last_gc < ip_rt_gc_min_interval &&
784 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
785 RT_CACHE_STAT_INC(gc_ignored);
789 /* Calculate number of entries, which we want to expire now. */
790 goal = atomic_read(&ipv4_dst_ops.entries) -
791 (ip_rt_gc_elasticity << rt_hash_log);
793 if (equilibrium < ipv4_dst_ops.gc_thresh)
794 equilibrium = ipv4_dst_ops.gc_thresh;
795 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
797 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
798 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
801 /* We are in dangerous area. Try to reduce cache really
804 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
805 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
808 if (now - last_gc >= ip_rt_gc_min_interval)
819 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
820 unsigned long tmo = expire;
822 k = (k + 1) & rt_hash_mask;
823 rthp = &rt_hash_table[k].chain;
824 spin_lock_bh(rt_hash_lock_addr(k));
825 while ((rth = *rthp) != NULL) {
826 if (!rt_may_expire(rth, tmo, expire)) {
828 rthp = &rth->u.rt_next;
831 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
832 /* remove all related balanced entries
835 if (rth->u.dst.flags & DST_BALANCED) {
838 rthp = rt_remove_balanced_route(
839 &rt_hash_table[k].chain,
846 *rthp = rth->u.rt_next;
850 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
851 *rthp = rth->u.rt_next;
854 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
856 spin_unlock_bh(rt_hash_lock_addr(k));
865 /* Goal is not achieved. We stop process if:
867 - if expire reduced to zero. Otherwise, expire is halfed.
868 - if table is not full.
869 - if we are called from interrupt.
870 - jiffies check is just fallback/debug loop breaker.
871 We will not spin here for long time in any case.
874 RT_CACHE_STAT_INC(gc_goal_miss);
880 #if RT_CACHE_DEBUG >= 2
881 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
882 atomic_read(&ipv4_dst_ops.entries), goal, i);
885 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
887 } while (!in_softirq() && time_before_eq(jiffies, now));
889 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
892 printk(KERN_WARNING "dst cache overflow\n");
893 RT_CACHE_STAT_INC(gc_dst_overflow);
897 expire += ip_rt_gc_min_interval;
898 if (expire > ip_rt_gc_timeout ||
899 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
900 expire = ip_rt_gc_timeout;
901 #if RT_CACHE_DEBUG >= 2
902 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
903 atomic_read(&ipv4_dst_ops.entries), goal, rover);
908 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
910 struct rtable *rth, **rthp;
912 struct rtable *cand, **candp;
915 int attempts = !in_softirq();
924 rthp = &rt_hash_table[hash].chain;
926 spin_lock_bh(rt_hash_lock_addr(hash));
927 while ((rth = *rthp) != NULL) {
928 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
929 if (!(rth->u.dst.flags & DST_BALANCED) &&
930 compare_keys(&rth->fl, &rt->fl)) {
932 if (compare_keys(&rth->fl, &rt->fl)) {
935 *rthp = rth->u.rt_next;
937 * Since lookup is lockfree, the deletion
938 * must be visible to another weakly ordered CPU before
939 * the insertion at the start of the hash chain.
941 rcu_assign_pointer(rth->u.rt_next,
942 rt_hash_table[hash].chain);
944 * Since lookup is lockfree, the update writes
945 * must be ordered for consistency on SMP.
947 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
950 dst_hold(&rth->u.dst);
951 rth->u.dst.lastuse = now;
952 spin_unlock_bh(rt_hash_lock_addr(hash));
959 if (!atomic_read(&rth->u.dst.__refcnt)) {
960 u32 score = rt_score(rth);
962 if (score <= min_score) {
971 rthp = &rth->u.rt_next;
975 /* ip_rt_gc_elasticity used to be average length of chain
976 * length, when exceeded gc becomes really aggressive.
978 * The second limit is less certain. At the moment it allows
979 * only 2 entries per bucket. We will see.
981 if (chain_length > ip_rt_gc_elasticity) {
982 *candp = cand->u.rt_next;
987 /* Try to bind route to arp only if it is output
988 route or unicast forwarding path.
990 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
991 int err = arp_bind_neighbour(&rt->u.dst);
993 spin_unlock_bh(rt_hash_lock_addr(hash));
995 if (err != -ENOBUFS) {
1000 /* Neighbour tables are full and nothing
1001 can be released. Try to shrink route cache,
1002 it is most likely it holds some neighbour records.
1004 if (attempts-- > 0) {
1005 int saved_elasticity = ip_rt_gc_elasticity;
1006 int saved_int = ip_rt_gc_min_interval;
1007 ip_rt_gc_elasticity = 1;
1008 ip_rt_gc_min_interval = 0;
1009 rt_garbage_collect();
1010 ip_rt_gc_min_interval = saved_int;
1011 ip_rt_gc_elasticity = saved_elasticity;
1015 if (net_ratelimit())
1016 printk(KERN_WARNING "Neighbour table overflow.\n");
1022 rt->u.rt_next = rt_hash_table[hash].chain;
1023 #if RT_CACHE_DEBUG >= 2
1024 if (rt->u.rt_next) {
1026 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1027 NIPQUAD(rt->rt_dst));
1028 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1029 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1033 rt_hash_table[hash].chain = rt;
1034 spin_unlock_bh(rt_hash_lock_addr(hash));
1039 void rt_bind_peer(struct rtable *rt, int create)
1041 static DEFINE_SPINLOCK(rt_peer_lock);
1042 struct inet_peer *peer;
1044 peer = inet_getpeer(rt->rt_dst, create);
1046 spin_lock_bh(&rt_peer_lock);
1047 if (rt->peer == NULL) {
1051 spin_unlock_bh(&rt_peer_lock);
1057 * Peer allocation may fail only in serious out-of-memory conditions. However
1058 * we still can generate some output.
1059 * Random ID selection looks a bit dangerous because we have no chances to
1060 * select ID being unique in a reasonable period of time.
1061 * But broken packet identifier may be better than no packet at all.
1063 static void ip_select_fb_ident(struct iphdr *iph)
1065 static DEFINE_SPINLOCK(ip_fb_id_lock);
1066 static u32 ip_fallback_id;
1069 spin_lock_bh(&ip_fb_id_lock);
1070 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1071 iph->id = htons(salt & 0xFFFF);
1072 ip_fallback_id = salt;
1073 spin_unlock_bh(&ip_fb_id_lock);
1076 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1078 struct rtable *rt = (struct rtable *) dst;
1081 if (rt->peer == NULL)
1082 rt_bind_peer(rt, 1);
1084 /* If peer is attached to destination, it is never detached,
1085 so that we need not to grab a lock to dereference it.
1088 iph->id = htons(inet_getid(rt->peer, more));
1092 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1093 __builtin_return_address(0));
1095 ip_select_fb_ident(iph);
1098 static void rt_del(unsigned hash, struct rtable *rt)
1100 struct rtable **rthp;
1102 spin_lock_bh(rt_hash_lock_addr(hash));
1104 for (rthp = &rt_hash_table[hash].chain; *rthp;
1105 rthp = &(*rthp)->u.rt_next)
1107 *rthp = rt->u.rt_next;
1111 spin_unlock_bh(rt_hash_lock_addr(hash));
1114 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1115 u32 saddr, struct net_device *dev)
1118 struct in_device *in_dev = in_dev_get(dev);
1119 struct rtable *rth, **rthp;
1120 u32 skeys[2] = { saddr, 0 };
1121 int ikeys[2] = { dev->ifindex, 0 };
1126 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1127 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1128 goto reject_redirect;
1130 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1131 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1132 goto reject_redirect;
1133 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1134 goto reject_redirect;
1136 if (inet_addr_type(new_gw) != RTN_UNICAST)
1137 goto reject_redirect;
1140 for (i = 0; i < 2; i++) {
1141 for (k = 0; k < 2; k++) {
1142 unsigned hash = rt_hash_code(daddr,
1143 skeys[i] ^ (ikeys[k] << 5));
1145 rthp=&rt_hash_table[hash].chain;
1148 while ((rth = rcu_dereference(*rthp)) != NULL) {
1151 if (rth->fl.fl4_dst != daddr ||
1152 rth->fl.fl4_src != skeys[i] ||
1153 rth->fl.oif != ikeys[k] ||
1155 rthp = &rth->u.rt_next;
1159 if (rth->rt_dst != daddr ||
1160 rth->rt_src != saddr ||
1162 rth->rt_gateway != old_gw ||
1163 rth->u.dst.dev != dev)
1166 dst_hold(&rth->u.dst);
1169 rt = dst_alloc(&ipv4_dst_ops);
1176 /* Copy all the information. */
1178 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1179 rt->u.dst.__use = 1;
1180 atomic_set(&rt->u.dst.__refcnt, 1);
1181 rt->u.dst.child = NULL;
1183 dev_hold(rt->u.dst.dev);
1185 in_dev_hold(rt->idev);
1186 rt->u.dst.obsolete = 0;
1187 rt->u.dst.lastuse = jiffies;
1188 rt->u.dst.path = &rt->u.dst;
1189 rt->u.dst.neighbour = NULL;
1190 rt->u.dst.hh = NULL;
1191 rt->u.dst.xfrm = NULL;
1193 rt->rt_flags |= RTCF_REDIRECTED;
1195 /* Gateway is different ... */
1196 rt->rt_gateway = new_gw;
1198 /* Redirect received -> path was valid */
1199 dst_confirm(&rth->u.dst);
1202 atomic_inc(&rt->peer->refcnt);
1204 if (arp_bind_neighbour(&rt->u.dst) ||
1205 !(rt->u.dst.neighbour->nud_state &
1207 if (rt->u.dst.neighbour)
1208 neigh_event_send(rt->u.dst.neighbour, NULL);
1215 if (!rt_intern_hash(hash, rt, &rt))
1228 #ifdef CONFIG_IP_ROUTE_VERBOSE
1229 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1230 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1231 "%u.%u.%u.%u ignored.\n"
1232 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1233 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1234 NIPQUAD(saddr), NIPQUAD(daddr));
1239 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1241 struct rtable *rt = (struct rtable*)dst;
1242 struct dst_entry *ret = dst;
1245 if (dst->obsolete) {
1248 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1249 rt->u.dst.expires) {
1250 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1253 #if RT_CACHE_DEBUG >= 1
1254 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1255 "%u.%u.%u.%u/%02x dropped\n",
1256 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1267 * 1. The first ip_rt_redirect_number redirects are sent
1268 * with exponential backoff, then we stop sending them at all,
1269 * assuming that the host ignores our redirects.
1270 * 2. If we did not see packets requiring redirects
1271 * during ip_rt_redirect_silence, we assume that the host
1272 * forgot redirected route and start to send redirects again.
1274 * This algorithm is much cheaper and more intelligent than dumb load limiting
1277 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1278 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1281 void ip_rt_send_redirect(struct sk_buff *skb)
1283 struct rtable *rt = (struct rtable*)skb->dst;
1284 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1289 if (!IN_DEV_TX_REDIRECTS(in_dev))
1292 /* No redirected packets during ip_rt_redirect_silence;
1293 * reset the algorithm.
1295 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1296 rt->u.dst.rate_tokens = 0;
1298 /* Too many ignored redirects; do not send anything
1299 * set u.dst.rate_last to the last seen redirected packet.
1301 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1302 rt->u.dst.rate_last = jiffies;
1306 /* Check for load limit; set rate_last to the latest sent
1309 if (time_after(jiffies,
1310 (rt->u.dst.rate_last +
1311 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1312 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1313 rt->u.dst.rate_last = jiffies;
1314 ++rt->u.dst.rate_tokens;
1315 #ifdef CONFIG_IP_ROUTE_VERBOSE
1316 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1317 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1319 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1320 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1321 NIPQUAD(rt->rt_src), rt->rt_iif,
1322 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1329 static int ip_error(struct sk_buff *skb)
1331 struct rtable *rt = (struct rtable*)skb->dst;
1335 switch (rt->u.dst.error) {
1340 code = ICMP_HOST_UNREACH;
1343 code = ICMP_NET_UNREACH;
1346 code = ICMP_PKT_FILTERED;
1351 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1352 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1353 rt->u.dst.rate_tokens = ip_rt_error_burst;
1354 rt->u.dst.rate_last = now;
1355 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1356 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1357 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1360 out: kfree_skb(skb);
1365 * The last two values are not from the RFC but
1366 * are needed for AMPRnet AX.25 paths.
1369 static const unsigned short mtu_plateau[] =
1370 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1372 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1376 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1377 if (old_mtu > mtu_plateau[i])
1378 return mtu_plateau[i];
1382 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1385 unsigned short old_mtu = ntohs(iph->tot_len);
1387 u32 skeys[2] = { iph->saddr, 0, };
1388 u32 daddr = iph->daddr;
1389 unsigned short est_mtu = 0;
1391 if (ipv4_config.no_pmtu_disc)
1394 for (i = 0; i < 2; i++) {
1395 unsigned hash = rt_hash_code(daddr, skeys[i]);
1398 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1399 rth = rcu_dereference(rth->u.rt_next)) {
1400 if (rth->fl.fl4_dst == daddr &&
1401 rth->fl.fl4_src == skeys[i] &&
1402 rth->rt_dst == daddr &&
1403 rth->rt_src == iph->saddr &&
1405 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1406 unsigned short mtu = new_mtu;
1408 if (new_mtu < 68 || new_mtu >= old_mtu) {
1410 /* BSD 4.2 compatibility hack :-( */
1412 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1413 old_mtu >= 68 + (iph->ihl << 2))
1414 old_mtu -= iph->ihl << 2;
1416 mtu = guess_mtu(old_mtu);
1418 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1419 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1420 dst_confirm(&rth->u.dst);
1421 if (mtu < ip_rt_min_pmtu) {
1422 mtu = ip_rt_min_pmtu;
1423 rth->u.dst.metrics[RTAX_LOCK-1] |=
1426 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1427 dst_set_expires(&rth->u.dst,
1436 return est_mtu ? : new_mtu;
1439 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1441 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1442 !(dst_metric_locked(dst, RTAX_MTU))) {
1443 if (mtu < ip_rt_min_pmtu) {
1444 mtu = ip_rt_min_pmtu;
1445 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1447 dst->metrics[RTAX_MTU-1] = mtu;
1448 dst_set_expires(dst, ip_rt_mtu_expires);
1452 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1457 static void ipv4_dst_destroy(struct dst_entry *dst)
1459 struct rtable *rt = (struct rtable *) dst;
1460 struct inet_peer *peer = rt->peer;
1461 struct in_device *idev = rt->idev;
1474 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1477 struct rtable *rt = (struct rtable *) dst;
1478 struct in_device *idev = rt->idev;
1479 if (dev != &loopback_dev && idev && idev->dev == dev) {
1480 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1481 if (loopback_idev) {
1482 rt->idev = loopback_idev;
1488 static void ipv4_link_failure(struct sk_buff *skb)
1492 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1494 rt = (struct rtable *) skb->dst;
1496 dst_set_expires(&rt->u.dst, 0);
1499 static int ip_rt_bug(struct sk_buff *skb)
1501 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1502 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1503 skb->dev ? skb->dev->name : "?");
1509 We do not cache source address of outgoing interface,
1510 because it is used only by IP RR, TS and SRR options,
1511 so that it out of fast path.
1513 BTW remember: "addr" is allowed to be not aligned
1517 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1520 struct fib_result res;
1522 if (rt->fl.iif == 0)
1524 else if (fib_lookup(&rt->fl, &res) == 0) {
1525 src = FIB_RES_PREFSRC(res);
1528 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1530 memcpy(addr, &src, 4);
1533 #ifdef CONFIG_NET_CLS_ROUTE
1534 static void set_class_tag(struct rtable *rt, u32 tag)
1536 if (!(rt->u.dst.tclassid & 0xFFFF))
1537 rt->u.dst.tclassid |= tag & 0xFFFF;
1538 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1539 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1543 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1545 struct fib_info *fi = res->fi;
1548 if (FIB_RES_GW(*res) &&
1549 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1550 rt->rt_gateway = FIB_RES_GW(*res);
1551 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1552 sizeof(rt->u.dst.metrics));
1553 if (fi->fib_mtu == 0) {
1554 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1555 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1556 rt->rt_gateway != rt->rt_dst &&
1557 rt->u.dst.dev->mtu > 576)
1558 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1560 #ifdef CONFIG_NET_CLS_ROUTE
1561 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1564 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1566 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1567 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1568 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1569 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1570 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1571 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1573 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1574 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1576 #ifdef CONFIG_NET_CLS_ROUTE
1577 #ifdef CONFIG_IP_MULTIPLE_TABLES
1578 set_class_tag(rt, fib_rules_tclass(res));
1580 set_class_tag(rt, itag);
1582 rt->rt_type = res->type;
1585 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1586 u8 tos, struct net_device *dev, int our)
1591 struct in_device *in_dev = in_dev_get(dev);
1594 /* Primary sanity checks. */
1599 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1600 skb->protocol != htons(ETH_P_IP))
1603 if (ZERONET(saddr)) {
1604 if (!LOCAL_MCAST(daddr))
1606 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1607 } else if (fib_validate_source(saddr, 0, tos, 0,
1608 dev, &spec_dst, &itag) < 0)
1611 rth = dst_alloc(&ipv4_dst_ops);
1615 rth->u.dst.output= ip_rt_bug;
1617 atomic_set(&rth->u.dst.__refcnt, 1);
1618 rth->u.dst.flags= DST_HOST;
1619 if (in_dev->cnf.no_policy)
1620 rth->u.dst.flags |= DST_NOPOLICY;
1621 rth->fl.fl4_dst = daddr;
1622 rth->rt_dst = daddr;
1623 rth->fl.fl4_tos = tos;
1624 #ifdef CONFIG_IP_ROUTE_FWMARK
1625 rth->fl.fl4_fwmark= skb->nfmark;
1627 rth->fl.fl4_src = saddr;
1628 rth->rt_src = saddr;
1629 #ifdef CONFIG_NET_CLS_ROUTE
1630 rth->u.dst.tclassid = itag;
1633 rth->fl.iif = dev->ifindex;
1634 rth->u.dst.dev = &loopback_dev;
1635 dev_hold(rth->u.dst.dev);
1636 rth->idev = in_dev_get(rth->u.dst.dev);
1638 rth->rt_gateway = daddr;
1639 rth->rt_spec_dst= spec_dst;
1640 rth->rt_type = RTN_MULTICAST;
1641 rth->rt_flags = RTCF_MULTICAST;
1643 rth->u.dst.input= ip_local_deliver;
1644 rth->rt_flags |= RTCF_LOCAL;
1647 #ifdef CONFIG_IP_MROUTE
1648 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1649 rth->u.dst.input = ip_mr_input;
1651 RT_CACHE_STAT_INC(in_slow_mc);
1654 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1655 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1667 static void ip_handle_martian_source(struct net_device *dev,
1668 struct in_device *in_dev,
1669 struct sk_buff *skb,
1673 RT_CACHE_STAT_INC(in_martian_src);
1674 #ifdef CONFIG_IP_ROUTE_VERBOSE
1675 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1677 * RFC1812 recommendation, if source is martian,
1678 * the only hint is MAC header.
1680 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1681 "%u.%u.%u.%u, on dev %s\n",
1682 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1683 if (dev->hard_header_len && skb->mac.raw) {
1685 unsigned char *p = skb->mac.raw;
1686 printk(KERN_WARNING "ll header: ");
1687 for (i = 0; i < dev->hard_header_len; i++, p++) {
1689 if (i < (dev->hard_header_len - 1))
1698 static inline int __mkroute_input(struct sk_buff *skb,
1699 struct fib_result* res,
1700 struct in_device *in_dev,
1701 u32 daddr, u32 saddr, u32 tos,
1702 struct rtable **result)
1707 struct in_device *out_dev;
1711 /* get a working reference to the output device */
1712 out_dev = in_dev_get(FIB_RES_DEV(*res));
1713 if (out_dev == NULL) {
1714 if (net_ratelimit())
1715 printk(KERN_CRIT "Bug in ip_route_input" \
1716 "_slow(). Please, report\n");
1721 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1722 in_dev->dev, &spec_dst, &itag);
1724 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1732 flags |= RTCF_DIRECTSRC;
1734 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1735 (IN_DEV_SHARED_MEDIA(out_dev) ||
1736 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1737 flags |= RTCF_DOREDIRECT;
1739 if (skb->protocol != htons(ETH_P_IP)) {
1740 /* Not IP (i.e. ARP). Do not create route, if it is
1741 * invalid for proxy arp. DNAT routes are always valid.
1743 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1750 rth = dst_alloc(&ipv4_dst_ops);
1756 atomic_set(&rth->u.dst.__refcnt, 1);
1757 rth->u.dst.flags= DST_HOST;
1758 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1759 if (res->fi->fib_nhs > 1)
1760 rth->u.dst.flags |= DST_BALANCED;
1762 if (in_dev->cnf.no_policy)
1763 rth->u.dst.flags |= DST_NOPOLICY;
1764 if (in_dev->cnf.no_xfrm)
1765 rth->u.dst.flags |= DST_NOXFRM;
1766 rth->fl.fl4_dst = daddr;
1767 rth->rt_dst = daddr;
1768 rth->fl.fl4_tos = tos;
1769 #ifdef CONFIG_IP_ROUTE_FWMARK
1770 rth->fl.fl4_fwmark= skb->nfmark;
1772 rth->fl.fl4_src = saddr;
1773 rth->rt_src = saddr;
1774 rth->rt_gateway = daddr;
1776 rth->fl.iif = in_dev->dev->ifindex;
1777 rth->u.dst.dev = (out_dev)->dev;
1778 dev_hold(rth->u.dst.dev);
1779 rth->idev = in_dev_get(rth->u.dst.dev);
1781 rth->rt_spec_dst= spec_dst;
1783 rth->u.dst.input = ip_forward;
1784 rth->u.dst.output = ip_output;
1786 rt_set_nexthop(rth, res, itag);
1788 rth->rt_flags = flags;
1793 /* release the working reference to the output device */
1794 in_dev_put(out_dev);
1798 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1799 struct fib_result* res,
1800 const struct flowi *fl,
1801 struct in_device *in_dev,
1802 u32 daddr, u32 saddr, u32 tos)
1804 struct rtable* rth = NULL;
1808 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1809 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1810 fib_select_multipath(fl, res);
1813 /* create a routing cache entry */
1814 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1818 /* put it into the cache */
1819 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1820 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1823 static inline int ip_mkroute_input(struct sk_buff *skb,
1824 struct fib_result* res,
1825 const struct flowi *fl,
1826 struct in_device *in_dev,
1827 u32 daddr, u32 saddr, u32 tos)
1829 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1830 struct rtable* rth = NULL, *rtres;
1831 unsigned char hop, hopcount;
1836 hopcount = res->fi->fib_nhs;
1840 /* distinguish between multipath and singlepath */
1842 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1845 /* add all alternatives to the routing cache */
1846 for (hop = 0; hop < hopcount; hop++) {
1849 /* put reference to previous result */
1853 /* create a routing cache entry */
1854 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1859 /* put it into the cache */
1860 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1861 err = rt_intern_hash(hash, rth, &rtres);
1865 /* forward hop information to multipath impl. */
1866 multipath_set_nhinfo(rth,
1867 FIB_RES_NETWORK(*res),
1868 FIB_RES_NETMASK(*res),
1872 skb->dst = &rtres->u.dst;
1874 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1875 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1876 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1881 * NOTE. We drop all the packets that has local source
1882 * addresses, because every properly looped back packet
1883 * must have correct destination already attached by output routine.
1885 * Such approach solves two big problems:
1886 * 1. Not simplex devices are handled properly.
1887 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1890 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1891 u8 tos, struct net_device *dev)
1893 struct fib_result res;
1894 struct in_device *in_dev = in_dev_get(dev);
1895 struct flowi fl = { .nl_u = { .ip4_u =
1899 .scope = RT_SCOPE_UNIVERSE,
1900 #ifdef CONFIG_IP_ROUTE_FWMARK
1901 .fwmark = skb->nfmark
1904 .iif = dev->ifindex };
1907 struct rtable * rth;
1913 /* IP on this device is disabled. */
1918 /* Check for the most weird martians, which can be not detected
1922 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1923 goto martian_source;
1925 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1928 /* Accept zero addresses only to limited broadcast;
1929 * I even do not know to fix it or not. Waiting for complains :-)
1932 goto martian_source;
1934 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1935 goto martian_destination;
1938 * Now we are ready to route packet.
1940 if ((err = fib_lookup(&fl, &res)) != 0) {
1941 if (!IN_DEV_FORWARD(in_dev))
1947 RT_CACHE_STAT_INC(in_slow_tot);
1949 if (res.type == RTN_BROADCAST)
1952 if (res.type == RTN_LOCAL) {
1954 result = fib_validate_source(saddr, daddr, tos,
1955 loopback_dev.ifindex,
1956 dev, &spec_dst, &itag);
1958 goto martian_source;
1960 flags |= RTCF_DIRECTSRC;
1965 if (!IN_DEV_FORWARD(in_dev))
1967 if (res.type != RTN_UNICAST)
1968 goto martian_destination;
1970 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971 if (err == -ENOBUFS)
1983 if (skb->protocol != htons(ETH_P_IP))
1987 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1989 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1992 goto martian_source;
1994 flags |= RTCF_DIRECTSRC;
1996 flags |= RTCF_BROADCAST;
1997 res.type = RTN_BROADCAST;
1998 RT_CACHE_STAT_INC(in_brd);
2001 rth = dst_alloc(&ipv4_dst_ops);
2005 rth->u.dst.output= ip_rt_bug;
2007 atomic_set(&rth->u.dst.__refcnt, 1);
2008 rth->u.dst.flags= DST_HOST;
2009 if (in_dev->cnf.no_policy)
2010 rth->u.dst.flags |= DST_NOPOLICY;
2011 rth->fl.fl4_dst = daddr;
2012 rth->rt_dst = daddr;
2013 rth->fl.fl4_tos = tos;
2014 #ifdef CONFIG_IP_ROUTE_FWMARK
2015 rth->fl.fl4_fwmark= skb->nfmark;
2017 rth->fl.fl4_src = saddr;
2018 rth->rt_src = saddr;
2019 #ifdef CONFIG_NET_CLS_ROUTE
2020 rth->u.dst.tclassid = itag;
2023 rth->fl.iif = dev->ifindex;
2024 rth->u.dst.dev = &loopback_dev;
2025 dev_hold(rth->u.dst.dev);
2026 rth->idev = in_dev_get(rth->u.dst.dev);
2027 rth->rt_gateway = daddr;
2028 rth->rt_spec_dst= spec_dst;
2029 rth->u.dst.input= ip_local_deliver;
2030 rth->rt_flags = flags|RTCF_LOCAL;
2031 if (res.type == RTN_UNREACHABLE) {
2032 rth->u.dst.input= ip_error;
2033 rth->u.dst.error= -err;
2034 rth->rt_flags &= ~RTCF_LOCAL;
2036 rth->rt_type = res.type;
2037 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2038 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2042 RT_CACHE_STAT_INC(in_no_route);
2043 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2044 res.type = RTN_UNREACHABLE;
2048 * Do not cache martian addresses: they should be logged (RFC1812)
2050 martian_destination:
2051 RT_CACHE_STAT_INC(in_martian_dst);
2052 #ifdef CONFIG_IP_ROUTE_VERBOSE
2053 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2054 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2055 "%u.%u.%u.%u, dev %s\n",
2056 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2060 err = -EHOSTUNREACH;
2072 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2076 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2077 u8 tos, struct net_device *dev)
2079 struct rtable * rth;
2081 int iif = dev->ifindex;
2083 tos &= IPTOS_RT_MASK;
2084 hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2087 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088 rth = rcu_dereference(rth->u.rt_next)) {
2089 if (rth->fl.fl4_dst == daddr &&
2090 rth->fl.fl4_src == saddr &&
2091 rth->fl.iif == iif &&
2093 #ifdef CONFIG_IP_ROUTE_FWMARK
2094 rth->fl.fl4_fwmark == skb->nfmark &&
2096 rth->fl.fl4_tos == tos) {
2097 rth->u.dst.lastuse = jiffies;
2098 dst_hold(&rth->u.dst);
2100 RT_CACHE_STAT_INC(in_hit);
2102 skb->dst = (struct dst_entry*)rth;
2105 RT_CACHE_STAT_INC(in_hlist_search);
2109 /* Multicast recognition logic is moved from route cache to here.
2110 The problem was that too many Ethernet cards have broken/missing
2111 hardware multicast filters :-( As result the host on multicasting
2112 network acquires a lot of useless route cache entries, sort of
2113 SDR messages from all the world. Now we try to get rid of them.
2114 Really, provided software IP multicast filter is organized
2115 reasonably (at least, hashed), it does not result in a slowdown
2116 comparing with route cache reject entries.
2117 Note, that multicast routers are not affected, because
2118 route cache entry is created eventually.
2120 if (MULTICAST(daddr)) {
2121 struct in_device *in_dev;
2124 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2125 int our = ip_check_mc(in_dev, daddr, saddr,
2126 skb->nh.iph->protocol);
2128 #ifdef CONFIG_IP_MROUTE
2129 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2133 return ip_route_input_mc(skb, daddr, saddr,
2140 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2143 static inline int __mkroute_output(struct rtable **result,
2144 struct fib_result* res,
2145 const struct flowi *fl,
2146 const struct flowi *oldflp,
2147 struct net_device *dev_out,
2151 struct in_device *in_dev;
2152 u32 tos = RT_FL_TOS(oldflp);
2155 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2158 if (fl->fl4_dst == 0xFFFFFFFF)
2159 res->type = RTN_BROADCAST;
2160 else if (MULTICAST(fl->fl4_dst))
2161 res->type = RTN_MULTICAST;
2162 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2165 if (dev_out->flags & IFF_LOOPBACK)
2166 flags |= RTCF_LOCAL;
2168 /* get work reference to inet device */
2169 in_dev = in_dev_get(dev_out);
2173 if (res->type == RTN_BROADCAST) {
2174 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2176 fib_info_put(res->fi);
2179 } else if (res->type == RTN_MULTICAST) {
2180 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2181 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2183 flags &= ~RTCF_LOCAL;
2184 /* If multicast route do not exist use
2185 default one, but do not gateway in this case.
2188 if (res->fi && res->prefixlen < 4) {
2189 fib_info_put(res->fi);
2195 rth = dst_alloc(&ipv4_dst_ops);
2201 atomic_set(&rth->u.dst.__refcnt, 1);
2202 rth->u.dst.flags= DST_HOST;
2203 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2205 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2206 if (res->fi->fib_nhs > 1)
2207 rth->u.dst.flags |= DST_BALANCED;
2210 if (in_dev->cnf.no_xfrm)
2211 rth->u.dst.flags |= DST_NOXFRM;
2212 if (in_dev->cnf.no_policy)
2213 rth->u.dst.flags |= DST_NOPOLICY;
2215 rth->fl.fl4_dst = oldflp->fl4_dst;
2216 rth->fl.fl4_tos = tos;
2217 rth->fl.fl4_src = oldflp->fl4_src;
2218 rth->fl.oif = oldflp->oif;
2219 #ifdef CONFIG_IP_ROUTE_FWMARK
2220 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2222 rth->rt_dst = fl->fl4_dst;
2223 rth->rt_src = fl->fl4_src;
2224 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2225 /* get references to the devices that are to be hold by the routing
2227 rth->u.dst.dev = dev_out;
2229 rth->idev = in_dev_get(dev_out);
2230 rth->rt_gateway = fl->fl4_dst;
2231 rth->rt_spec_dst= fl->fl4_src;
2233 rth->u.dst.output=ip_output;
2235 RT_CACHE_STAT_INC(out_slow_tot);
2237 if (flags & RTCF_LOCAL) {
2238 rth->u.dst.input = ip_local_deliver;
2239 rth->rt_spec_dst = fl->fl4_dst;
2241 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2242 rth->rt_spec_dst = fl->fl4_src;
2243 if (flags & RTCF_LOCAL &&
2244 !(dev_out->flags & IFF_LOOPBACK)) {
2245 rth->u.dst.output = ip_mc_output;
2246 RT_CACHE_STAT_INC(out_slow_mc);
2248 #ifdef CONFIG_IP_MROUTE
2249 if (res->type == RTN_MULTICAST) {
2250 if (IN_DEV_MFORWARD(in_dev) &&
2251 !LOCAL_MCAST(oldflp->fl4_dst)) {
2252 rth->u.dst.input = ip_mr_input;
2253 rth->u.dst.output = ip_mc_output;
2259 rt_set_nexthop(rth, res, 0);
2261 rth->rt_flags = flags;
2265 /* release work reference to inet device */
2271 static inline int ip_mkroute_output_def(struct rtable **rp,
2272 struct fib_result* res,
2273 const struct flowi *fl,
2274 const struct flowi *oldflp,
2275 struct net_device *dev_out,
2278 struct rtable *rth = NULL;
2279 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2282 hash = rt_hash_code(oldflp->fl4_dst,
2283 oldflp->fl4_src ^ (oldflp->oif << 5));
2284 err = rt_intern_hash(hash, rth, rp);
2290 static inline int ip_mkroute_output(struct rtable** rp,
2291 struct fib_result* res,
2292 const struct flowi *fl,
2293 const struct flowi *oldflp,
2294 struct net_device *dev_out,
2297 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2301 struct rtable *rth = NULL;
2303 if (res->fi && res->fi->fib_nhs > 1) {
2304 unsigned char hopcount = res->fi->fib_nhs;
2306 for (hop = 0; hop < hopcount; hop++) {
2307 struct net_device *dev2nexthop;
2311 /* hold a work reference to the output device */
2312 dev2nexthop = FIB_RES_DEV(*res);
2313 dev_hold(dev2nexthop);
2315 /* put reference to previous result */
2319 err = __mkroute_output(&rth, res, fl, oldflp,
2320 dev2nexthop, flags);
2325 hash = rt_hash_code(oldflp->fl4_dst,
2327 (oldflp->oif << 5));
2328 err = rt_intern_hash(hash, rth, rp);
2330 /* forward hop information to multipath impl. */
2331 multipath_set_nhinfo(rth,
2332 FIB_RES_NETWORK(*res),
2333 FIB_RES_NETMASK(*res),
2337 /* release work reference to output device */
2338 dev_put(dev2nexthop);
2345 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2348 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2349 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2354 * Major route resolver routine.
2357 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2359 u32 tos = RT_FL_TOS(oldflp);
2360 struct flowi fl = { .nl_u = { .ip4_u =
2361 { .daddr = oldflp->fl4_dst,
2362 .saddr = oldflp->fl4_src,
2363 .tos = tos & IPTOS_RT_MASK,
2364 .scope = ((tos & RTO_ONLINK) ?
2367 #ifdef CONFIG_IP_ROUTE_FWMARK
2368 .fwmark = oldflp->fl4_fwmark
2371 .iif = loopback_dev.ifindex,
2372 .oif = oldflp->oif };
2373 struct fib_result res;
2375 struct net_device *dev_out = NULL;
2381 #ifdef CONFIG_IP_MULTIPLE_TABLES
2385 if (oldflp->fl4_src) {
2387 if (MULTICAST(oldflp->fl4_src) ||
2388 BADCLASS(oldflp->fl4_src) ||
2389 ZERONET(oldflp->fl4_src))
2392 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2393 dev_out = ip_dev_find(oldflp->fl4_src);
2394 if (dev_out == NULL)
2397 /* I removed check for oif == dev_out->oif here.
2398 It was wrong for two reasons:
2399 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2400 assigned to multiple interfaces.
2401 2. Moreover, we are allowed to send packets with saddr
2402 of another iface. --ANK
2405 if (oldflp->oif == 0
2406 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2407 /* Special hack: user can direct multicasts
2408 and limited broadcast via necessary interface
2409 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2410 This hack is not just for fun, it allows
2411 vic,vat and friends to work.
2412 They bind socket to loopback, set ttl to zero
2413 and expect that it will work.
2414 From the viewpoint of routing cache they are broken,
2415 because we are not allowed to build multicast path
2416 with loopback source addr (look, routing cache
2417 cannot know, that ttl is zero, so that packet
2418 will not leave this host and route is valid).
2419 Luckily, this hack is good workaround.
2422 fl.oif = dev_out->ifindex;
2432 dev_out = dev_get_by_index(oldflp->oif);
2434 if (dev_out == NULL)
2437 /* RACE: Check return value of inet_select_addr instead. */
2438 if (__in_dev_get_rtnl(dev_out) == NULL) {
2440 goto out; /* Wrong error code */
2443 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2445 fl.fl4_src = inet_select_addr(dev_out, 0,
2450 if (MULTICAST(oldflp->fl4_dst))
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2453 else if (!oldflp->fl4_dst)
2454 fl.fl4_src = inet_select_addr(dev_out, 0,
2460 fl.fl4_dst = fl.fl4_src;
2462 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2465 dev_out = &loopback_dev;
2467 fl.oif = loopback_dev.ifindex;
2468 res.type = RTN_LOCAL;
2469 flags |= RTCF_LOCAL;
2473 if (fib_lookup(&fl, &res)) {
2476 /* Apparently, routing tables are wrong. Assume,
2477 that the destination is on link.
2480 Because we are allowed to send to iface
2481 even if it has NO routes and NO assigned
2482 addresses. When oif is specified, routing
2483 tables are looked up with only one purpose:
2484 to catch if destination is gatewayed, rather than
2485 direct. Moreover, if MSG_DONTROUTE is set,
2486 we send packet, ignoring both routing tables
2487 and ifaddr state. --ANK
2490 We could make it even if oif is unknown,
2491 likely IPv6, but we do not.
2494 if (fl.fl4_src == 0)
2495 fl.fl4_src = inet_select_addr(dev_out, 0,
2497 res.type = RTN_UNICAST;
2507 if (res.type == RTN_LOCAL) {
2509 fl.fl4_src = fl.fl4_dst;
2512 dev_out = &loopback_dev;
2514 fl.oif = dev_out->ifindex;
2516 fib_info_put(res.fi);
2518 flags |= RTCF_LOCAL;
2522 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2523 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2524 fib_select_multipath(&fl, &res);
2527 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2528 fib_select_default(&fl, &res);
2531 fl.fl4_src = FIB_RES_PREFSRC(res);
2535 dev_out = FIB_RES_DEV(res);
2537 fl.oif = dev_out->ifindex;
2541 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2551 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2556 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2559 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2560 rth = rcu_dereference(rth->u.rt_next)) {
2561 if (rth->fl.fl4_dst == flp->fl4_dst &&
2562 rth->fl.fl4_src == flp->fl4_src &&
2564 rth->fl.oif == flp->oif &&
2565 #ifdef CONFIG_IP_ROUTE_FWMARK
2566 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2568 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2569 (IPTOS_RT_MASK | RTO_ONLINK))) {
2571 /* check for multipath routes and choose one if
2574 if (multipath_select_route(flp, rth, rp)) {
2575 dst_hold(&(*rp)->u.dst);
2576 RT_CACHE_STAT_INC(out_hit);
2577 rcu_read_unlock_bh();
2581 rth->u.dst.lastuse = jiffies;
2582 dst_hold(&rth->u.dst);
2584 RT_CACHE_STAT_INC(out_hit);
2585 rcu_read_unlock_bh();
2589 RT_CACHE_STAT_INC(out_hlist_search);
2591 rcu_read_unlock_bh();
2593 return ip_route_output_slow(rp, flp);
2596 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2598 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2602 if ((err = __ip_route_output_key(rp, flp)) != 0)
2607 flp->fl4_src = (*rp)->rt_src;
2609 flp->fl4_dst = (*rp)->rt_dst;
2610 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2616 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2618 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2620 return ip_route_output_flow(rp, flp, NULL, 0);
2623 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2624 int nowait, unsigned int flags)
2626 struct rtable *rt = (struct rtable*)skb->dst;
2628 struct nlmsghdr *nlh;
2629 unsigned char *b = skb->tail;
2630 struct rta_cacheinfo ci;
2631 #ifdef CONFIG_IP_MROUTE
2632 struct rtattr *eptr;
2634 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2635 r = NLMSG_DATA(nlh);
2636 r->rtm_family = AF_INET;
2637 r->rtm_dst_len = 32;
2639 r->rtm_tos = rt->fl.fl4_tos;
2640 r->rtm_table = RT_TABLE_MAIN;
2641 r->rtm_type = rt->rt_type;
2642 r->rtm_scope = RT_SCOPE_UNIVERSE;
2643 r->rtm_protocol = RTPROT_UNSPEC;
2644 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2645 if (rt->rt_flags & RTCF_NOTIFY)
2646 r->rtm_flags |= RTM_F_NOTIFY;
2647 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2648 if (rt->fl.fl4_src) {
2649 r->rtm_src_len = 32;
2650 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2653 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2654 #ifdef CONFIG_NET_CLS_ROUTE
2655 if (rt->u.dst.tclassid)
2656 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2658 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2659 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2660 __u32 alg = rt->rt_multipath_alg;
2662 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2666 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2667 else if (rt->rt_src != rt->fl.fl4_src)
2668 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2669 if (rt->rt_dst != rt->rt_gateway)
2670 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2671 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2672 goto rtattr_failure;
2673 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2674 ci.rta_used = rt->u.dst.__use;
2675 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2676 if (rt->u.dst.expires)
2677 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2680 ci.rta_error = rt->u.dst.error;
2681 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2683 ci.rta_id = rt->peer->ip_id_count;
2684 if (rt->peer->tcp_ts_stamp) {
2685 ci.rta_ts = rt->peer->tcp_ts;
2686 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2689 #ifdef CONFIG_IP_MROUTE
2690 eptr = (struct rtattr*)skb->tail;
2692 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2694 #ifdef CONFIG_IP_MROUTE
2695 u32 dst = rt->rt_dst;
2697 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2698 ipv4_devconf.mc_forwarding) {
2699 int err = ipmr_get_route(skb, r, nowait);
2706 if (err == -EMSGSIZE)
2708 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2713 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2716 nlh->nlmsg_len = skb->tail - b;
2721 skb_trim(skb, b - skb->data);
2725 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2727 struct rtattr **rta = arg;
2728 struct rtmsg *rtm = NLMSG_DATA(nlh);
2729 struct rtable *rt = NULL;
2734 struct sk_buff *skb;
2736 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2740 /* Reserve room for dummy headers, this skb can pass
2741 through good chunk of routing engine.
2743 skb->mac.raw = skb->nh.raw = skb->data;
2745 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2746 skb->nh.iph->protocol = IPPROTO_ICMP;
2747 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2749 if (rta[RTA_SRC - 1])
2750 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751 if (rta[RTA_DST - 1])
2752 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753 if (rta[RTA_IIF - 1])
2754 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2757 struct net_device *dev = __dev_get_by_index(iif);
2761 skb->protocol = htons(ETH_P_IP);
2764 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2766 rt = (struct rtable*)skb->dst;
2767 if (!err && rt->u.dst.error)
2768 err = -rt->u.dst.error;
2770 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2772 .tos = rtm->rtm_tos } } };
2774 if (rta[RTA_OIF - 1])
2775 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2777 err = ip_route_output_key(&rt, &fl);
2782 skb->dst = &rt->u.dst;
2783 if (rtm->rtm_flags & RTM_F_NOTIFY)
2784 rt->rt_flags |= RTCF_NOTIFY;
2786 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2788 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2789 RTM_NEWROUTE, 0, 0);
2797 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2807 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2814 s_idx = idx = cb->args[1];
2815 for (h = 0; h <= rt_hash_mask; h++) {
2816 if (h < s_h) continue;
2820 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821 rt = rcu_dereference(rt->u.rt_next), idx++) {
2824 skb->dst = dst_clone(&rt->u.dst);
2825 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2826 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2827 1, NLM_F_MULTI) <= 0) {
2828 dst_release(xchg(&skb->dst, NULL));
2829 rcu_read_unlock_bh();
2832 dst_release(xchg(&skb->dst, NULL));
2834 rcu_read_unlock_bh();
2843 void ip_rt_multicast_event(struct in_device *in_dev)
2848 #ifdef CONFIG_SYSCTL
2849 static int flush_delay;
2851 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852 struct file *filp, void __user *buffer,
2853 size_t *lenp, loff_t *ppos)
2856 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857 rt_cache_flush(flush_delay);
2864 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2867 void __user *oldval,
2868 size_t __user *oldlenp,
2869 void __user *newval,
2874 if (newlen != sizeof(int))
2876 if (get_user(delay, (int __user *)newval))
2878 rt_cache_flush(delay);
2882 ctl_table ipv4_route_table[] = {
2884 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2885 .procname = "flush",
2886 .data = &flush_delay,
2887 .maxlen = sizeof(int),
2889 .proc_handler = &ipv4_sysctl_rtcache_flush,
2890 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2893 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2894 .procname = "min_delay",
2895 .data = &ip_rt_min_delay,
2896 .maxlen = sizeof(int),
2898 .proc_handler = &proc_dointvec_jiffies,
2899 .strategy = &sysctl_jiffies,
2902 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2903 .procname = "max_delay",
2904 .data = &ip_rt_max_delay,
2905 .maxlen = sizeof(int),
2907 .proc_handler = &proc_dointvec_jiffies,
2908 .strategy = &sysctl_jiffies,
2911 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2912 .procname = "gc_thresh",
2913 .data = &ipv4_dst_ops.gc_thresh,
2914 .maxlen = sizeof(int),
2916 .proc_handler = &proc_dointvec,
2919 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2920 .procname = "max_size",
2921 .data = &ip_rt_max_size,
2922 .maxlen = sizeof(int),
2924 .proc_handler = &proc_dointvec,
2927 /* Deprecated. Use gc_min_interval_ms */
2929 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930 .procname = "gc_min_interval",
2931 .data = &ip_rt_gc_min_interval,
2932 .maxlen = sizeof(int),
2934 .proc_handler = &proc_dointvec_jiffies,
2935 .strategy = &sysctl_jiffies,
2938 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939 .procname = "gc_min_interval_ms",
2940 .data = &ip_rt_gc_min_interval,
2941 .maxlen = sizeof(int),
2943 .proc_handler = &proc_dointvec_ms_jiffies,
2944 .strategy = &sysctl_ms_jiffies,
2947 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2948 .procname = "gc_timeout",
2949 .data = &ip_rt_gc_timeout,
2950 .maxlen = sizeof(int),
2952 .proc_handler = &proc_dointvec_jiffies,
2953 .strategy = &sysctl_jiffies,
2956 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2957 .procname = "gc_interval",
2958 .data = &ip_rt_gc_interval,
2959 .maxlen = sizeof(int),
2961 .proc_handler = &proc_dointvec_jiffies,
2962 .strategy = &sysctl_jiffies,
2965 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2966 .procname = "redirect_load",
2967 .data = &ip_rt_redirect_load,
2968 .maxlen = sizeof(int),
2970 .proc_handler = &proc_dointvec,
2973 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974 .procname = "redirect_number",
2975 .data = &ip_rt_redirect_number,
2976 .maxlen = sizeof(int),
2978 .proc_handler = &proc_dointvec,
2981 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982 .procname = "redirect_silence",
2983 .data = &ip_rt_redirect_silence,
2984 .maxlen = sizeof(int),
2986 .proc_handler = &proc_dointvec,
2989 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2990 .procname = "error_cost",
2991 .data = &ip_rt_error_cost,
2992 .maxlen = sizeof(int),
2994 .proc_handler = &proc_dointvec,
2997 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2998 .procname = "error_burst",
2999 .data = &ip_rt_error_burst,
3000 .maxlen = sizeof(int),
3002 .proc_handler = &proc_dointvec,
3005 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3006 .procname = "gc_elasticity",
3007 .data = &ip_rt_gc_elasticity,
3008 .maxlen = sizeof(int),
3010 .proc_handler = &proc_dointvec,
3013 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3014 .procname = "mtu_expires",
3015 .data = &ip_rt_mtu_expires,
3016 .maxlen = sizeof(int),
3018 .proc_handler = &proc_dointvec_jiffies,
3019 .strategy = &sysctl_jiffies,
3022 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3023 .procname = "min_pmtu",
3024 .data = &ip_rt_min_pmtu,
3025 .maxlen = sizeof(int),
3027 .proc_handler = &proc_dointvec,
3030 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3031 .procname = "min_adv_mss",
3032 .data = &ip_rt_min_advmss,
3033 .maxlen = sizeof(int),
3035 .proc_handler = &proc_dointvec,
3038 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3039 .procname = "secret_interval",
3040 .data = &ip_rt_secret_interval,
3041 .maxlen = sizeof(int),
3043 .proc_handler = &proc_dointvec_jiffies,
3044 .strategy = &sysctl_jiffies,
3050 #ifdef CONFIG_NET_CLS_ROUTE
3051 struct ip_rt_acct *ip_rt_acct;
3053 /* This code sucks. But you should have seen it before! --RR */
3055 /* IP route accounting ptr for this logical cpu number. */
3056 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3058 #ifdef CONFIG_PROC_FS
3059 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060 int length, int *eof, void *data)
3064 if ((offset & 3) || (length & 3))
3067 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3072 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073 length = sizeof(struct ip_rt_acct) * 256 - offset;
3077 offset /= sizeof(u32);
3080 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081 u32 *dst = (u32 *) buffer;
3083 /* Copy first cpu. */
3085 memcpy(dst, src, length);
3087 /* Add the other cpus in, one int at a time */
3088 for_each_possible_cpu(i) {
3091 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3093 for (j = 0; j < length/4; j++)
3099 #endif /* CONFIG_PROC_FS */
3100 #endif /* CONFIG_NET_CLS_ROUTE */
3102 static __initdata unsigned long rhash_entries;
3103 static int __init set_rhash_entries(char *str)
3107 rhash_entries = simple_strtoul(str, &str, 0);
3110 __setup("rhash_entries=", set_rhash_entries);
3112 int __init ip_rt_init(void)
3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117 (jiffies ^ (jiffies >> 7)));
3119 #ifdef CONFIG_NET_CLS_ROUTE
3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3125 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3127 panic("IP: failed to allocate ip_rt_acct\n");
3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133 sizeof(struct rtable),
3134 0, SLAB_HWCACHE_ALIGN,
3137 if (!ipv4_dst_ops.kmem_cachep)
3138 panic("IP: failed to allocate ip_dst_cache\n");
3140 rt_hash_table = (struct rt_hash_bucket *)
3141 alloc_large_system_hash("IP route cache",
3142 sizeof(struct rt_hash_bucket),
3144 (num_physpages >= 128 * 1024) ?
3150 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3151 rt_hash_lock_init();
3153 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3154 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3159 init_timer(&rt_flush_timer);
3160 rt_flush_timer.function = rt_run_flush;
3161 init_timer(&rt_periodic_timer);
3162 rt_periodic_timer.function = rt_check_expire;
3163 init_timer(&rt_secret_timer);
3164 rt_secret_timer.function = rt_secret_rebuild;
3166 /* All the timers, started at system startup tend
3167 to synchronize. Perturb it a bit.
3169 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171 add_timer(&rt_periodic_timer);
3173 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3174 ip_rt_secret_interval;
3175 add_timer(&rt_secret_timer);
3177 #ifdef CONFIG_PROC_FS
3179 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3180 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3181 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3185 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187 #ifdef CONFIG_NET_CLS_ROUTE
3188 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3198 EXPORT_SYMBOL(__ip_select_ident);
3199 EXPORT_SYMBOL(ip_route_input);
3200 EXPORT_SYMBOL(ip_route_output_key);