Merge branches 'master', 'omap1-upstream' and 'orion' into devel
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval            = 60 * HZ;
123 static int ip_rt_gc_min_interval        = HZ / 2;
124 static int ip_rt_redirect_number        = 9;
125 static int ip_rt_redirect_load          = HZ / 50;
126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost             = HZ;
128 static int ip_rt_error_burst            = 5 * HZ;
129 static int ip_rt_gc_elasticity          = 8;
130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
132 static int ip_rt_min_advmss             = 256;
133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .local_out =            ip_local_out,
166         .entry_size =           sizeof(struct rtable),
167         .entries =              ATOMIC_INIT(0),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 const __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234
235 static __init void rt_hash_lock_init(void)
236 {
237         int i;
238
239         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240                         GFP_KERNEL);
241         if (!rt_hash_locks)
242                 panic("IP: failed to allocate rt_hash_locks\n");
243
244         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245                 spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254
255 static struct rt_hash_bucket    *rt_hash_table;
256 static unsigned                 rt_hash_mask;
257 static unsigned int             rt_hash_log;
258 static atomic_t                 rt_genid;
259
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262         (__raw_get_cpu_var(rt_cache_stat).field++)
263
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267                 & rt_hash_mask;
268 }
269
270 #define rt_hash(daddr, saddr, idx) \
271         rt_hash_code((__force u32)(__be32)(daddr),\
272                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276         int bucket;
277         int genid;
278 };
279
280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
281 {
282         struct rtable *r = NULL;
283
284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285                 rcu_read_lock_bh();
286                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287                 while (r) {
288                         if (r->rt_genid == st->genid)
289                                 return r;
290                         r = rcu_dereference(r->u.dst.rt_next);
291                 }
292                 rcu_read_unlock_bh();
293         }
294         return r;
295 }
296
297 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
298 {
299         r = r->u.dst.rt_next;
300         while (!r) {
301                 rcu_read_unlock_bh();
302                 if (--st->bucket < 0)
303                         break;
304                 rcu_read_lock_bh();
305                 r = rt_hash_table[st->bucket].chain;
306         }
307         return rcu_dereference(r);
308 }
309
310 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
311 {
312         struct rtable *r = rt_cache_get_first(st);
313
314         if (r)
315                 while (pos && (r = rt_cache_get_next(st, r))) {
316                         if (r->rt_genid != st->genid)
317                                 continue;
318                         --pos;
319                 }
320         return pos ? NULL : r;
321 }
322
323 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
324 {
325         struct rt_cache_iter_state *st = seq->private;
326
327         if (*pos)
328                 return rt_cache_get_idx(st, *pos - 1);
329         st->genid = atomic_read(&rt_genid);
330         return SEQ_START_TOKEN;
331 }
332
333 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
334 {
335         struct rtable *r;
336         struct rt_cache_iter_state *st = seq->private;
337
338         if (v == SEQ_START_TOKEN)
339                 r = rt_cache_get_first(st);
340         else
341                 r = rt_cache_get_next(st, v);
342         ++*pos;
343         return r;
344 }
345
346 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
347 {
348         if (v && v != SEQ_START_TOKEN)
349                 rcu_read_unlock_bh();
350 }
351
352 static int rt_cache_seq_show(struct seq_file *seq, void *v)
353 {
354         if (v == SEQ_START_TOKEN)
355                 seq_printf(seq, "%-127s\n",
356                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
357                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
358                            "HHUptod\tSpecDst");
359         else {
360                 struct rtable *r = v;
361                 char temp[256];
362
363                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
364                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
365                         r->u.dst.dev ? r->u.dst.dev->name : "*",
366                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
367                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
368                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
369                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
370                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
371                         dst_metric(&r->u.dst, RTAX_WINDOW),
372                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
373                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
374                         r->fl.fl4_tos,
375                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
376                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
377                                        dev_queue_xmit) : 0,
378                         r->rt_spec_dst);
379                 seq_printf(seq, "%-127s\n", temp);
380         }
381         return 0;
382 }
383
384 static const struct seq_operations rt_cache_seq_ops = {
385         .start  = rt_cache_seq_start,
386         .next   = rt_cache_seq_next,
387         .stop   = rt_cache_seq_stop,
388         .show   = rt_cache_seq_show,
389 };
390
391 static int rt_cache_seq_open(struct inode *inode, struct file *file)
392 {
393         return seq_open_private(file, &rt_cache_seq_ops,
394                         sizeof(struct rt_cache_iter_state));
395 }
396
397 static const struct file_operations rt_cache_seq_fops = {
398         .owner   = THIS_MODULE,
399         .open    = rt_cache_seq_open,
400         .read    = seq_read,
401         .llseek  = seq_lseek,
402         .release = seq_release_private,
403 };
404
405
406 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
407 {
408         int cpu;
409
410         if (*pos == 0)
411                 return SEQ_START_TOKEN;
412
413         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
414                 if (!cpu_possible(cpu))
415                         continue;
416                 *pos = cpu+1;
417                 return &per_cpu(rt_cache_stat, cpu);
418         }
419         return NULL;
420 }
421
422 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
423 {
424         int cpu;
425
426         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
427                 if (!cpu_possible(cpu))
428                         continue;
429                 *pos = cpu+1;
430                 return &per_cpu(rt_cache_stat, cpu);
431         }
432         return NULL;
433
434 }
435
436 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
437 {
438
439 }
440
441 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
442 {
443         struct rt_cache_stat *st = v;
444
445         if (v == SEQ_START_TOKEN) {
446                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
447                 return 0;
448         }
449
450         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
451                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
452                    atomic_read(&ipv4_dst_ops.entries),
453                    st->in_hit,
454                    st->in_slow_tot,
455                    st->in_slow_mc,
456                    st->in_no_route,
457                    st->in_brd,
458                    st->in_martian_dst,
459                    st->in_martian_src,
460
461                    st->out_hit,
462                    st->out_slow_tot,
463                    st->out_slow_mc,
464
465                    st->gc_total,
466                    st->gc_ignored,
467                    st->gc_goal_miss,
468                    st->gc_dst_overflow,
469                    st->in_hlist_search,
470                    st->out_hlist_search
471                 );
472         return 0;
473 }
474
475 static const struct seq_operations rt_cpu_seq_ops = {
476         .start  = rt_cpu_seq_start,
477         .next   = rt_cpu_seq_next,
478         .stop   = rt_cpu_seq_stop,
479         .show   = rt_cpu_seq_show,
480 };
481
482
483 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
484 {
485         return seq_open(file, &rt_cpu_seq_ops);
486 }
487
488 static const struct file_operations rt_cpu_seq_fops = {
489         .owner   = THIS_MODULE,
490         .open    = rt_cpu_seq_open,
491         .read    = seq_read,
492         .llseek  = seq_lseek,
493         .release = seq_release,
494 };
495
496 #ifdef CONFIG_NET_CLS_ROUTE
497 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
498                            int length, int *eof, void *data)
499 {
500         unsigned int i;
501
502         if ((offset & 3) || (length & 3))
503                 return -EIO;
504
505         if (offset >= sizeof(struct ip_rt_acct) * 256) {
506                 *eof = 1;
507                 return 0;
508         }
509
510         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
511                 length = sizeof(struct ip_rt_acct) * 256 - offset;
512                 *eof = 1;
513         }
514
515         offset /= sizeof(u32);
516
517         if (length > 0) {
518                 u32 *dst = (u32 *) buffer;
519
520                 *start = buffer;
521                 memset(dst, 0, length);
522
523                 for_each_possible_cpu(i) {
524                         unsigned int j;
525                         u32 *src;
526
527                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
528                         for (j = 0; j < length/4; j++)
529                                 dst[j] += src[j];
530                 }
531         }
532         return length;
533 }
534 #endif
535
536 static __init int ip_rt_proc_init(struct net *net)
537 {
538         struct proc_dir_entry *pde;
539
540         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
541                         &rt_cache_seq_fops);
542         if (!pde)
543                 goto err1;
544
545         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
546         if (!pde)
547                 goto err2;
548
549         pde->proc_fops = &rt_cpu_seq_fops;
550
551 #ifdef CONFIG_NET_CLS_ROUTE
552         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
553                         ip_rt_acct_read, NULL);
554         if (!pde)
555                 goto err3;
556 #endif
557         return 0;
558
559 #ifdef CONFIG_NET_CLS_ROUTE
560 err3:
561         remove_proc_entry("rt_cache", net->proc_net_stat);
562 #endif
563 err2:
564         remove_proc_entry("rt_cache", net->proc_net);
565 err1:
566         return -ENOMEM;
567 }
568 #else
569 static inline int ip_rt_proc_init(struct net *net)
570 {
571         return 0;
572 }
573 #endif /* CONFIG_PROC_FS */
574
575 static __inline__ void rt_free(struct rtable *rt)
576 {
577         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
578 }
579
580 static __inline__ void rt_drop(struct rtable *rt)
581 {
582         ip_rt_put(rt);
583         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
584 }
585
586 static __inline__ int rt_fast_clean(struct rtable *rth)
587 {
588         /* Kill broadcast/multicast entries very aggresively, if they
589            collide in hash table with more useful entries */
590         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
591                 rth->fl.iif && rth->u.dst.rt_next;
592 }
593
594 static __inline__ int rt_valuable(struct rtable *rth)
595 {
596         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
597                 rth->u.dst.expires;
598 }
599
600 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
601 {
602         unsigned long age;
603         int ret = 0;
604
605         if (atomic_read(&rth->u.dst.__refcnt))
606                 goto out;
607
608         ret = 1;
609         if (rth->u.dst.expires &&
610             time_after_eq(jiffies, rth->u.dst.expires))
611                 goto out;
612
613         age = jiffies - rth->u.dst.lastuse;
614         ret = 0;
615         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
616             (age <= tmo2 && rt_valuable(rth)))
617                 goto out;
618         ret = 1;
619 out:    return ret;
620 }
621
622 /* Bits of score are:
623  * 31: very valuable
624  * 30: not quite useless
625  * 29..0: usage counter
626  */
627 static inline u32 rt_score(struct rtable *rt)
628 {
629         u32 score = jiffies - rt->u.dst.lastuse;
630
631         score = ~score & ~(3<<30);
632
633         if (rt_valuable(rt))
634                 score |= (1<<31);
635
636         if (!rt->fl.iif ||
637             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
638                 score |= (1<<30);
639
640         return score;
641 }
642
643 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
644 {
645         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
646                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
647                 (fl1->mark ^ fl2->mark) |
648                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
649                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
650                 (fl1->oif ^ fl2->oif) |
651                 (fl1->iif ^ fl2->iif)) == 0;
652 }
653
654 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
655 {
656         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
657 }
658
659 /*
660  * Perform a full scan of hash table and free all entries.
661  * Can be called by a softirq or a process.
662  * In the later case, we want to be reschedule if necessary
663  */
664 static void rt_do_flush(int process_context)
665 {
666         unsigned int i;
667         struct rtable *rth, *next;
668
669         for (i = 0; i <= rt_hash_mask; i++) {
670                 if (process_context && need_resched())
671                         cond_resched();
672                 rth = rt_hash_table[i].chain;
673                 if (!rth)
674                         continue;
675
676                 spin_lock_bh(rt_hash_lock_addr(i));
677                 rth = rt_hash_table[i].chain;
678                 rt_hash_table[i].chain = NULL;
679                 spin_unlock_bh(rt_hash_lock_addr(i));
680
681                 for (; rth; rth = next) {
682                         next = rth->u.dst.rt_next;
683                         rt_free(rth);
684                 }
685         }
686 }
687
688 static void rt_check_expire(void)
689 {
690         static unsigned int rover;
691         unsigned int i = rover, goal;
692         struct rtable *rth, **rthp;
693         u64 mult;
694
695         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
696         if (ip_rt_gc_timeout > 1)
697                 do_div(mult, ip_rt_gc_timeout);
698         goal = (unsigned int)mult;
699         if (goal > rt_hash_mask)
700                 goal = rt_hash_mask + 1;
701         for (; goal > 0; goal--) {
702                 unsigned long tmo = ip_rt_gc_timeout;
703
704                 i = (i + 1) & rt_hash_mask;
705                 rthp = &rt_hash_table[i].chain;
706
707                 if (need_resched())
708                         cond_resched();
709
710                 if (*rthp == NULL)
711                         continue;
712                 spin_lock_bh(rt_hash_lock_addr(i));
713                 while ((rth = *rthp) != NULL) {
714                         if (rth->rt_genid != atomic_read(&rt_genid)) {
715                                 *rthp = rth->u.dst.rt_next;
716                                 rt_free(rth);
717                                 continue;
718                         }
719                         if (rth->u.dst.expires) {
720                                 /* Entry is expired even if it is in use */
721                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
722                                         tmo >>= 1;
723                                         rthp = &rth->u.dst.rt_next;
724                                         continue;
725                                 }
726                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
727                                 tmo >>= 1;
728                                 rthp = &rth->u.dst.rt_next;
729                                 continue;
730                         }
731
732                         /* Cleanup aged off entries. */
733                         *rthp = rth->u.dst.rt_next;
734                         rt_free(rth);
735                 }
736                 spin_unlock_bh(rt_hash_lock_addr(i));
737         }
738         rover = i;
739 }
740
741 /*
742  * rt_worker_func() is run in process context.
743  * we call rt_check_expire() to scan part of the hash table
744  */
745 static void rt_worker_func(struct work_struct *work)
746 {
747         rt_check_expire();
748         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
749 }
750
751 /*
752  * Pertubation of rt_genid by a small quantity [1..256]
753  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
754  * many times (2^24) without giving recent rt_genid.
755  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
756  */
757 static void rt_cache_invalidate(void)
758 {
759         unsigned char shuffle;
760
761         get_random_bytes(&shuffle, sizeof(shuffle));
762         atomic_add(shuffle + 1U, &rt_genid);
763 }
764
765 /*
766  * delay < 0  : invalidate cache (fast : entries will be deleted later)
767  * delay >= 0 : invalidate & flush cache (can be long)
768  */
769 void rt_cache_flush(int delay)
770 {
771         rt_cache_invalidate();
772         if (delay >= 0)
773                 rt_do_flush(!in_softirq());
774 }
775
776 /*
777  * We change rt_genid and let gc do the cleanup
778  */
779 static void rt_secret_rebuild(unsigned long dummy)
780 {
781         rt_cache_invalidate();
782         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
783 }
784
785 /*
786    Short description of GC goals.
787
788    We want to build algorithm, which will keep routing cache
789    at some equilibrium point, when number of aged off entries
790    is kept approximately equal to newly generated ones.
791
792    Current expiration strength is variable "expire".
793    We try to adjust it dynamically, so that if networking
794    is idle expires is large enough to keep enough of warm entries,
795    and when load increases it reduces to limit cache size.
796  */
797
798 static int rt_garbage_collect(struct dst_ops *ops)
799 {
800         static unsigned long expire = RT_GC_TIMEOUT;
801         static unsigned long last_gc;
802         static int rover;
803         static int equilibrium;
804         struct rtable *rth, **rthp;
805         unsigned long now = jiffies;
806         int goal;
807
808         /*
809          * Garbage collection is pretty expensive,
810          * do not make it too frequently.
811          */
812
813         RT_CACHE_STAT_INC(gc_total);
814
815         if (now - last_gc < ip_rt_gc_min_interval &&
816             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
817                 RT_CACHE_STAT_INC(gc_ignored);
818                 goto out;
819         }
820
821         /* Calculate number of entries, which we want to expire now. */
822         goal = atomic_read(&ipv4_dst_ops.entries) -
823                 (ip_rt_gc_elasticity << rt_hash_log);
824         if (goal <= 0) {
825                 if (equilibrium < ipv4_dst_ops.gc_thresh)
826                         equilibrium = ipv4_dst_ops.gc_thresh;
827                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
828                 if (goal > 0) {
829                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
830                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
831                 }
832         } else {
833                 /* We are in dangerous area. Try to reduce cache really
834                  * aggressively.
835                  */
836                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
837                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
838         }
839
840         if (now - last_gc >= ip_rt_gc_min_interval)
841                 last_gc = now;
842
843         if (goal <= 0) {
844                 equilibrium += goal;
845                 goto work_done;
846         }
847
848         do {
849                 int i, k;
850
851                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
852                         unsigned long tmo = expire;
853
854                         k = (k + 1) & rt_hash_mask;
855                         rthp = &rt_hash_table[k].chain;
856                         spin_lock_bh(rt_hash_lock_addr(k));
857                         while ((rth = *rthp) != NULL) {
858                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
859                                         !rt_may_expire(rth, tmo, expire)) {
860                                         tmo >>= 1;
861                                         rthp = &rth->u.dst.rt_next;
862                                         continue;
863                                 }
864                                 *rthp = rth->u.dst.rt_next;
865                                 rt_free(rth);
866                                 goal--;
867                         }
868                         spin_unlock_bh(rt_hash_lock_addr(k));
869                         if (goal <= 0)
870                                 break;
871                 }
872                 rover = k;
873
874                 if (goal <= 0)
875                         goto work_done;
876
877                 /* Goal is not achieved. We stop process if:
878
879                    - if expire reduced to zero. Otherwise, expire is halfed.
880                    - if table is not full.
881                    - if we are called from interrupt.
882                    - jiffies check is just fallback/debug loop breaker.
883                      We will not spin here for long time in any case.
884                  */
885
886                 RT_CACHE_STAT_INC(gc_goal_miss);
887
888                 if (expire == 0)
889                         break;
890
891                 expire >>= 1;
892 #if RT_CACHE_DEBUG >= 2
893                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
894                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
895 #endif
896
897                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
898                         goto out;
899         } while (!in_softirq() && time_before_eq(jiffies, now));
900
901         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
902                 goto out;
903         if (net_ratelimit())
904                 printk(KERN_WARNING "dst cache overflow\n");
905         RT_CACHE_STAT_INC(gc_dst_overflow);
906         return 1;
907
908 work_done:
909         expire += ip_rt_gc_min_interval;
910         if (expire > ip_rt_gc_timeout ||
911             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
912                 expire = ip_rt_gc_timeout;
913 #if RT_CACHE_DEBUG >= 2
914         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
915                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
916 #endif
917 out:    return 0;
918 }
919
920 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
921 {
922         struct rtable   *rth, **rthp;
923         unsigned long   now;
924         struct rtable *cand, **candp;
925         u32             min_score;
926         int             chain_length;
927         int attempts = !in_softirq();
928
929 restart:
930         chain_length = 0;
931         min_score = ~(u32)0;
932         cand = NULL;
933         candp = NULL;
934         now = jiffies;
935
936         rthp = &rt_hash_table[hash].chain;
937
938         spin_lock_bh(rt_hash_lock_addr(hash));
939         while ((rth = *rthp) != NULL) {
940                 if (rth->rt_genid != atomic_read(&rt_genid)) {
941                         *rthp = rth->u.dst.rt_next;
942                         rt_free(rth);
943                         continue;
944                 }
945                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
946                         /* Put it first */
947                         *rthp = rth->u.dst.rt_next;
948                         /*
949                          * Since lookup is lockfree, the deletion
950                          * must be visible to another weakly ordered CPU before
951                          * the insertion at the start of the hash chain.
952                          */
953                         rcu_assign_pointer(rth->u.dst.rt_next,
954                                            rt_hash_table[hash].chain);
955                         /*
956                          * Since lookup is lockfree, the update writes
957                          * must be ordered for consistency on SMP.
958                          */
959                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
960
961                         dst_use(&rth->u.dst, now);
962                         spin_unlock_bh(rt_hash_lock_addr(hash));
963
964                         rt_drop(rt);
965                         *rp = rth;
966                         return 0;
967                 }
968
969                 if (!atomic_read(&rth->u.dst.__refcnt)) {
970                         u32 score = rt_score(rth);
971
972                         if (score <= min_score) {
973                                 cand = rth;
974                                 candp = rthp;
975                                 min_score = score;
976                         }
977                 }
978
979                 chain_length++;
980
981                 rthp = &rth->u.dst.rt_next;
982         }
983
984         if (cand) {
985                 /* ip_rt_gc_elasticity used to be average length of chain
986                  * length, when exceeded gc becomes really aggressive.
987                  *
988                  * The second limit is less certain. At the moment it allows
989                  * only 2 entries per bucket. We will see.
990                  */
991                 if (chain_length > ip_rt_gc_elasticity) {
992                         *candp = cand->u.dst.rt_next;
993                         rt_free(cand);
994                 }
995         }
996
997         /* Try to bind route to arp only if it is output
998            route or unicast forwarding path.
999          */
1000         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1001                 int err = arp_bind_neighbour(&rt->u.dst);
1002                 if (err) {
1003                         spin_unlock_bh(rt_hash_lock_addr(hash));
1004
1005                         if (err != -ENOBUFS) {
1006                                 rt_drop(rt);
1007                                 return err;
1008                         }
1009
1010                         /* Neighbour tables are full and nothing
1011                            can be released. Try to shrink route cache,
1012                            it is most likely it holds some neighbour records.
1013                          */
1014                         if (attempts-- > 0) {
1015                                 int saved_elasticity = ip_rt_gc_elasticity;
1016                                 int saved_int = ip_rt_gc_min_interval;
1017                                 ip_rt_gc_elasticity     = 1;
1018                                 ip_rt_gc_min_interval   = 0;
1019                                 rt_garbage_collect(&ipv4_dst_ops);
1020                                 ip_rt_gc_min_interval   = saved_int;
1021                                 ip_rt_gc_elasticity     = saved_elasticity;
1022                                 goto restart;
1023                         }
1024
1025                         if (net_ratelimit())
1026                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1027                         rt_drop(rt);
1028                         return -ENOBUFS;
1029                 }
1030         }
1031
1032         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1033 #if RT_CACHE_DEBUG >= 2
1034         if (rt->u.dst.rt_next) {
1035                 struct rtable *trt;
1036                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1037                        NIPQUAD(rt->rt_dst));
1038                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1039                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1040                 printk("\n");
1041         }
1042 #endif
1043         rt_hash_table[hash].chain = rt;
1044         spin_unlock_bh(rt_hash_lock_addr(hash));
1045         *rp = rt;
1046         return 0;
1047 }
1048
1049 void rt_bind_peer(struct rtable *rt, int create)
1050 {
1051         static DEFINE_SPINLOCK(rt_peer_lock);
1052         struct inet_peer *peer;
1053
1054         peer = inet_getpeer(rt->rt_dst, create);
1055
1056         spin_lock_bh(&rt_peer_lock);
1057         if (rt->peer == NULL) {
1058                 rt->peer = peer;
1059                 peer = NULL;
1060         }
1061         spin_unlock_bh(&rt_peer_lock);
1062         if (peer)
1063                 inet_putpeer(peer);
1064 }
1065
1066 /*
1067  * Peer allocation may fail only in serious out-of-memory conditions.  However
1068  * we still can generate some output.
1069  * Random ID selection looks a bit dangerous because we have no chances to
1070  * select ID being unique in a reasonable period of time.
1071  * But broken packet identifier may be better than no packet at all.
1072  */
1073 static void ip_select_fb_ident(struct iphdr *iph)
1074 {
1075         static DEFINE_SPINLOCK(ip_fb_id_lock);
1076         static u32 ip_fallback_id;
1077         u32 salt;
1078
1079         spin_lock_bh(&ip_fb_id_lock);
1080         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1081         iph->id = htons(salt & 0xFFFF);
1082         ip_fallback_id = salt;
1083         spin_unlock_bh(&ip_fb_id_lock);
1084 }
1085
1086 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1087 {
1088         struct rtable *rt = (struct rtable *) dst;
1089
1090         if (rt) {
1091                 if (rt->peer == NULL)
1092                         rt_bind_peer(rt, 1);
1093
1094                 /* If peer is attached to destination, it is never detached,
1095                    so that we need not to grab a lock to dereference it.
1096                  */
1097                 if (rt->peer) {
1098                         iph->id = htons(inet_getid(rt->peer, more));
1099                         return;
1100                 }
1101         } else
1102                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1103                        __builtin_return_address(0));
1104
1105         ip_select_fb_ident(iph);
1106 }
1107
1108 static void rt_del(unsigned hash, struct rtable *rt)
1109 {
1110         struct rtable **rthp, *aux;
1111
1112         rthp = &rt_hash_table[hash].chain;
1113         spin_lock_bh(rt_hash_lock_addr(hash));
1114         ip_rt_put(rt);
1115         while ((aux = *rthp) != NULL) {
1116                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1117                         *rthp = aux->u.dst.rt_next;
1118                         rt_free(aux);
1119                         continue;
1120                 }
1121                 rthp = &aux->u.dst.rt_next;
1122         }
1123         spin_unlock_bh(rt_hash_lock_addr(hash));
1124 }
1125
1126 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1127                     __be32 saddr, struct net_device *dev)
1128 {
1129         int i, k;
1130         struct in_device *in_dev = in_dev_get(dev);
1131         struct rtable *rth, **rthp;
1132         __be32  skeys[2] = { saddr, 0 };
1133         int  ikeys[2] = { dev->ifindex, 0 };
1134         struct netevent_redirect netevent;
1135
1136         if (!in_dev)
1137                 return;
1138
1139         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1140             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1141             || ipv4_is_zeronet(new_gw))
1142                 goto reject_redirect;
1143
1144         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1145                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1146                         goto reject_redirect;
1147                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1148                         goto reject_redirect;
1149         } else {
1150                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1151                         goto reject_redirect;
1152         }
1153
1154         for (i = 0; i < 2; i++) {
1155                 for (k = 0; k < 2; k++) {
1156                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1157
1158                         rthp=&rt_hash_table[hash].chain;
1159
1160                         rcu_read_lock();
1161                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1162                                 struct rtable *rt;
1163
1164                                 if (rth->fl.fl4_dst != daddr ||
1165                                     rth->fl.fl4_src != skeys[i] ||
1166                                     rth->fl.oif != ikeys[k] ||
1167                                     rth->fl.iif != 0 ||
1168                                     rth->rt_genid != atomic_read(&rt_genid)) {
1169                                         rthp = &rth->u.dst.rt_next;
1170                                         continue;
1171                                 }
1172
1173                                 if (rth->rt_dst != daddr ||
1174                                     rth->rt_src != saddr ||
1175                                     rth->u.dst.error ||
1176                                     rth->rt_gateway != old_gw ||
1177                                     rth->u.dst.dev != dev)
1178                                         break;
1179
1180                                 dst_hold(&rth->u.dst);
1181                                 rcu_read_unlock();
1182
1183                                 rt = dst_alloc(&ipv4_dst_ops);
1184                                 if (rt == NULL) {
1185                                         ip_rt_put(rth);
1186                                         in_dev_put(in_dev);
1187                                         return;
1188                                 }
1189
1190                                 /* Copy all the information. */
1191                                 *rt = *rth;
1192                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1193                                 rt->u.dst.__use         = 1;
1194                                 atomic_set(&rt->u.dst.__refcnt, 1);
1195                                 rt->u.dst.child         = NULL;
1196                                 if (rt->u.dst.dev)
1197                                         dev_hold(rt->u.dst.dev);
1198                                 if (rt->idev)
1199                                         in_dev_hold(rt->idev);
1200                                 rt->u.dst.obsolete      = 0;
1201                                 rt->u.dst.lastuse       = jiffies;
1202                                 rt->u.dst.path          = &rt->u.dst;
1203                                 rt->u.dst.neighbour     = NULL;
1204                                 rt->u.dst.hh            = NULL;
1205                                 rt->u.dst.xfrm          = NULL;
1206                                 rt->rt_genid            = atomic_read(&rt_genid);
1207                                 rt->rt_flags            |= RTCF_REDIRECTED;
1208
1209                                 /* Gateway is different ... */
1210                                 rt->rt_gateway          = new_gw;
1211
1212                                 /* Redirect received -> path was valid */
1213                                 dst_confirm(&rth->u.dst);
1214
1215                                 if (rt->peer)
1216                                         atomic_inc(&rt->peer->refcnt);
1217
1218                                 if (arp_bind_neighbour(&rt->u.dst) ||
1219                                     !(rt->u.dst.neighbour->nud_state &
1220                                             NUD_VALID)) {
1221                                         if (rt->u.dst.neighbour)
1222                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1223                                         ip_rt_put(rth);
1224                                         rt_drop(rt);
1225                                         goto do_next;
1226                                 }
1227
1228                                 netevent.old = &rth->u.dst;
1229                                 netevent.new = &rt->u.dst;
1230                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1231                                                         &netevent);
1232
1233                                 rt_del(hash, rth);
1234                                 if (!rt_intern_hash(hash, rt, &rt))
1235                                         ip_rt_put(rt);
1236                                 goto do_next;
1237                         }
1238                         rcu_read_unlock();
1239                 do_next:
1240                         ;
1241                 }
1242         }
1243         in_dev_put(in_dev);
1244         return;
1245
1246 reject_redirect:
1247 #ifdef CONFIG_IP_ROUTE_VERBOSE
1248         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1249                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1250                         "%u.%u.%u.%u ignored.\n"
1251                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1252                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1253                        NIPQUAD(saddr), NIPQUAD(daddr));
1254 #endif
1255         in_dev_put(in_dev);
1256 }
1257
1258 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1259 {
1260         struct rtable *rt = (struct rtable*)dst;
1261         struct dst_entry *ret = dst;
1262
1263         if (rt) {
1264                 if (dst->obsolete) {
1265                         ip_rt_put(rt);
1266                         ret = NULL;
1267                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1268                            rt->u.dst.expires) {
1269                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1270                                                 rt->fl.oif);
1271 #if RT_CACHE_DEBUG >= 1
1272                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1273                                           "%u.%u.%u.%u/%02x dropped\n",
1274                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1275 #endif
1276                         rt_del(hash, rt);
1277                         ret = NULL;
1278                 }
1279         }
1280         return ret;
1281 }
1282
1283 /*
1284  * Algorithm:
1285  *      1. The first ip_rt_redirect_number redirects are sent
1286  *         with exponential backoff, then we stop sending them at all,
1287  *         assuming that the host ignores our redirects.
1288  *      2. If we did not see packets requiring redirects
1289  *         during ip_rt_redirect_silence, we assume that the host
1290  *         forgot redirected route and start to send redirects again.
1291  *
1292  * This algorithm is much cheaper and more intelligent than dumb load limiting
1293  * in icmp.c.
1294  *
1295  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1296  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1297  */
1298
1299 void ip_rt_send_redirect(struct sk_buff *skb)
1300 {
1301         struct rtable *rt = (struct rtable*)skb->dst;
1302         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1303
1304         if (!in_dev)
1305                 return;
1306
1307         if (!IN_DEV_TX_REDIRECTS(in_dev))
1308                 goto out;
1309
1310         /* No redirected packets during ip_rt_redirect_silence;
1311          * reset the algorithm.
1312          */
1313         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1314                 rt->u.dst.rate_tokens = 0;
1315
1316         /* Too many ignored redirects; do not send anything
1317          * set u.dst.rate_last to the last seen redirected packet.
1318          */
1319         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1320                 rt->u.dst.rate_last = jiffies;
1321                 goto out;
1322         }
1323
1324         /* Check for load limit; set rate_last to the latest sent
1325          * redirect.
1326          */
1327         if (rt->u.dst.rate_tokens == 0 ||
1328             time_after(jiffies,
1329                        (rt->u.dst.rate_last +
1330                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332                 rt->u.dst.rate_last = jiffies;
1333                 ++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337                     net_ratelimit())
1338                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1341                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342 #endif
1343         }
1344 out:
1345         in_dev_put(in_dev);
1346 }
1347
1348 static int ip_error(struct sk_buff *skb)
1349 {
1350         struct rtable *rt = (struct rtable*)skb->dst;
1351         unsigned long now;
1352         int code;
1353
1354         switch (rt->u.dst.error) {
1355                 case EINVAL:
1356                 default:
1357                         goto out;
1358                 case EHOSTUNREACH:
1359                         code = ICMP_HOST_UNREACH;
1360                         break;
1361                 case ENETUNREACH:
1362                         code = ICMP_NET_UNREACH;
1363                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1364                         break;
1365                 case EACCES:
1366                         code = ICMP_PKT_FILTERED;
1367                         break;
1368         }
1369
1370         now = jiffies;
1371         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1374         rt->u.dst.rate_last = now;
1375         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1378         }
1379
1380 out:    kfree_skb(skb);
1381         return 0;
1382 }
1383
1384 /*
1385  *      The last two values are not from the RFC but
1386  *      are needed for AMPRnet AX.25 paths.
1387  */
1388
1389 static const unsigned short mtu_plateau[] =
1390 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391
1392 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1393 {
1394         int i;
1395
1396         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397                 if (old_mtu > mtu_plateau[i])
1398                         return mtu_plateau[i];
1399         return 68;
1400 }
1401
1402 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1403                                  unsigned short new_mtu)
1404 {
1405         int i;
1406         unsigned short old_mtu = ntohs(iph->tot_len);
1407         struct rtable *rth;
1408         __be32  skeys[2] = { iph->saddr, 0, };
1409         __be32  daddr = iph->daddr;
1410         unsigned short est_mtu = 0;
1411
1412         if (ipv4_config.no_pmtu_disc)
1413                 return 0;
1414
1415         for (i = 0; i < 2; i++) {
1416                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1417
1418                 rcu_read_lock();
1419                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1420                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1421                         if (rth->fl.fl4_dst == daddr &&
1422                             rth->fl.fl4_src == skeys[i] &&
1423                             rth->rt_dst  == daddr &&
1424                             rth->rt_src  == iph->saddr &&
1425                             rth->fl.iif == 0 &&
1426                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1427                             rth->u.dst.dev->nd_net == net &&
1428                             rth->rt_genid == atomic_read(&rt_genid)) {
1429                                 unsigned short mtu = new_mtu;
1430
1431                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1432
1433                                         /* BSD 4.2 compatibility hack :-( */
1434                                         if (mtu == 0 &&
1435                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1436                                             old_mtu >= 68 + (iph->ihl << 2))
1437                                                 old_mtu -= iph->ihl << 2;
1438
1439                                         mtu = guess_mtu(old_mtu);
1440                                 }
1441                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1442                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1443                                                 dst_confirm(&rth->u.dst);
1444                                                 if (mtu < ip_rt_min_pmtu) {
1445                                                         mtu = ip_rt_min_pmtu;
1446                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1447                                                                 (1 << RTAX_MTU);
1448                                                 }
1449                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1450                                                 dst_set_expires(&rth->u.dst,
1451                                                         ip_rt_mtu_expires);
1452                                         }
1453                                         est_mtu = mtu;
1454                                 }
1455                         }
1456                 }
1457                 rcu_read_unlock();
1458         }
1459         return est_mtu ? : new_mtu;
1460 }
1461
1462 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1463 {
1464         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1465             !(dst_metric_locked(dst, RTAX_MTU))) {
1466                 if (mtu < ip_rt_min_pmtu) {
1467                         mtu = ip_rt_min_pmtu;
1468                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1469                 }
1470                 dst->metrics[RTAX_MTU-1] = mtu;
1471                 dst_set_expires(dst, ip_rt_mtu_expires);
1472                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1473         }
1474 }
1475
1476 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1477 {
1478         return NULL;
1479 }
1480
1481 static void ipv4_dst_destroy(struct dst_entry *dst)
1482 {
1483         struct rtable *rt = (struct rtable *) dst;
1484         struct inet_peer *peer = rt->peer;
1485         struct in_device *idev = rt->idev;
1486
1487         if (peer) {
1488                 rt->peer = NULL;
1489                 inet_putpeer(peer);
1490         }
1491
1492         if (idev) {
1493                 rt->idev = NULL;
1494                 in_dev_put(idev);
1495         }
1496 }
1497
1498 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1499                             int how)
1500 {
1501         struct rtable *rt = (struct rtable *) dst;
1502         struct in_device *idev = rt->idev;
1503         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1504                 struct in_device *loopback_idev =
1505                         in_dev_get(dev->nd_net->loopback_dev);
1506                 if (loopback_idev) {
1507                         rt->idev = loopback_idev;
1508                         in_dev_put(idev);
1509                 }
1510         }
1511 }
1512
1513 static void ipv4_link_failure(struct sk_buff *skb)
1514 {
1515         struct rtable *rt;
1516
1517         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1518
1519         rt = (struct rtable *) skb->dst;
1520         if (rt)
1521                 dst_set_expires(&rt->u.dst, 0);
1522 }
1523
1524 static int ip_rt_bug(struct sk_buff *skb)
1525 {
1526         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1527                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1528                 skb->dev ? skb->dev->name : "?");
1529         kfree_skb(skb);
1530         return 0;
1531 }
1532
1533 /*
1534    We do not cache source address of outgoing interface,
1535    because it is used only by IP RR, TS and SRR options,
1536    so that it out of fast path.
1537
1538    BTW remember: "addr" is allowed to be not aligned
1539    in IP options!
1540  */
1541
1542 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1543 {
1544         __be32 src;
1545         struct fib_result res;
1546
1547         if (rt->fl.iif == 0)
1548                 src = rt->rt_src;
1549         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1550                 src = FIB_RES_PREFSRC(res);
1551                 fib_res_put(&res);
1552         } else
1553                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1554                                         RT_SCOPE_UNIVERSE);
1555         memcpy(addr, &src, 4);
1556 }
1557
1558 #ifdef CONFIG_NET_CLS_ROUTE
1559 static void set_class_tag(struct rtable *rt, u32 tag)
1560 {
1561         if (!(rt->u.dst.tclassid & 0xFFFF))
1562                 rt->u.dst.tclassid |= tag & 0xFFFF;
1563         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1564                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1565 }
1566 #endif
1567
1568 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1569 {
1570         struct fib_info *fi = res->fi;
1571
1572         if (fi) {
1573                 if (FIB_RES_GW(*res) &&
1574                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1575                         rt->rt_gateway = FIB_RES_GW(*res);
1576                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1577                        sizeof(rt->u.dst.metrics));
1578                 if (fi->fib_mtu == 0) {
1579                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1580                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1581                             rt->rt_gateway != rt->rt_dst &&
1582                             rt->u.dst.dev->mtu > 576)
1583                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1584                 }
1585 #ifdef CONFIG_NET_CLS_ROUTE
1586                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1587 #endif
1588         } else
1589                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1590
1591         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1592                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1593         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1594                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1595         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1596                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1597                                        ip_rt_min_advmss);
1598         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1599                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1600
1601 #ifdef CONFIG_NET_CLS_ROUTE
1602 #ifdef CONFIG_IP_MULTIPLE_TABLES
1603         set_class_tag(rt, fib_rules_tclass(res));
1604 #endif
1605         set_class_tag(rt, itag);
1606 #endif
1607         rt->rt_type = res->type;
1608 }
1609
1610 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1611                                 u8 tos, struct net_device *dev, int our)
1612 {
1613         unsigned hash;
1614         struct rtable *rth;
1615         __be32 spec_dst;
1616         struct in_device *in_dev = in_dev_get(dev);
1617         u32 itag = 0;
1618
1619         /* Primary sanity checks. */
1620
1621         if (in_dev == NULL)
1622                 return -EINVAL;
1623
1624         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1625             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1626                 goto e_inval;
1627
1628         if (ipv4_is_zeronet(saddr)) {
1629                 if (!ipv4_is_local_multicast(daddr))
1630                         goto e_inval;
1631                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1632         } else if (fib_validate_source(saddr, 0, tos, 0,
1633                                         dev, &spec_dst, &itag) < 0)
1634                 goto e_inval;
1635
1636         rth = dst_alloc(&ipv4_dst_ops);
1637         if (!rth)
1638                 goto e_nobufs;
1639
1640         rth->u.dst.output= ip_rt_bug;
1641
1642         atomic_set(&rth->u.dst.__refcnt, 1);
1643         rth->u.dst.flags= DST_HOST;
1644         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1645                 rth->u.dst.flags |= DST_NOPOLICY;
1646         rth->fl.fl4_dst = daddr;
1647         rth->rt_dst     = daddr;
1648         rth->fl.fl4_tos = tos;
1649         rth->fl.mark    = skb->mark;
1650         rth->fl.fl4_src = saddr;
1651         rth->rt_src     = saddr;
1652 #ifdef CONFIG_NET_CLS_ROUTE
1653         rth->u.dst.tclassid = itag;
1654 #endif
1655         rth->rt_iif     =
1656         rth->fl.iif     = dev->ifindex;
1657         rth->u.dst.dev  = init_net.loopback_dev;
1658         dev_hold(rth->u.dst.dev);
1659         rth->idev       = in_dev_get(rth->u.dst.dev);
1660         rth->fl.oif     = 0;
1661         rth->rt_gateway = daddr;
1662         rth->rt_spec_dst= spec_dst;
1663         rth->rt_genid   = atomic_read(&rt_genid);
1664         rth->rt_flags   = RTCF_MULTICAST;
1665         rth->rt_type    = RTN_MULTICAST;
1666         if (our) {
1667                 rth->u.dst.input= ip_local_deliver;
1668                 rth->rt_flags |= RTCF_LOCAL;
1669         }
1670
1671 #ifdef CONFIG_IP_MROUTE
1672         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1673                 rth->u.dst.input = ip_mr_input;
1674 #endif
1675         RT_CACHE_STAT_INC(in_slow_mc);
1676
1677         in_dev_put(in_dev);
1678         hash = rt_hash(daddr, saddr, dev->ifindex);
1679         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1680
1681 e_nobufs:
1682         in_dev_put(in_dev);
1683         return -ENOBUFS;
1684
1685 e_inval:
1686         in_dev_put(in_dev);
1687         return -EINVAL;
1688 }
1689
1690
1691 static void ip_handle_martian_source(struct net_device *dev,
1692                                      struct in_device *in_dev,
1693                                      struct sk_buff *skb,
1694                                      __be32 daddr,
1695                                      __be32 saddr)
1696 {
1697         RT_CACHE_STAT_INC(in_martian_src);
1698 #ifdef CONFIG_IP_ROUTE_VERBOSE
1699         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1700                 /*
1701                  *      RFC1812 recommendation, if source is martian,
1702                  *      the only hint is MAC header.
1703                  */
1704                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1705                         "%u.%u.%u.%u, on dev %s\n",
1706                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1707                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1708                         int i;
1709                         const unsigned char *p = skb_mac_header(skb);
1710                         printk(KERN_WARNING "ll header: ");
1711                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1712                                 printk("%02x", *p);
1713                                 if (i < (dev->hard_header_len - 1))
1714                                         printk(":");
1715                         }
1716                         printk("\n");
1717                 }
1718         }
1719 #endif
1720 }
1721
1722 static inline int __mkroute_input(struct sk_buff *skb,
1723                                   struct fib_result* res,
1724                                   struct in_device *in_dev,
1725                                   __be32 daddr, __be32 saddr, u32 tos,
1726                                   struct rtable **result)
1727 {
1728
1729         struct rtable *rth;
1730         int err;
1731         struct in_device *out_dev;
1732         unsigned flags = 0;
1733         __be32 spec_dst;
1734         u32 itag;
1735
1736         /* get a working reference to the output device */
1737         out_dev = in_dev_get(FIB_RES_DEV(*res));
1738         if (out_dev == NULL) {
1739                 if (net_ratelimit())
1740                         printk(KERN_CRIT "Bug in ip_route_input" \
1741                                "_slow(). Please, report\n");
1742                 return -EINVAL;
1743         }
1744
1745
1746         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1747                                   in_dev->dev, &spec_dst, &itag);
1748         if (err < 0) {
1749                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1750                                          saddr);
1751
1752                 err = -EINVAL;
1753                 goto cleanup;
1754         }
1755
1756         if (err)
1757                 flags |= RTCF_DIRECTSRC;
1758
1759         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1760             (IN_DEV_SHARED_MEDIA(out_dev) ||
1761              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1762                 flags |= RTCF_DOREDIRECT;
1763
1764         if (skb->protocol != htons(ETH_P_IP)) {
1765                 /* Not IP (i.e. ARP). Do not create route, if it is
1766                  * invalid for proxy arp. DNAT routes are always valid.
1767                  */
1768                 if (out_dev == in_dev) {
1769                         err = -EINVAL;
1770                         goto cleanup;
1771                 }
1772         }
1773
1774
1775         rth = dst_alloc(&ipv4_dst_ops);
1776         if (!rth) {
1777                 err = -ENOBUFS;
1778                 goto cleanup;
1779         }
1780
1781         atomic_set(&rth->u.dst.__refcnt, 1);
1782         rth->u.dst.flags= DST_HOST;
1783         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1784                 rth->u.dst.flags |= DST_NOPOLICY;
1785         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1786                 rth->u.dst.flags |= DST_NOXFRM;
1787         rth->fl.fl4_dst = daddr;
1788         rth->rt_dst     = daddr;
1789         rth->fl.fl4_tos = tos;
1790         rth->fl.mark    = skb->mark;
1791         rth->fl.fl4_src = saddr;
1792         rth->rt_src     = saddr;
1793         rth->rt_gateway = daddr;
1794         rth->rt_iif     =
1795                 rth->fl.iif     = in_dev->dev->ifindex;
1796         rth->u.dst.dev  = (out_dev)->dev;
1797         dev_hold(rth->u.dst.dev);
1798         rth->idev       = in_dev_get(rth->u.dst.dev);
1799         rth->fl.oif     = 0;
1800         rth->rt_spec_dst= spec_dst;
1801
1802         rth->u.dst.input = ip_forward;
1803         rth->u.dst.output = ip_output;
1804         rth->rt_genid = atomic_read(&rt_genid);
1805
1806         rt_set_nexthop(rth, res, itag);
1807
1808         rth->rt_flags = flags;
1809
1810         *result = rth;
1811         err = 0;
1812  cleanup:
1813         /* release the working reference to the output device */
1814         in_dev_put(out_dev);
1815         return err;
1816 }
1817
1818 static inline int ip_mkroute_input(struct sk_buff *skb,
1819                                    struct fib_result* res,
1820                                    const struct flowi *fl,
1821                                    struct in_device *in_dev,
1822                                    __be32 daddr, __be32 saddr, u32 tos)
1823 {
1824         struct rtable* rth = NULL;
1825         int err;
1826         unsigned hash;
1827
1828 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1829         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1830                 fib_select_multipath(fl, res);
1831 #endif
1832
1833         /* create a routing cache entry */
1834         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1835         if (err)
1836                 return err;
1837
1838         /* put it into the cache */
1839         hash = rt_hash(daddr, saddr, fl->iif);
1840         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1841 }
1842
1843 /*
1844  *      NOTE. We drop all the packets that has local source
1845  *      addresses, because every properly looped back packet
1846  *      must have correct destination already attached by output routine.
1847  *
1848  *      Such approach solves two big problems:
1849  *      1. Not simplex devices are handled properly.
1850  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1851  */
1852
1853 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1854                                u8 tos, struct net_device *dev)
1855 {
1856         struct fib_result res;
1857         struct in_device *in_dev = in_dev_get(dev);
1858         struct flowi fl = { .nl_u = { .ip4_u =
1859                                       { .daddr = daddr,
1860                                         .saddr = saddr,
1861                                         .tos = tos,
1862                                         .scope = RT_SCOPE_UNIVERSE,
1863                                       } },
1864                             .mark = skb->mark,
1865                             .iif = dev->ifindex };
1866         unsigned        flags = 0;
1867         u32             itag = 0;
1868         struct rtable * rth;
1869         unsigned        hash;
1870         __be32          spec_dst;
1871         int             err = -EINVAL;
1872         int             free_res = 0;
1873         struct net    * net = dev->nd_net;
1874
1875         /* IP on this device is disabled. */
1876
1877         if (!in_dev)
1878                 goto out;
1879
1880         /* Check for the most weird martians, which can be not detected
1881            by fib_lookup.
1882          */
1883
1884         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1885             ipv4_is_loopback(saddr))
1886                 goto martian_source;
1887
1888         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1889                 goto brd_input;
1890
1891         /* Accept zero addresses only to limited broadcast;
1892          * I even do not know to fix it or not. Waiting for complains :-)
1893          */
1894         if (ipv4_is_zeronet(saddr))
1895                 goto martian_source;
1896
1897         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1898             ipv4_is_loopback(daddr))
1899                 goto martian_destination;
1900
1901         /*
1902          *      Now we are ready to route packet.
1903          */
1904         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1905                 if (!IN_DEV_FORWARD(in_dev))
1906                         goto e_hostunreach;
1907                 goto no_route;
1908         }
1909         free_res = 1;
1910
1911         RT_CACHE_STAT_INC(in_slow_tot);
1912
1913         if (res.type == RTN_BROADCAST)
1914                 goto brd_input;
1915
1916         if (res.type == RTN_LOCAL) {
1917                 int result;
1918                 result = fib_validate_source(saddr, daddr, tos,
1919                                              net->loopback_dev->ifindex,
1920                                              dev, &spec_dst, &itag);
1921                 if (result < 0)
1922                         goto martian_source;
1923                 if (result)
1924                         flags |= RTCF_DIRECTSRC;
1925                 spec_dst = daddr;
1926                 goto local_input;
1927         }
1928
1929         if (!IN_DEV_FORWARD(in_dev))
1930                 goto e_hostunreach;
1931         if (res.type != RTN_UNICAST)
1932                 goto martian_destination;
1933
1934         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1935 done:
1936         in_dev_put(in_dev);
1937         if (free_res)
1938                 fib_res_put(&res);
1939 out:    return err;
1940
1941 brd_input:
1942         if (skb->protocol != htons(ETH_P_IP))
1943                 goto e_inval;
1944
1945         if (ipv4_is_zeronet(saddr))
1946                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1947         else {
1948                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1949                                           &itag);
1950                 if (err < 0)
1951                         goto martian_source;
1952                 if (err)
1953                         flags |= RTCF_DIRECTSRC;
1954         }
1955         flags |= RTCF_BROADCAST;
1956         res.type = RTN_BROADCAST;
1957         RT_CACHE_STAT_INC(in_brd);
1958
1959 local_input:
1960         rth = dst_alloc(&ipv4_dst_ops);
1961         if (!rth)
1962                 goto e_nobufs;
1963
1964         rth->u.dst.output= ip_rt_bug;
1965         rth->rt_genid = atomic_read(&rt_genid);
1966
1967         atomic_set(&rth->u.dst.__refcnt, 1);
1968         rth->u.dst.flags= DST_HOST;
1969         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1970                 rth->u.dst.flags |= DST_NOPOLICY;
1971         rth->fl.fl4_dst = daddr;
1972         rth->rt_dst     = daddr;
1973         rth->fl.fl4_tos = tos;
1974         rth->fl.mark    = skb->mark;
1975         rth->fl.fl4_src = saddr;
1976         rth->rt_src     = saddr;
1977 #ifdef CONFIG_NET_CLS_ROUTE
1978         rth->u.dst.tclassid = itag;
1979 #endif
1980         rth->rt_iif     =
1981         rth->fl.iif     = dev->ifindex;
1982         rth->u.dst.dev  = net->loopback_dev;
1983         dev_hold(rth->u.dst.dev);
1984         rth->idev       = in_dev_get(rth->u.dst.dev);
1985         rth->rt_gateway = daddr;
1986         rth->rt_spec_dst= spec_dst;
1987         rth->u.dst.input= ip_local_deliver;
1988         rth->rt_flags   = flags|RTCF_LOCAL;
1989         if (res.type == RTN_UNREACHABLE) {
1990                 rth->u.dst.input= ip_error;
1991                 rth->u.dst.error= -err;
1992                 rth->rt_flags   &= ~RTCF_LOCAL;
1993         }
1994         rth->rt_type    = res.type;
1995         hash = rt_hash(daddr, saddr, fl.iif);
1996         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1997         goto done;
1998
1999 no_route:
2000         RT_CACHE_STAT_INC(in_no_route);
2001         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2002         res.type = RTN_UNREACHABLE;
2003         if (err == -ESRCH)
2004                 err = -ENETUNREACH;
2005         goto local_input;
2006
2007         /*
2008          *      Do not cache martian addresses: they should be logged (RFC1812)
2009          */
2010 martian_destination:
2011         RT_CACHE_STAT_INC(in_martian_dst);
2012 #ifdef CONFIG_IP_ROUTE_VERBOSE
2013         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2014                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2015                         "%u.%u.%u.%u, dev %s\n",
2016                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2017 #endif
2018
2019 e_hostunreach:
2020         err = -EHOSTUNREACH;
2021         goto done;
2022
2023 e_inval:
2024         err = -EINVAL;
2025         goto done;
2026
2027 e_nobufs:
2028         err = -ENOBUFS;
2029         goto done;
2030
2031 martian_source:
2032         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2033         goto e_inval;
2034 }
2035
2036 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2037                    u8 tos, struct net_device *dev)
2038 {
2039         struct rtable * rth;
2040         unsigned        hash;
2041         int iif = dev->ifindex;
2042         struct net *net;
2043
2044         net = dev->nd_net;
2045         tos &= IPTOS_RT_MASK;
2046         hash = rt_hash(daddr, saddr, iif);
2047
2048         rcu_read_lock();
2049         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2050              rth = rcu_dereference(rth->u.dst.rt_next)) {
2051                 if (rth->fl.fl4_dst == daddr &&
2052                     rth->fl.fl4_src == saddr &&
2053                     rth->fl.iif == iif &&
2054                     rth->fl.oif == 0 &&
2055                     rth->fl.mark == skb->mark &&
2056                     rth->fl.fl4_tos == tos &&
2057                     rth->u.dst.dev->nd_net == net &&
2058                     rth->rt_genid == atomic_read(&rt_genid)) {
2059                         dst_use(&rth->u.dst, jiffies);
2060                         RT_CACHE_STAT_INC(in_hit);
2061                         rcu_read_unlock();
2062                         skb->dst = (struct dst_entry*)rth;
2063                         return 0;
2064                 }
2065                 RT_CACHE_STAT_INC(in_hlist_search);
2066         }
2067         rcu_read_unlock();
2068
2069         /* Multicast recognition logic is moved from route cache to here.
2070            The problem was that too many Ethernet cards have broken/missing
2071            hardware multicast filters :-( As result the host on multicasting
2072            network acquires a lot of useless route cache entries, sort of
2073            SDR messages from all the world. Now we try to get rid of them.
2074            Really, provided software IP multicast filter is organized
2075            reasonably (at least, hashed), it does not result in a slowdown
2076            comparing with route cache reject entries.
2077            Note, that multicast routers are not affected, because
2078            route cache entry is created eventually.
2079          */
2080         if (ipv4_is_multicast(daddr)) {
2081                 struct in_device *in_dev;
2082
2083                 rcu_read_lock();
2084                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2085                         int our = ip_check_mc(in_dev, daddr, saddr,
2086                                 ip_hdr(skb)->protocol);
2087                         if (our
2088 #ifdef CONFIG_IP_MROUTE
2089                             || (!ipv4_is_local_multicast(daddr) &&
2090                                 IN_DEV_MFORWARD(in_dev))
2091 #endif
2092                             ) {
2093                                 rcu_read_unlock();
2094                                 return ip_route_input_mc(skb, daddr, saddr,
2095                                                          tos, dev, our);
2096                         }
2097                 }
2098                 rcu_read_unlock();
2099                 return -EINVAL;
2100         }
2101         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2102 }
2103
2104 static inline int __mkroute_output(struct rtable **result,
2105                                    struct fib_result* res,
2106                                    const struct flowi *fl,
2107                                    const struct flowi *oldflp,
2108                                    struct net_device *dev_out,
2109                                    unsigned flags)
2110 {
2111         struct rtable *rth;
2112         struct in_device *in_dev;
2113         u32 tos = RT_FL_TOS(oldflp);
2114         int err = 0;
2115
2116         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2117                 return -EINVAL;
2118
2119         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2120                 res->type = RTN_BROADCAST;
2121         else if (ipv4_is_multicast(fl->fl4_dst))
2122                 res->type = RTN_MULTICAST;
2123         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2124                 return -EINVAL;
2125
2126         if (dev_out->flags & IFF_LOOPBACK)
2127                 flags |= RTCF_LOCAL;
2128
2129         /* get work reference to inet device */
2130         in_dev = in_dev_get(dev_out);
2131         if (!in_dev)
2132                 return -EINVAL;
2133
2134         if (res->type == RTN_BROADCAST) {
2135                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2136                 if (res->fi) {
2137                         fib_info_put(res->fi);
2138                         res->fi = NULL;
2139                 }
2140         } else if (res->type == RTN_MULTICAST) {
2141                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2142                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2143                                  oldflp->proto))
2144                         flags &= ~RTCF_LOCAL;
2145                 /* If multicast route do not exist use
2146                    default one, but do not gateway in this case.
2147                    Yes, it is hack.
2148                  */
2149                 if (res->fi && res->prefixlen < 4) {
2150                         fib_info_put(res->fi);
2151                         res->fi = NULL;
2152                 }
2153         }
2154
2155
2156         rth = dst_alloc(&ipv4_dst_ops);
2157         if (!rth) {
2158                 err = -ENOBUFS;
2159                 goto cleanup;
2160         }
2161
2162         atomic_set(&rth->u.dst.__refcnt, 1);
2163         rth->u.dst.flags= DST_HOST;
2164         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2165                 rth->u.dst.flags |= DST_NOXFRM;
2166         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2167                 rth->u.dst.flags |= DST_NOPOLICY;
2168
2169         rth->fl.fl4_dst = oldflp->fl4_dst;
2170         rth->fl.fl4_tos = tos;
2171         rth->fl.fl4_src = oldflp->fl4_src;
2172         rth->fl.oif     = oldflp->oif;
2173         rth->fl.mark    = oldflp->mark;
2174         rth->rt_dst     = fl->fl4_dst;
2175         rth->rt_src     = fl->fl4_src;
2176         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2177         /* get references to the devices that are to be hold by the routing
2178            cache entry */
2179         rth->u.dst.dev  = dev_out;
2180         dev_hold(dev_out);
2181         rth->idev       = in_dev_get(dev_out);
2182         rth->rt_gateway = fl->fl4_dst;
2183         rth->rt_spec_dst= fl->fl4_src;
2184
2185         rth->u.dst.output=ip_output;
2186         rth->rt_genid = atomic_read(&rt_genid);
2187
2188         RT_CACHE_STAT_INC(out_slow_tot);
2189
2190         if (flags & RTCF_LOCAL) {
2191                 rth->u.dst.input = ip_local_deliver;
2192                 rth->rt_spec_dst = fl->fl4_dst;
2193         }
2194         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2195                 rth->rt_spec_dst = fl->fl4_src;
2196                 if (flags & RTCF_LOCAL &&
2197                     !(dev_out->flags & IFF_LOOPBACK)) {
2198                         rth->u.dst.output = ip_mc_output;
2199                         RT_CACHE_STAT_INC(out_slow_mc);
2200                 }
2201 #ifdef CONFIG_IP_MROUTE
2202                 if (res->type == RTN_MULTICAST) {
2203                         if (IN_DEV_MFORWARD(in_dev) &&
2204                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2205                                 rth->u.dst.input = ip_mr_input;
2206                                 rth->u.dst.output = ip_mc_output;
2207                         }
2208                 }
2209 #endif
2210         }
2211
2212         rt_set_nexthop(rth, res, 0);
2213
2214         rth->rt_flags = flags;
2215
2216         *result = rth;
2217  cleanup:
2218         /* release work reference to inet device */
2219         in_dev_put(in_dev);
2220
2221         return err;
2222 }
2223
2224 static inline int ip_mkroute_output(struct rtable **rp,
2225                                     struct fib_result* res,
2226                                     const struct flowi *fl,
2227                                     const struct flowi *oldflp,
2228                                     struct net_device *dev_out,
2229                                     unsigned flags)
2230 {
2231         struct rtable *rth = NULL;
2232         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2233         unsigned hash;
2234         if (err == 0) {
2235                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2236                 err = rt_intern_hash(hash, rth, rp);
2237         }
2238
2239         return err;
2240 }
2241
2242 /*
2243  * Major route resolver routine.
2244  */
2245
2246 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2247                                 const struct flowi *oldflp)
2248 {
2249         u32 tos = RT_FL_TOS(oldflp);
2250         struct flowi fl = { .nl_u = { .ip4_u =
2251                                       { .daddr = oldflp->fl4_dst,
2252                                         .saddr = oldflp->fl4_src,
2253                                         .tos = tos & IPTOS_RT_MASK,
2254                                         .scope = ((tos & RTO_ONLINK) ?
2255                                                   RT_SCOPE_LINK :
2256                                                   RT_SCOPE_UNIVERSE),
2257                                       } },
2258                             .mark = oldflp->mark,
2259                             .iif = net->loopback_dev->ifindex,
2260                             .oif = oldflp->oif };
2261         struct fib_result res;
2262         unsigned flags = 0;
2263         struct net_device *dev_out = NULL;
2264         int free_res = 0;
2265         int err;
2266
2267
2268         res.fi          = NULL;
2269 #ifdef CONFIG_IP_MULTIPLE_TABLES
2270         res.r           = NULL;
2271 #endif
2272
2273         if (oldflp->fl4_src) {
2274                 err = -EINVAL;
2275                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2276                     ipv4_is_lbcast(oldflp->fl4_src) ||
2277                     ipv4_is_zeronet(oldflp->fl4_src))
2278                         goto out;
2279
2280                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2281                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2282                 if (dev_out == NULL)
2283                         goto out;
2284
2285                 /* I removed check for oif == dev_out->oif here.
2286                    It was wrong for two reasons:
2287                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2288                       is assigned to multiple interfaces.
2289                    2. Moreover, we are allowed to send packets with saddr
2290                       of another iface. --ANK
2291                  */
2292
2293                 if (oldflp->oif == 0
2294                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2295                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2296                         /* Special hack: user can direct multicasts
2297                            and limited broadcast via necessary interface
2298                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2299                            This hack is not just for fun, it allows
2300                            vic,vat and friends to work.
2301                            They bind socket to loopback, set ttl to zero
2302                            and expect that it will work.
2303                            From the viewpoint of routing cache they are broken,
2304                            because we are not allowed to build multicast path
2305                            with loopback source addr (look, routing cache
2306                            cannot know, that ttl is zero, so that packet
2307                            will not leave this host and route is valid).
2308                            Luckily, this hack is good workaround.
2309                          */
2310
2311                         fl.oif = dev_out->ifindex;
2312                         goto make_route;
2313                 }
2314                 if (dev_out)
2315                         dev_put(dev_out);
2316                 dev_out = NULL;
2317         }
2318
2319
2320         if (oldflp->oif) {
2321                 dev_out = dev_get_by_index(net, oldflp->oif);
2322                 err = -ENODEV;
2323                 if (dev_out == NULL)
2324                         goto out;
2325
2326                 /* RACE: Check return value of inet_select_addr instead. */
2327                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2328                         dev_put(dev_out);
2329                         goto out;       /* Wrong error code */
2330                 }
2331
2332                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2333                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2334                         if (!fl.fl4_src)
2335                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2336                                                               RT_SCOPE_LINK);
2337                         goto make_route;
2338                 }
2339                 if (!fl.fl4_src) {
2340                         if (ipv4_is_multicast(oldflp->fl4_dst))
2341                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2342                                                               fl.fl4_scope);
2343                         else if (!oldflp->fl4_dst)
2344                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2345                                                               RT_SCOPE_HOST);
2346                 }
2347         }
2348
2349         if (!fl.fl4_dst) {
2350                 fl.fl4_dst = fl.fl4_src;
2351                 if (!fl.fl4_dst)
2352                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2353                 if (dev_out)
2354                         dev_put(dev_out);
2355                 dev_out = net->loopback_dev;
2356                 dev_hold(dev_out);
2357                 fl.oif = net->loopback_dev->ifindex;
2358                 res.type = RTN_LOCAL;
2359                 flags |= RTCF_LOCAL;
2360                 goto make_route;
2361         }
2362
2363         if (fib_lookup(net, &fl, &res)) {
2364                 res.fi = NULL;
2365                 if (oldflp->oif) {
2366                         /* Apparently, routing tables are wrong. Assume,
2367                            that the destination is on link.
2368
2369                            WHY? DW.
2370                            Because we are allowed to send to iface
2371                            even if it has NO routes and NO assigned
2372                            addresses. When oif is specified, routing
2373                            tables are looked up with only one purpose:
2374                            to catch if destination is gatewayed, rather than
2375                            direct. Moreover, if MSG_DONTROUTE is set,
2376                            we send packet, ignoring both routing tables
2377                            and ifaddr state. --ANK
2378
2379
2380                            We could make it even if oif is unknown,
2381                            likely IPv6, but we do not.
2382                          */
2383
2384                         if (fl.fl4_src == 0)
2385                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2386                                                               RT_SCOPE_LINK);
2387                         res.type = RTN_UNICAST;
2388                         goto make_route;
2389                 }
2390                 if (dev_out)
2391                         dev_put(dev_out);
2392                 err = -ENETUNREACH;
2393                 goto out;
2394         }
2395         free_res = 1;
2396
2397         if (res.type == RTN_LOCAL) {
2398                 if (!fl.fl4_src)
2399                         fl.fl4_src = fl.fl4_dst;
2400                 if (dev_out)
2401                         dev_put(dev_out);
2402                 dev_out = net->loopback_dev;
2403                 dev_hold(dev_out);
2404                 fl.oif = dev_out->ifindex;
2405                 if (res.fi)
2406                         fib_info_put(res.fi);
2407                 res.fi = NULL;
2408                 flags |= RTCF_LOCAL;
2409                 goto make_route;
2410         }
2411
2412 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2413         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2414                 fib_select_multipath(&fl, &res);
2415         else
2416 #endif
2417         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2418                 fib_select_default(net, &fl, &res);
2419
2420         if (!fl.fl4_src)
2421                 fl.fl4_src = FIB_RES_PREFSRC(res);
2422
2423         if (dev_out)
2424                 dev_put(dev_out);
2425         dev_out = FIB_RES_DEV(res);
2426         dev_hold(dev_out);
2427         fl.oif = dev_out->ifindex;
2428
2429
2430 make_route:
2431         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2432
2433
2434         if (free_res)
2435                 fib_res_put(&res);
2436         if (dev_out)
2437                 dev_put(dev_out);
2438 out:    return err;
2439 }
2440
2441 int __ip_route_output_key(struct net *net, struct rtable **rp,
2442                           const struct flowi *flp)
2443 {
2444         unsigned hash;
2445         struct rtable *rth;
2446
2447         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2448
2449         rcu_read_lock_bh();
2450         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2451                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2452                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2453                     rth->fl.fl4_src == flp->fl4_src &&
2454                     rth->fl.iif == 0 &&
2455                     rth->fl.oif == flp->oif &&
2456                     rth->fl.mark == flp->mark &&
2457                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2458                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2459                     rth->u.dst.dev->nd_net == net &&
2460                     rth->rt_genid == atomic_read(&rt_genid)) {
2461                         dst_use(&rth->u.dst, jiffies);
2462                         RT_CACHE_STAT_INC(out_hit);
2463                         rcu_read_unlock_bh();
2464                         *rp = rth;
2465                         return 0;
2466                 }
2467                 RT_CACHE_STAT_INC(out_hlist_search);
2468         }
2469         rcu_read_unlock_bh();
2470
2471         return ip_route_output_slow(net, rp, flp);
2472 }
2473
2474 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2475
2476 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2477 {
2478 }
2479
2480 static struct dst_ops ipv4_dst_blackhole_ops = {
2481         .family                 =       AF_INET,
2482         .protocol               =       __constant_htons(ETH_P_IP),
2483         .destroy                =       ipv4_dst_destroy,
2484         .check                  =       ipv4_dst_check,
2485         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2486         .entry_size             =       sizeof(struct rtable),
2487         .entries                =       ATOMIC_INIT(0),
2488 };
2489
2490
2491 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2492 {
2493         struct rtable *ort = *rp;
2494         struct rtable *rt = (struct rtable *)
2495                 dst_alloc(&ipv4_dst_blackhole_ops);
2496
2497         if (rt) {
2498                 struct dst_entry *new = &rt->u.dst;
2499
2500                 atomic_set(&new->__refcnt, 1);
2501                 new->__use = 1;
2502                 new->input = dst_discard;
2503                 new->output = dst_discard;
2504                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2505
2506                 new->dev = ort->u.dst.dev;
2507                 if (new->dev)
2508                         dev_hold(new->dev);
2509
2510                 rt->fl = ort->fl;
2511
2512                 rt->idev = ort->idev;
2513                 if (rt->idev)
2514                         in_dev_hold(rt->idev);
2515                 rt->rt_genid = atomic_read(&rt_genid);
2516                 rt->rt_flags = ort->rt_flags;
2517                 rt->rt_type = ort->rt_type;
2518                 rt->rt_dst = ort->rt_dst;
2519                 rt->rt_src = ort->rt_src;
2520                 rt->rt_iif = ort->rt_iif;
2521                 rt->rt_gateway = ort->rt_gateway;
2522                 rt->rt_spec_dst = ort->rt_spec_dst;
2523                 rt->peer = ort->peer;
2524                 if (rt->peer)
2525                         atomic_inc(&rt->peer->refcnt);
2526
2527                 dst_free(new);
2528         }
2529
2530         dst_release(&(*rp)->u.dst);
2531         *rp = rt;
2532         return (rt ? 0 : -ENOMEM);
2533 }
2534
2535 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2536                          struct sock *sk, int flags)
2537 {
2538         int err;
2539
2540         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2541                 return err;
2542
2543         if (flp->proto) {
2544                 if (!flp->fl4_src)
2545                         flp->fl4_src = (*rp)->rt_src;
2546                 if (!flp->fl4_dst)
2547                         flp->fl4_dst = (*rp)->rt_dst;
2548                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2549                                     flags ? XFRM_LOOKUP_WAIT : 0);
2550                 if (err == -EREMOTE)
2551                         err = ipv4_dst_blackhole(rp, flp, sk);
2552
2553                 return err;
2554         }
2555
2556         return 0;
2557 }
2558
2559 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2560
2561 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2562 {
2563         return ip_route_output_flow(net, rp, flp, NULL, 0);
2564 }
2565
2566 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2567                         int nowait, unsigned int flags)
2568 {
2569         struct rtable *rt = (struct rtable*)skb->dst;
2570         struct rtmsg *r;
2571         struct nlmsghdr *nlh;
2572         long expires;
2573         u32 id = 0, ts = 0, tsage = 0, error;
2574
2575         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2576         if (nlh == NULL)
2577                 return -EMSGSIZE;
2578
2579         r = nlmsg_data(nlh);
2580         r->rtm_family    = AF_INET;
2581         r->rtm_dst_len  = 32;
2582         r->rtm_src_len  = 0;
2583         r->rtm_tos      = rt->fl.fl4_tos;
2584         r->rtm_table    = RT_TABLE_MAIN;
2585         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2586         r->rtm_type     = rt->rt_type;
2587         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2588         r->rtm_protocol = RTPROT_UNSPEC;
2589         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2590         if (rt->rt_flags & RTCF_NOTIFY)
2591                 r->rtm_flags |= RTM_F_NOTIFY;
2592
2593         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2594
2595         if (rt->fl.fl4_src) {
2596                 r->rtm_src_len = 32;
2597                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2598         }
2599         if (rt->u.dst.dev)
2600                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2601 #ifdef CONFIG_NET_CLS_ROUTE
2602         if (rt->u.dst.tclassid)
2603                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2604 #endif
2605         if (rt->fl.iif)
2606                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2607         else if (rt->rt_src != rt->fl.fl4_src)
2608                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2609
2610         if (rt->rt_dst != rt->rt_gateway)
2611                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2612
2613         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2614                 goto nla_put_failure;
2615
2616         error = rt->u.dst.error;
2617         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2618         if (rt->peer) {
2619                 id = rt->peer->ip_id_count;
2620                 if (rt->peer->tcp_ts_stamp) {
2621                         ts = rt->peer->tcp_ts;
2622                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2623                 }
2624         }
2625
2626         if (rt->fl.iif) {
2627 #ifdef CONFIG_IP_MROUTE
2628                 __be32 dst = rt->rt_dst;
2629
2630                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2631                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2632                         int err = ipmr_get_route(skb, r, nowait);
2633                         if (err <= 0) {
2634                                 if (!nowait) {
2635                                         if (err == 0)
2636                                                 return 0;
2637                                         goto nla_put_failure;
2638                                 } else {
2639                                         if (err == -EMSGSIZE)
2640                                                 goto nla_put_failure;
2641                                         error = err;
2642                                 }
2643                         }
2644                 } else
2645 #endif
2646                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2647         }
2648
2649         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2650                                expires, error) < 0)
2651                 goto nla_put_failure;
2652
2653         return nlmsg_end(skb, nlh);
2654
2655 nla_put_failure:
2656         nlmsg_cancel(skb, nlh);
2657         return -EMSGSIZE;
2658 }
2659
2660 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2661 {
2662         struct net *net = in_skb->sk->sk_net;
2663         struct rtmsg *rtm;
2664         struct nlattr *tb[RTA_MAX+1];
2665         struct rtable *rt = NULL;
2666         __be32 dst = 0;
2667         __be32 src = 0;
2668         u32 iif;
2669         int err;
2670         struct sk_buff *skb;
2671
2672         if (net != &init_net)
2673                 return -EINVAL;
2674
2675         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2676         if (err < 0)
2677                 goto errout;
2678
2679         rtm = nlmsg_data(nlh);
2680
2681         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2682         if (skb == NULL) {
2683                 err = -ENOBUFS;
2684                 goto errout;
2685         }
2686
2687         /* Reserve room for dummy headers, this skb can pass
2688            through good chunk of routing engine.
2689          */
2690         skb_reset_mac_header(skb);
2691         skb_reset_network_header(skb);
2692
2693         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2694         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2695         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2696
2697         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2698         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2699         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2700
2701         if (iif) {
2702                 struct net_device *dev;
2703
2704                 dev = __dev_get_by_index(&init_net, iif);
2705                 if (dev == NULL) {
2706                         err = -ENODEV;
2707                         goto errout_free;
2708                 }
2709
2710                 skb->protocol   = htons(ETH_P_IP);
2711                 skb->dev        = dev;
2712                 local_bh_disable();
2713                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2714                 local_bh_enable();
2715
2716                 rt = (struct rtable*) skb->dst;
2717                 if (err == 0 && rt->u.dst.error)
2718                         err = -rt->u.dst.error;
2719         } else {
2720                 struct flowi fl = {
2721                         .nl_u = {
2722                                 .ip4_u = {
2723                                         .daddr = dst,
2724                                         .saddr = src,
2725                                         .tos = rtm->rtm_tos,
2726                                 },
2727                         },
2728                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2729                 };
2730                 err = ip_route_output_key(&init_net, &rt, &fl);
2731         }
2732
2733         if (err)
2734                 goto errout_free;
2735
2736         skb->dst = &rt->u.dst;
2737         if (rtm->rtm_flags & RTM_F_NOTIFY)
2738                 rt->rt_flags |= RTCF_NOTIFY;
2739
2740         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2741                                 RTM_NEWROUTE, 0, 0);
2742         if (err <= 0)
2743                 goto errout_free;
2744
2745         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2746 errout:
2747         return err;
2748
2749 errout_free:
2750         kfree_skb(skb);
2751         goto errout;
2752 }
2753
2754 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2755 {
2756         struct rtable *rt;
2757         int h, s_h;
2758         int idx, s_idx;
2759
2760         s_h = cb->args[0];
2761         if (s_h < 0)
2762                 s_h = 0;
2763         s_idx = idx = cb->args[1];
2764         for (h = s_h; h <= rt_hash_mask; h++) {
2765                 rcu_read_lock_bh();
2766                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2767                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2768                         if (idx < s_idx)
2769                                 continue;
2770                         if (rt->rt_genid != atomic_read(&rt_genid))
2771                                 continue;
2772                         skb->dst = dst_clone(&rt->u.dst);
2773                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2774                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2775                                          1, NLM_F_MULTI) <= 0) {
2776                                 dst_release(xchg(&skb->dst, NULL));
2777                                 rcu_read_unlock_bh();
2778                                 goto done;
2779                         }
2780                         dst_release(xchg(&skb->dst, NULL));
2781                 }
2782                 rcu_read_unlock_bh();
2783                 s_idx = 0;
2784         }
2785
2786 done:
2787         cb->args[0] = h;
2788         cb->args[1] = idx;
2789         return skb->len;
2790 }
2791
2792 void ip_rt_multicast_event(struct in_device *in_dev)
2793 {
2794         rt_cache_flush(0);
2795 }
2796
2797 #ifdef CONFIG_SYSCTL
2798 static int flush_delay;
2799
2800 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2801                                         struct file *filp, void __user *buffer,
2802                                         size_t *lenp, loff_t *ppos)
2803 {
2804         if (write) {
2805                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2806                 rt_cache_flush(flush_delay);
2807                 return 0;
2808         }
2809
2810         return -EINVAL;
2811 }
2812
2813 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2814                                                 int __user *name,
2815                                                 int nlen,
2816                                                 void __user *oldval,
2817                                                 size_t __user *oldlenp,
2818                                                 void __user *newval,
2819                                                 size_t newlen)
2820 {
2821         int delay;
2822         if (newlen != sizeof(int))
2823                 return -EINVAL;
2824         if (get_user(delay, (int __user *)newval))
2825                 return -EFAULT;
2826         rt_cache_flush(delay);
2827         return 0;
2828 }
2829
2830 ctl_table ipv4_route_table[] = {
2831         {
2832                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2833                 .procname       = "flush",
2834                 .data           = &flush_delay,
2835                 .maxlen         = sizeof(int),
2836                 .mode           = 0200,
2837                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2838                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2839         },
2840         {
2841                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2842                 .procname       = "gc_thresh",
2843                 .data           = &ipv4_dst_ops.gc_thresh,
2844                 .maxlen         = sizeof(int),
2845                 .mode           = 0644,
2846                 .proc_handler   = &proc_dointvec,
2847         },
2848         {
2849                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2850                 .procname       = "max_size",
2851                 .data           = &ip_rt_max_size,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = &proc_dointvec,
2855         },
2856         {
2857                 /*  Deprecated. Use gc_min_interval_ms */
2858
2859                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2860                 .procname       = "gc_min_interval",
2861                 .data           = &ip_rt_gc_min_interval,
2862                 .maxlen         = sizeof(int),
2863                 .mode           = 0644,
2864                 .proc_handler   = &proc_dointvec_jiffies,
2865                 .strategy       = &sysctl_jiffies,
2866         },
2867         {
2868                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2869                 .procname       = "gc_min_interval_ms",
2870                 .data           = &ip_rt_gc_min_interval,
2871                 .maxlen         = sizeof(int),
2872                 .mode           = 0644,
2873                 .proc_handler   = &proc_dointvec_ms_jiffies,
2874                 .strategy       = &sysctl_ms_jiffies,
2875         },
2876         {
2877                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2878                 .procname       = "gc_timeout",
2879                 .data           = &ip_rt_gc_timeout,
2880                 .maxlen         = sizeof(int),
2881                 .mode           = 0644,
2882                 .proc_handler   = &proc_dointvec_jiffies,
2883                 .strategy       = &sysctl_jiffies,
2884         },
2885         {
2886                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2887                 .procname       = "gc_interval",
2888                 .data           = &ip_rt_gc_interval,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = &proc_dointvec_jiffies,
2892                 .strategy       = &sysctl_jiffies,
2893         },
2894         {
2895                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2896                 .procname       = "redirect_load",
2897                 .data           = &ip_rt_redirect_load,
2898                 .maxlen         = sizeof(int),
2899                 .mode           = 0644,
2900                 .proc_handler   = &proc_dointvec,
2901         },
2902         {
2903                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2904                 .procname       = "redirect_number",
2905                 .data           = &ip_rt_redirect_number,
2906                 .maxlen         = sizeof(int),
2907                 .mode           = 0644,
2908                 .proc_handler   = &proc_dointvec,
2909         },
2910         {
2911                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2912                 .procname       = "redirect_silence",
2913                 .data           = &ip_rt_redirect_silence,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0644,
2916                 .proc_handler   = &proc_dointvec,
2917         },
2918         {
2919                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2920                 .procname       = "error_cost",
2921                 .data           = &ip_rt_error_cost,
2922                 .maxlen         = sizeof(int),
2923                 .mode           = 0644,
2924                 .proc_handler   = &proc_dointvec,
2925         },
2926         {
2927                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2928                 .procname       = "error_burst",
2929                 .data           = &ip_rt_error_burst,
2930                 .maxlen         = sizeof(int),
2931                 .mode           = 0644,
2932                 .proc_handler   = &proc_dointvec,
2933         },
2934         {
2935                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2936                 .procname       = "gc_elasticity",
2937                 .data           = &ip_rt_gc_elasticity,
2938                 .maxlen         = sizeof(int),
2939                 .mode           = 0644,
2940                 .proc_handler   = &proc_dointvec,
2941         },
2942         {
2943                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2944                 .procname       = "mtu_expires",
2945                 .data           = &ip_rt_mtu_expires,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = &proc_dointvec_jiffies,
2949                 .strategy       = &sysctl_jiffies,
2950         },
2951         {
2952                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2953                 .procname       = "min_pmtu",
2954                 .data           = &ip_rt_min_pmtu,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = &proc_dointvec,
2958         },
2959         {
2960                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2961                 .procname       = "min_adv_mss",
2962                 .data           = &ip_rt_min_advmss,
2963                 .maxlen         = sizeof(int),
2964                 .mode           = 0644,
2965                 .proc_handler   = &proc_dointvec,
2966         },
2967         {
2968                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2969                 .procname       = "secret_interval",
2970                 .data           = &ip_rt_secret_interval,
2971                 .maxlen         = sizeof(int),
2972                 .mode           = 0644,
2973                 .proc_handler   = &proc_dointvec_jiffies,
2974                 .strategy       = &sysctl_jiffies,
2975         },
2976         { .ctl_name = 0 }
2977 };
2978 #endif
2979
2980 #ifdef CONFIG_NET_CLS_ROUTE
2981 struct ip_rt_acct *ip_rt_acct __read_mostly;
2982 #endif /* CONFIG_NET_CLS_ROUTE */
2983
2984 static __initdata unsigned long rhash_entries;
2985 static int __init set_rhash_entries(char *str)
2986 {
2987         if (!str)
2988                 return 0;
2989         rhash_entries = simple_strtoul(str, &str, 0);
2990         return 1;
2991 }
2992 __setup("rhash_entries=", set_rhash_entries);
2993
2994 int __init ip_rt_init(void)
2995 {
2996         int rc = 0;
2997
2998         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
2999                              (jiffies ^ (jiffies >> 7))));
3000
3001 #ifdef CONFIG_NET_CLS_ROUTE
3002         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3003         if (!ip_rt_acct)
3004                 panic("IP: failed to allocate ip_rt_acct\n");
3005 #endif
3006
3007         ipv4_dst_ops.kmem_cachep =
3008                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3009                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3010
3011         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3012
3013         rt_hash_table = (struct rt_hash_bucket *)
3014                 alloc_large_system_hash("IP route cache",
3015                                         sizeof(struct rt_hash_bucket),
3016                                         rhash_entries,
3017                                         (num_physpages >= 128 * 1024) ?
3018                                         15 : 17,
3019                                         0,
3020                                         &rt_hash_log,
3021                                         &rt_hash_mask,
3022                                         0);
3023         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3024         rt_hash_lock_init();
3025
3026         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3027         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3028
3029         devinet_init();
3030         ip_fib_init();
3031
3032         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3033
3034         /* All the timers, started at system startup tend
3035            to synchronize. Perturb it a bit.
3036          */
3037         schedule_delayed_work(&expires_work,
3038                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3039
3040         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3041                 ip_rt_secret_interval;
3042         add_timer(&rt_secret_timer);
3043
3044         if (ip_rt_proc_init(&init_net))
3045                 printk(KERN_ERR "Unable to create route proc files\n");
3046 #ifdef CONFIG_XFRM
3047         xfrm_init();
3048         xfrm4_init();
3049 #endif
3050         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3051
3052         return rc;
3053 }
3054
3055 EXPORT_SYMBOL(__ip_select_ident);
3056 EXPORT_SYMBOL(ip_route_input);
3057 EXPORT_SYMBOL(ip_route_output_key);