[IPV4]: Switch users of ipv4_devconf(_all) to use the pernet one
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(void);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238
239 static __init void rt_hash_lock_init(void)
240 {
241         int i;
242
243         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
244                         GFP_KERNEL);
245         if (!rt_hash_locks)
246                 panic("IP: failed to allocate rt_hash_locks\n");
247
248         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
249                 spin_lock_init(&rt_hash_locks[i]);
250 }
251 #else
252 # define rt_hash_lock_addr(slot) NULL
253
254 static inline void rt_hash_lock_init(void)
255 {
256 }
257 #endif
258
259 static struct rt_hash_bucket    *rt_hash_table;
260 static unsigned                 rt_hash_mask;
261 static unsigned int             rt_hash_log;
262 static unsigned int             rt_hash_rnd;
263
264 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
265 #define RT_CACHE_STAT_INC(field) \
266         (__raw_get_cpu_var(rt_cache_stat).field++)
267
268 static int rt_intern_hash(unsigned hash, struct rtable *rth,
269                                 struct rtable **res);
270
271 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
272 {
273         return (jhash_2words(daddr, saddr, rt_hash_rnd)
274                 & rt_hash_mask);
275 }
276
277 #define rt_hash(daddr, saddr, idx) \
278         rt_hash_code((__force u32)(__be32)(daddr),\
279                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
280
281 #ifdef CONFIG_PROC_FS
282 struct rt_cache_iter_state {
283         int bucket;
284 };
285
286 static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 {
288         struct rtable *r = NULL;
289         struct rt_cache_iter_state *st = seq->private;
290
291         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
292                 rcu_read_lock_bh();
293                 r = rt_hash_table[st->bucket].chain;
294                 if (r)
295                         break;
296                 rcu_read_unlock_bh();
297         }
298         return rcu_dereference(r);
299 }
300
301 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 if (--st->bucket < 0)
309                         break;
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference(r);
314 }
315
316 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
317 {
318         struct rtable *r = rt_cache_get_first(seq);
319
320         if (r)
321                 while (pos && (r = rt_cache_get_next(seq, r)))
322                         --pos;
323         return pos ? NULL : r;
324 }
325
326 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
327 {
328         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
329 }
330
331 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
332 {
333         struct rtable *r = NULL;
334
335         if (v == SEQ_START_TOKEN)
336                 r = rt_cache_get_first(seq);
337         else
338                 r = rt_cache_get_next(seq, v);
339         ++*pos;
340         return r;
341 }
342
343 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
344 {
345         if (v && v != SEQ_START_TOKEN)
346                 rcu_read_unlock_bh();
347 }
348
349 static int rt_cache_seq_show(struct seq_file *seq, void *v)
350 {
351         if (v == SEQ_START_TOKEN)
352                 seq_printf(seq, "%-127s\n",
353                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
354                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
355                            "HHUptod\tSpecDst");
356         else {
357                 struct rtable *r = v;
358                 char temp[256];
359
360                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
361                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
362                         r->u.dst.dev ? r->u.dst.dev->name : "*",
363                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
364                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
365                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
366                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
367                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
368                         dst_metric(&r->u.dst, RTAX_WINDOW),
369                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
370                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
371                         r->fl.fl4_tos,
372                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
373                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
374                                        dev_queue_xmit) : 0,
375                         r->rt_spec_dst);
376                 seq_printf(seq, "%-127s\n", temp);
377         }
378         return 0;
379 }
380
381 static const struct seq_operations rt_cache_seq_ops = {
382         .start  = rt_cache_seq_start,
383         .next   = rt_cache_seq_next,
384         .stop   = rt_cache_seq_stop,
385         .show   = rt_cache_seq_show,
386 };
387
388 static int rt_cache_seq_open(struct inode *inode, struct file *file)
389 {
390         return seq_open_private(file, &rt_cache_seq_ops,
391                         sizeof(struct rt_cache_iter_state));
392 }
393
394 static const struct file_operations rt_cache_seq_fops = {
395         .owner   = THIS_MODULE,
396         .open    = rt_cache_seq_open,
397         .read    = seq_read,
398         .llseek  = seq_lseek,
399         .release = seq_release_private,
400 };
401
402
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405         int cpu;
406
407         if (*pos == 0)
408                 return SEQ_START_TOKEN;
409
410         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411                 if (!cpu_possible(cpu))
412                         continue;
413                 *pos = cpu+1;
414                 return &per_cpu(rt_cache_stat, cpu);
415         }
416         return NULL;
417 }
418
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421         int cpu;
422
423         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430
431 }
432
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435
436 }
437
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440         struct rt_cache_stat *st = v;
441
442         if (v == SEQ_START_TOKEN) {
443                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444                 return 0;
445         }
446
447         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449                    atomic_read(&ipv4_dst_ops.entries),
450                    st->in_hit,
451                    st->in_slow_tot,
452                    st->in_slow_mc,
453                    st->in_no_route,
454                    st->in_brd,
455                    st->in_martian_dst,
456                    st->in_martian_src,
457
458                    st->out_hit,
459                    st->out_slow_tot,
460                    st->out_slow_mc,
461
462                    st->gc_total,
463                    st->gc_ignored,
464                    st->gc_goal_miss,
465                    st->gc_dst_overflow,
466                    st->in_hlist_search,
467                    st->out_hlist_search
468                 );
469         return 0;
470 }
471
472 static const struct seq_operations rt_cpu_seq_ops = {
473         .start  = rt_cpu_seq_start,
474         .next   = rt_cpu_seq_next,
475         .stop   = rt_cpu_seq_stop,
476         .show   = rt_cpu_seq_show,
477 };
478
479
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482         return seq_open(file, &rt_cpu_seq_ops);
483 }
484
485 static const struct file_operations rt_cpu_seq_fops = {
486         .owner   = THIS_MODULE,
487         .open    = rt_cpu_seq_open,
488         .read    = seq_read,
489         .llseek  = seq_lseek,
490         .release = seq_release,
491 };
492
493 #ifdef CONFIG_NET_CLS_ROUTE
494 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
495                            int length, int *eof, void *data)
496 {
497         unsigned int i;
498
499         if ((offset & 3) || (length & 3))
500                 return -EIO;
501
502         if (offset >= sizeof(struct ip_rt_acct) * 256) {
503                 *eof = 1;
504                 return 0;
505         }
506
507         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
508                 length = sizeof(struct ip_rt_acct) * 256 - offset;
509                 *eof = 1;
510         }
511
512         offset /= sizeof(u32);
513
514         if (length > 0) {
515                 u32 *dst = (u32 *) buffer;
516
517                 *start = buffer;
518                 memset(dst, 0, length);
519
520                 for_each_possible_cpu(i) {
521                         unsigned int j;
522                         u32 *src;
523
524                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
525                         for (j = 0; j < length/4; j++)
526                                 dst[j] += src[j];
527                 }
528         }
529         return length;
530 }
531 #endif
532
533 static __init int ip_rt_proc_init(struct net *net)
534 {
535         struct proc_dir_entry *pde;
536
537         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
538                         &rt_cache_seq_fops);
539         if (!pde)
540                 goto err1;
541
542         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
543         if (!pde)
544                 goto err2;
545
546         pde->proc_fops = &rt_cpu_seq_fops;
547
548 #ifdef CONFIG_NET_CLS_ROUTE
549         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
550                         ip_rt_acct_read, NULL);
551         if (!pde)
552                 goto err3;
553 #endif
554         return 0;
555
556 #ifdef CONFIG_NET_CLS_ROUTE
557 err3:
558         remove_proc_entry("rt_cache", net->proc_net_stat);
559 #endif
560 err2:
561         remove_proc_entry("rt_cache", net->proc_net);
562 err1:
563         return -ENOMEM;
564 }
565 #else
566 static inline int ip_rt_proc_init(struct net *net)
567 {
568         return 0;
569 }
570 #endif /* CONFIG_PROC_FS */
571
572 static __inline__ void rt_free(struct rtable *rt)
573 {
574         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
575 }
576
577 static __inline__ void rt_drop(struct rtable *rt)
578 {
579         ip_rt_put(rt);
580         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
581 }
582
583 static __inline__ int rt_fast_clean(struct rtable *rth)
584 {
585         /* Kill broadcast/multicast entries very aggresively, if they
586            collide in hash table with more useful entries */
587         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
588                 rth->fl.iif && rth->u.dst.rt_next;
589 }
590
591 static __inline__ int rt_valuable(struct rtable *rth)
592 {
593         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
594                 rth->u.dst.expires;
595 }
596
597 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
598 {
599         unsigned long age;
600         int ret = 0;
601
602         if (atomic_read(&rth->u.dst.__refcnt))
603                 goto out;
604
605         ret = 1;
606         if (rth->u.dst.expires &&
607             time_after_eq(jiffies, rth->u.dst.expires))
608                 goto out;
609
610         age = jiffies - rth->u.dst.lastuse;
611         ret = 0;
612         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
613             (age <= tmo2 && rt_valuable(rth)))
614                 goto out;
615         ret = 1;
616 out:    return ret;
617 }
618
619 /* Bits of score are:
620  * 31: very valuable
621  * 30: not quite useless
622  * 29..0: usage counter
623  */
624 static inline u32 rt_score(struct rtable *rt)
625 {
626         u32 score = jiffies - rt->u.dst.lastuse;
627
628         score = ~score & ~(3<<30);
629
630         if (rt_valuable(rt))
631                 score |= (1<<31);
632
633         if (!rt->fl.iif ||
634             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
635                 score |= (1<<30);
636
637         return score;
638 }
639
640 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
641 {
642         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
643                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
644                 (fl1->mark ^ fl2->mark) |
645                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
646                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
647                 (fl1->oif ^ fl2->oif) |
648                 (fl1->iif ^ fl2->iif)) == 0;
649 }
650
651 /*
652  * Perform a full scan of hash table and free all entries.
653  * Can be called by a softirq or a process.
654  * In the later case, we want to be reschedule if necessary
655  */
656 static void rt_do_flush(int process_context)
657 {
658         unsigned int i;
659         struct rtable *rth, *next;
660
661         for (i = 0; i <= rt_hash_mask; i++) {
662                 if (process_context && need_resched())
663                         cond_resched();
664                 rth = rt_hash_table[i].chain;
665                 if (!rth)
666                         continue;
667
668                 spin_lock_bh(rt_hash_lock_addr(i));
669                 rth = rt_hash_table[i].chain;
670                 rt_hash_table[i].chain = NULL;
671                 spin_unlock_bh(rt_hash_lock_addr(i));
672
673                 for (; rth; rth = next) {
674                         next = rth->u.dst.rt_next;
675                         rt_free(rth);
676                 }
677         }
678 }
679
680 static void rt_check_expire(void)
681 {
682         static unsigned int rover;
683         unsigned int i = rover, goal;
684         struct rtable *rth, **rthp;
685         u64 mult;
686
687         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
688         if (ip_rt_gc_timeout > 1)
689                 do_div(mult, ip_rt_gc_timeout);
690         goal = (unsigned int)mult;
691         if (goal > rt_hash_mask)
692                 goal = rt_hash_mask + 1;
693         for (; goal > 0; goal--) {
694                 unsigned long tmo = ip_rt_gc_timeout;
695
696                 i = (i + 1) & rt_hash_mask;
697                 rthp = &rt_hash_table[i].chain;
698
699                 if (need_resched())
700                         cond_resched();
701
702                 if (*rthp == NULL)
703                         continue;
704                 spin_lock_bh(rt_hash_lock_addr(i));
705                 while ((rth = *rthp) != NULL) {
706                         if (rth->u.dst.expires) {
707                                 /* Entry is expired even if it is in use */
708                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
709                                         tmo >>= 1;
710                                         rthp = &rth->u.dst.rt_next;
711                                         continue;
712                                 }
713                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
714                                 tmo >>= 1;
715                                 rthp = &rth->u.dst.rt_next;
716                                 continue;
717                         }
718
719                         /* Cleanup aged off entries. */
720                         *rthp = rth->u.dst.rt_next;
721                         rt_free(rth);
722                 }
723                 spin_unlock_bh(rt_hash_lock_addr(i));
724         }
725         rover = i;
726 }
727
728 /*
729  * rt_worker_func() is run in process context.
730  * If a whole flush was scheduled, it is done.
731  * Else, we call rt_check_expire() to scan part of the hash table
732  */
733 static void rt_worker_func(struct work_struct *work)
734 {
735         if (ip_rt_flush_expected) {
736                 ip_rt_flush_expected = 0;
737                 rt_do_flush(1);
738         } else
739                 rt_check_expire();
740         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
741 }
742
743 /* This can run from both BH and non-BH contexts, the latter
744  * in the case of a forced flush event.
745  */
746 static void rt_run_flush(unsigned long process_context)
747 {
748         rt_deadline = 0;
749
750         get_random_bytes(&rt_hash_rnd, 4);
751
752         rt_do_flush(process_context);
753 }
754
755 static DEFINE_SPINLOCK(rt_flush_lock);
756
757 void rt_cache_flush(int delay)
758 {
759         unsigned long now = jiffies;
760         int user_mode = !in_softirq();
761
762         if (delay < 0)
763                 delay = ip_rt_min_delay;
764
765         spin_lock_bh(&rt_flush_lock);
766
767         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
768                 long tmo = (long)(rt_deadline - now);
769
770                 /* If flush timer is already running
771                    and flush request is not immediate (delay > 0):
772
773                    if deadline is not achieved, prolongate timer to "delay",
774                    otherwise fire it at deadline time.
775                  */
776
777                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
778                         tmo = 0;
779
780                 if (delay > tmo)
781                         delay = tmo;
782         }
783
784         if (delay <= 0) {
785                 spin_unlock_bh(&rt_flush_lock);
786                 rt_run_flush(user_mode);
787                 return;
788         }
789
790         if (rt_deadline == 0)
791                 rt_deadline = now + ip_rt_max_delay;
792
793         mod_timer(&rt_flush_timer, now+delay);
794         spin_unlock_bh(&rt_flush_lock);
795 }
796
797 /*
798  * We change rt_hash_rnd and ask next rt_worker_func() invocation
799  * to perform a flush in process context
800  */
801 static void rt_secret_rebuild(unsigned long dummy)
802 {
803         get_random_bytes(&rt_hash_rnd, 4);
804         ip_rt_flush_expected = 1;
805         cancel_delayed_work(&expires_work);
806         schedule_delayed_work(&expires_work, HZ/10);
807         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
808 }
809
810 /*
811    Short description of GC goals.
812
813    We want to build algorithm, which will keep routing cache
814    at some equilibrium point, when number of aged off entries
815    is kept approximately equal to newly generated ones.
816
817    Current expiration strength is variable "expire".
818    We try to adjust it dynamically, so that if networking
819    is idle expires is large enough to keep enough of warm entries,
820    and when load increases it reduces to limit cache size.
821  */
822
823 static int rt_garbage_collect(void)
824 {
825         static unsigned long expire = RT_GC_TIMEOUT;
826         static unsigned long last_gc;
827         static int rover;
828         static int equilibrium;
829         struct rtable *rth, **rthp;
830         unsigned long now = jiffies;
831         int goal;
832
833         /*
834          * Garbage collection is pretty expensive,
835          * do not make it too frequently.
836          */
837
838         RT_CACHE_STAT_INC(gc_total);
839
840         if (now - last_gc < ip_rt_gc_min_interval &&
841             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842                 RT_CACHE_STAT_INC(gc_ignored);
843                 goto out;
844         }
845
846         /* Calculate number of entries, which we want to expire now. */
847         goal = atomic_read(&ipv4_dst_ops.entries) -
848                 (ip_rt_gc_elasticity << rt_hash_log);
849         if (goal <= 0) {
850                 if (equilibrium < ipv4_dst_ops.gc_thresh)
851                         equilibrium = ipv4_dst_ops.gc_thresh;
852                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853                 if (goal > 0) {
854                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
855                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856                 }
857         } else {
858                 /* We are in dangerous area. Try to reduce cache really
859                  * aggressively.
860                  */
861                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
862                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863         }
864
865         if (now - last_gc >= ip_rt_gc_min_interval)
866                 last_gc = now;
867
868         if (goal <= 0) {
869                 equilibrium += goal;
870                 goto work_done;
871         }
872
873         do {
874                 int i, k;
875
876                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877                         unsigned long tmo = expire;
878
879                         k = (k + 1) & rt_hash_mask;
880                         rthp = &rt_hash_table[k].chain;
881                         spin_lock_bh(rt_hash_lock_addr(k));
882                         while ((rth = *rthp) != NULL) {
883                                 if (!rt_may_expire(rth, tmo, expire)) {
884                                         tmo >>= 1;
885                                         rthp = &rth->u.dst.rt_next;
886                                         continue;
887                                 }
888                                 *rthp = rth->u.dst.rt_next;
889                                 rt_free(rth);
890                                 goal--;
891                         }
892                         spin_unlock_bh(rt_hash_lock_addr(k));
893                         if (goal <= 0)
894                                 break;
895                 }
896                 rover = k;
897
898                 if (goal <= 0)
899                         goto work_done;
900
901                 /* Goal is not achieved. We stop process if:
902
903                    - if expire reduced to zero. Otherwise, expire is halfed.
904                    - if table is not full.
905                    - if we are called from interrupt.
906                    - jiffies check is just fallback/debug loop breaker.
907                      We will not spin here for long time in any case.
908                  */
909
910                 RT_CACHE_STAT_INC(gc_goal_miss);
911
912                 if (expire == 0)
913                         break;
914
915                 expire >>= 1;
916 #if RT_CACHE_DEBUG >= 2
917                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
918                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
919 #endif
920
921                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
922                         goto out;
923         } while (!in_softirq() && time_before_eq(jiffies, now));
924
925         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
926                 goto out;
927         if (net_ratelimit())
928                 printk(KERN_WARNING "dst cache overflow\n");
929         RT_CACHE_STAT_INC(gc_dst_overflow);
930         return 1;
931
932 work_done:
933         expire += ip_rt_gc_min_interval;
934         if (expire > ip_rt_gc_timeout ||
935             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
936                 expire = ip_rt_gc_timeout;
937 #if RT_CACHE_DEBUG >= 2
938         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
939                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
940 #endif
941 out:    return 0;
942 }
943
944 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
945 {
946         struct rtable   *rth, **rthp;
947         unsigned long   now;
948         struct rtable *cand, **candp;
949         u32             min_score;
950         int             chain_length;
951         int attempts = !in_softirq();
952
953 restart:
954         chain_length = 0;
955         min_score = ~(u32)0;
956         cand = NULL;
957         candp = NULL;
958         now = jiffies;
959
960         rthp = &rt_hash_table[hash].chain;
961
962         spin_lock_bh(rt_hash_lock_addr(hash));
963         while ((rth = *rthp) != NULL) {
964                 if (compare_keys(&rth->fl, &rt->fl)) {
965                         /* Put it first */
966                         *rthp = rth->u.dst.rt_next;
967                         /*
968                          * Since lookup is lockfree, the deletion
969                          * must be visible to another weakly ordered CPU before
970                          * the insertion at the start of the hash chain.
971                          */
972                         rcu_assign_pointer(rth->u.dst.rt_next,
973                                            rt_hash_table[hash].chain);
974                         /*
975                          * Since lookup is lockfree, the update writes
976                          * must be ordered for consistency on SMP.
977                          */
978                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
979
980                         dst_use(&rth->u.dst, now);
981                         spin_unlock_bh(rt_hash_lock_addr(hash));
982
983                         rt_drop(rt);
984                         *rp = rth;
985                         return 0;
986                 }
987
988                 if (!atomic_read(&rth->u.dst.__refcnt)) {
989                         u32 score = rt_score(rth);
990
991                         if (score <= min_score) {
992                                 cand = rth;
993                                 candp = rthp;
994                                 min_score = score;
995                         }
996                 }
997
998                 chain_length++;
999
1000                 rthp = &rth->u.dst.rt_next;
1001         }
1002
1003         if (cand) {
1004                 /* ip_rt_gc_elasticity used to be average length of chain
1005                  * length, when exceeded gc becomes really aggressive.
1006                  *
1007                  * The second limit is less certain. At the moment it allows
1008                  * only 2 entries per bucket. We will see.
1009                  */
1010                 if (chain_length > ip_rt_gc_elasticity) {
1011                         *candp = cand->u.dst.rt_next;
1012                         rt_free(cand);
1013                 }
1014         }
1015
1016         /* Try to bind route to arp only if it is output
1017            route or unicast forwarding path.
1018          */
1019         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1020                 int err = arp_bind_neighbour(&rt->u.dst);
1021                 if (err) {
1022                         spin_unlock_bh(rt_hash_lock_addr(hash));
1023
1024                         if (err != -ENOBUFS) {
1025                                 rt_drop(rt);
1026                                 return err;
1027                         }
1028
1029                         /* Neighbour tables are full and nothing
1030                            can be released. Try to shrink route cache,
1031                            it is most likely it holds some neighbour records.
1032                          */
1033                         if (attempts-- > 0) {
1034                                 int saved_elasticity = ip_rt_gc_elasticity;
1035                                 int saved_int = ip_rt_gc_min_interval;
1036                                 ip_rt_gc_elasticity     = 1;
1037                                 ip_rt_gc_min_interval   = 0;
1038                                 rt_garbage_collect();
1039                                 ip_rt_gc_min_interval   = saved_int;
1040                                 ip_rt_gc_elasticity     = saved_elasticity;
1041                                 goto restart;
1042                         }
1043
1044                         if (net_ratelimit())
1045                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1046                         rt_drop(rt);
1047                         return -ENOBUFS;
1048                 }
1049         }
1050
1051         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1052 #if RT_CACHE_DEBUG >= 2
1053         if (rt->u.dst.rt_next) {
1054                 struct rtable *trt;
1055                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1056                        NIPQUAD(rt->rt_dst));
1057                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1058                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1059                 printk("\n");
1060         }
1061 #endif
1062         rt_hash_table[hash].chain = rt;
1063         spin_unlock_bh(rt_hash_lock_addr(hash));
1064         *rp = rt;
1065         return 0;
1066 }
1067
1068 void rt_bind_peer(struct rtable *rt, int create)
1069 {
1070         static DEFINE_SPINLOCK(rt_peer_lock);
1071         struct inet_peer *peer;
1072
1073         peer = inet_getpeer(rt->rt_dst, create);
1074
1075         spin_lock_bh(&rt_peer_lock);
1076         if (rt->peer == NULL) {
1077                 rt->peer = peer;
1078                 peer = NULL;
1079         }
1080         spin_unlock_bh(&rt_peer_lock);
1081         if (peer)
1082                 inet_putpeer(peer);
1083 }
1084
1085 /*
1086  * Peer allocation may fail only in serious out-of-memory conditions.  However
1087  * we still can generate some output.
1088  * Random ID selection looks a bit dangerous because we have no chances to
1089  * select ID being unique in a reasonable period of time.
1090  * But broken packet identifier may be better than no packet at all.
1091  */
1092 static void ip_select_fb_ident(struct iphdr *iph)
1093 {
1094         static DEFINE_SPINLOCK(ip_fb_id_lock);
1095         static u32 ip_fallback_id;
1096         u32 salt;
1097
1098         spin_lock_bh(&ip_fb_id_lock);
1099         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1100         iph->id = htons(salt & 0xFFFF);
1101         ip_fallback_id = salt;
1102         spin_unlock_bh(&ip_fb_id_lock);
1103 }
1104
1105 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1106 {
1107         struct rtable *rt = (struct rtable *) dst;
1108
1109         if (rt) {
1110                 if (rt->peer == NULL)
1111                         rt_bind_peer(rt, 1);
1112
1113                 /* If peer is attached to destination, it is never detached,
1114                    so that we need not to grab a lock to dereference it.
1115                  */
1116                 if (rt->peer) {
1117                         iph->id = htons(inet_getid(rt->peer, more));
1118                         return;
1119                 }
1120         } else
1121                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1122                        __builtin_return_address(0));
1123
1124         ip_select_fb_ident(iph);
1125 }
1126
1127 static void rt_del(unsigned hash, struct rtable *rt)
1128 {
1129         struct rtable **rthp;
1130
1131         spin_lock_bh(rt_hash_lock_addr(hash));
1132         ip_rt_put(rt);
1133         for (rthp = &rt_hash_table[hash].chain; *rthp;
1134              rthp = &(*rthp)->u.dst.rt_next)
1135                 if (*rthp == rt) {
1136                         *rthp = rt->u.dst.rt_next;
1137                         rt_free(rt);
1138                         break;
1139                 }
1140         spin_unlock_bh(rt_hash_lock_addr(hash));
1141 }
1142
1143 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1144                     __be32 saddr, struct net_device *dev)
1145 {
1146         int i, k;
1147         struct in_device *in_dev = in_dev_get(dev);
1148         struct rtable *rth, **rthp;
1149         __be32  skeys[2] = { saddr, 0 };
1150         int  ikeys[2] = { dev->ifindex, 0 };
1151         struct netevent_redirect netevent;
1152
1153         if (!in_dev)
1154                 return;
1155
1156         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1157             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1158                 goto reject_redirect;
1159
1160         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1161                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1162                         goto reject_redirect;
1163                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1164                         goto reject_redirect;
1165         } else {
1166                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1167                         goto reject_redirect;
1168         }
1169
1170         for (i = 0; i < 2; i++) {
1171                 for (k = 0; k < 2; k++) {
1172                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1173
1174                         rthp=&rt_hash_table[hash].chain;
1175
1176                         rcu_read_lock();
1177                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1178                                 struct rtable *rt;
1179
1180                                 if (rth->fl.fl4_dst != daddr ||
1181                                     rth->fl.fl4_src != skeys[i] ||
1182                                     rth->fl.oif != ikeys[k] ||
1183                                     rth->fl.iif != 0) {
1184                                         rthp = &rth->u.dst.rt_next;
1185                                         continue;
1186                                 }
1187
1188                                 if (rth->rt_dst != daddr ||
1189                                     rth->rt_src != saddr ||
1190                                     rth->u.dst.error ||
1191                                     rth->rt_gateway != old_gw ||
1192                                     rth->u.dst.dev != dev)
1193                                         break;
1194
1195                                 dst_hold(&rth->u.dst);
1196                                 rcu_read_unlock();
1197
1198                                 rt = dst_alloc(&ipv4_dst_ops);
1199                                 if (rt == NULL) {
1200                                         ip_rt_put(rth);
1201                                         in_dev_put(in_dev);
1202                                         return;
1203                                 }
1204
1205                                 /* Copy all the information. */
1206                                 *rt = *rth;
1207                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1208                                 rt->u.dst.__use         = 1;
1209                                 atomic_set(&rt->u.dst.__refcnt, 1);
1210                                 rt->u.dst.child         = NULL;
1211                                 if (rt->u.dst.dev)
1212                                         dev_hold(rt->u.dst.dev);
1213                                 if (rt->idev)
1214                                         in_dev_hold(rt->idev);
1215                                 rt->u.dst.obsolete      = 0;
1216                                 rt->u.dst.lastuse       = jiffies;
1217                                 rt->u.dst.path          = &rt->u.dst;
1218                                 rt->u.dst.neighbour     = NULL;
1219                                 rt->u.dst.hh            = NULL;
1220                                 rt->u.dst.xfrm          = NULL;
1221
1222                                 rt->rt_flags            |= RTCF_REDIRECTED;
1223
1224                                 /* Gateway is different ... */
1225                                 rt->rt_gateway          = new_gw;
1226
1227                                 /* Redirect received -> path was valid */
1228                                 dst_confirm(&rth->u.dst);
1229
1230                                 if (rt->peer)
1231                                         atomic_inc(&rt->peer->refcnt);
1232
1233                                 if (arp_bind_neighbour(&rt->u.dst) ||
1234                                     !(rt->u.dst.neighbour->nud_state &
1235                                             NUD_VALID)) {
1236                                         if (rt->u.dst.neighbour)
1237                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1238                                         ip_rt_put(rth);
1239                                         rt_drop(rt);
1240                                         goto do_next;
1241                                 }
1242
1243                                 netevent.old = &rth->u.dst;
1244                                 netevent.new = &rt->u.dst;
1245                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1246                                                         &netevent);
1247
1248                                 rt_del(hash, rth);
1249                                 if (!rt_intern_hash(hash, rt, &rt))
1250                                         ip_rt_put(rt);
1251                                 goto do_next;
1252                         }
1253                         rcu_read_unlock();
1254                 do_next:
1255                         ;
1256                 }
1257         }
1258         in_dev_put(in_dev);
1259         return;
1260
1261 reject_redirect:
1262 #ifdef CONFIG_IP_ROUTE_VERBOSE
1263         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1264                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1265                         "%u.%u.%u.%u ignored.\n"
1266                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1267                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1268                        NIPQUAD(saddr), NIPQUAD(daddr));
1269 #endif
1270         in_dev_put(in_dev);
1271 }
1272
1273 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1274 {
1275         struct rtable *rt = (struct rtable*)dst;
1276         struct dst_entry *ret = dst;
1277
1278         if (rt) {
1279                 if (dst->obsolete) {
1280                         ip_rt_put(rt);
1281                         ret = NULL;
1282                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1283                            rt->u.dst.expires) {
1284                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1285                                                 rt->fl.oif);
1286 #if RT_CACHE_DEBUG >= 1
1287                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1288                                           "%u.%u.%u.%u/%02x dropped\n",
1289                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1290 #endif
1291                         rt_del(hash, rt);
1292                         ret = NULL;
1293                 }
1294         }
1295         return ret;
1296 }
1297
1298 /*
1299  * Algorithm:
1300  *      1. The first ip_rt_redirect_number redirects are sent
1301  *         with exponential backoff, then we stop sending them at all,
1302  *         assuming that the host ignores our redirects.
1303  *      2. If we did not see packets requiring redirects
1304  *         during ip_rt_redirect_silence, we assume that the host
1305  *         forgot redirected route and start to send redirects again.
1306  *
1307  * This algorithm is much cheaper and more intelligent than dumb load limiting
1308  * in icmp.c.
1309  *
1310  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1311  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1312  */
1313
1314 void ip_rt_send_redirect(struct sk_buff *skb)
1315 {
1316         struct rtable *rt = (struct rtable*)skb->dst;
1317         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1318
1319         if (!in_dev)
1320                 return;
1321
1322         if (!IN_DEV_TX_REDIRECTS(in_dev))
1323                 goto out;
1324
1325         /* No redirected packets during ip_rt_redirect_silence;
1326          * reset the algorithm.
1327          */
1328         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1329                 rt->u.dst.rate_tokens = 0;
1330
1331         /* Too many ignored redirects; do not send anything
1332          * set u.dst.rate_last to the last seen redirected packet.
1333          */
1334         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1335                 rt->u.dst.rate_last = jiffies;
1336                 goto out;
1337         }
1338
1339         /* Check for load limit; set rate_last to the latest sent
1340          * redirect.
1341          */
1342         if (rt->u.dst.rate_tokens == 0 ||
1343             time_after(jiffies,
1344                        (rt->u.dst.rate_last +
1345                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1346                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1347                 rt->u.dst.rate_last = jiffies;
1348                 ++rt->u.dst.rate_tokens;
1349 #ifdef CONFIG_IP_ROUTE_VERBOSE
1350                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1351                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1352                     net_ratelimit())
1353                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1354                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1355                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1356                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1357 #endif
1358         }
1359 out:
1360         in_dev_put(in_dev);
1361 }
1362
1363 static int ip_error(struct sk_buff *skb)
1364 {
1365         struct rtable *rt = (struct rtable*)skb->dst;
1366         unsigned long now;
1367         int code;
1368
1369         switch (rt->u.dst.error) {
1370                 case EINVAL:
1371                 default:
1372                         goto out;
1373                 case EHOSTUNREACH:
1374                         code = ICMP_HOST_UNREACH;
1375                         break;
1376                 case ENETUNREACH:
1377                         code = ICMP_NET_UNREACH;
1378                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1379                         break;
1380                 case EACCES:
1381                         code = ICMP_PKT_FILTERED;
1382                         break;
1383         }
1384
1385         now = jiffies;
1386         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1387         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1388                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1389         rt->u.dst.rate_last = now;
1390         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1391                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1392                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1393         }
1394
1395 out:    kfree_skb(skb);
1396         return 0;
1397 }
1398
1399 /*
1400  *      The last two values are not from the RFC but
1401  *      are needed for AMPRnet AX.25 paths.
1402  */
1403
1404 static const unsigned short mtu_plateau[] =
1405 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1406
1407 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1408 {
1409         int i;
1410
1411         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1412                 if (old_mtu > mtu_plateau[i])
1413                         return mtu_plateau[i];
1414         return 68;
1415 }
1416
1417 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1418 {
1419         int i;
1420         unsigned short old_mtu = ntohs(iph->tot_len);
1421         struct rtable *rth;
1422         __be32  skeys[2] = { iph->saddr, 0, };
1423         __be32  daddr = iph->daddr;
1424         unsigned short est_mtu = 0;
1425
1426         if (ipv4_config.no_pmtu_disc)
1427                 return 0;
1428
1429         for (i = 0; i < 2; i++) {
1430                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1431
1432                 rcu_read_lock();
1433                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1434                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1435                         if (rth->fl.fl4_dst == daddr &&
1436                             rth->fl.fl4_src == skeys[i] &&
1437                             rth->rt_dst  == daddr &&
1438                             rth->rt_src  == iph->saddr &&
1439                             rth->fl.iif == 0 &&
1440                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1441                                 unsigned short mtu = new_mtu;
1442
1443                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1444
1445                                         /* BSD 4.2 compatibility hack :-( */
1446                                         if (mtu == 0 &&
1447                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1448                                             old_mtu >= 68 + (iph->ihl << 2))
1449                                                 old_mtu -= iph->ihl << 2;
1450
1451                                         mtu = guess_mtu(old_mtu);
1452                                 }
1453                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1454                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1455                                                 dst_confirm(&rth->u.dst);
1456                                                 if (mtu < ip_rt_min_pmtu) {
1457                                                         mtu = ip_rt_min_pmtu;
1458                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1459                                                                 (1 << RTAX_MTU);
1460                                                 }
1461                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1462                                                 dst_set_expires(&rth->u.dst,
1463                                                         ip_rt_mtu_expires);
1464                                         }
1465                                         est_mtu = mtu;
1466                                 }
1467                         }
1468                 }
1469                 rcu_read_unlock();
1470         }
1471         return est_mtu ? : new_mtu;
1472 }
1473
1474 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1475 {
1476         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1477             !(dst_metric_locked(dst, RTAX_MTU))) {
1478                 if (mtu < ip_rt_min_pmtu) {
1479                         mtu = ip_rt_min_pmtu;
1480                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1481                 }
1482                 dst->metrics[RTAX_MTU-1] = mtu;
1483                 dst_set_expires(dst, ip_rt_mtu_expires);
1484                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1485         }
1486 }
1487
1488 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1489 {
1490         return NULL;
1491 }
1492
1493 static void ipv4_dst_destroy(struct dst_entry *dst)
1494 {
1495         struct rtable *rt = (struct rtable *) dst;
1496         struct inet_peer *peer = rt->peer;
1497         struct in_device *idev = rt->idev;
1498
1499         if (peer) {
1500                 rt->peer = NULL;
1501                 inet_putpeer(peer);
1502         }
1503
1504         if (idev) {
1505                 rt->idev = NULL;
1506                 in_dev_put(idev);
1507         }
1508 }
1509
1510 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1511                             int how)
1512 {
1513         struct rtable *rt = (struct rtable *) dst;
1514         struct in_device *idev = rt->idev;
1515         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1516                 struct in_device *loopback_idev =
1517                         in_dev_get(dev->nd_net->loopback_dev);
1518                 if (loopback_idev) {
1519                         rt->idev = loopback_idev;
1520                         in_dev_put(idev);
1521                 }
1522         }
1523 }
1524
1525 static void ipv4_link_failure(struct sk_buff *skb)
1526 {
1527         struct rtable *rt;
1528
1529         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1530
1531         rt = (struct rtable *) skb->dst;
1532         if (rt)
1533                 dst_set_expires(&rt->u.dst, 0);
1534 }
1535
1536 static int ip_rt_bug(struct sk_buff *skb)
1537 {
1538         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1539                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1540                 skb->dev ? skb->dev->name : "?");
1541         kfree_skb(skb);
1542         return 0;
1543 }
1544
1545 /*
1546    We do not cache source address of outgoing interface,
1547    because it is used only by IP RR, TS and SRR options,
1548    so that it out of fast path.
1549
1550    BTW remember: "addr" is allowed to be not aligned
1551    in IP options!
1552  */
1553
1554 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1555 {
1556         __be32 src;
1557         struct fib_result res;
1558
1559         if (rt->fl.iif == 0)
1560                 src = rt->rt_src;
1561         else if (fib_lookup(&rt->fl, &res) == 0) {
1562                 src = FIB_RES_PREFSRC(res);
1563                 fib_res_put(&res);
1564         } else
1565                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1566                                         RT_SCOPE_UNIVERSE);
1567         memcpy(addr, &src, 4);
1568 }
1569
1570 #ifdef CONFIG_NET_CLS_ROUTE
1571 static void set_class_tag(struct rtable *rt, u32 tag)
1572 {
1573         if (!(rt->u.dst.tclassid & 0xFFFF))
1574                 rt->u.dst.tclassid |= tag & 0xFFFF;
1575         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1576                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1577 }
1578 #endif
1579
1580 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1581 {
1582         struct fib_info *fi = res->fi;
1583
1584         if (fi) {
1585                 if (FIB_RES_GW(*res) &&
1586                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1587                         rt->rt_gateway = FIB_RES_GW(*res);
1588                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1589                        sizeof(rt->u.dst.metrics));
1590                 if (fi->fib_mtu == 0) {
1591                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1592                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1593                             rt->rt_gateway != rt->rt_dst &&
1594                             rt->u.dst.dev->mtu > 576)
1595                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1596                 }
1597 #ifdef CONFIG_NET_CLS_ROUTE
1598                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1599 #endif
1600         } else
1601                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1602
1603         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1604                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1605         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1606                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1607         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1608                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1609                                        ip_rt_min_advmss);
1610         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1611                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1612
1613 #ifdef CONFIG_NET_CLS_ROUTE
1614 #ifdef CONFIG_IP_MULTIPLE_TABLES
1615         set_class_tag(rt, fib_rules_tclass(res));
1616 #endif
1617         set_class_tag(rt, itag);
1618 #endif
1619         rt->rt_type = res->type;
1620 }
1621
1622 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1623                                 u8 tos, struct net_device *dev, int our)
1624 {
1625         unsigned hash;
1626         struct rtable *rth;
1627         __be32 spec_dst;
1628         struct in_device *in_dev = in_dev_get(dev);
1629         u32 itag = 0;
1630
1631         /* Primary sanity checks. */
1632
1633         if (in_dev == NULL)
1634                 return -EINVAL;
1635
1636         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1637             skb->protocol != htons(ETH_P_IP))
1638                 goto e_inval;
1639
1640         if (ZERONET(saddr)) {
1641                 if (!LOCAL_MCAST(daddr))
1642                         goto e_inval;
1643                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1644         } else if (fib_validate_source(saddr, 0, tos, 0,
1645                                         dev, &spec_dst, &itag) < 0)
1646                 goto e_inval;
1647
1648         rth = dst_alloc(&ipv4_dst_ops);
1649         if (!rth)
1650                 goto e_nobufs;
1651
1652         rth->u.dst.output= ip_rt_bug;
1653
1654         atomic_set(&rth->u.dst.__refcnt, 1);
1655         rth->u.dst.flags= DST_HOST;
1656         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1657                 rth->u.dst.flags |= DST_NOPOLICY;
1658         rth->fl.fl4_dst = daddr;
1659         rth->rt_dst     = daddr;
1660         rth->fl.fl4_tos = tos;
1661         rth->fl.mark    = skb->mark;
1662         rth->fl.fl4_src = saddr;
1663         rth->rt_src     = saddr;
1664 #ifdef CONFIG_NET_CLS_ROUTE
1665         rth->u.dst.tclassid = itag;
1666 #endif
1667         rth->rt_iif     =
1668         rth->fl.iif     = dev->ifindex;
1669         rth->u.dst.dev  = init_net.loopback_dev;
1670         dev_hold(rth->u.dst.dev);
1671         rth->idev       = in_dev_get(rth->u.dst.dev);
1672         rth->fl.oif     = 0;
1673         rth->rt_gateway = daddr;
1674         rth->rt_spec_dst= spec_dst;
1675         rth->rt_type    = RTN_MULTICAST;
1676         rth->rt_flags   = RTCF_MULTICAST;
1677         if (our) {
1678                 rth->u.dst.input= ip_local_deliver;
1679                 rth->rt_flags |= RTCF_LOCAL;
1680         }
1681
1682 #ifdef CONFIG_IP_MROUTE
1683         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1684                 rth->u.dst.input = ip_mr_input;
1685 #endif
1686         RT_CACHE_STAT_INC(in_slow_mc);
1687
1688         in_dev_put(in_dev);
1689         hash = rt_hash(daddr, saddr, dev->ifindex);
1690         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1691
1692 e_nobufs:
1693         in_dev_put(in_dev);
1694         return -ENOBUFS;
1695
1696 e_inval:
1697         in_dev_put(in_dev);
1698         return -EINVAL;
1699 }
1700
1701
1702 static void ip_handle_martian_source(struct net_device *dev,
1703                                      struct in_device *in_dev,
1704                                      struct sk_buff *skb,
1705                                      __be32 daddr,
1706                                      __be32 saddr)
1707 {
1708         RT_CACHE_STAT_INC(in_martian_src);
1709 #ifdef CONFIG_IP_ROUTE_VERBOSE
1710         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1711                 /*
1712                  *      RFC1812 recommendation, if source is martian,
1713                  *      the only hint is MAC header.
1714                  */
1715                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1716                         "%u.%u.%u.%u, on dev %s\n",
1717                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1718                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1719                         int i;
1720                         const unsigned char *p = skb_mac_header(skb);
1721                         printk(KERN_WARNING "ll header: ");
1722                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1723                                 printk("%02x", *p);
1724                                 if (i < (dev->hard_header_len - 1))
1725                                         printk(":");
1726                         }
1727                         printk("\n");
1728                 }
1729         }
1730 #endif
1731 }
1732
1733 static inline int __mkroute_input(struct sk_buff *skb,
1734                                   struct fib_result* res,
1735                                   struct in_device *in_dev,
1736                                   __be32 daddr, __be32 saddr, u32 tos,
1737                                   struct rtable **result)
1738 {
1739
1740         struct rtable *rth;
1741         int err;
1742         struct in_device *out_dev;
1743         unsigned flags = 0;
1744         __be32 spec_dst;
1745         u32 itag;
1746
1747         /* get a working reference to the output device */
1748         out_dev = in_dev_get(FIB_RES_DEV(*res));
1749         if (out_dev == NULL) {
1750                 if (net_ratelimit())
1751                         printk(KERN_CRIT "Bug in ip_route_input" \
1752                                "_slow(). Please, report\n");
1753                 return -EINVAL;
1754         }
1755
1756
1757         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1758                                   in_dev->dev, &spec_dst, &itag);
1759         if (err < 0) {
1760                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1761                                          saddr);
1762
1763                 err = -EINVAL;
1764                 goto cleanup;
1765         }
1766
1767         if (err)
1768                 flags |= RTCF_DIRECTSRC;
1769
1770         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1771             (IN_DEV_SHARED_MEDIA(out_dev) ||
1772              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1773                 flags |= RTCF_DOREDIRECT;
1774
1775         if (skb->protocol != htons(ETH_P_IP)) {
1776                 /* Not IP (i.e. ARP). Do not create route, if it is
1777                  * invalid for proxy arp. DNAT routes are always valid.
1778                  */
1779                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1780                         err = -EINVAL;
1781                         goto cleanup;
1782                 }
1783         }
1784
1785
1786         rth = dst_alloc(&ipv4_dst_ops);
1787         if (!rth) {
1788                 err = -ENOBUFS;
1789                 goto cleanup;
1790         }
1791
1792         atomic_set(&rth->u.dst.__refcnt, 1);
1793         rth->u.dst.flags= DST_HOST;
1794         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1795                 rth->u.dst.flags |= DST_NOPOLICY;
1796         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1797                 rth->u.dst.flags |= DST_NOXFRM;
1798         rth->fl.fl4_dst = daddr;
1799         rth->rt_dst     = daddr;
1800         rth->fl.fl4_tos = tos;
1801         rth->fl.mark    = skb->mark;
1802         rth->fl.fl4_src = saddr;
1803         rth->rt_src     = saddr;
1804         rth->rt_gateway = daddr;
1805         rth->rt_iif     =
1806                 rth->fl.iif     = in_dev->dev->ifindex;
1807         rth->u.dst.dev  = (out_dev)->dev;
1808         dev_hold(rth->u.dst.dev);
1809         rth->idev       = in_dev_get(rth->u.dst.dev);
1810         rth->fl.oif     = 0;
1811         rth->rt_spec_dst= spec_dst;
1812
1813         rth->u.dst.input = ip_forward;
1814         rth->u.dst.output = ip_output;
1815
1816         rt_set_nexthop(rth, res, itag);
1817
1818         rth->rt_flags = flags;
1819
1820         *result = rth;
1821         err = 0;
1822  cleanup:
1823         /* release the working reference to the output device */
1824         in_dev_put(out_dev);
1825         return err;
1826 }
1827
1828 static inline int ip_mkroute_input(struct sk_buff *skb,
1829                                    struct fib_result* res,
1830                                    const struct flowi *fl,
1831                                    struct in_device *in_dev,
1832                                    __be32 daddr, __be32 saddr, u32 tos)
1833 {
1834         struct rtable* rth = NULL;
1835         int err;
1836         unsigned hash;
1837
1838 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1839         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1840                 fib_select_multipath(fl, res);
1841 #endif
1842
1843         /* create a routing cache entry */
1844         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1845         if (err)
1846                 return err;
1847
1848         /* put it into the cache */
1849         hash = rt_hash(daddr, saddr, fl->iif);
1850         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1851 }
1852
1853 /*
1854  *      NOTE. We drop all the packets that has local source
1855  *      addresses, because every properly looped back packet
1856  *      must have correct destination already attached by output routine.
1857  *
1858  *      Such approach solves two big problems:
1859  *      1. Not simplex devices are handled properly.
1860  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1861  */
1862
1863 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1864                                u8 tos, struct net_device *dev)
1865 {
1866         struct fib_result res;
1867         struct in_device *in_dev = in_dev_get(dev);
1868         struct flowi fl = { .nl_u = { .ip4_u =
1869                                       { .daddr = daddr,
1870                                         .saddr = saddr,
1871                                         .tos = tos,
1872                                         .scope = RT_SCOPE_UNIVERSE,
1873                                       } },
1874                             .mark = skb->mark,
1875                             .iif = dev->ifindex };
1876         unsigned        flags = 0;
1877         u32             itag = 0;
1878         struct rtable * rth;
1879         unsigned        hash;
1880         __be32          spec_dst;
1881         int             err = -EINVAL;
1882         int             free_res = 0;
1883
1884         /* IP on this device is disabled. */
1885
1886         if (!in_dev)
1887                 goto out;
1888
1889         /* Check for the most weird martians, which can be not detected
1890            by fib_lookup.
1891          */
1892
1893         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1894                 goto martian_source;
1895
1896         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1897                 goto brd_input;
1898
1899         /* Accept zero addresses only to limited broadcast;
1900          * I even do not know to fix it or not. Waiting for complains :-)
1901          */
1902         if (ZERONET(saddr))
1903                 goto martian_source;
1904
1905         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1906                 goto martian_destination;
1907
1908         /*
1909          *      Now we are ready to route packet.
1910          */
1911         if ((err = fib_lookup(&fl, &res)) != 0) {
1912                 if (!IN_DEV_FORWARD(in_dev))
1913                         goto e_hostunreach;
1914                 goto no_route;
1915         }
1916         free_res = 1;
1917
1918         RT_CACHE_STAT_INC(in_slow_tot);
1919
1920         if (res.type == RTN_BROADCAST)
1921                 goto brd_input;
1922
1923         if (res.type == RTN_LOCAL) {
1924                 int result;
1925                 result = fib_validate_source(saddr, daddr, tos,
1926                                              init_net.loopback_dev->ifindex,
1927                                              dev, &spec_dst, &itag);
1928                 if (result < 0)
1929                         goto martian_source;
1930                 if (result)
1931                         flags |= RTCF_DIRECTSRC;
1932                 spec_dst = daddr;
1933                 goto local_input;
1934         }
1935
1936         if (!IN_DEV_FORWARD(in_dev))
1937                 goto e_hostunreach;
1938         if (res.type != RTN_UNICAST)
1939                 goto martian_destination;
1940
1941         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1942 done:
1943         in_dev_put(in_dev);
1944         if (free_res)
1945                 fib_res_put(&res);
1946 out:    return err;
1947
1948 brd_input:
1949         if (skb->protocol != htons(ETH_P_IP))
1950                 goto e_inval;
1951
1952         if (ZERONET(saddr))
1953                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1954         else {
1955                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1956                                           &itag);
1957                 if (err < 0)
1958                         goto martian_source;
1959                 if (err)
1960                         flags |= RTCF_DIRECTSRC;
1961         }
1962         flags |= RTCF_BROADCAST;
1963         res.type = RTN_BROADCAST;
1964         RT_CACHE_STAT_INC(in_brd);
1965
1966 local_input:
1967         rth = dst_alloc(&ipv4_dst_ops);
1968         if (!rth)
1969                 goto e_nobufs;
1970
1971         rth->u.dst.output= ip_rt_bug;
1972
1973         atomic_set(&rth->u.dst.__refcnt, 1);
1974         rth->u.dst.flags= DST_HOST;
1975         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1976                 rth->u.dst.flags |= DST_NOPOLICY;
1977         rth->fl.fl4_dst = daddr;
1978         rth->rt_dst     = daddr;
1979         rth->fl.fl4_tos = tos;
1980         rth->fl.mark    = skb->mark;
1981         rth->fl.fl4_src = saddr;
1982         rth->rt_src     = saddr;
1983 #ifdef CONFIG_NET_CLS_ROUTE
1984         rth->u.dst.tclassid = itag;
1985 #endif
1986         rth->rt_iif     =
1987         rth->fl.iif     = dev->ifindex;
1988         rth->u.dst.dev  = init_net.loopback_dev;
1989         dev_hold(rth->u.dst.dev);
1990         rth->idev       = in_dev_get(rth->u.dst.dev);
1991         rth->rt_gateway = daddr;
1992         rth->rt_spec_dst= spec_dst;
1993         rth->u.dst.input= ip_local_deliver;
1994         rth->rt_flags   = flags|RTCF_LOCAL;
1995         if (res.type == RTN_UNREACHABLE) {
1996                 rth->u.dst.input= ip_error;
1997                 rth->u.dst.error= -err;
1998                 rth->rt_flags   &= ~RTCF_LOCAL;
1999         }
2000         rth->rt_type    = res.type;
2001         hash = rt_hash(daddr, saddr, fl.iif);
2002         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2003         goto done;
2004
2005 no_route:
2006         RT_CACHE_STAT_INC(in_no_route);
2007         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2008         res.type = RTN_UNREACHABLE;
2009         if (err == -ESRCH)
2010                 err = -ENETUNREACH;
2011         goto local_input;
2012
2013         /*
2014          *      Do not cache martian addresses: they should be logged (RFC1812)
2015          */
2016 martian_destination:
2017         RT_CACHE_STAT_INC(in_martian_dst);
2018 #ifdef CONFIG_IP_ROUTE_VERBOSE
2019         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2020                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2021                         "%u.%u.%u.%u, dev %s\n",
2022                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2023 #endif
2024
2025 e_hostunreach:
2026         err = -EHOSTUNREACH;
2027         goto done;
2028
2029 e_inval:
2030         err = -EINVAL;
2031         goto done;
2032
2033 e_nobufs:
2034         err = -ENOBUFS;
2035         goto done;
2036
2037 martian_source:
2038         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2039         goto e_inval;
2040 }
2041
2042 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2043                    u8 tos, struct net_device *dev)
2044 {
2045         struct rtable * rth;
2046         unsigned        hash;
2047         int iif = dev->ifindex;
2048
2049         tos &= IPTOS_RT_MASK;
2050         hash = rt_hash(daddr, saddr, iif);
2051
2052         rcu_read_lock();
2053         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2054              rth = rcu_dereference(rth->u.dst.rt_next)) {
2055                 if (rth->fl.fl4_dst == daddr &&
2056                     rth->fl.fl4_src == saddr &&
2057                     rth->fl.iif == iif &&
2058                     rth->fl.oif == 0 &&
2059                     rth->fl.mark == skb->mark &&
2060                     rth->fl.fl4_tos == tos) {
2061                         dst_use(&rth->u.dst, jiffies);
2062                         RT_CACHE_STAT_INC(in_hit);
2063                         rcu_read_unlock();
2064                         skb->dst = (struct dst_entry*)rth;
2065                         return 0;
2066                 }
2067                 RT_CACHE_STAT_INC(in_hlist_search);
2068         }
2069         rcu_read_unlock();
2070
2071         /* Multicast recognition logic is moved from route cache to here.
2072            The problem was that too many Ethernet cards have broken/missing
2073            hardware multicast filters :-( As result the host on multicasting
2074            network acquires a lot of useless route cache entries, sort of
2075            SDR messages from all the world. Now we try to get rid of them.
2076            Really, provided software IP multicast filter is organized
2077            reasonably (at least, hashed), it does not result in a slowdown
2078            comparing with route cache reject entries.
2079            Note, that multicast routers are not affected, because
2080            route cache entry is created eventually.
2081          */
2082         if (MULTICAST(daddr)) {
2083                 struct in_device *in_dev;
2084
2085                 rcu_read_lock();
2086                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2087                         int our = ip_check_mc(in_dev, daddr, saddr,
2088                                 ip_hdr(skb)->protocol);
2089                         if (our
2090 #ifdef CONFIG_IP_MROUTE
2091                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2092 #endif
2093                             ) {
2094                                 rcu_read_unlock();
2095                                 return ip_route_input_mc(skb, daddr, saddr,
2096                                                          tos, dev, our);
2097                         }
2098                 }
2099                 rcu_read_unlock();
2100                 return -EINVAL;
2101         }
2102         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2103 }
2104
2105 static inline int __mkroute_output(struct rtable **result,
2106                                    struct fib_result* res,
2107                                    const struct flowi *fl,
2108                                    const struct flowi *oldflp,
2109                                    struct net_device *dev_out,
2110                                    unsigned flags)
2111 {
2112         struct rtable *rth;
2113         struct in_device *in_dev;
2114         u32 tos = RT_FL_TOS(oldflp);
2115         int err = 0;
2116
2117         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2118                 return -EINVAL;
2119
2120         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2121                 res->type = RTN_BROADCAST;
2122         else if (MULTICAST(fl->fl4_dst))
2123                 res->type = RTN_MULTICAST;
2124         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2125                 return -EINVAL;
2126
2127         if (dev_out->flags & IFF_LOOPBACK)
2128                 flags |= RTCF_LOCAL;
2129
2130         /* get work reference to inet device */
2131         in_dev = in_dev_get(dev_out);
2132         if (!in_dev)
2133                 return -EINVAL;
2134
2135         if (res->type == RTN_BROADCAST) {
2136                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2137                 if (res->fi) {
2138                         fib_info_put(res->fi);
2139                         res->fi = NULL;
2140                 }
2141         } else if (res->type == RTN_MULTICAST) {
2142                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2143                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2144                                  oldflp->proto))
2145                         flags &= ~RTCF_LOCAL;
2146                 /* If multicast route do not exist use
2147                    default one, but do not gateway in this case.
2148                    Yes, it is hack.
2149                  */
2150                 if (res->fi && res->prefixlen < 4) {
2151                         fib_info_put(res->fi);
2152                         res->fi = NULL;
2153                 }
2154         }
2155
2156
2157         rth = dst_alloc(&ipv4_dst_ops);
2158         if (!rth) {
2159                 err = -ENOBUFS;
2160                 goto cleanup;
2161         }
2162
2163         atomic_set(&rth->u.dst.__refcnt, 1);
2164         rth->u.dst.flags= DST_HOST;
2165         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2166                 rth->u.dst.flags |= DST_NOXFRM;
2167         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2168                 rth->u.dst.flags |= DST_NOPOLICY;
2169
2170         rth->fl.fl4_dst = oldflp->fl4_dst;
2171         rth->fl.fl4_tos = tos;
2172         rth->fl.fl4_src = oldflp->fl4_src;
2173         rth->fl.oif     = oldflp->oif;
2174         rth->fl.mark    = oldflp->mark;
2175         rth->rt_dst     = fl->fl4_dst;
2176         rth->rt_src     = fl->fl4_src;
2177         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2178         /* get references to the devices that are to be hold by the routing
2179            cache entry */
2180         rth->u.dst.dev  = dev_out;
2181         dev_hold(dev_out);
2182         rth->idev       = in_dev_get(dev_out);
2183         rth->rt_gateway = fl->fl4_dst;
2184         rth->rt_spec_dst= fl->fl4_src;
2185
2186         rth->u.dst.output=ip_output;
2187
2188         RT_CACHE_STAT_INC(out_slow_tot);
2189
2190         if (flags & RTCF_LOCAL) {
2191                 rth->u.dst.input = ip_local_deliver;
2192                 rth->rt_spec_dst = fl->fl4_dst;
2193         }
2194         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2195                 rth->rt_spec_dst = fl->fl4_src;
2196                 if (flags & RTCF_LOCAL &&
2197                     !(dev_out->flags & IFF_LOOPBACK)) {
2198                         rth->u.dst.output = ip_mc_output;
2199                         RT_CACHE_STAT_INC(out_slow_mc);
2200                 }
2201 #ifdef CONFIG_IP_MROUTE
2202                 if (res->type == RTN_MULTICAST) {
2203                         if (IN_DEV_MFORWARD(in_dev) &&
2204                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2205                                 rth->u.dst.input = ip_mr_input;
2206                                 rth->u.dst.output = ip_mc_output;
2207                         }
2208                 }
2209 #endif
2210         }
2211
2212         rt_set_nexthop(rth, res, 0);
2213
2214         rth->rt_flags = flags;
2215
2216         *result = rth;
2217  cleanup:
2218         /* release work reference to inet device */
2219         in_dev_put(in_dev);
2220
2221         return err;
2222 }
2223
2224 static inline int ip_mkroute_output(struct rtable **rp,
2225                                     struct fib_result* res,
2226                                     const struct flowi *fl,
2227                                     const struct flowi *oldflp,
2228                                     struct net_device *dev_out,
2229                                     unsigned flags)
2230 {
2231         struct rtable *rth = NULL;
2232         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2233         unsigned hash;
2234         if (err == 0) {
2235                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2236                 err = rt_intern_hash(hash, rth, rp);
2237         }
2238
2239         return err;
2240 }
2241
2242 /*
2243  * Major route resolver routine.
2244  */
2245
2246 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2247 {
2248         u32 tos = RT_FL_TOS(oldflp);
2249         struct flowi fl = { .nl_u = { .ip4_u =
2250                                       { .daddr = oldflp->fl4_dst,
2251                                         .saddr = oldflp->fl4_src,
2252                                         .tos = tos & IPTOS_RT_MASK,
2253                                         .scope = ((tos & RTO_ONLINK) ?
2254                                                   RT_SCOPE_LINK :
2255                                                   RT_SCOPE_UNIVERSE),
2256                                       } },
2257                             .mark = oldflp->mark,
2258                             .iif = init_net.loopback_dev->ifindex,
2259                             .oif = oldflp->oif };
2260         struct fib_result res;
2261         unsigned flags = 0;
2262         struct net_device *dev_out = NULL;
2263         int free_res = 0;
2264         int err;
2265
2266
2267         res.fi          = NULL;
2268 #ifdef CONFIG_IP_MULTIPLE_TABLES
2269         res.r           = NULL;
2270 #endif
2271
2272         if (oldflp->fl4_src) {
2273                 err = -EINVAL;
2274                 if (MULTICAST(oldflp->fl4_src) ||
2275                     BADCLASS(oldflp->fl4_src) ||
2276                     ZERONET(oldflp->fl4_src))
2277                         goto out;
2278
2279                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2280                 dev_out = ip_dev_find(oldflp->fl4_src);
2281                 if (dev_out == NULL)
2282                         goto out;
2283
2284                 /* I removed check for oif == dev_out->oif here.
2285                    It was wrong for two reasons:
2286                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2287                       assigned to multiple interfaces.
2288                    2. Moreover, we are allowed to send packets with saddr
2289                       of another iface. --ANK
2290                  */
2291
2292                 if (oldflp->oif == 0
2293                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2294                         /* Special hack: user can direct multicasts
2295                            and limited broadcast via necessary interface
2296                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2297                            This hack is not just for fun, it allows
2298                            vic,vat and friends to work.
2299                            They bind socket to loopback, set ttl to zero
2300                            and expect that it will work.
2301                            From the viewpoint of routing cache they are broken,
2302                            because we are not allowed to build multicast path
2303                            with loopback source addr (look, routing cache
2304                            cannot know, that ttl is zero, so that packet
2305                            will not leave this host and route is valid).
2306                            Luckily, this hack is good workaround.
2307                          */
2308
2309                         fl.oif = dev_out->ifindex;
2310                         goto make_route;
2311                 }
2312                 if (dev_out)
2313                         dev_put(dev_out);
2314                 dev_out = NULL;
2315         }
2316
2317
2318         if (oldflp->oif) {
2319                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2320                 err = -ENODEV;
2321                 if (dev_out == NULL)
2322                         goto out;
2323
2324                 /* RACE: Check return value of inet_select_addr instead. */
2325                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2326                         dev_put(dev_out);
2327                         goto out;       /* Wrong error code */
2328                 }
2329
2330                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2331                         if (!fl.fl4_src)
2332                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2333                                                               RT_SCOPE_LINK);
2334                         goto make_route;
2335                 }
2336                 if (!fl.fl4_src) {
2337                         if (MULTICAST(oldflp->fl4_dst))
2338                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2339                                                               fl.fl4_scope);
2340                         else if (!oldflp->fl4_dst)
2341                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2342                                                               RT_SCOPE_HOST);
2343                 }
2344         }
2345
2346         if (!fl.fl4_dst) {
2347                 fl.fl4_dst = fl.fl4_src;
2348                 if (!fl.fl4_dst)
2349                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2350                 if (dev_out)
2351                         dev_put(dev_out);
2352                 dev_out = init_net.loopback_dev;
2353                 dev_hold(dev_out);
2354                 fl.oif = init_net.loopback_dev->ifindex;
2355                 res.type = RTN_LOCAL;
2356                 flags |= RTCF_LOCAL;
2357                 goto make_route;
2358         }
2359
2360         if (fib_lookup(&fl, &res)) {
2361                 res.fi = NULL;
2362                 if (oldflp->oif) {
2363                         /* Apparently, routing tables are wrong. Assume,
2364                            that the destination is on link.
2365
2366                            WHY? DW.
2367                            Because we are allowed to send to iface
2368                            even if it has NO routes and NO assigned
2369                            addresses. When oif is specified, routing
2370                            tables are looked up with only one purpose:
2371                            to catch if destination is gatewayed, rather than
2372                            direct. Moreover, if MSG_DONTROUTE is set,
2373                            we send packet, ignoring both routing tables
2374                            and ifaddr state. --ANK
2375
2376
2377                            We could make it even if oif is unknown,
2378                            likely IPv6, but we do not.
2379                          */
2380
2381                         if (fl.fl4_src == 0)
2382                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2383                                                               RT_SCOPE_LINK);
2384                         res.type = RTN_UNICAST;
2385                         goto make_route;
2386                 }
2387                 if (dev_out)
2388                         dev_put(dev_out);
2389                 err = -ENETUNREACH;
2390                 goto out;
2391         }
2392         free_res = 1;
2393
2394         if (res.type == RTN_LOCAL) {
2395                 if (!fl.fl4_src)
2396                         fl.fl4_src = fl.fl4_dst;
2397                 if (dev_out)
2398                         dev_put(dev_out);
2399                 dev_out = init_net.loopback_dev;
2400                 dev_hold(dev_out);
2401                 fl.oif = dev_out->ifindex;
2402                 if (res.fi)
2403                         fib_info_put(res.fi);
2404                 res.fi = NULL;
2405                 flags |= RTCF_LOCAL;
2406                 goto make_route;
2407         }
2408
2409 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2410         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2411                 fib_select_multipath(&fl, &res);
2412         else
2413 #endif
2414         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2415                 fib_select_default(&fl, &res);
2416
2417         if (!fl.fl4_src)
2418                 fl.fl4_src = FIB_RES_PREFSRC(res);
2419
2420         if (dev_out)
2421                 dev_put(dev_out);
2422         dev_out = FIB_RES_DEV(res);
2423         dev_hold(dev_out);
2424         fl.oif = dev_out->ifindex;
2425
2426
2427 make_route:
2428         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2429
2430
2431         if (free_res)
2432                 fib_res_put(&res);
2433         if (dev_out)
2434                 dev_put(dev_out);
2435 out:    return err;
2436 }
2437
2438 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2439 {
2440         unsigned hash;
2441         struct rtable *rth;
2442
2443         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2444
2445         rcu_read_lock_bh();
2446         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2447                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2448                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2449                     rth->fl.fl4_src == flp->fl4_src &&
2450                     rth->fl.iif == 0 &&
2451                     rth->fl.oif == flp->oif &&
2452                     rth->fl.mark == flp->mark &&
2453                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2454                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2455                         dst_use(&rth->u.dst, jiffies);
2456                         RT_CACHE_STAT_INC(out_hit);
2457                         rcu_read_unlock_bh();
2458                         *rp = rth;
2459                         return 0;
2460                 }
2461                 RT_CACHE_STAT_INC(out_hlist_search);
2462         }
2463         rcu_read_unlock_bh();
2464
2465         return ip_route_output_slow(rp, flp);
2466 }
2467
2468 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2469
2470 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2471 {
2472 }
2473
2474 static struct dst_ops ipv4_dst_blackhole_ops = {
2475         .family                 =       AF_INET,
2476         .protocol               =       __constant_htons(ETH_P_IP),
2477         .destroy                =       ipv4_dst_destroy,
2478         .check                  =       ipv4_dst_check,
2479         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2480         .entry_size             =       sizeof(struct rtable),
2481 };
2482
2483
2484 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2485 {
2486         struct rtable *ort = *rp;
2487         struct rtable *rt = (struct rtable *)
2488                 dst_alloc(&ipv4_dst_blackhole_ops);
2489
2490         if (rt) {
2491                 struct dst_entry *new = &rt->u.dst;
2492
2493                 atomic_set(&new->__refcnt, 1);
2494                 new->__use = 1;
2495                 new->input = dst_discard;
2496                 new->output = dst_discard;
2497                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2498
2499                 new->dev = ort->u.dst.dev;
2500                 if (new->dev)
2501                         dev_hold(new->dev);
2502
2503                 rt->fl = ort->fl;
2504
2505                 rt->idev = ort->idev;
2506                 if (rt->idev)
2507                         in_dev_hold(rt->idev);
2508                 rt->rt_flags = ort->rt_flags;
2509                 rt->rt_type = ort->rt_type;
2510                 rt->rt_dst = ort->rt_dst;
2511                 rt->rt_src = ort->rt_src;
2512                 rt->rt_iif = ort->rt_iif;
2513                 rt->rt_gateway = ort->rt_gateway;
2514                 rt->rt_spec_dst = ort->rt_spec_dst;
2515                 rt->peer = ort->peer;
2516                 if (rt->peer)
2517                         atomic_inc(&rt->peer->refcnt);
2518
2519                 dst_free(new);
2520         }
2521
2522         dst_release(&(*rp)->u.dst);
2523         *rp = rt;
2524         return (rt ? 0 : -ENOMEM);
2525 }
2526
2527 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2528 {
2529         int err;
2530
2531         if ((err = __ip_route_output_key(rp, flp)) != 0)
2532                 return err;
2533
2534         if (flp->proto) {
2535                 if (!flp->fl4_src)
2536                         flp->fl4_src = (*rp)->rt_src;
2537                 if (!flp->fl4_dst)
2538                         flp->fl4_dst = (*rp)->rt_dst;
2539                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2540                                     flags ? XFRM_LOOKUP_WAIT : 0);
2541                 if (err == -EREMOTE)
2542                         err = ipv4_dst_blackhole(rp, flp, sk);
2543
2544                 return err;
2545         }
2546
2547         return 0;
2548 }
2549
2550 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2551
2552 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2553 {
2554         return ip_route_output_flow(rp, flp, NULL, 0);
2555 }
2556
2557 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2558                         int nowait, unsigned int flags)
2559 {
2560         struct rtable *rt = (struct rtable*)skb->dst;
2561         struct rtmsg *r;
2562         struct nlmsghdr *nlh;
2563         long expires;
2564         u32 id = 0, ts = 0, tsage = 0, error;
2565
2566         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2567         if (nlh == NULL)
2568                 return -EMSGSIZE;
2569
2570         r = nlmsg_data(nlh);
2571         r->rtm_family    = AF_INET;
2572         r->rtm_dst_len  = 32;
2573         r->rtm_src_len  = 0;
2574         r->rtm_tos      = rt->fl.fl4_tos;
2575         r->rtm_table    = RT_TABLE_MAIN;
2576         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2577         r->rtm_type     = rt->rt_type;
2578         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2579         r->rtm_protocol = RTPROT_UNSPEC;
2580         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2581         if (rt->rt_flags & RTCF_NOTIFY)
2582                 r->rtm_flags |= RTM_F_NOTIFY;
2583
2584         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2585
2586         if (rt->fl.fl4_src) {
2587                 r->rtm_src_len = 32;
2588                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2589         }
2590         if (rt->u.dst.dev)
2591                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2592 #ifdef CONFIG_NET_CLS_ROUTE
2593         if (rt->u.dst.tclassid)
2594                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2595 #endif
2596         if (rt->fl.iif)
2597                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2598         else if (rt->rt_src != rt->fl.fl4_src)
2599                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2600
2601         if (rt->rt_dst != rt->rt_gateway)
2602                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2603
2604         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2605                 goto nla_put_failure;
2606
2607         error = rt->u.dst.error;
2608         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2609         if (rt->peer) {
2610                 id = rt->peer->ip_id_count;
2611                 if (rt->peer->tcp_ts_stamp) {
2612                         ts = rt->peer->tcp_ts;
2613                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2614                 }
2615         }
2616
2617         if (rt->fl.iif) {
2618 #ifdef CONFIG_IP_MROUTE
2619                 __be32 dst = rt->rt_dst;
2620
2621                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2622                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2623                         int err = ipmr_get_route(skb, r, nowait);
2624                         if (err <= 0) {
2625                                 if (!nowait) {
2626                                         if (err == 0)
2627                                                 return 0;
2628                                         goto nla_put_failure;
2629                                 } else {
2630                                         if (err == -EMSGSIZE)
2631                                                 goto nla_put_failure;
2632                                         error = err;
2633                                 }
2634                         }
2635                 } else
2636 #endif
2637                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2638         }
2639
2640         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2641                                expires, error) < 0)
2642                 goto nla_put_failure;
2643
2644         return nlmsg_end(skb, nlh);
2645
2646 nla_put_failure:
2647         nlmsg_cancel(skb, nlh);
2648         return -EMSGSIZE;
2649 }
2650
2651 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2652 {
2653         struct net *net = in_skb->sk->sk_net;
2654         struct rtmsg *rtm;
2655         struct nlattr *tb[RTA_MAX+1];
2656         struct rtable *rt = NULL;
2657         __be32 dst = 0;
2658         __be32 src = 0;
2659         u32 iif;
2660         int err;
2661         struct sk_buff *skb;
2662
2663         if (net != &init_net)
2664                 return -EINVAL;
2665
2666         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2667         if (err < 0)
2668                 goto errout;
2669
2670         rtm = nlmsg_data(nlh);
2671
2672         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2673         if (skb == NULL) {
2674                 err = -ENOBUFS;
2675                 goto errout;
2676         }
2677
2678         /* Reserve room for dummy headers, this skb can pass
2679            through good chunk of routing engine.
2680          */
2681         skb_reset_mac_header(skb);
2682         skb_reset_network_header(skb);
2683
2684         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2685         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2686         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2687
2688         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2689         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2690         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2691
2692         if (iif) {
2693                 struct net_device *dev;
2694
2695                 dev = __dev_get_by_index(&init_net, iif);
2696                 if (dev == NULL) {
2697                         err = -ENODEV;
2698                         goto errout_free;
2699                 }
2700
2701                 skb->protocol   = htons(ETH_P_IP);
2702                 skb->dev        = dev;
2703                 local_bh_disable();
2704                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2705                 local_bh_enable();
2706
2707                 rt = (struct rtable*) skb->dst;
2708                 if (err == 0 && rt->u.dst.error)
2709                         err = -rt->u.dst.error;
2710         } else {
2711                 struct flowi fl = {
2712                         .nl_u = {
2713                                 .ip4_u = {
2714                                         .daddr = dst,
2715                                         .saddr = src,
2716                                         .tos = rtm->rtm_tos,
2717                                 },
2718                         },
2719                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2720                 };
2721                 err = ip_route_output_key(&rt, &fl);
2722         }
2723
2724         if (err)
2725                 goto errout_free;
2726
2727         skb->dst = &rt->u.dst;
2728         if (rtm->rtm_flags & RTM_F_NOTIFY)
2729                 rt->rt_flags |= RTCF_NOTIFY;
2730
2731         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2732                                 RTM_NEWROUTE, 0, 0);
2733         if (err <= 0)
2734                 goto errout_free;
2735
2736         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2737 errout:
2738         return err;
2739
2740 errout_free:
2741         kfree_skb(skb);
2742         goto errout;
2743 }
2744
2745 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2746 {
2747         struct rtable *rt;
2748         int h, s_h;
2749         int idx, s_idx;
2750
2751         s_h = cb->args[0];
2752         if (s_h < 0)
2753                 s_h = 0;
2754         s_idx = idx = cb->args[1];
2755         for (h = s_h; h <= rt_hash_mask; h++) {
2756                 rcu_read_lock_bh();
2757                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2758                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2759                         if (idx < s_idx)
2760                                 continue;
2761                         skb->dst = dst_clone(&rt->u.dst);
2762                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2763                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2764                                          1, NLM_F_MULTI) <= 0) {
2765                                 dst_release(xchg(&skb->dst, NULL));
2766                                 rcu_read_unlock_bh();
2767                                 goto done;
2768                         }
2769                         dst_release(xchg(&skb->dst, NULL));
2770                 }
2771                 rcu_read_unlock_bh();
2772                 s_idx = 0;
2773         }
2774
2775 done:
2776         cb->args[0] = h;
2777         cb->args[1] = idx;
2778         return skb->len;
2779 }
2780
2781 void ip_rt_multicast_event(struct in_device *in_dev)
2782 {
2783         rt_cache_flush(0);
2784 }
2785
2786 #ifdef CONFIG_SYSCTL
2787 static int flush_delay;
2788
2789 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2790                                         struct file *filp, void __user *buffer,
2791                                         size_t *lenp, loff_t *ppos)
2792 {
2793         if (write) {
2794                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2795                 rt_cache_flush(flush_delay);
2796                 return 0;
2797         }
2798
2799         return -EINVAL;
2800 }
2801
2802 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2803                                                 int __user *name,
2804                                                 int nlen,
2805                                                 void __user *oldval,
2806                                                 size_t __user *oldlenp,
2807                                                 void __user *newval,
2808                                                 size_t newlen)
2809 {
2810         int delay;
2811         if (newlen != sizeof(int))
2812                 return -EINVAL;
2813         if (get_user(delay, (int __user *)newval))
2814                 return -EFAULT;
2815         rt_cache_flush(delay);
2816         return 0;
2817 }
2818
2819 ctl_table ipv4_route_table[] = {
2820         {
2821                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2822                 .procname       = "flush",
2823                 .data           = &flush_delay,
2824                 .maxlen         = sizeof(int),
2825                 .mode           = 0200,
2826                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2827                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2828         },
2829         {
2830                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2831                 .procname       = "min_delay",
2832                 .data           = &ip_rt_min_delay,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = &proc_dointvec_jiffies,
2836                 .strategy       = &sysctl_jiffies,
2837         },
2838         {
2839                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2840                 .procname       = "max_delay",
2841                 .data           = &ip_rt_max_delay,
2842                 .maxlen         = sizeof(int),
2843                 .mode           = 0644,
2844                 .proc_handler   = &proc_dointvec_jiffies,
2845                 .strategy       = &sysctl_jiffies,
2846         },
2847         {
2848                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2849                 .procname       = "gc_thresh",
2850                 .data           = &ipv4_dst_ops.gc_thresh,
2851                 .maxlen         = sizeof(int),
2852                 .mode           = 0644,
2853                 .proc_handler   = &proc_dointvec,
2854         },
2855         {
2856                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2857                 .procname       = "max_size",
2858                 .data           = &ip_rt_max_size,
2859                 .maxlen         = sizeof(int),
2860                 .mode           = 0644,
2861                 .proc_handler   = &proc_dointvec,
2862         },
2863         {
2864                 /*  Deprecated. Use gc_min_interval_ms */
2865
2866                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2867                 .procname       = "gc_min_interval",
2868                 .data           = &ip_rt_gc_min_interval,
2869                 .maxlen         = sizeof(int),
2870                 .mode           = 0644,
2871                 .proc_handler   = &proc_dointvec_jiffies,
2872                 .strategy       = &sysctl_jiffies,
2873         },
2874         {
2875                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2876                 .procname       = "gc_min_interval_ms",
2877                 .data           = &ip_rt_gc_min_interval,
2878                 .maxlen         = sizeof(int),
2879                 .mode           = 0644,
2880                 .proc_handler   = &proc_dointvec_ms_jiffies,
2881                 .strategy       = &sysctl_ms_jiffies,
2882         },
2883         {
2884                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2885                 .procname       = "gc_timeout",
2886                 .data           = &ip_rt_gc_timeout,
2887                 .maxlen         = sizeof(int),
2888                 .mode           = 0644,
2889                 .proc_handler   = &proc_dointvec_jiffies,
2890                 .strategy       = &sysctl_jiffies,
2891         },
2892         {
2893                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2894                 .procname       = "gc_interval",
2895                 .data           = &ip_rt_gc_interval,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = &proc_dointvec_jiffies,
2899                 .strategy       = &sysctl_jiffies,
2900         },
2901         {
2902                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2903                 .procname       = "redirect_load",
2904                 .data           = &ip_rt_redirect_load,
2905                 .maxlen         = sizeof(int),
2906                 .mode           = 0644,
2907                 .proc_handler   = &proc_dointvec,
2908         },
2909         {
2910                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2911                 .procname       = "redirect_number",
2912                 .data           = &ip_rt_redirect_number,
2913                 .maxlen         = sizeof(int),
2914                 .mode           = 0644,
2915                 .proc_handler   = &proc_dointvec,
2916         },
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2919                 .procname       = "redirect_silence",
2920                 .data           = &ip_rt_redirect_silence,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec,
2924         },
2925         {
2926                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2927                 .procname       = "error_cost",
2928                 .data           = &ip_rt_error_cost,
2929                 .maxlen         = sizeof(int),
2930                 .mode           = 0644,
2931                 .proc_handler   = &proc_dointvec,
2932         },
2933         {
2934                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2935                 .procname       = "error_burst",
2936                 .data           = &ip_rt_error_burst,
2937                 .maxlen         = sizeof(int),
2938                 .mode           = 0644,
2939                 .proc_handler   = &proc_dointvec,
2940         },
2941         {
2942                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2943                 .procname       = "gc_elasticity",
2944                 .data           = &ip_rt_gc_elasticity,
2945                 .maxlen         = sizeof(int),
2946                 .mode           = 0644,
2947                 .proc_handler   = &proc_dointvec,
2948         },
2949         {
2950                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2951                 .procname       = "mtu_expires",
2952                 .data           = &ip_rt_mtu_expires,
2953                 .maxlen         = sizeof(int),
2954                 .mode           = 0644,
2955                 .proc_handler   = &proc_dointvec_jiffies,
2956                 .strategy       = &sysctl_jiffies,
2957         },
2958         {
2959                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2960                 .procname       = "min_pmtu",
2961                 .data           = &ip_rt_min_pmtu,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = &proc_dointvec,
2965         },
2966         {
2967                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2968                 .procname       = "min_adv_mss",
2969                 .data           = &ip_rt_min_advmss,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = &proc_dointvec,
2973         },
2974         {
2975                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2976                 .procname       = "secret_interval",
2977                 .data           = &ip_rt_secret_interval,
2978                 .maxlen         = sizeof(int),
2979                 .mode           = 0644,
2980                 .proc_handler   = &proc_dointvec_jiffies,
2981                 .strategy       = &sysctl_jiffies,
2982         },
2983         { .ctl_name = 0 }
2984 };
2985 #endif
2986
2987 #ifdef CONFIG_NET_CLS_ROUTE
2988 struct ip_rt_acct *ip_rt_acct __read_mostly;
2989 #endif /* CONFIG_NET_CLS_ROUTE */
2990
2991 static __initdata unsigned long rhash_entries;
2992 static int __init set_rhash_entries(char *str)
2993 {
2994         if (!str)
2995                 return 0;
2996         rhash_entries = simple_strtoul(str, &str, 0);
2997         return 1;
2998 }
2999 __setup("rhash_entries=", set_rhash_entries);
3000
3001 int __init ip_rt_init(void)
3002 {
3003         int rc = 0;
3004
3005         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3006                              (jiffies ^ (jiffies >> 7)));
3007
3008 #ifdef CONFIG_NET_CLS_ROUTE
3009         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3010         if (!ip_rt_acct)
3011                 panic("IP: failed to allocate ip_rt_acct\n");
3012 #endif
3013
3014         ipv4_dst_ops.kmem_cachep =
3015                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3016                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3017
3018         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3019
3020         rt_hash_table = (struct rt_hash_bucket *)
3021                 alloc_large_system_hash("IP route cache",
3022                                         sizeof(struct rt_hash_bucket),
3023                                         rhash_entries,
3024                                         (num_physpages >= 128 * 1024) ?
3025                                         15 : 17,
3026                                         0,
3027                                         &rt_hash_log,
3028                                         &rt_hash_mask,
3029                                         0);
3030         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3031         rt_hash_lock_init();
3032
3033         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3034         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3035
3036         devinet_init();
3037         ip_fib_init();
3038
3039         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3040         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3041
3042         /* All the timers, started at system startup tend
3043            to synchronize. Perturb it a bit.
3044          */
3045         schedule_delayed_work(&expires_work,
3046                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3047
3048         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3049                 ip_rt_secret_interval;
3050         add_timer(&rt_secret_timer);
3051
3052         if (ip_rt_proc_init(&init_net))
3053                 printk(KERN_ERR "Unable to create route proc files\n");
3054 #ifdef CONFIG_XFRM
3055         xfrm_init();
3056         xfrm4_init();
3057 #endif
3058         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3059
3060         return rc;
3061 }
3062
3063 EXPORT_SYMBOL(__ip_select_ident);
3064 EXPORT_SYMBOL(ip_route_input);
3065 EXPORT_SYMBOL(ip_route_output_key);