[IPV4]: Remove bugus goto-s from ip_route_input_slow
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static void rt_check_expire(struct work_struct *work);
141 static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
142 static struct timer_list rt_secret_timer;
143
144 /*
145  *      Interface to generic destination cache.
146  */
147
148 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
149 static void              ipv4_dst_destroy(struct dst_entry *dst);
150 static void              ipv4_dst_ifdown(struct dst_entry *dst,
151                                          struct net_device *dev, int how);
152 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
153 static void              ipv4_link_failure(struct sk_buff *skb);
154 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
155 static int rt_garbage_collect(void);
156
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .protocol =             __constant_htons(ETH_P_IP),
161         .gc =                   rt_garbage_collect,
162         .check =                ipv4_dst_check,
163         .destroy =              ipv4_dst_destroy,
164         .ifdown =               ipv4_dst_ifdown,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .entry_size =           sizeof(struct rtable),
169 };
170
171 #define ECN_OR_COST(class)      TC_PRIO_##class
172
173 const __u8 ip_tos2prio[16] = {
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(FILLER),
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK)
190 };
191
192
193 /*
194  * Route cache.
195  */
196
197 /* The locking scheme is rather straight forward:
198  *
199  * 1) Read-Copy Update protects the buckets of the central route hash.
200  * 2) Only writers remove entries, and they hold the lock
201  *    as they look at rtable reference counts.
202  * 3) Only readers acquire references to rtable entries,
203  *    they do so with atomic increments and with the
204  *    lock held.
205  */
206
207 struct rt_hash_bucket {
208         struct rtable   *chain;
209 };
210 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
211         defined(CONFIG_PROVE_LOCKING)
212 /*
213  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
214  * The size of this table is a power of two and depends on the number of CPUS.
215  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216  */
217 #ifdef CONFIG_LOCKDEP
218 # define RT_HASH_LOCK_SZ        256
219 #else
220 # if NR_CPUS >= 32
221 #  define RT_HASH_LOCK_SZ       4096
222 # elif NR_CPUS >= 16
223 #  define RT_HASH_LOCK_SZ       2048
224 # elif NR_CPUS >= 8
225 #  define RT_HASH_LOCK_SZ       1024
226 # elif NR_CPUS >= 4
227 #  define RT_HASH_LOCK_SZ       512
228 # else
229 #  define RT_HASH_LOCK_SZ       256
230 # endif
231 #endif
232
233 static spinlock_t       *rt_hash_locks;
234 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235 # define rt_hash_lock_init()    { \
236                 int i; \
237                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
238                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
239                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
240                         spin_lock_init(&rt_hash_locks[i]); \
241                 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 # define rt_hash_lock_init()
245 #endif
246
247 static struct rt_hash_bucket    *rt_hash_table;
248 static unsigned                 rt_hash_mask;
249 static unsigned int             rt_hash_log;
250 static unsigned int             rt_hash_rnd;
251
252 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
253 #define RT_CACHE_STAT_INC(field) \
254         (__raw_get_cpu_var(rt_cache_stat).field++)
255
256 static int rt_intern_hash(unsigned hash, struct rtable *rth,
257                                 struct rtable **res);
258
259 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260 {
261         return (jhash_2words(daddr, saddr, rt_hash_rnd)
262                 & rt_hash_mask);
263 }
264
265 #define rt_hash(daddr, saddr, idx) \
266         rt_hash_code((__force u32)(__be32)(daddr),\
267                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268
269 #ifdef CONFIG_PROC_FS
270 struct rt_cache_iter_state {
271         int bucket;
272 };
273
274 static struct rtable *rt_cache_get_first(struct seq_file *seq)
275 {
276         struct rtable *r = NULL;
277         struct rt_cache_iter_state *st = seq->private;
278
279         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280                 rcu_read_lock_bh();
281                 r = rt_hash_table[st->bucket].chain;
282                 if (r)
283                         break;
284                 rcu_read_unlock_bh();
285         }
286         return r;
287 }
288
289 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290 {
291         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292
293         r = r->u.dst.rt_next;
294         while (!r) {
295                 rcu_read_unlock_bh();
296                 if (--st->bucket < 0)
297                         break;
298                 rcu_read_lock_bh();
299                 r = rt_hash_table[st->bucket].chain;
300         }
301         return r;
302 }
303
304 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305 {
306         struct rtable *r = rt_cache_get_first(seq);
307
308         if (r)
309                 while (pos && (r = rt_cache_get_next(seq, r)))
310                         --pos;
311         return pos ? NULL : r;
312 }
313
314 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315 {
316         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317 }
318
319 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320 {
321         struct rtable *r = NULL;
322
323         if (v == SEQ_START_TOKEN)
324                 r = rt_cache_get_first(seq);
325         else
326                 r = rt_cache_get_next(seq, v);
327         ++*pos;
328         return r;
329 }
330
331 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332 {
333         if (v && v != SEQ_START_TOKEN)
334                 rcu_read_unlock_bh();
335 }
336
337 static int rt_cache_seq_show(struct seq_file *seq, void *v)
338 {
339         if (v == SEQ_START_TOKEN)
340                 seq_printf(seq, "%-127s\n",
341                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
342                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343                            "HHUptod\tSpecDst");
344         else {
345                 struct rtable *r = v;
346                 char temp[256];
347
348                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
349                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
350                         r->u.dst.dev ? r->u.dst.dev->name : "*",
351                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
352                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
353                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
354                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
355                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
356                         dst_metric(&r->u.dst, RTAX_WINDOW),
357                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
358                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
359                         r->fl.fl4_tos,
360                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
361                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362                                        dev_queue_xmit) : 0,
363                         r->rt_spec_dst);
364                 seq_printf(seq, "%-127s\n", temp);
365         }
366         return 0;
367 }
368
369 static const struct seq_operations rt_cache_seq_ops = {
370         .start  = rt_cache_seq_start,
371         .next   = rt_cache_seq_next,
372         .stop   = rt_cache_seq_stop,
373         .show   = rt_cache_seq_show,
374 };
375
376 static int rt_cache_seq_open(struct inode *inode, struct file *file)
377 {
378         return seq_open_private(file, &rt_cache_seq_ops,
379                         sizeof(struct rt_cache_iter_state));
380 }
381
382 static const struct file_operations rt_cache_seq_fops = {
383         .owner   = THIS_MODULE,
384         .open    = rt_cache_seq_open,
385         .read    = seq_read,
386         .llseek  = seq_lseek,
387         .release = seq_release_private,
388 };
389
390
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393         int cpu;
394
395         if (*pos == 0)
396                 return SEQ_START_TOKEN;
397
398         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399                 if (!cpu_possible(cpu))
400                         continue;
401                 *pos = cpu+1;
402                 return &per_cpu(rt_cache_stat, cpu);
403         }
404         return NULL;
405 }
406
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408 {
409         int cpu;
410
411         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412                 if (!cpu_possible(cpu))
413                         continue;
414                 *pos = cpu+1;
415                 return &per_cpu(rt_cache_stat, cpu);
416         }
417         return NULL;
418
419 }
420
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422 {
423
424 }
425
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 {
428         struct rt_cache_stat *st = v;
429
430         if (v == SEQ_START_TOKEN) {
431                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432                 return 0;
433         }
434
435         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
436                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437                    atomic_read(&ipv4_dst_ops.entries),
438                    st->in_hit,
439                    st->in_slow_tot,
440                    st->in_slow_mc,
441                    st->in_no_route,
442                    st->in_brd,
443                    st->in_martian_dst,
444                    st->in_martian_src,
445
446                    st->out_hit,
447                    st->out_slow_tot,
448                    st->out_slow_mc,
449
450                    st->gc_total,
451                    st->gc_ignored,
452                    st->gc_goal_miss,
453                    st->gc_dst_overflow,
454                    st->in_hlist_search,
455                    st->out_hlist_search
456                 );
457         return 0;
458 }
459
460 static const struct seq_operations rt_cpu_seq_ops = {
461         .start  = rt_cpu_seq_start,
462         .next   = rt_cpu_seq_next,
463         .stop   = rt_cpu_seq_stop,
464         .show   = rt_cpu_seq_show,
465 };
466
467
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 {
470         return seq_open(file, &rt_cpu_seq_ops);
471 }
472
473 static const struct file_operations rt_cpu_seq_fops = {
474         .owner   = THIS_MODULE,
475         .open    = rt_cpu_seq_open,
476         .read    = seq_read,
477         .llseek  = seq_lseek,
478         .release = seq_release,
479 };
480
481 #endif /* CONFIG_PROC_FS */
482
483 static __inline__ void rt_free(struct rtable *rt)
484 {
485         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486 }
487
488 static __inline__ void rt_drop(struct rtable *rt)
489 {
490         ip_rt_put(rt);
491         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492 }
493
494 static __inline__ int rt_fast_clean(struct rtable *rth)
495 {
496         /* Kill broadcast/multicast entries very aggresively, if they
497            collide in hash table with more useful entries */
498         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
499                 rth->fl.iif && rth->u.dst.rt_next;
500 }
501
502 static __inline__ int rt_valuable(struct rtable *rth)
503 {
504         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
505                 rth->u.dst.expires;
506 }
507
508 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
509 {
510         unsigned long age;
511         int ret = 0;
512
513         if (atomic_read(&rth->u.dst.__refcnt))
514                 goto out;
515
516         ret = 1;
517         if (rth->u.dst.expires &&
518             time_after_eq(jiffies, rth->u.dst.expires))
519                 goto out;
520
521         age = jiffies - rth->u.dst.lastuse;
522         ret = 0;
523         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
524             (age <= tmo2 && rt_valuable(rth)))
525                 goto out;
526         ret = 1;
527 out:    return ret;
528 }
529
530 /* Bits of score are:
531  * 31: very valuable
532  * 30: not quite useless
533  * 29..0: usage counter
534  */
535 static inline u32 rt_score(struct rtable *rt)
536 {
537         u32 score = jiffies - rt->u.dst.lastuse;
538
539         score = ~score & ~(3<<30);
540
541         if (rt_valuable(rt))
542                 score |= (1<<31);
543
544         if (!rt->fl.iif ||
545             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
546                 score |= (1<<30);
547
548         return score;
549 }
550
551 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
552 {
553         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
554                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
555                 (fl1->mark ^ fl2->mark) |
556                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
557                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
558                 (fl1->oif ^ fl2->oif) |
559                 (fl1->iif ^ fl2->iif)) == 0;
560 }
561
562 static void rt_check_expire(struct work_struct *work)
563 {
564         static unsigned int rover;
565         unsigned int i = rover, goal;
566         struct rtable *rth, **rthp;
567         u64 mult;
568
569         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
570         if (ip_rt_gc_timeout > 1)
571                 do_div(mult, ip_rt_gc_timeout);
572         goal = (unsigned int)mult;
573         if (goal > rt_hash_mask)
574                 goal = rt_hash_mask + 1;
575         for (; goal > 0; goal--) {
576                 unsigned long tmo = ip_rt_gc_timeout;
577
578                 i = (i + 1) & rt_hash_mask;
579                 rthp = &rt_hash_table[i].chain;
580
581                 if (*rthp == NULL)
582                         continue;
583                 spin_lock_bh(rt_hash_lock_addr(i));
584                 while ((rth = *rthp) != NULL) {
585                         if (rth->u.dst.expires) {
586                                 /* Entry is expired even if it is in use */
587                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
588                                         tmo >>= 1;
589                                         rthp = &rth->u.dst.rt_next;
590                                         continue;
591                                 }
592                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
593                                 tmo >>= 1;
594                                 rthp = &rth->u.dst.rt_next;
595                                 continue;
596                         }
597
598                         /* Cleanup aged off entries. */
599                         *rthp = rth->u.dst.rt_next;
600                         rt_free(rth);
601                 }
602                 spin_unlock_bh(rt_hash_lock_addr(i));
603         }
604         rover = i;
605         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
606 }
607
608 /* This can run from both BH and non-BH contexts, the latter
609  * in the case of a forced flush event.
610  */
611 static void rt_run_flush(unsigned long dummy)
612 {
613         int i;
614         struct rtable *rth, *next;
615
616         rt_deadline = 0;
617
618         get_random_bytes(&rt_hash_rnd, 4);
619
620         for (i = rt_hash_mask; i >= 0; i--) {
621                 spin_lock_bh(rt_hash_lock_addr(i));
622                 rth = rt_hash_table[i].chain;
623                 if (rth)
624                         rt_hash_table[i].chain = NULL;
625                 spin_unlock_bh(rt_hash_lock_addr(i));
626
627                 for (; rth; rth = next) {
628                         next = rth->u.dst.rt_next;
629                         rt_free(rth);
630                 }
631         }
632 }
633
634 static DEFINE_SPINLOCK(rt_flush_lock);
635
636 void rt_cache_flush(int delay)
637 {
638         unsigned long now = jiffies;
639         int user_mode = !in_softirq();
640
641         if (delay < 0)
642                 delay = ip_rt_min_delay;
643
644         spin_lock_bh(&rt_flush_lock);
645
646         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
647                 long tmo = (long)(rt_deadline - now);
648
649                 /* If flush timer is already running
650                    and flush request is not immediate (delay > 0):
651
652                    if deadline is not achieved, prolongate timer to "delay",
653                    otherwise fire it at deadline time.
654                  */
655
656                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
657                         tmo = 0;
658
659                 if (delay > tmo)
660                         delay = tmo;
661         }
662
663         if (delay <= 0) {
664                 spin_unlock_bh(&rt_flush_lock);
665                 rt_run_flush(0);
666                 return;
667         }
668
669         if (rt_deadline == 0)
670                 rt_deadline = now + ip_rt_max_delay;
671
672         mod_timer(&rt_flush_timer, now+delay);
673         spin_unlock_bh(&rt_flush_lock);
674 }
675
676 static void rt_secret_rebuild(unsigned long dummy)
677 {
678         unsigned long now = jiffies;
679
680         rt_cache_flush(0);
681         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
682 }
683
684 /*
685    Short description of GC goals.
686
687    We want to build algorithm, which will keep routing cache
688    at some equilibrium point, when number of aged off entries
689    is kept approximately equal to newly generated ones.
690
691    Current expiration strength is variable "expire".
692    We try to adjust it dynamically, so that if networking
693    is idle expires is large enough to keep enough of warm entries,
694    and when load increases it reduces to limit cache size.
695  */
696
697 static int rt_garbage_collect(void)
698 {
699         static unsigned long expire = RT_GC_TIMEOUT;
700         static unsigned long last_gc;
701         static int rover;
702         static int equilibrium;
703         struct rtable *rth, **rthp;
704         unsigned long now = jiffies;
705         int goal;
706
707         /*
708          * Garbage collection is pretty expensive,
709          * do not make it too frequently.
710          */
711
712         RT_CACHE_STAT_INC(gc_total);
713
714         if (now - last_gc < ip_rt_gc_min_interval &&
715             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
716                 RT_CACHE_STAT_INC(gc_ignored);
717                 goto out;
718         }
719
720         /* Calculate number of entries, which we want to expire now. */
721         goal = atomic_read(&ipv4_dst_ops.entries) -
722                 (ip_rt_gc_elasticity << rt_hash_log);
723         if (goal <= 0) {
724                 if (equilibrium < ipv4_dst_ops.gc_thresh)
725                         equilibrium = ipv4_dst_ops.gc_thresh;
726                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
727                 if (goal > 0) {
728                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
729                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
730                 }
731         } else {
732                 /* We are in dangerous area. Try to reduce cache really
733                  * aggressively.
734                  */
735                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
736                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
737         }
738
739         if (now - last_gc >= ip_rt_gc_min_interval)
740                 last_gc = now;
741
742         if (goal <= 0) {
743                 equilibrium += goal;
744                 goto work_done;
745         }
746
747         do {
748                 int i, k;
749
750                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
751                         unsigned long tmo = expire;
752
753                         k = (k + 1) & rt_hash_mask;
754                         rthp = &rt_hash_table[k].chain;
755                         spin_lock_bh(rt_hash_lock_addr(k));
756                         while ((rth = *rthp) != NULL) {
757                                 if (!rt_may_expire(rth, tmo, expire)) {
758                                         tmo >>= 1;
759                                         rthp = &rth->u.dst.rt_next;
760                                         continue;
761                                 }
762                                 *rthp = rth->u.dst.rt_next;
763                                 rt_free(rth);
764                                 goal--;
765                         }
766                         spin_unlock_bh(rt_hash_lock_addr(k));
767                         if (goal <= 0)
768                                 break;
769                 }
770                 rover = k;
771
772                 if (goal <= 0)
773                         goto work_done;
774
775                 /* Goal is not achieved. We stop process if:
776
777                    - if expire reduced to zero. Otherwise, expire is halfed.
778                    - if table is not full.
779                    - if we are called from interrupt.
780                    - jiffies check is just fallback/debug loop breaker.
781                      We will not spin here for long time in any case.
782                  */
783
784                 RT_CACHE_STAT_INC(gc_goal_miss);
785
786                 if (expire == 0)
787                         break;
788
789                 expire >>= 1;
790 #if RT_CACHE_DEBUG >= 2
791                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
792                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
793 #endif
794
795                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
796                         goto out;
797         } while (!in_softirq() && time_before_eq(jiffies, now));
798
799         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
800                 goto out;
801         if (net_ratelimit())
802                 printk(KERN_WARNING "dst cache overflow\n");
803         RT_CACHE_STAT_INC(gc_dst_overflow);
804         return 1;
805
806 work_done:
807         expire += ip_rt_gc_min_interval;
808         if (expire > ip_rt_gc_timeout ||
809             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
810                 expire = ip_rt_gc_timeout;
811 #if RT_CACHE_DEBUG >= 2
812         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
813                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
814 #endif
815 out:    return 0;
816 }
817
818 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
819 {
820         struct rtable   *rth, **rthp;
821         unsigned long   now;
822         struct rtable *cand, **candp;
823         u32             min_score;
824         int             chain_length;
825         int attempts = !in_softirq();
826
827 restart:
828         chain_length = 0;
829         min_score = ~(u32)0;
830         cand = NULL;
831         candp = NULL;
832         now = jiffies;
833
834         rthp = &rt_hash_table[hash].chain;
835
836         spin_lock_bh(rt_hash_lock_addr(hash));
837         while ((rth = *rthp) != NULL) {
838                 if (compare_keys(&rth->fl, &rt->fl)) {
839                         /* Put it first */
840                         *rthp = rth->u.dst.rt_next;
841                         /*
842                          * Since lookup is lockfree, the deletion
843                          * must be visible to another weakly ordered CPU before
844                          * the insertion at the start of the hash chain.
845                          */
846                         rcu_assign_pointer(rth->u.dst.rt_next,
847                                            rt_hash_table[hash].chain);
848                         /*
849                          * Since lookup is lockfree, the update writes
850                          * must be ordered for consistency on SMP.
851                          */
852                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
853
854                         rth->u.dst.__use++;
855                         dst_hold(&rth->u.dst);
856                         rth->u.dst.lastuse = now;
857                         spin_unlock_bh(rt_hash_lock_addr(hash));
858
859                         rt_drop(rt);
860                         *rp = rth;
861                         return 0;
862                 }
863
864                 if (!atomic_read(&rth->u.dst.__refcnt)) {
865                         u32 score = rt_score(rth);
866
867                         if (score <= min_score) {
868                                 cand = rth;
869                                 candp = rthp;
870                                 min_score = score;
871                         }
872                 }
873
874                 chain_length++;
875
876                 rthp = &rth->u.dst.rt_next;
877         }
878
879         if (cand) {
880                 /* ip_rt_gc_elasticity used to be average length of chain
881                  * length, when exceeded gc becomes really aggressive.
882                  *
883                  * The second limit is less certain. At the moment it allows
884                  * only 2 entries per bucket. We will see.
885                  */
886                 if (chain_length > ip_rt_gc_elasticity) {
887                         *candp = cand->u.dst.rt_next;
888                         rt_free(cand);
889                 }
890         }
891
892         /* Try to bind route to arp only if it is output
893            route or unicast forwarding path.
894          */
895         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
896                 int err = arp_bind_neighbour(&rt->u.dst);
897                 if (err) {
898                         spin_unlock_bh(rt_hash_lock_addr(hash));
899
900                         if (err != -ENOBUFS) {
901                                 rt_drop(rt);
902                                 return err;
903                         }
904
905                         /* Neighbour tables are full and nothing
906                            can be released. Try to shrink route cache,
907                            it is most likely it holds some neighbour records.
908                          */
909                         if (attempts-- > 0) {
910                                 int saved_elasticity = ip_rt_gc_elasticity;
911                                 int saved_int = ip_rt_gc_min_interval;
912                                 ip_rt_gc_elasticity     = 1;
913                                 ip_rt_gc_min_interval   = 0;
914                                 rt_garbage_collect();
915                                 ip_rt_gc_min_interval   = saved_int;
916                                 ip_rt_gc_elasticity     = saved_elasticity;
917                                 goto restart;
918                         }
919
920                         if (net_ratelimit())
921                                 printk(KERN_WARNING "Neighbour table overflow.\n");
922                         rt_drop(rt);
923                         return -ENOBUFS;
924                 }
925         }
926
927         rt->u.dst.rt_next = rt_hash_table[hash].chain;
928 #if RT_CACHE_DEBUG >= 2
929         if (rt->u.dst.rt_next) {
930                 struct rtable *trt;
931                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
932                        NIPQUAD(rt->rt_dst));
933                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
934                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
935                 printk("\n");
936         }
937 #endif
938         rt_hash_table[hash].chain = rt;
939         spin_unlock_bh(rt_hash_lock_addr(hash));
940         *rp = rt;
941         return 0;
942 }
943
944 void rt_bind_peer(struct rtable *rt, int create)
945 {
946         static DEFINE_SPINLOCK(rt_peer_lock);
947         struct inet_peer *peer;
948
949         peer = inet_getpeer(rt->rt_dst, create);
950
951         spin_lock_bh(&rt_peer_lock);
952         if (rt->peer == NULL) {
953                 rt->peer = peer;
954                 peer = NULL;
955         }
956         spin_unlock_bh(&rt_peer_lock);
957         if (peer)
958                 inet_putpeer(peer);
959 }
960
961 /*
962  * Peer allocation may fail only in serious out-of-memory conditions.  However
963  * we still can generate some output.
964  * Random ID selection looks a bit dangerous because we have no chances to
965  * select ID being unique in a reasonable period of time.
966  * But broken packet identifier may be better than no packet at all.
967  */
968 static void ip_select_fb_ident(struct iphdr *iph)
969 {
970         static DEFINE_SPINLOCK(ip_fb_id_lock);
971         static u32 ip_fallback_id;
972         u32 salt;
973
974         spin_lock_bh(&ip_fb_id_lock);
975         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
976         iph->id = htons(salt & 0xFFFF);
977         ip_fallback_id = salt;
978         spin_unlock_bh(&ip_fb_id_lock);
979 }
980
981 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
982 {
983         struct rtable *rt = (struct rtable *) dst;
984
985         if (rt) {
986                 if (rt->peer == NULL)
987                         rt_bind_peer(rt, 1);
988
989                 /* If peer is attached to destination, it is never detached,
990                    so that we need not to grab a lock to dereference it.
991                  */
992                 if (rt->peer) {
993                         iph->id = htons(inet_getid(rt->peer, more));
994                         return;
995                 }
996         } else
997                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
998                        __builtin_return_address(0));
999
1000         ip_select_fb_ident(iph);
1001 }
1002
1003 static void rt_del(unsigned hash, struct rtable *rt)
1004 {
1005         struct rtable **rthp;
1006
1007         spin_lock_bh(rt_hash_lock_addr(hash));
1008         ip_rt_put(rt);
1009         for (rthp = &rt_hash_table[hash].chain; *rthp;
1010              rthp = &(*rthp)->u.dst.rt_next)
1011                 if (*rthp == rt) {
1012                         *rthp = rt->u.dst.rt_next;
1013                         rt_free(rt);
1014                         break;
1015                 }
1016         spin_unlock_bh(rt_hash_lock_addr(hash));
1017 }
1018
1019 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1020                     __be32 saddr, struct net_device *dev)
1021 {
1022         int i, k;
1023         struct in_device *in_dev = in_dev_get(dev);
1024         struct rtable *rth, **rthp;
1025         __be32  skeys[2] = { saddr, 0 };
1026         int  ikeys[2] = { dev->ifindex, 0 };
1027         struct netevent_redirect netevent;
1028
1029         if (!in_dev)
1030                 return;
1031
1032         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1033             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1034                 goto reject_redirect;
1035
1036         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1037                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1038                         goto reject_redirect;
1039                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1040                         goto reject_redirect;
1041         } else {
1042                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1043                         goto reject_redirect;
1044         }
1045
1046         for (i = 0; i < 2; i++) {
1047                 for (k = 0; k < 2; k++) {
1048                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1049
1050                         rthp=&rt_hash_table[hash].chain;
1051
1052                         rcu_read_lock();
1053                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1054                                 struct rtable *rt;
1055
1056                                 if (rth->fl.fl4_dst != daddr ||
1057                                     rth->fl.fl4_src != skeys[i] ||
1058                                     rth->fl.oif != ikeys[k] ||
1059                                     rth->fl.iif != 0) {
1060                                         rthp = &rth->u.dst.rt_next;
1061                                         continue;
1062                                 }
1063
1064                                 if (rth->rt_dst != daddr ||
1065                                     rth->rt_src != saddr ||
1066                                     rth->u.dst.error ||
1067                                     rth->rt_gateway != old_gw ||
1068                                     rth->u.dst.dev != dev)
1069                                         break;
1070
1071                                 dst_hold(&rth->u.dst);
1072                                 rcu_read_unlock();
1073
1074                                 rt = dst_alloc(&ipv4_dst_ops);
1075                                 if (rt == NULL) {
1076                                         ip_rt_put(rth);
1077                                         in_dev_put(in_dev);
1078                                         return;
1079                                 }
1080
1081                                 /* Copy all the information. */
1082                                 *rt = *rth;
1083                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1084                                 rt->u.dst.__use         = 1;
1085                                 atomic_set(&rt->u.dst.__refcnt, 1);
1086                                 rt->u.dst.child         = NULL;
1087                                 if (rt->u.dst.dev)
1088                                         dev_hold(rt->u.dst.dev);
1089                                 if (rt->idev)
1090                                         in_dev_hold(rt->idev);
1091                                 rt->u.dst.obsolete      = 0;
1092                                 rt->u.dst.lastuse       = jiffies;
1093                                 rt->u.dst.path          = &rt->u.dst;
1094                                 rt->u.dst.neighbour     = NULL;
1095                                 rt->u.dst.hh            = NULL;
1096                                 rt->u.dst.xfrm          = NULL;
1097
1098                                 rt->rt_flags            |= RTCF_REDIRECTED;
1099
1100                                 /* Gateway is different ... */
1101                                 rt->rt_gateway          = new_gw;
1102
1103                                 /* Redirect received -> path was valid */
1104                                 dst_confirm(&rth->u.dst);
1105
1106                                 if (rt->peer)
1107                                         atomic_inc(&rt->peer->refcnt);
1108
1109                                 if (arp_bind_neighbour(&rt->u.dst) ||
1110                                     !(rt->u.dst.neighbour->nud_state &
1111                                             NUD_VALID)) {
1112                                         if (rt->u.dst.neighbour)
1113                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1114                                         ip_rt_put(rth);
1115                                         rt_drop(rt);
1116                                         goto do_next;
1117                                 }
1118
1119                                 netevent.old = &rth->u.dst;
1120                                 netevent.new = &rt->u.dst;
1121                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1122                                                         &netevent);
1123
1124                                 rt_del(hash, rth);
1125                                 if (!rt_intern_hash(hash, rt, &rt))
1126                                         ip_rt_put(rt);
1127                                 goto do_next;
1128                         }
1129                         rcu_read_unlock();
1130                 do_next:
1131                         ;
1132                 }
1133         }
1134         in_dev_put(in_dev);
1135         return;
1136
1137 reject_redirect:
1138 #ifdef CONFIG_IP_ROUTE_VERBOSE
1139         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1140                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1141                         "%u.%u.%u.%u ignored.\n"
1142                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1143                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1144                        NIPQUAD(saddr), NIPQUAD(daddr));
1145 #endif
1146         in_dev_put(in_dev);
1147 }
1148
1149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1150 {
1151         struct rtable *rt = (struct rtable*)dst;
1152         struct dst_entry *ret = dst;
1153
1154         if (rt) {
1155                 if (dst->obsolete) {
1156                         ip_rt_put(rt);
1157                         ret = NULL;
1158                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1159                            rt->u.dst.expires) {
1160                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1161                                                 rt->fl.oif);
1162 #if RT_CACHE_DEBUG >= 1
1163                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1164                                           "%u.%u.%u.%u/%02x dropped\n",
1165                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1166 #endif
1167                         rt_del(hash, rt);
1168                         ret = NULL;
1169                 }
1170         }
1171         return ret;
1172 }
1173
1174 /*
1175  * Algorithm:
1176  *      1. The first ip_rt_redirect_number redirects are sent
1177  *         with exponential backoff, then we stop sending them at all,
1178  *         assuming that the host ignores our redirects.
1179  *      2. If we did not see packets requiring redirects
1180  *         during ip_rt_redirect_silence, we assume that the host
1181  *         forgot redirected route and start to send redirects again.
1182  *
1183  * This algorithm is much cheaper and more intelligent than dumb load limiting
1184  * in icmp.c.
1185  *
1186  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1187  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1188  */
1189
1190 void ip_rt_send_redirect(struct sk_buff *skb)
1191 {
1192         struct rtable *rt = (struct rtable*)skb->dst;
1193         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1194
1195         if (!in_dev)
1196                 return;
1197
1198         if (!IN_DEV_TX_REDIRECTS(in_dev))
1199                 goto out;
1200
1201         /* No redirected packets during ip_rt_redirect_silence;
1202          * reset the algorithm.
1203          */
1204         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1205                 rt->u.dst.rate_tokens = 0;
1206
1207         /* Too many ignored redirects; do not send anything
1208          * set u.dst.rate_last to the last seen redirected packet.
1209          */
1210         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1211                 rt->u.dst.rate_last = jiffies;
1212                 goto out;
1213         }
1214
1215         /* Check for load limit; set rate_last to the latest sent
1216          * redirect.
1217          */
1218         if (rt->u.dst.rate_tokens == 0 ||
1219             time_after(jiffies,
1220                        (rt->u.dst.rate_last +
1221                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1222                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1223                 rt->u.dst.rate_last = jiffies;
1224                 ++rt->u.dst.rate_tokens;
1225 #ifdef CONFIG_IP_ROUTE_VERBOSE
1226                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1227                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1228                     net_ratelimit())
1229                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1230                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1231                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1232                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1233 #endif
1234         }
1235 out:
1236         in_dev_put(in_dev);
1237 }
1238
1239 static int ip_error(struct sk_buff *skb)
1240 {
1241         struct rtable *rt = (struct rtable*)skb->dst;
1242         unsigned long now;
1243         int code;
1244
1245         switch (rt->u.dst.error) {
1246                 case EINVAL:
1247                 default:
1248                         goto out;
1249                 case EHOSTUNREACH:
1250                         code = ICMP_HOST_UNREACH;
1251                         break;
1252                 case ENETUNREACH:
1253                         code = ICMP_NET_UNREACH;
1254                         break;
1255                 case EACCES:
1256                         code = ICMP_PKT_FILTERED;
1257                         break;
1258         }
1259
1260         now = jiffies;
1261         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1262         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1263                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1264         rt->u.dst.rate_last = now;
1265         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1266                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1267                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1268         }
1269
1270 out:    kfree_skb(skb);
1271         return 0;
1272 }
1273
1274 /*
1275  *      The last two values are not from the RFC but
1276  *      are needed for AMPRnet AX.25 paths.
1277  */
1278
1279 static const unsigned short mtu_plateau[] =
1280 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1281
1282 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1283 {
1284         int i;
1285
1286         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1287                 if (old_mtu > mtu_plateau[i])
1288                         return mtu_plateau[i];
1289         return 68;
1290 }
1291
1292 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1293 {
1294         int i;
1295         unsigned short old_mtu = ntohs(iph->tot_len);
1296         struct rtable *rth;
1297         __be32  skeys[2] = { iph->saddr, 0, };
1298         __be32  daddr = iph->daddr;
1299         unsigned short est_mtu = 0;
1300
1301         if (ipv4_config.no_pmtu_disc)
1302                 return 0;
1303
1304         for (i = 0; i < 2; i++) {
1305                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1306
1307                 rcu_read_lock();
1308                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1309                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1310                         if (rth->fl.fl4_dst == daddr &&
1311                             rth->fl.fl4_src == skeys[i] &&
1312                             rth->rt_dst  == daddr &&
1313                             rth->rt_src  == iph->saddr &&
1314                             rth->fl.iif == 0 &&
1315                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1316                                 unsigned short mtu = new_mtu;
1317
1318                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1319
1320                                         /* BSD 4.2 compatibility hack :-( */
1321                                         if (mtu == 0 &&
1322                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1323                                             old_mtu >= 68 + (iph->ihl << 2))
1324                                                 old_mtu -= iph->ihl << 2;
1325
1326                                         mtu = guess_mtu(old_mtu);
1327                                 }
1328                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1329                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1330                                                 dst_confirm(&rth->u.dst);
1331                                                 if (mtu < ip_rt_min_pmtu) {
1332                                                         mtu = ip_rt_min_pmtu;
1333                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1334                                                                 (1 << RTAX_MTU);
1335                                                 }
1336                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1337                                                 dst_set_expires(&rth->u.dst,
1338                                                         ip_rt_mtu_expires);
1339                                         }
1340                                         est_mtu = mtu;
1341                                 }
1342                         }
1343                 }
1344                 rcu_read_unlock();
1345         }
1346         return est_mtu ? : new_mtu;
1347 }
1348
1349 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1350 {
1351         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1352             !(dst_metric_locked(dst, RTAX_MTU))) {
1353                 if (mtu < ip_rt_min_pmtu) {
1354                         mtu = ip_rt_min_pmtu;
1355                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1356                 }
1357                 dst->metrics[RTAX_MTU-1] = mtu;
1358                 dst_set_expires(dst, ip_rt_mtu_expires);
1359                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1360         }
1361 }
1362
1363 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1364 {
1365         return NULL;
1366 }
1367
1368 static void ipv4_dst_destroy(struct dst_entry *dst)
1369 {
1370         struct rtable *rt = (struct rtable *) dst;
1371         struct inet_peer *peer = rt->peer;
1372         struct in_device *idev = rt->idev;
1373
1374         if (peer) {
1375                 rt->peer = NULL;
1376                 inet_putpeer(peer);
1377         }
1378
1379         if (idev) {
1380                 rt->idev = NULL;
1381                 in_dev_put(idev);
1382         }
1383 }
1384
1385 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1386                             int how)
1387 {
1388         struct rtable *rt = (struct rtable *) dst;
1389         struct in_device *idev = rt->idev;
1390         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1391                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1392                 if (loopback_idev) {
1393                         rt->idev = loopback_idev;
1394                         in_dev_put(idev);
1395                 }
1396         }
1397 }
1398
1399 static void ipv4_link_failure(struct sk_buff *skb)
1400 {
1401         struct rtable *rt;
1402
1403         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1404
1405         rt = (struct rtable *) skb->dst;
1406         if (rt)
1407                 dst_set_expires(&rt->u.dst, 0);
1408 }
1409
1410 static int ip_rt_bug(struct sk_buff *skb)
1411 {
1412         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1413                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1414                 skb->dev ? skb->dev->name : "?");
1415         kfree_skb(skb);
1416         return 0;
1417 }
1418
1419 /*
1420    We do not cache source address of outgoing interface,
1421    because it is used only by IP RR, TS and SRR options,
1422    so that it out of fast path.
1423
1424    BTW remember: "addr" is allowed to be not aligned
1425    in IP options!
1426  */
1427
1428 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1429 {
1430         __be32 src;
1431         struct fib_result res;
1432
1433         if (rt->fl.iif == 0)
1434                 src = rt->rt_src;
1435         else if (fib_lookup(&rt->fl, &res) == 0) {
1436                 src = FIB_RES_PREFSRC(res);
1437                 fib_res_put(&res);
1438         } else
1439                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1440                                         RT_SCOPE_UNIVERSE);
1441         memcpy(addr, &src, 4);
1442 }
1443
1444 #ifdef CONFIG_NET_CLS_ROUTE
1445 static void set_class_tag(struct rtable *rt, u32 tag)
1446 {
1447         if (!(rt->u.dst.tclassid & 0xFFFF))
1448                 rt->u.dst.tclassid |= tag & 0xFFFF;
1449         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1450                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1451 }
1452 #endif
1453
1454 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1455 {
1456         struct fib_info *fi = res->fi;
1457
1458         if (fi) {
1459                 if (FIB_RES_GW(*res) &&
1460                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1461                         rt->rt_gateway = FIB_RES_GW(*res);
1462                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1463                        sizeof(rt->u.dst.metrics));
1464                 if (fi->fib_mtu == 0) {
1465                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1466                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1467                             rt->rt_gateway != rt->rt_dst &&
1468                             rt->u.dst.dev->mtu > 576)
1469                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1470                 }
1471 #ifdef CONFIG_NET_CLS_ROUTE
1472                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1473 #endif
1474         } else
1475                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1476
1477         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1478                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1479         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1480                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1481         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1482                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1483                                        ip_rt_min_advmss);
1484         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1485                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1486
1487 #ifdef CONFIG_NET_CLS_ROUTE
1488 #ifdef CONFIG_IP_MULTIPLE_TABLES
1489         set_class_tag(rt, fib_rules_tclass(res));
1490 #endif
1491         set_class_tag(rt, itag);
1492 #endif
1493         rt->rt_type = res->type;
1494 }
1495
1496 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1497                                 u8 tos, struct net_device *dev, int our)
1498 {
1499         unsigned hash;
1500         struct rtable *rth;
1501         __be32 spec_dst;
1502         struct in_device *in_dev = in_dev_get(dev);
1503         u32 itag = 0;
1504
1505         /* Primary sanity checks. */
1506
1507         if (in_dev == NULL)
1508                 return -EINVAL;
1509
1510         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1511             skb->protocol != htons(ETH_P_IP))
1512                 goto e_inval;
1513
1514         if (ZERONET(saddr)) {
1515                 if (!LOCAL_MCAST(daddr))
1516                         goto e_inval;
1517                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1518         } else if (fib_validate_source(saddr, 0, tos, 0,
1519                                         dev, &spec_dst, &itag) < 0)
1520                 goto e_inval;
1521
1522         rth = dst_alloc(&ipv4_dst_ops);
1523         if (!rth)
1524                 goto e_nobufs;
1525
1526         rth->u.dst.output= ip_rt_bug;
1527
1528         atomic_set(&rth->u.dst.__refcnt, 1);
1529         rth->u.dst.flags= DST_HOST;
1530         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1531                 rth->u.dst.flags |= DST_NOPOLICY;
1532         rth->fl.fl4_dst = daddr;
1533         rth->rt_dst     = daddr;
1534         rth->fl.fl4_tos = tos;
1535         rth->fl.mark    = skb->mark;
1536         rth->fl.fl4_src = saddr;
1537         rth->rt_src     = saddr;
1538 #ifdef CONFIG_NET_CLS_ROUTE
1539         rth->u.dst.tclassid = itag;
1540 #endif
1541         rth->rt_iif     =
1542         rth->fl.iif     = dev->ifindex;
1543         rth->u.dst.dev  = init_net.loopback_dev;
1544         dev_hold(rth->u.dst.dev);
1545         rth->idev       = in_dev_get(rth->u.dst.dev);
1546         rth->fl.oif     = 0;
1547         rth->rt_gateway = daddr;
1548         rth->rt_spec_dst= spec_dst;
1549         rth->rt_type    = RTN_MULTICAST;
1550         rth->rt_flags   = RTCF_MULTICAST;
1551         if (our) {
1552                 rth->u.dst.input= ip_local_deliver;
1553                 rth->rt_flags |= RTCF_LOCAL;
1554         }
1555
1556 #ifdef CONFIG_IP_MROUTE
1557         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1558                 rth->u.dst.input = ip_mr_input;
1559 #endif
1560         RT_CACHE_STAT_INC(in_slow_mc);
1561
1562         in_dev_put(in_dev);
1563         hash = rt_hash(daddr, saddr, dev->ifindex);
1564         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1565
1566 e_nobufs:
1567         in_dev_put(in_dev);
1568         return -ENOBUFS;
1569
1570 e_inval:
1571         in_dev_put(in_dev);
1572         return -EINVAL;
1573 }
1574
1575
1576 static void ip_handle_martian_source(struct net_device *dev,
1577                                      struct in_device *in_dev,
1578                                      struct sk_buff *skb,
1579                                      __be32 daddr,
1580                                      __be32 saddr)
1581 {
1582         RT_CACHE_STAT_INC(in_martian_src);
1583 #ifdef CONFIG_IP_ROUTE_VERBOSE
1584         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1585                 /*
1586                  *      RFC1812 recommendation, if source is martian,
1587                  *      the only hint is MAC header.
1588                  */
1589                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1590                         "%u.%u.%u.%u, on dev %s\n",
1591                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1592                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1593                         int i;
1594                         const unsigned char *p = skb_mac_header(skb);
1595                         printk(KERN_WARNING "ll header: ");
1596                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1597                                 printk("%02x", *p);
1598                                 if (i < (dev->hard_header_len - 1))
1599                                         printk(":");
1600                         }
1601                         printk("\n");
1602                 }
1603         }
1604 #endif
1605 }
1606
1607 static inline int __mkroute_input(struct sk_buff *skb,
1608                                   struct fib_result* res,
1609                                   struct in_device *in_dev,
1610                                   __be32 daddr, __be32 saddr, u32 tos,
1611                                   struct rtable **result)
1612 {
1613
1614         struct rtable *rth;
1615         int err;
1616         struct in_device *out_dev;
1617         unsigned flags = 0;
1618         __be32 spec_dst;
1619         u32 itag;
1620
1621         /* get a working reference to the output device */
1622         out_dev = in_dev_get(FIB_RES_DEV(*res));
1623         if (out_dev == NULL) {
1624                 if (net_ratelimit())
1625                         printk(KERN_CRIT "Bug in ip_route_input" \
1626                                "_slow(). Please, report\n");
1627                 return -EINVAL;
1628         }
1629
1630
1631         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1632                                   in_dev->dev, &spec_dst, &itag);
1633         if (err < 0) {
1634                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1635                                          saddr);
1636
1637                 err = -EINVAL;
1638                 goto cleanup;
1639         }
1640
1641         if (err)
1642                 flags |= RTCF_DIRECTSRC;
1643
1644         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1645             (IN_DEV_SHARED_MEDIA(out_dev) ||
1646              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1647                 flags |= RTCF_DOREDIRECT;
1648
1649         if (skb->protocol != htons(ETH_P_IP)) {
1650                 /* Not IP (i.e. ARP). Do not create route, if it is
1651                  * invalid for proxy arp. DNAT routes are always valid.
1652                  */
1653                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1654                         err = -EINVAL;
1655                         goto cleanup;
1656                 }
1657         }
1658
1659
1660         rth = dst_alloc(&ipv4_dst_ops);
1661         if (!rth) {
1662                 err = -ENOBUFS;
1663                 goto cleanup;
1664         }
1665
1666         atomic_set(&rth->u.dst.__refcnt, 1);
1667         rth->u.dst.flags= DST_HOST;
1668         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1669                 rth->u.dst.flags |= DST_NOPOLICY;
1670         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1671                 rth->u.dst.flags |= DST_NOXFRM;
1672         rth->fl.fl4_dst = daddr;
1673         rth->rt_dst     = daddr;
1674         rth->fl.fl4_tos = tos;
1675         rth->fl.mark    = skb->mark;
1676         rth->fl.fl4_src = saddr;
1677         rth->rt_src     = saddr;
1678         rth->rt_gateway = daddr;
1679         rth->rt_iif     =
1680                 rth->fl.iif     = in_dev->dev->ifindex;
1681         rth->u.dst.dev  = (out_dev)->dev;
1682         dev_hold(rth->u.dst.dev);
1683         rth->idev       = in_dev_get(rth->u.dst.dev);
1684         rth->fl.oif     = 0;
1685         rth->rt_spec_dst= spec_dst;
1686
1687         rth->u.dst.input = ip_forward;
1688         rth->u.dst.output = ip_output;
1689
1690         rt_set_nexthop(rth, res, itag);
1691
1692         rth->rt_flags = flags;
1693
1694         *result = rth;
1695         err = 0;
1696  cleanup:
1697         /* release the working reference to the output device */
1698         in_dev_put(out_dev);
1699         return err;
1700 }
1701
1702 static inline int ip_mkroute_input(struct sk_buff *skb,
1703                                    struct fib_result* res,
1704                                    const struct flowi *fl,
1705                                    struct in_device *in_dev,
1706                                    __be32 daddr, __be32 saddr, u32 tos)
1707 {
1708         struct rtable* rth = NULL;
1709         int err;
1710         unsigned hash;
1711
1712 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1713         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1714                 fib_select_multipath(fl, res);
1715 #endif
1716
1717         /* create a routing cache entry */
1718         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1719         if (err)
1720                 return err;
1721
1722         /* put it into the cache */
1723         hash = rt_hash(daddr, saddr, fl->iif);
1724         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1725 }
1726
1727 /*
1728  *      NOTE. We drop all the packets that has local source
1729  *      addresses, because every properly looped back packet
1730  *      must have correct destination already attached by output routine.
1731  *
1732  *      Such approach solves two big problems:
1733  *      1. Not simplex devices are handled properly.
1734  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1735  */
1736
1737 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1738                                u8 tos, struct net_device *dev)
1739 {
1740         struct fib_result res;
1741         struct in_device *in_dev = in_dev_get(dev);
1742         struct flowi fl = { .nl_u = { .ip4_u =
1743                                       { .daddr = daddr,
1744                                         .saddr = saddr,
1745                                         .tos = tos,
1746                                         .scope = RT_SCOPE_UNIVERSE,
1747                                       } },
1748                             .mark = skb->mark,
1749                             .iif = dev->ifindex };
1750         unsigned        flags = 0;
1751         u32             itag = 0;
1752         struct rtable * rth;
1753         unsigned        hash;
1754         __be32          spec_dst;
1755         int             err = -EINVAL;
1756         int             free_res = 0;
1757
1758         /* IP on this device is disabled. */
1759
1760         if (!in_dev)
1761                 goto out;
1762
1763         /* Check for the most weird martians, which can be not detected
1764            by fib_lookup.
1765          */
1766
1767         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1768                 goto martian_source;
1769
1770         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1771                 goto brd_input;
1772
1773         /* Accept zero addresses only to limited broadcast;
1774          * I even do not know to fix it or not. Waiting for complains :-)
1775          */
1776         if (ZERONET(saddr))
1777                 goto martian_source;
1778
1779         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1780                 goto martian_destination;
1781
1782         /*
1783          *      Now we are ready to route packet.
1784          */
1785         if ((err = fib_lookup(&fl, &res)) != 0) {
1786                 if (!IN_DEV_FORWARD(in_dev))
1787                         goto e_hostunreach;
1788                 goto no_route;
1789         }
1790         free_res = 1;
1791
1792         RT_CACHE_STAT_INC(in_slow_tot);
1793
1794         if (res.type == RTN_BROADCAST)
1795                 goto brd_input;
1796
1797         if (res.type == RTN_LOCAL) {
1798                 int result;
1799                 result = fib_validate_source(saddr, daddr, tos,
1800                                              init_net.loopback_dev->ifindex,
1801                                              dev, &spec_dst, &itag);
1802                 if (result < 0)
1803                         goto martian_source;
1804                 if (result)
1805                         flags |= RTCF_DIRECTSRC;
1806                 spec_dst = daddr;
1807                 goto local_input;
1808         }
1809
1810         if (!IN_DEV_FORWARD(in_dev))
1811                 goto e_hostunreach;
1812         if (res.type != RTN_UNICAST)
1813                 goto martian_destination;
1814
1815         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1816 done:
1817         in_dev_put(in_dev);
1818         if (free_res)
1819                 fib_res_put(&res);
1820 out:    return err;
1821
1822 brd_input:
1823         if (skb->protocol != htons(ETH_P_IP))
1824                 goto e_inval;
1825
1826         if (ZERONET(saddr))
1827                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1828         else {
1829                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1830                                           &itag);
1831                 if (err < 0)
1832                         goto martian_source;
1833                 if (err)
1834                         flags |= RTCF_DIRECTSRC;
1835         }
1836         flags |= RTCF_BROADCAST;
1837         res.type = RTN_BROADCAST;
1838         RT_CACHE_STAT_INC(in_brd);
1839
1840 local_input:
1841         rth = dst_alloc(&ipv4_dst_ops);
1842         if (!rth)
1843                 goto e_nobufs;
1844
1845         rth->u.dst.output= ip_rt_bug;
1846
1847         atomic_set(&rth->u.dst.__refcnt, 1);
1848         rth->u.dst.flags= DST_HOST;
1849         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1850                 rth->u.dst.flags |= DST_NOPOLICY;
1851         rth->fl.fl4_dst = daddr;
1852         rth->rt_dst     = daddr;
1853         rth->fl.fl4_tos = tos;
1854         rth->fl.mark    = skb->mark;
1855         rth->fl.fl4_src = saddr;
1856         rth->rt_src     = saddr;
1857 #ifdef CONFIG_NET_CLS_ROUTE
1858         rth->u.dst.tclassid = itag;
1859 #endif
1860         rth->rt_iif     =
1861         rth->fl.iif     = dev->ifindex;
1862         rth->u.dst.dev  = init_net.loopback_dev;
1863         dev_hold(rth->u.dst.dev);
1864         rth->idev       = in_dev_get(rth->u.dst.dev);
1865         rth->rt_gateway = daddr;
1866         rth->rt_spec_dst= spec_dst;
1867         rth->u.dst.input= ip_local_deliver;
1868         rth->rt_flags   = flags|RTCF_LOCAL;
1869         if (res.type == RTN_UNREACHABLE) {
1870                 rth->u.dst.input= ip_error;
1871                 rth->u.dst.error= -err;
1872                 rth->rt_flags   &= ~RTCF_LOCAL;
1873         }
1874         rth->rt_type    = res.type;
1875         hash = rt_hash(daddr, saddr, fl.iif);
1876         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1877         goto done;
1878
1879 no_route:
1880         RT_CACHE_STAT_INC(in_no_route);
1881         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1882         res.type = RTN_UNREACHABLE;
1883         goto local_input;
1884
1885         /*
1886          *      Do not cache martian addresses: they should be logged (RFC1812)
1887          */
1888 martian_destination:
1889         RT_CACHE_STAT_INC(in_martian_dst);
1890 #ifdef CONFIG_IP_ROUTE_VERBOSE
1891         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1892                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1893                         "%u.%u.%u.%u, dev %s\n",
1894                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1895 #endif
1896
1897 e_hostunreach:
1898         err = -EHOSTUNREACH;
1899         goto done;
1900
1901 e_inval:
1902         err = -EINVAL;
1903         goto done;
1904
1905 e_nobufs:
1906         err = -ENOBUFS;
1907         goto done;
1908
1909 martian_source:
1910         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1911         goto e_inval;
1912 }
1913
1914 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1915                    u8 tos, struct net_device *dev)
1916 {
1917         struct rtable * rth;
1918         unsigned        hash;
1919         int iif = dev->ifindex;
1920
1921         tos &= IPTOS_RT_MASK;
1922         hash = rt_hash(daddr, saddr, iif);
1923
1924         rcu_read_lock();
1925         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1926              rth = rcu_dereference(rth->u.dst.rt_next)) {
1927                 if (rth->fl.fl4_dst == daddr &&
1928                     rth->fl.fl4_src == saddr &&
1929                     rth->fl.iif == iif &&
1930                     rth->fl.oif == 0 &&
1931                     rth->fl.mark == skb->mark &&
1932                     rth->fl.fl4_tos == tos) {
1933                         rth->u.dst.lastuse = jiffies;
1934                         dst_hold(&rth->u.dst);
1935                         rth->u.dst.__use++;
1936                         RT_CACHE_STAT_INC(in_hit);
1937                         rcu_read_unlock();
1938                         skb->dst = (struct dst_entry*)rth;
1939                         return 0;
1940                 }
1941                 RT_CACHE_STAT_INC(in_hlist_search);
1942         }
1943         rcu_read_unlock();
1944
1945         /* Multicast recognition logic is moved from route cache to here.
1946            The problem was that too many Ethernet cards have broken/missing
1947            hardware multicast filters :-( As result the host on multicasting
1948            network acquires a lot of useless route cache entries, sort of
1949            SDR messages from all the world. Now we try to get rid of them.
1950            Really, provided software IP multicast filter is organized
1951            reasonably (at least, hashed), it does not result in a slowdown
1952            comparing with route cache reject entries.
1953            Note, that multicast routers are not affected, because
1954            route cache entry is created eventually.
1955          */
1956         if (MULTICAST(daddr)) {
1957                 struct in_device *in_dev;
1958
1959                 rcu_read_lock();
1960                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1961                         int our = ip_check_mc(in_dev, daddr, saddr,
1962                                 ip_hdr(skb)->protocol);
1963                         if (our
1964 #ifdef CONFIG_IP_MROUTE
1965                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1966 #endif
1967                             ) {
1968                                 rcu_read_unlock();
1969                                 return ip_route_input_mc(skb, daddr, saddr,
1970                                                          tos, dev, our);
1971                         }
1972                 }
1973                 rcu_read_unlock();
1974                 return -EINVAL;
1975         }
1976         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1977 }
1978
1979 static inline int __mkroute_output(struct rtable **result,
1980                                    struct fib_result* res,
1981                                    const struct flowi *fl,
1982                                    const struct flowi *oldflp,
1983                                    struct net_device *dev_out,
1984                                    unsigned flags)
1985 {
1986         struct rtable *rth;
1987         struct in_device *in_dev;
1988         u32 tos = RT_FL_TOS(oldflp);
1989         int err = 0;
1990
1991         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1992                 return -EINVAL;
1993
1994         if (fl->fl4_dst == htonl(0xFFFFFFFF))
1995                 res->type = RTN_BROADCAST;
1996         else if (MULTICAST(fl->fl4_dst))
1997                 res->type = RTN_MULTICAST;
1998         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
1999                 return -EINVAL;
2000
2001         if (dev_out->flags & IFF_LOOPBACK)
2002                 flags |= RTCF_LOCAL;
2003
2004         /* get work reference to inet device */
2005         in_dev = in_dev_get(dev_out);
2006         if (!in_dev)
2007                 return -EINVAL;
2008
2009         if (res->type == RTN_BROADCAST) {
2010                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2011                 if (res->fi) {
2012                         fib_info_put(res->fi);
2013                         res->fi = NULL;
2014                 }
2015         } else if (res->type == RTN_MULTICAST) {
2016                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2017                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2018                                  oldflp->proto))
2019                         flags &= ~RTCF_LOCAL;
2020                 /* If multicast route do not exist use
2021                    default one, but do not gateway in this case.
2022                    Yes, it is hack.
2023                  */
2024                 if (res->fi && res->prefixlen < 4) {
2025                         fib_info_put(res->fi);
2026                         res->fi = NULL;
2027                 }
2028         }
2029
2030
2031         rth = dst_alloc(&ipv4_dst_ops);
2032         if (!rth) {
2033                 err = -ENOBUFS;
2034                 goto cleanup;
2035         }
2036
2037         atomic_set(&rth->u.dst.__refcnt, 1);
2038         rth->u.dst.flags= DST_HOST;
2039         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2040                 rth->u.dst.flags |= DST_NOXFRM;
2041         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2042                 rth->u.dst.flags |= DST_NOPOLICY;
2043
2044         rth->fl.fl4_dst = oldflp->fl4_dst;
2045         rth->fl.fl4_tos = tos;
2046         rth->fl.fl4_src = oldflp->fl4_src;
2047         rth->fl.oif     = oldflp->oif;
2048         rth->fl.mark    = oldflp->mark;
2049         rth->rt_dst     = fl->fl4_dst;
2050         rth->rt_src     = fl->fl4_src;
2051         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2052         /* get references to the devices that are to be hold by the routing
2053            cache entry */
2054         rth->u.dst.dev  = dev_out;
2055         dev_hold(dev_out);
2056         rth->idev       = in_dev_get(dev_out);
2057         rth->rt_gateway = fl->fl4_dst;
2058         rth->rt_spec_dst= fl->fl4_src;
2059
2060         rth->u.dst.output=ip_output;
2061
2062         RT_CACHE_STAT_INC(out_slow_tot);
2063
2064         if (flags & RTCF_LOCAL) {
2065                 rth->u.dst.input = ip_local_deliver;
2066                 rth->rt_spec_dst = fl->fl4_dst;
2067         }
2068         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2069                 rth->rt_spec_dst = fl->fl4_src;
2070                 if (flags & RTCF_LOCAL &&
2071                     !(dev_out->flags & IFF_LOOPBACK)) {
2072                         rth->u.dst.output = ip_mc_output;
2073                         RT_CACHE_STAT_INC(out_slow_mc);
2074                 }
2075 #ifdef CONFIG_IP_MROUTE
2076                 if (res->type == RTN_MULTICAST) {
2077                         if (IN_DEV_MFORWARD(in_dev) &&
2078                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2079                                 rth->u.dst.input = ip_mr_input;
2080                                 rth->u.dst.output = ip_mc_output;
2081                         }
2082                 }
2083 #endif
2084         }
2085
2086         rt_set_nexthop(rth, res, 0);
2087
2088         rth->rt_flags = flags;
2089
2090         *result = rth;
2091  cleanup:
2092         /* release work reference to inet device */
2093         in_dev_put(in_dev);
2094
2095         return err;
2096 }
2097
2098 static inline int ip_mkroute_output(struct rtable **rp,
2099                                     struct fib_result* res,
2100                                     const struct flowi *fl,
2101                                     const struct flowi *oldflp,
2102                                     struct net_device *dev_out,
2103                                     unsigned flags)
2104 {
2105         struct rtable *rth = NULL;
2106         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2107         unsigned hash;
2108         if (err == 0) {
2109                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2110                 err = rt_intern_hash(hash, rth, rp);
2111         }
2112
2113         return err;
2114 }
2115
2116 /*
2117  * Major route resolver routine.
2118  */
2119
2120 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2121 {
2122         u32 tos = RT_FL_TOS(oldflp);
2123         struct flowi fl = { .nl_u = { .ip4_u =
2124                                       { .daddr = oldflp->fl4_dst,
2125                                         .saddr = oldflp->fl4_src,
2126                                         .tos = tos & IPTOS_RT_MASK,
2127                                         .scope = ((tos & RTO_ONLINK) ?
2128                                                   RT_SCOPE_LINK :
2129                                                   RT_SCOPE_UNIVERSE),
2130                                       } },
2131                             .mark = oldflp->mark,
2132                             .iif = init_net.loopback_dev->ifindex,
2133                             .oif = oldflp->oif };
2134         struct fib_result res;
2135         unsigned flags = 0;
2136         struct net_device *dev_out = NULL;
2137         int free_res = 0;
2138         int err;
2139
2140
2141         res.fi          = NULL;
2142 #ifdef CONFIG_IP_MULTIPLE_TABLES
2143         res.r           = NULL;
2144 #endif
2145
2146         if (oldflp->fl4_src) {
2147                 err = -EINVAL;
2148                 if (MULTICAST(oldflp->fl4_src) ||
2149                     BADCLASS(oldflp->fl4_src) ||
2150                     ZERONET(oldflp->fl4_src))
2151                         goto out;
2152
2153                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2154                 dev_out = ip_dev_find(oldflp->fl4_src);
2155                 if (dev_out == NULL)
2156                         goto out;
2157
2158                 /* I removed check for oif == dev_out->oif here.
2159                    It was wrong for two reasons:
2160                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2161                       assigned to multiple interfaces.
2162                    2. Moreover, we are allowed to send packets with saddr
2163                       of another iface. --ANK
2164                  */
2165
2166                 if (oldflp->oif == 0
2167                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2168                         /* Special hack: user can direct multicasts
2169                            and limited broadcast via necessary interface
2170                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2171                            This hack is not just for fun, it allows
2172                            vic,vat and friends to work.
2173                            They bind socket to loopback, set ttl to zero
2174                            and expect that it will work.
2175                            From the viewpoint of routing cache they are broken,
2176                            because we are not allowed to build multicast path
2177                            with loopback source addr (look, routing cache
2178                            cannot know, that ttl is zero, so that packet
2179                            will not leave this host and route is valid).
2180                            Luckily, this hack is good workaround.
2181                          */
2182
2183                         fl.oif = dev_out->ifindex;
2184                         goto make_route;
2185                 }
2186                 if (dev_out)
2187                         dev_put(dev_out);
2188                 dev_out = NULL;
2189         }
2190
2191
2192         if (oldflp->oif) {
2193                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2194                 err = -ENODEV;
2195                 if (dev_out == NULL)
2196                         goto out;
2197
2198                 /* RACE: Check return value of inet_select_addr instead. */
2199                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2200                         dev_put(dev_out);
2201                         goto out;       /* Wrong error code */
2202                 }
2203
2204                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2205                         if (!fl.fl4_src)
2206                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2207                                                               RT_SCOPE_LINK);
2208                         goto make_route;
2209                 }
2210                 if (!fl.fl4_src) {
2211                         if (MULTICAST(oldflp->fl4_dst))
2212                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2213                                                               fl.fl4_scope);
2214                         else if (!oldflp->fl4_dst)
2215                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2216                                                               RT_SCOPE_HOST);
2217                 }
2218         }
2219
2220         if (!fl.fl4_dst) {
2221                 fl.fl4_dst = fl.fl4_src;
2222                 if (!fl.fl4_dst)
2223                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2224                 if (dev_out)
2225                         dev_put(dev_out);
2226                 dev_out = init_net.loopback_dev;
2227                 dev_hold(dev_out);
2228                 fl.oif = init_net.loopback_dev->ifindex;
2229                 res.type = RTN_LOCAL;
2230                 flags |= RTCF_LOCAL;
2231                 goto make_route;
2232         }
2233
2234         if (fib_lookup(&fl, &res)) {
2235                 res.fi = NULL;
2236                 if (oldflp->oif) {
2237                         /* Apparently, routing tables are wrong. Assume,
2238                            that the destination is on link.
2239
2240                            WHY? DW.
2241                            Because we are allowed to send to iface
2242                            even if it has NO routes and NO assigned
2243                            addresses. When oif is specified, routing
2244                            tables are looked up with only one purpose:
2245                            to catch if destination is gatewayed, rather than
2246                            direct. Moreover, if MSG_DONTROUTE is set,
2247                            we send packet, ignoring both routing tables
2248                            and ifaddr state. --ANK
2249
2250
2251                            We could make it even if oif is unknown,
2252                            likely IPv6, but we do not.
2253                          */
2254
2255                         if (fl.fl4_src == 0)
2256                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2257                                                               RT_SCOPE_LINK);
2258                         res.type = RTN_UNICAST;
2259                         goto make_route;
2260                 }
2261                 if (dev_out)
2262                         dev_put(dev_out);
2263                 err = -ENETUNREACH;
2264                 goto out;
2265         }
2266         free_res = 1;
2267
2268         if (res.type == RTN_LOCAL) {
2269                 if (!fl.fl4_src)
2270                         fl.fl4_src = fl.fl4_dst;
2271                 if (dev_out)
2272                         dev_put(dev_out);
2273                 dev_out = init_net.loopback_dev;
2274                 dev_hold(dev_out);
2275                 fl.oif = dev_out->ifindex;
2276                 if (res.fi)
2277                         fib_info_put(res.fi);
2278                 res.fi = NULL;
2279                 flags |= RTCF_LOCAL;
2280                 goto make_route;
2281         }
2282
2283 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2284         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2285                 fib_select_multipath(&fl, &res);
2286         else
2287 #endif
2288         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2289                 fib_select_default(&fl, &res);
2290
2291         if (!fl.fl4_src)
2292                 fl.fl4_src = FIB_RES_PREFSRC(res);
2293
2294         if (dev_out)
2295                 dev_put(dev_out);
2296         dev_out = FIB_RES_DEV(res);
2297         dev_hold(dev_out);
2298         fl.oif = dev_out->ifindex;
2299
2300
2301 make_route:
2302         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2303
2304
2305         if (free_res)
2306                 fib_res_put(&res);
2307         if (dev_out)
2308                 dev_put(dev_out);
2309 out:    return err;
2310 }
2311
2312 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2313 {
2314         unsigned hash;
2315         struct rtable *rth;
2316
2317         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2318
2319         rcu_read_lock_bh();
2320         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2321                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2322                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2323                     rth->fl.fl4_src == flp->fl4_src &&
2324                     rth->fl.iif == 0 &&
2325                     rth->fl.oif == flp->oif &&
2326                     rth->fl.mark == flp->mark &&
2327                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2328                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2329                         rth->u.dst.lastuse = jiffies;
2330                         dst_hold(&rth->u.dst);
2331                         rth->u.dst.__use++;
2332                         RT_CACHE_STAT_INC(out_hit);
2333                         rcu_read_unlock_bh();
2334                         *rp = rth;
2335                         return 0;
2336                 }
2337                 RT_CACHE_STAT_INC(out_hlist_search);
2338         }
2339         rcu_read_unlock_bh();
2340
2341         return ip_route_output_slow(rp, flp);
2342 }
2343
2344 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2345
2346 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2347 {
2348 }
2349
2350 static struct dst_ops ipv4_dst_blackhole_ops = {
2351         .family                 =       AF_INET,
2352         .protocol               =       __constant_htons(ETH_P_IP),
2353         .destroy                =       ipv4_dst_destroy,
2354         .check                  =       ipv4_dst_check,
2355         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2356         .entry_size             =       sizeof(struct rtable),
2357 };
2358
2359
2360 static int ipv4_blackhole_output(struct sk_buff *skb)
2361 {
2362         kfree_skb(skb);
2363         return 0;
2364 }
2365
2366 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2367 {
2368         struct rtable *ort = *rp;
2369         struct rtable *rt = (struct rtable *)
2370                 dst_alloc(&ipv4_dst_blackhole_ops);
2371
2372         if (rt) {
2373                 struct dst_entry *new = &rt->u.dst;
2374
2375                 atomic_set(&new->__refcnt, 1);
2376                 new->__use = 1;
2377                 new->input = ipv4_blackhole_output;
2378                 new->output = ipv4_blackhole_output;
2379                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2380
2381                 new->dev = ort->u.dst.dev;
2382                 if (new->dev)
2383                         dev_hold(new->dev);
2384
2385                 rt->fl = ort->fl;
2386
2387                 rt->idev = ort->idev;
2388                 if (rt->idev)
2389                         in_dev_hold(rt->idev);
2390                 rt->rt_flags = ort->rt_flags;
2391                 rt->rt_type = ort->rt_type;
2392                 rt->rt_dst = ort->rt_dst;
2393                 rt->rt_src = ort->rt_src;
2394                 rt->rt_iif = ort->rt_iif;
2395                 rt->rt_gateway = ort->rt_gateway;
2396                 rt->rt_spec_dst = ort->rt_spec_dst;
2397                 rt->peer = ort->peer;
2398                 if (rt->peer)
2399                         atomic_inc(&rt->peer->refcnt);
2400
2401                 dst_free(new);
2402         }
2403
2404         dst_release(&(*rp)->u.dst);
2405         *rp = rt;
2406         return (rt ? 0 : -ENOMEM);
2407 }
2408
2409 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2410 {
2411         int err;
2412
2413         if ((err = __ip_route_output_key(rp, flp)) != 0)
2414                 return err;
2415
2416         if (flp->proto) {
2417                 if (!flp->fl4_src)
2418                         flp->fl4_src = (*rp)->rt_src;
2419                 if (!flp->fl4_dst)
2420                         flp->fl4_dst = (*rp)->rt_dst;
2421                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2422                 if (err == -EREMOTE)
2423                         err = ipv4_dst_blackhole(rp, flp, sk);
2424
2425                 return err;
2426         }
2427
2428         return 0;
2429 }
2430
2431 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2432
2433 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2434 {
2435         return ip_route_output_flow(rp, flp, NULL, 0);
2436 }
2437
2438 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2439                         int nowait, unsigned int flags)
2440 {
2441         struct rtable *rt = (struct rtable*)skb->dst;
2442         struct rtmsg *r;
2443         struct nlmsghdr *nlh;
2444         long expires;
2445         u32 id = 0, ts = 0, tsage = 0, error;
2446
2447         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2448         if (nlh == NULL)
2449                 return -EMSGSIZE;
2450
2451         r = nlmsg_data(nlh);
2452         r->rtm_family    = AF_INET;
2453         r->rtm_dst_len  = 32;
2454         r->rtm_src_len  = 0;
2455         r->rtm_tos      = rt->fl.fl4_tos;
2456         r->rtm_table    = RT_TABLE_MAIN;
2457         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2458         r->rtm_type     = rt->rt_type;
2459         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2460         r->rtm_protocol = RTPROT_UNSPEC;
2461         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2462         if (rt->rt_flags & RTCF_NOTIFY)
2463                 r->rtm_flags |= RTM_F_NOTIFY;
2464
2465         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2466
2467         if (rt->fl.fl4_src) {
2468                 r->rtm_src_len = 32;
2469                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2470         }
2471         if (rt->u.dst.dev)
2472                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2473 #ifdef CONFIG_NET_CLS_ROUTE
2474         if (rt->u.dst.tclassid)
2475                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2476 #endif
2477         if (rt->fl.iif)
2478                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2479         else if (rt->rt_src != rt->fl.fl4_src)
2480                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2481
2482         if (rt->rt_dst != rt->rt_gateway)
2483                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2484
2485         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2486                 goto nla_put_failure;
2487
2488         error = rt->u.dst.error;
2489         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2490         if (rt->peer) {
2491                 id = rt->peer->ip_id_count;
2492                 if (rt->peer->tcp_ts_stamp) {
2493                         ts = rt->peer->tcp_ts;
2494                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2495                 }
2496         }
2497
2498         if (rt->fl.iif) {
2499 #ifdef CONFIG_IP_MROUTE
2500                 __be32 dst = rt->rt_dst;
2501
2502                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2503                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2504                         int err = ipmr_get_route(skb, r, nowait);
2505                         if (err <= 0) {
2506                                 if (!nowait) {
2507                                         if (err == 0)
2508                                                 return 0;
2509                                         goto nla_put_failure;
2510                                 } else {
2511                                         if (err == -EMSGSIZE)
2512                                                 goto nla_put_failure;
2513                                         error = err;
2514                                 }
2515                         }
2516                 } else
2517 #endif
2518                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2519         }
2520
2521         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2522                                expires, error) < 0)
2523                 goto nla_put_failure;
2524
2525         return nlmsg_end(skb, nlh);
2526
2527 nla_put_failure:
2528         nlmsg_cancel(skb, nlh);
2529         return -EMSGSIZE;
2530 }
2531
2532 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2533 {
2534         struct rtmsg *rtm;
2535         struct nlattr *tb[RTA_MAX+1];
2536         struct rtable *rt = NULL;
2537         __be32 dst = 0;
2538         __be32 src = 0;
2539         u32 iif;
2540         int err;
2541         struct sk_buff *skb;
2542
2543         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2544         if (err < 0)
2545                 goto errout;
2546
2547         rtm = nlmsg_data(nlh);
2548
2549         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2550         if (skb == NULL) {
2551                 err = -ENOBUFS;
2552                 goto errout;
2553         }
2554
2555         /* Reserve room for dummy headers, this skb can pass
2556            through good chunk of routing engine.
2557          */
2558         skb_reset_mac_header(skb);
2559         skb_reset_network_header(skb);
2560
2561         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2562         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2563         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2564
2565         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2566         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2567         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2568
2569         if (iif) {
2570                 struct net_device *dev;
2571
2572                 dev = __dev_get_by_index(&init_net, iif);
2573                 if (dev == NULL) {
2574                         err = -ENODEV;
2575                         goto errout_free;
2576                 }
2577
2578                 skb->protocol   = htons(ETH_P_IP);
2579                 skb->dev        = dev;
2580                 local_bh_disable();
2581                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2582                 local_bh_enable();
2583
2584                 rt = (struct rtable*) skb->dst;
2585                 if (err == 0 && rt->u.dst.error)
2586                         err = -rt->u.dst.error;
2587         } else {
2588                 struct flowi fl = {
2589                         .nl_u = {
2590                                 .ip4_u = {
2591                                         .daddr = dst,
2592                                         .saddr = src,
2593                                         .tos = rtm->rtm_tos,
2594                                 },
2595                         },
2596                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2597                 };
2598                 err = ip_route_output_key(&rt, &fl);
2599         }
2600
2601         if (err)
2602                 goto errout_free;
2603
2604         skb->dst = &rt->u.dst;
2605         if (rtm->rtm_flags & RTM_F_NOTIFY)
2606                 rt->rt_flags |= RTCF_NOTIFY;
2607
2608         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2609                                 RTM_NEWROUTE, 0, 0);
2610         if (err <= 0)
2611                 goto errout_free;
2612
2613         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2614 errout:
2615         return err;
2616
2617 errout_free:
2618         kfree_skb(skb);
2619         goto errout;
2620 }
2621
2622 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2623 {
2624         struct rtable *rt;
2625         int h, s_h;
2626         int idx, s_idx;
2627
2628         s_h = cb->args[0];
2629         s_idx = idx = cb->args[1];
2630         for (h = 0; h <= rt_hash_mask; h++) {
2631                 if (h < s_h) continue;
2632                 if (h > s_h)
2633                         s_idx = 0;
2634                 rcu_read_lock_bh();
2635                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2636                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2637                         if (idx < s_idx)
2638                                 continue;
2639                         skb->dst = dst_clone(&rt->u.dst);
2640                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2641                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2642                                          1, NLM_F_MULTI) <= 0) {
2643                                 dst_release(xchg(&skb->dst, NULL));
2644                                 rcu_read_unlock_bh();
2645                                 goto done;
2646                         }
2647                         dst_release(xchg(&skb->dst, NULL));
2648                 }
2649                 rcu_read_unlock_bh();
2650         }
2651
2652 done:
2653         cb->args[0] = h;
2654         cb->args[1] = idx;
2655         return skb->len;
2656 }
2657
2658 void ip_rt_multicast_event(struct in_device *in_dev)
2659 {
2660         rt_cache_flush(0);
2661 }
2662
2663 #ifdef CONFIG_SYSCTL
2664 static int flush_delay;
2665
2666 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2667                                         struct file *filp, void __user *buffer,
2668                                         size_t *lenp, loff_t *ppos)
2669 {
2670         if (write) {
2671                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2672                 rt_cache_flush(flush_delay);
2673                 return 0;
2674         }
2675
2676         return -EINVAL;
2677 }
2678
2679 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2680                                                 int __user *name,
2681                                                 int nlen,
2682                                                 void __user *oldval,
2683                                                 size_t __user *oldlenp,
2684                                                 void __user *newval,
2685                                                 size_t newlen)
2686 {
2687         int delay;
2688         if (newlen != sizeof(int))
2689                 return -EINVAL;
2690         if (get_user(delay, (int __user *)newval))
2691                 return -EFAULT;
2692         rt_cache_flush(delay);
2693         return 0;
2694 }
2695
2696 ctl_table ipv4_route_table[] = {
2697         {
2698                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2699                 .procname       = "flush",
2700                 .data           = &flush_delay,
2701                 .maxlen         = sizeof(int),
2702                 .mode           = 0200,
2703                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2704                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2705         },
2706         {
2707                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2708                 .procname       = "min_delay",
2709                 .data           = &ip_rt_min_delay,
2710                 .maxlen         = sizeof(int),
2711                 .mode           = 0644,
2712                 .proc_handler   = &proc_dointvec_jiffies,
2713                 .strategy       = &sysctl_jiffies,
2714         },
2715         {
2716                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2717                 .procname       = "max_delay",
2718                 .data           = &ip_rt_max_delay,
2719                 .maxlen         = sizeof(int),
2720                 .mode           = 0644,
2721                 .proc_handler   = &proc_dointvec_jiffies,
2722                 .strategy       = &sysctl_jiffies,
2723         },
2724         {
2725                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2726                 .procname       = "gc_thresh",
2727                 .data           = &ipv4_dst_ops.gc_thresh,
2728                 .maxlen         = sizeof(int),
2729                 .mode           = 0644,
2730                 .proc_handler   = &proc_dointvec,
2731         },
2732         {
2733                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2734                 .procname       = "max_size",
2735                 .data           = &ip_rt_max_size,
2736                 .maxlen         = sizeof(int),
2737                 .mode           = 0644,
2738                 .proc_handler   = &proc_dointvec,
2739         },
2740         {
2741                 /*  Deprecated. Use gc_min_interval_ms */
2742
2743                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2744                 .procname       = "gc_min_interval",
2745                 .data           = &ip_rt_gc_min_interval,
2746                 .maxlen         = sizeof(int),
2747                 .mode           = 0644,
2748                 .proc_handler   = &proc_dointvec_jiffies,
2749                 .strategy       = &sysctl_jiffies,
2750         },
2751         {
2752                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2753                 .procname       = "gc_min_interval_ms",
2754                 .data           = &ip_rt_gc_min_interval,
2755                 .maxlen         = sizeof(int),
2756                 .mode           = 0644,
2757                 .proc_handler   = &proc_dointvec_ms_jiffies,
2758                 .strategy       = &sysctl_ms_jiffies,
2759         },
2760         {
2761                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2762                 .procname       = "gc_timeout",
2763                 .data           = &ip_rt_gc_timeout,
2764                 .maxlen         = sizeof(int),
2765                 .mode           = 0644,
2766                 .proc_handler   = &proc_dointvec_jiffies,
2767                 .strategy       = &sysctl_jiffies,
2768         },
2769         {
2770                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2771                 .procname       = "gc_interval",
2772                 .data           = &ip_rt_gc_interval,
2773                 .maxlen         = sizeof(int),
2774                 .mode           = 0644,
2775                 .proc_handler   = &proc_dointvec_jiffies,
2776                 .strategy       = &sysctl_jiffies,
2777         },
2778         {
2779                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2780                 .procname       = "redirect_load",
2781                 .data           = &ip_rt_redirect_load,
2782                 .maxlen         = sizeof(int),
2783                 .mode           = 0644,
2784                 .proc_handler   = &proc_dointvec,
2785         },
2786         {
2787                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2788                 .procname       = "redirect_number",
2789                 .data           = &ip_rt_redirect_number,
2790                 .maxlen         = sizeof(int),
2791                 .mode           = 0644,
2792                 .proc_handler   = &proc_dointvec,
2793         },
2794         {
2795                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2796                 .procname       = "redirect_silence",
2797                 .data           = &ip_rt_redirect_silence,
2798                 .maxlen         = sizeof(int),
2799                 .mode           = 0644,
2800                 .proc_handler   = &proc_dointvec,
2801         },
2802         {
2803                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2804                 .procname       = "error_cost",
2805                 .data           = &ip_rt_error_cost,
2806                 .maxlen         = sizeof(int),
2807                 .mode           = 0644,
2808                 .proc_handler   = &proc_dointvec,
2809         },
2810         {
2811                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2812                 .procname       = "error_burst",
2813                 .data           = &ip_rt_error_burst,
2814                 .maxlen         = sizeof(int),
2815                 .mode           = 0644,
2816                 .proc_handler   = &proc_dointvec,
2817         },
2818         {
2819                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2820                 .procname       = "gc_elasticity",
2821                 .data           = &ip_rt_gc_elasticity,
2822                 .maxlen         = sizeof(int),
2823                 .mode           = 0644,
2824                 .proc_handler   = &proc_dointvec,
2825         },
2826         {
2827                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2828                 .procname       = "mtu_expires",
2829                 .data           = &ip_rt_mtu_expires,
2830                 .maxlen         = sizeof(int),
2831                 .mode           = 0644,
2832                 .proc_handler   = &proc_dointvec_jiffies,
2833                 .strategy       = &sysctl_jiffies,
2834         },
2835         {
2836                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2837                 .procname       = "min_pmtu",
2838                 .data           = &ip_rt_min_pmtu,
2839                 .maxlen         = sizeof(int),
2840                 .mode           = 0644,
2841                 .proc_handler   = &proc_dointvec,
2842         },
2843         {
2844                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2845                 .procname       = "min_adv_mss",
2846                 .data           = &ip_rt_min_advmss,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0644,
2849                 .proc_handler   = &proc_dointvec,
2850         },
2851         {
2852                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2853                 .procname       = "secret_interval",
2854                 .data           = &ip_rt_secret_interval,
2855                 .maxlen         = sizeof(int),
2856                 .mode           = 0644,
2857                 .proc_handler   = &proc_dointvec_jiffies,
2858                 .strategy       = &sysctl_jiffies,
2859         },
2860         { .ctl_name = 0 }
2861 };
2862 #endif
2863
2864 #ifdef CONFIG_NET_CLS_ROUTE
2865 struct ip_rt_acct *ip_rt_acct;
2866
2867 /* This code sucks.  But you should have seen it before! --RR */
2868
2869 /* IP route accounting ptr for this logical cpu number. */
2870 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2871
2872 #ifdef CONFIG_PROC_FS
2873 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2874                            int length, int *eof, void *data)
2875 {
2876         unsigned int i;
2877
2878         if ((offset & 3) || (length & 3))
2879                 return -EIO;
2880
2881         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2882                 *eof = 1;
2883                 return 0;
2884         }
2885
2886         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2887                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2888                 *eof = 1;
2889         }
2890
2891         offset /= sizeof(u32);
2892
2893         if (length > 0) {
2894                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2895                 u32 *dst = (u32 *) buffer;
2896
2897                 /* Copy first cpu. */
2898                 *start = buffer;
2899                 memcpy(dst, src, length);
2900
2901                 /* Add the other cpus in, one int at a time */
2902                 for_each_possible_cpu(i) {
2903                         unsigned int j;
2904
2905                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2906
2907                         for (j = 0; j < length/4; j++)
2908                                 dst[j] += src[j];
2909                 }
2910         }
2911         return length;
2912 }
2913 #endif /* CONFIG_PROC_FS */
2914 #endif /* CONFIG_NET_CLS_ROUTE */
2915
2916 static __initdata unsigned long rhash_entries;
2917 static int __init set_rhash_entries(char *str)
2918 {
2919         if (!str)
2920                 return 0;
2921         rhash_entries = simple_strtoul(str, &str, 0);
2922         return 1;
2923 }
2924 __setup("rhash_entries=", set_rhash_entries);
2925
2926 int __init ip_rt_init(void)
2927 {
2928         int rc = 0;
2929
2930         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2931                              (jiffies ^ (jiffies >> 7)));
2932
2933 #ifdef CONFIG_NET_CLS_ROUTE
2934         {
2935         int order;
2936         for (order = 0;
2937              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2938                 /* NOTHING */;
2939         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2940         if (!ip_rt_acct)
2941                 panic("IP: failed to allocate ip_rt_acct\n");
2942         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2943         }
2944 #endif
2945
2946         ipv4_dst_ops.kmem_cachep =
2947                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2948                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2949
2950         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2951
2952         rt_hash_table = (struct rt_hash_bucket *)
2953                 alloc_large_system_hash("IP route cache",
2954                                         sizeof(struct rt_hash_bucket),
2955                                         rhash_entries,
2956                                         (num_physpages >= 128 * 1024) ?
2957                                         15 : 17,
2958                                         0,
2959                                         &rt_hash_log,
2960                                         &rt_hash_mask,
2961                                         0);
2962         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2963         rt_hash_lock_init();
2964
2965         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2966         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2967
2968         devinet_init();
2969         ip_fib_init();
2970
2971         init_timer(&rt_flush_timer);
2972         rt_flush_timer.function = rt_run_flush;
2973         init_timer(&rt_secret_timer);
2974         rt_secret_timer.function = rt_secret_rebuild;
2975
2976         /* All the timers, started at system startup tend
2977            to synchronize. Perturb it a bit.
2978          */
2979         schedule_delayed_work(&expires_work,
2980                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2981
2982         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2983                 ip_rt_secret_interval;
2984         add_timer(&rt_secret_timer);
2985
2986 #ifdef CONFIG_PROC_FS
2987         {
2988         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2989         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2990             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2991                                              init_net.proc_net_stat))) {
2992                 return -ENOMEM;
2993         }
2994         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2995         }
2996 #ifdef CONFIG_NET_CLS_ROUTE
2997         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
2998 #endif
2999 #endif
3000 #ifdef CONFIG_XFRM
3001         xfrm_init();
3002         xfrm4_init();
3003 #endif
3004         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3005
3006         return rc;
3007 }
3008
3009 EXPORT_SYMBOL(__ip_select_ident);
3010 EXPORT_SYMBOL(ip_route_input);
3011 EXPORT_SYMBOL(ip_route_output_key);