Merge git://git.kernel.org/pub/scm/linux/kernel/git/holtmann/bluetooth-2.6
[linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
46 #include <net/netlink.h>
47 #include <net/nexthop.h>
48
49 #include "fib_lookup.h"
50
51 #define FSprintk(a...)
52
53 static DEFINE_SPINLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
58
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64
65 static DEFINE_SPINLOCK(fib_multipath_lock);
66
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
76
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
82
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84
85 #define endfor_nexthops(fi) }
86
87
88 static const struct
89 {
90         int     error;
91         u8      scope;
92 } fib_props[RTN_MAX + 1] = {
93         {
94                 .error  = 0,
95                 .scope  = RT_SCOPE_NOWHERE,
96         },      /* RTN_UNSPEC */
97         {
98                 .error  = 0,
99                 .scope  = RT_SCOPE_UNIVERSE,
100         },      /* RTN_UNICAST */
101         {
102                 .error  = 0,
103                 .scope  = RT_SCOPE_HOST,
104         },      /* RTN_LOCAL */
105         {
106                 .error  = 0,
107                 .scope  = RT_SCOPE_LINK,
108         },      /* RTN_BROADCAST */
109         {
110                 .error  = 0,
111                 .scope  = RT_SCOPE_LINK,
112         },      /* RTN_ANYCAST */
113         {
114                 .error  = 0,
115                 .scope  = RT_SCOPE_UNIVERSE,
116         },      /* RTN_MULTICAST */
117         {
118                 .error  = -EINVAL,
119                 .scope  = RT_SCOPE_UNIVERSE,
120         },      /* RTN_BLACKHOLE */
121         {
122                 .error  = -EHOSTUNREACH,
123                 .scope  = RT_SCOPE_UNIVERSE,
124         },      /* RTN_UNREACHABLE */
125         {
126                 .error  = -EACCES,
127                 .scope  = RT_SCOPE_UNIVERSE,
128         },      /* RTN_PROHIBIT */
129         {
130                 .error  = -EAGAIN,
131                 .scope  = RT_SCOPE_UNIVERSE,
132         },      /* RTN_THROW */
133         {
134                 .error  = -EINVAL,
135                 .scope  = RT_SCOPE_NOWHERE,
136         },      /* RTN_NAT */
137         {
138                 .error  = -EINVAL,
139                 .scope  = RT_SCOPE_NOWHERE,
140         },      /* RTN_XRESOLVE */
141 };
142
143
144 /* Release a nexthop info record */
145
146 void free_fib_info(struct fib_info *fi)
147 {
148         if (fi->fib_dead == 0) {
149                 printk("Freeing alive fib_info %p\n", fi);
150                 return;
151         }
152         change_nexthops(fi) {
153                 if (nh->nh_dev)
154                         dev_put(nh->nh_dev);
155                 nh->nh_dev = NULL;
156         } endfor_nexthops(fi);
157         fib_info_cnt--;
158         kfree(fi);
159 }
160
161 void fib_release_info(struct fib_info *fi)
162 {
163         spin_lock_bh(&fib_info_lock);
164         if (fi && --fi->fib_treeref == 0) {
165                 hlist_del(&fi->fib_hash);
166                 if (fi->fib_prefsrc)
167                         hlist_del(&fi->fib_lhash);
168                 change_nexthops(fi) {
169                         if (!nh->nh_dev)
170                                 continue;
171                         hlist_del(&nh->nh_hash);
172                 } endfor_nexthops(fi)
173                 fi->fib_dead = 1;
174                 fib_info_put(fi);
175         }
176         spin_unlock_bh(&fib_info_lock);
177 }
178
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 {
181         const struct fib_nh *onh = ofi->fib_nh;
182
183         for_nexthops(fi) {
184                 if (nh->nh_oif != onh->nh_oif ||
185                     nh->nh_gw  != onh->nh_gw ||
186                     nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188                     nh->nh_weight != onh->nh_weight ||
189 #endif
190 #ifdef CONFIG_NET_CLS_ROUTE
191                     nh->nh_tclassid != onh->nh_tclassid ||
192 #endif
193                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194                         return -1;
195                 onh++;
196         } endfor_nexthops(fi);
197         return 0;
198 }
199
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 {
202         unsigned int mask = (fib_hash_size - 1);
203         unsigned int val = fi->fib_nhs;
204
205         val ^= fi->fib_protocol;
206         val ^= (__force u32)fi->fib_prefsrc;
207         val ^= fi->fib_priority;
208
209         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 }
211
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 {
214         struct hlist_head *head;
215         struct hlist_node *node;
216         struct fib_info *fi;
217         unsigned int hash;
218
219         hash = fib_info_hashfn(nfi);
220         head = &fib_info_hash[hash];
221
222         hlist_for_each_entry(fi, node, head, fib_hash) {
223                 if (fi->fib_nhs != nfi->fib_nhs)
224                         continue;
225                 if (nfi->fib_protocol == fi->fib_protocol &&
226                     nfi->fib_prefsrc == fi->fib_prefsrc &&
227                     nfi->fib_priority == fi->fib_priority &&
228                     memcmp(nfi->fib_metrics, fi->fib_metrics,
229                            sizeof(fi->fib_metrics)) == 0 &&
230                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
232                         return fi;
233         }
234
235         return NULL;
236 }
237
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 {
240         unsigned int mask = DEVINDEX_HASHSIZE - 1;
241
242         return (val ^
243                 (val >> DEVINDEX_HASHBITS) ^
244                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 }
246
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250
251 int ip_fib_check_default(__be32 gw, struct net_device *dev)
252 {
253         struct hlist_head *head;
254         struct hlist_node *node;
255         struct fib_nh *nh;
256         unsigned int hash;
257
258         spin_lock(&fib_info_lock);
259
260         hash = fib_devindex_hashfn(dev->ifindex);
261         head = &fib_info_devhash[hash];
262         hlist_for_each_entry(nh, node, head, nh_hash) {
263                 if (nh->nh_dev == dev &&
264                     nh->nh_gw == gw &&
265                     !(nh->nh_flags&RTNH_F_DEAD)) {
266                         spin_unlock(&fib_info_lock);
267                         return 0;
268                 }
269         }
270
271         spin_unlock(&fib_info_lock);
272
273         return -1;
274 }
275
276 static inline size_t fib_nlmsg_size(struct fib_info *fi)
277 {
278         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
279                          + nla_total_size(4) /* RTA_TABLE */
280                          + nla_total_size(4) /* RTA_DST */
281                          + nla_total_size(4) /* RTA_PRIORITY */
282                          + nla_total_size(4); /* RTA_PREFSRC */
283
284         /* space for nested metrics */
285         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
286
287         if (fi->fib_nhs) {
288                 /* Also handles the special case fib_nhs == 1 */
289
290                 /* each nexthop is packed in an attribute */
291                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
292
293                 /* may contain flow and gateway attribute */
294                 nhsize += 2 * nla_total_size(4);
295
296                 /* all nexthops are packed in a nested attribute */
297                 payload += nla_total_size(fi->fib_nhs * nhsize);
298         }
299
300         return payload;
301 }
302
303 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
304                int dst_len, u32 tb_id, struct nl_info *info,
305                unsigned int nlm_flags)
306 {
307         struct sk_buff *skb;
308         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
309         int err = -ENOBUFS;
310
311         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
312         if (skb == NULL)
313                 goto errout;
314
315         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
316                             fa->fa_type, fa->fa_scope, key, dst_len,
317                             fa->fa_tos, fa->fa_info, nlm_flags);
318         if (err < 0) {
319                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
320                 WARN_ON(err == -EMSGSIZE);
321                 kfree_skb(skb);
322                 goto errout;
323         }
324         err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
325                           info->nlh, GFP_KERNEL);
326 errout:
327         if (err < 0)
328                 rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
329 }
330
331 /* Return the first fib alias matching TOS with
332  * priority less than or equal to PRIO.
333  */
334 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
335 {
336         if (fah) {
337                 struct fib_alias *fa;
338                 list_for_each_entry(fa, fah, fa_list) {
339                         if (fa->fa_tos > tos)
340                                 continue;
341                         if (fa->fa_info->fib_priority >= prio ||
342                             fa->fa_tos < tos)
343                                 return fa;
344                 }
345         }
346         return NULL;
347 }
348
349 int fib_detect_death(struct fib_info *fi, int order,
350                      struct fib_info **last_resort, int *last_idx, int *dflt)
351 {
352         struct neighbour *n;
353         int state = NUD_NONE;
354
355         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
356         if (n) {
357                 state = n->nud_state;
358                 neigh_release(n);
359         }
360         if (state==NUD_REACHABLE)
361                 return 0;
362         if ((state&NUD_VALID) && order != *dflt)
363                 return 0;
364         if ((state&NUD_VALID) ||
365             (*last_idx<0 && order > *dflt)) {
366                 *last_resort = fi;
367                 *last_idx = order;
368         }
369         return 1;
370 }
371
372 #ifdef CONFIG_IP_ROUTE_MULTIPATH
373
374 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
375 {
376         int nhs = 0;
377
378         while (rtnh_ok(rtnh, remaining)) {
379                 nhs++;
380                 rtnh = rtnh_next(rtnh, &remaining);
381         }
382
383         /* leftover implies invalid nexthop configuration, discard it */
384         return remaining > 0 ? 0 : nhs;
385 }
386
387 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
388                        int remaining, struct fib_config *cfg)
389 {
390         change_nexthops(fi) {
391                 int attrlen;
392
393                 if (!rtnh_ok(rtnh, remaining))
394                         return -EINVAL;
395
396                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
397                 nh->nh_oif = rtnh->rtnh_ifindex;
398                 nh->nh_weight = rtnh->rtnh_hops + 1;
399
400                 attrlen = rtnh_attrlen(rtnh);
401                 if (attrlen > 0) {
402                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
403
404                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
405                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
406 #ifdef CONFIG_NET_CLS_ROUTE
407                         nla = nla_find(attrs, attrlen, RTA_FLOW);
408                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
409 #endif
410                 }
411
412                 rtnh = rtnh_next(rtnh, &remaining);
413         } endfor_nexthops(fi);
414
415         return 0;
416 }
417
418 #endif
419
420 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
421 {
422 #ifdef CONFIG_IP_ROUTE_MULTIPATH
423         struct rtnexthop *rtnh;
424         int remaining;
425 #endif
426
427         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
428                 return 1;
429
430         if (cfg->fc_oif || cfg->fc_gw) {
431                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
432                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
433                         return 0;
434                 return 1;
435         }
436
437 #ifdef CONFIG_IP_ROUTE_MULTIPATH
438         if (cfg->fc_mp == NULL)
439                 return 0;
440
441         rtnh = cfg->fc_mp;
442         remaining = cfg->fc_mp_len;
443
444         for_nexthops(fi) {
445                 int attrlen;
446
447                 if (!rtnh_ok(rtnh, remaining))
448                         return -EINVAL;
449
450                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
451                         return 1;
452
453                 attrlen = rtnh_attrlen(rtnh);
454                 if (attrlen < 0) {
455                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
456
457                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
458                         if (nla && nla_get_be32(nla) != nh->nh_gw)
459                                 return 1;
460 #ifdef CONFIG_NET_CLS_ROUTE
461                         nla = nla_find(attrs, attrlen, RTA_FLOW);
462                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
463                                 return 1;
464 #endif
465                 }
466
467                 rtnh = rtnh_next(rtnh, &remaining);
468         } endfor_nexthops(fi);
469 #endif
470         return 0;
471 }
472
473
474 /*
475    Picture
476    -------
477
478    Semantics of nexthop is very messy by historical reasons.
479    We have to take into account, that:
480    a) gateway can be actually local interface address,
481       so that gatewayed route is direct.
482    b) gateway must be on-link address, possibly
483       described not by an ifaddr, but also by a direct route.
484    c) If both gateway and interface are specified, they should not
485       contradict.
486    d) If we use tunnel routes, gateway could be not on-link.
487
488    Attempt to reconcile all of these (alas, self-contradictory) conditions
489    results in pretty ugly and hairy code with obscure logic.
490
491    I chose to generalized it instead, so that the size
492    of code does not increase practically, but it becomes
493    much more general.
494    Every prefix is assigned a "scope" value: "host" is local address,
495    "link" is direct route,
496    [ ... "site" ... "interior" ... ]
497    and "universe" is true gateway route with global meaning.
498
499    Every prefix refers to a set of "nexthop"s (gw, oif),
500    where gw must have narrower scope. This recursion stops
501    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
502    which means that gw is forced to be on link.
503
504    Code is still hairy, but now it is apparently logically
505    consistent and very flexible. F.e. as by-product it allows
506    to co-exists in peace independent exterior and interior
507    routing processes.
508
509    Normally it looks as following.
510
511    {universe prefix}  -> (gw, oif) [scope link]
512                           |
513                           |-> {link prefix} -> (gw, oif) [scope local]
514                                                 |
515                                                 |-> {local prefix} (terminal node)
516  */
517
518 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
519                         struct fib_nh *nh)
520 {
521         int err;
522
523         if (nh->nh_gw) {
524                 struct fib_result res;
525
526 #ifdef CONFIG_IP_ROUTE_PERVASIVE
527                 if (nh->nh_flags&RTNH_F_PERVASIVE)
528                         return 0;
529 #endif
530                 if (nh->nh_flags&RTNH_F_ONLINK) {
531                         struct net_device *dev;
532
533                         if (cfg->fc_scope >= RT_SCOPE_LINK)
534                                 return -EINVAL;
535                         if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
536                                 return -EINVAL;
537                         if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
538                                 return -ENODEV;
539                         if (!(dev->flags&IFF_UP))
540                                 return -ENETDOWN;
541                         nh->nh_dev = dev;
542                         dev_hold(dev);
543                         nh->nh_scope = RT_SCOPE_LINK;
544                         return 0;
545                 }
546                 {
547                         struct flowi fl = {
548                                 .nl_u = {
549                                         .ip4_u = {
550                                                 .daddr = nh->nh_gw,
551                                                 .scope = cfg->fc_scope + 1,
552                                         },
553                                 },
554                                 .oif = nh->nh_oif,
555                         };
556
557                         /* It is not necessary, but requires a bit of thinking */
558                         if (fl.fl4_scope < RT_SCOPE_LINK)
559                                 fl.fl4_scope = RT_SCOPE_LINK;
560                         if ((err = fib_lookup(&fl, &res)) != 0)
561                                 return err;
562                 }
563                 err = -EINVAL;
564                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
565                         goto out;
566                 nh->nh_scope = res.scope;
567                 nh->nh_oif = FIB_RES_OIF(res);
568                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
569                         goto out;
570                 dev_hold(nh->nh_dev);
571                 err = -ENETDOWN;
572                 if (!(nh->nh_dev->flags & IFF_UP))
573                         goto out;
574                 err = 0;
575 out:
576                 fib_res_put(&res);
577                 return err;
578         } else {
579                 struct in_device *in_dev;
580
581                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
582                         return -EINVAL;
583
584                 in_dev = inetdev_by_index(nh->nh_oif);
585                 if (in_dev == NULL)
586                         return -ENODEV;
587                 if (!(in_dev->dev->flags&IFF_UP)) {
588                         in_dev_put(in_dev);
589                         return -ENETDOWN;
590                 }
591                 nh->nh_dev = in_dev->dev;
592                 dev_hold(nh->nh_dev);
593                 nh->nh_scope = RT_SCOPE_HOST;
594                 in_dev_put(in_dev);
595         }
596         return 0;
597 }
598
599 static inline unsigned int fib_laddr_hashfn(__be32 val)
600 {
601         unsigned int mask = (fib_hash_size - 1);
602
603         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
604 }
605
606 static struct hlist_head *fib_hash_alloc(int bytes)
607 {
608         if (bytes <= PAGE_SIZE)
609                 return kmalloc(bytes, GFP_KERNEL);
610         else
611                 return (struct hlist_head *)
612                         __get_free_pages(GFP_KERNEL, get_order(bytes));
613 }
614
615 static void fib_hash_free(struct hlist_head *hash, int bytes)
616 {
617         if (!hash)
618                 return;
619
620         if (bytes <= PAGE_SIZE)
621                 kfree(hash);
622         else
623                 free_pages((unsigned long) hash, get_order(bytes));
624 }
625
626 static void fib_hash_move(struct hlist_head *new_info_hash,
627                           struct hlist_head *new_laddrhash,
628                           unsigned int new_size)
629 {
630         struct hlist_head *old_info_hash, *old_laddrhash;
631         unsigned int old_size = fib_hash_size;
632         unsigned int i, bytes;
633
634         spin_lock_bh(&fib_info_lock);
635         old_info_hash = fib_info_hash;
636         old_laddrhash = fib_info_laddrhash;
637         fib_hash_size = new_size;
638
639         for (i = 0; i < old_size; i++) {
640                 struct hlist_head *head = &fib_info_hash[i];
641                 struct hlist_node *node, *n;
642                 struct fib_info *fi;
643
644                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
645                         struct hlist_head *dest;
646                         unsigned int new_hash;
647
648                         hlist_del(&fi->fib_hash);
649
650                         new_hash = fib_info_hashfn(fi);
651                         dest = &new_info_hash[new_hash];
652                         hlist_add_head(&fi->fib_hash, dest);
653                 }
654         }
655         fib_info_hash = new_info_hash;
656
657         for (i = 0; i < old_size; i++) {
658                 struct hlist_head *lhead = &fib_info_laddrhash[i];
659                 struct hlist_node *node, *n;
660                 struct fib_info *fi;
661
662                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
663                         struct hlist_head *ldest;
664                         unsigned int new_hash;
665
666                         hlist_del(&fi->fib_lhash);
667
668                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
669                         ldest = &new_laddrhash[new_hash];
670                         hlist_add_head(&fi->fib_lhash, ldest);
671                 }
672         }
673         fib_info_laddrhash = new_laddrhash;
674
675         spin_unlock_bh(&fib_info_lock);
676
677         bytes = old_size * sizeof(struct hlist_head *);
678         fib_hash_free(old_info_hash, bytes);
679         fib_hash_free(old_laddrhash, bytes);
680 }
681
682 struct fib_info *fib_create_info(struct fib_config *cfg)
683 {
684         int err;
685         struct fib_info *fi = NULL;
686         struct fib_info *ofi;
687         int nhs = 1;
688
689         /* Fast check to catch the most weird cases */
690         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
691                 goto err_inval;
692
693 #ifdef CONFIG_IP_ROUTE_MULTIPATH
694         if (cfg->fc_mp) {
695                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
696                 if (nhs == 0)
697                         goto err_inval;
698         }
699 #endif
700 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
701         if (cfg->fc_mp_alg) {
702                 if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
703                     cfg->fc_mp_alg > IP_MP_ALG_MAX)
704                         goto err_inval;
705         }
706 #endif
707
708         err = -ENOBUFS;
709         if (fib_info_cnt >= fib_hash_size) {
710                 unsigned int new_size = fib_hash_size << 1;
711                 struct hlist_head *new_info_hash;
712                 struct hlist_head *new_laddrhash;
713                 unsigned int bytes;
714
715                 if (!new_size)
716                         new_size = 1;
717                 bytes = new_size * sizeof(struct hlist_head *);
718                 new_info_hash = fib_hash_alloc(bytes);
719                 new_laddrhash = fib_hash_alloc(bytes);
720                 if (!new_info_hash || !new_laddrhash) {
721                         fib_hash_free(new_info_hash, bytes);
722                         fib_hash_free(new_laddrhash, bytes);
723                 } else {
724                         memset(new_info_hash, 0, bytes);
725                         memset(new_laddrhash, 0, bytes);
726
727                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
728                 }
729
730                 if (!fib_hash_size)
731                         goto failure;
732         }
733
734         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
735         if (fi == NULL)
736                 goto failure;
737         fib_info_cnt++;
738
739         fi->fib_protocol = cfg->fc_protocol;
740         fi->fib_flags = cfg->fc_flags;
741         fi->fib_priority = cfg->fc_priority;
742         fi->fib_prefsrc = cfg->fc_prefsrc;
743
744         fi->fib_nhs = nhs;
745         change_nexthops(fi) {
746                 nh->nh_parent = fi;
747         } endfor_nexthops(fi)
748
749         if (cfg->fc_mx) {
750                 struct nlattr *nla;
751                 int remaining;
752
753                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
754                         int type = nla->nla_type;
755
756                         if (type) {
757                                 if (type > RTAX_MAX)
758                                         goto err_inval;
759                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
760                         }
761                 }
762         }
763
764         if (cfg->fc_mp) {
765 #ifdef CONFIG_IP_ROUTE_MULTIPATH
766                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
767                 if (err != 0)
768                         goto failure;
769                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
770                         goto err_inval;
771                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
772                         goto err_inval;
773 #ifdef CONFIG_NET_CLS_ROUTE
774                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
775                         goto err_inval;
776 #endif
777 #else
778                 goto err_inval;
779 #endif
780         } else {
781                 struct fib_nh *nh = fi->fib_nh;
782
783                 nh->nh_oif = cfg->fc_oif;
784                 nh->nh_gw = cfg->fc_gw;
785                 nh->nh_flags = cfg->fc_flags;
786 #ifdef CONFIG_NET_CLS_ROUTE
787                 nh->nh_tclassid = cfg->fc_flow;
788 #endif
789 #ifdef CONFIG_IP_ROUTE_MULTIPATH
790                 nh->nh_weight = 1;
791 #endif
792         }
793
794 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
795         fi->fib_mp_alg = cfg->fc_mp_alg;
796 #endif
797
798         if (fib_props[cfg->fc_type].error) {
799                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
800                         goto err_inval;
801                 goto link_it;
802         }
803
804         if (cfg->fc_scope > RT_SCOPE_HOST)
805                 goto err_inval;
806
807         if (cfg->fc_scope == RT_SCOPE_HOST) {
808                 struct fib_nh *nh = fi->fib_nh;
809
810                 /* Local address is added. */
811                 if (nhs != 1 || nh->nh_gw)
812                         goto err_inval;
813                 nh->nh_scope = RT_SCOPE_NOWHERE;
814                 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
815                 err = -ENODEV;
816                 if (nh->nh_dev == NULL)
817                         goto failure;
818         } else {
819                 change_nexthops(fi) {
820                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
821                                 goto failure;
822                 } endfor_nexthops(fi)
823         }
824
825         if (fi->fib_prefsrc) {
826                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
827                     fi->fib_prefsrc != cfg->fc_dst)
828                         if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
829                                 goto err_inval;
830         }
831
832 link_it:
833         if ((ofi = fib_find_info(fi)) != NULL) {
834                 fi->fib_dead = 1;
835                 free_fib_info(fi);
836                 ofi->fib_treeref++;
837                 return ofi;
838         }
839
840         fi->fib_treeref++;
841         atomic_inc(&fi->fib_clntref);
842         spin_lock_bh(&fib_info_lock);
843         hlist_add_head(&fi->fib_hash,
844                        &fib_info_hash[fib_info_hashfn(fi)]);
845         if (fi->fib_prefsrc) {
846                 struct hlist_head *head;
847
848                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
849                 hlist_add_head(&fi->fib_lhash, head);
850         }
851         change_nexthops(fi) {
852                 struct hlist_head *head;
853                 unsigned int hash;
854
855                 if (!nh->nh_dev)
856                         continue;
857                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
858                 head = &fib_info_devhash[hash];
859                 hlist_add_head(&nh->nh_hash, head);
860         } endfor_nexthops(fi)
861         spin_unlock_bh(&fib_info_lock);
862         return fi;
863
864 err_inval:
865         err = -EINVAL;
866
867 failure:
868         if (fi) {
869                 fi->fib_dead = 1;
870                 free_fib_info(fi);
871         }
872
873         return ERR_PTR(err);
874 }
875
876 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
877 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
878                        struct fib_result *res, __be32 zone, __be32 mask,
879                         int prefixlen)
880 {
881         struct fib_alias *fa;
882         int nh_sel = 0;
883
884         list_for_each_entry_rcu(fa, head, fa_list) {
885                 int err;
886
887                 if (fa->fa_tos &&
888                     fa->fa_tos != flp->fl4_tos)
889                         continue;
890
891                 if (fa->fa_scope < flp->fl4_scope)
892                         continue;
893
894                 fa->fa_state |= FA_S_ACCESSED;
895
896                 err = fib_props[fa->fa_type].error;
897                 if (err == 0) {
898                         struct fib_info *fi = fa->fa_info;
899
900                         if (fi->fib_flags & RTNH_F_DEAD)
901                                 continue;
902
903                         switch (fa->fa_type) {
904                         case RTN_UNICAST:
905                         case RTN_LOCAL:
906                         case RTN_BROADCAST:
907                         case RTN_ANYCAST:
908                         case RTN_MULTICAST:
909                                 for_nexthops(fi) {
910                                         if (nh->nh_flags&RTNH_F_DEAD)
911                                                 continue;
912                                         if (!flp->oif || flp->oif == nh->nh_oif)
913                                                 break;
914                                 }
915 #ifdef CONFIG_IP_ROUTE_MULTIPATH
916                                 if (nhsel < fi->fib_nhs) {
917                                         nh_sel = nhsel;
918                                         goto out_fill_res;
919                                 }
920 #else
921                                 if (nhsel < 1) {
922                                         goto out_fill_res;
923                                 }
924 #endif
925                                 endfor_nexthops(fi);
926                                 continue;
927
928                         default:
929                                 printk(KERN_DEBUG "impossible 102\n");
930                                 return -EINVAL;
931                         }
932                 }
933                 return err;
934         }
935         return 1;
936
937 out_fill_res:
938         res->prefixlen = prefixlen;
939         res->nh_sel = nh_sel;
940         res->type = fa->fa_type;
941         res->scope = fa->fa_scope;
942         res->fi = fa->fa_info;
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944         res->netmask = mask;
945         res->network = zone & inet_make_mask(prefixlen);
946 #endif
947         atomic_inc(&res->fi->fib_clntref);
948         return 0;
949 }
950
951 /* Find appropriate source address to this destination */
952
953 __be32 __fib_res_prefsrc(struct fib_result *res)
954 {
955         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
956 }
957
958 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
959                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
960                   struct fib_info *fi, unsigned int flags)
961 {
962         struct nlmsghdr *nlh;
963         struct rtmsg *rtm;
964
965         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
966         if (nlh == NULL)
967                 return -EMSGSIZE;
968
969         rtm = nlmsg_data(nlh);
970         rtm->rtm_family = AF_INET;
971         rtm->rtm_dst_len = dst_len;
972         rtm->rtm_src_len = 0;
973         rtm->rtm_tos = tos;
974         rtm->rtm_table = tb_id;
975         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
976         rtm->rtm_type = type;
977         rtm->rtm_flags = fi->fib_flags;
978         rtm->rtm_scope = scope;
979         rtm->rtm_protocol = fi->fib_protocol;
980
981         if (rtm->rtm_dst_len)
982                 NLA_PUT_BE32(skb, RTA_DST, dst);
983
984         if (fi->fib_priority)
985                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
986
987         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
988                 goto nla_put_failure;
989
990         if (fi->fib_prefsrc)
991                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
992
993         if (fi->fib_nhs == 1) {
994                 if (fi->fib_nh->nh_gw)
995                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
996
997                 if (fi->fib_nh->nh_oif)
998                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
999 #ifdef CONFIG_NET_CLS_ROUTE
1000                 if (fi->fib_nh[0].nh_tclassid)
1001                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1002 #endif
1003         }
1004 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1005         if (fi->fib_nhs > 1) {
1006                 struct rtnexthop *rtnh;
1007                 struct nlattr *mp;
1008
1009                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1010                 if (mp == NULL)
1011                         goto nla_put_failure;
1012
1013                 for_nexthops(fi) {
1014                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1015                         if (rtnh == NULL)
1016                                 goto nla_put_failure;
1017
1018                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1019                         rtnh->rtnh_hops = nh->nh_weight - 1;
1020                         rtnh->rtnh_ifindex = nh->nh_oif;
1021
1022                         if (nh->nh_gw)
1023                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1024 #ifdef CONFIG_NET_CLS_ROUTE
1025                         if (nh->nh_tclassid)
1026                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1027 #endif
1028                         /* length of rtnetlink header + attributes */
1029                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1030                 } endfor_nexthops(fi);
1031
1032                 nla_nest_end(skb, mp);
1033         }
1034 #endif
1035         return nlmsg_end(skb, nlh);
1036
1037 nla_put_failure:
1038         nlmsg_cancel(skb, nlh);
1039         return -EMSGSIZE;
1040 }
1041
1042 /*
1043    Update FIB if:
1044    - local address disappeared -> we must delete all the entries
1045      referring to it.
1046    - device went down -> we must shutdown all nexthops going via it.
1047  */
1048
1049 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1050 {
1051         int ret = 0;
1052         int scope = RT_SCOPE_NOWHERE;
1053
1054         if (force)
1055                 scope = -1;
1056
1057         if (local && fib_info_laddrhash) {
1058                 unsigned int hash = fib_laddr_hashfn(local);
1059                 struct hlist_head *head = &fib_info_laddrhash[hash];
1060                 struct hlist_node *node;
1061                 struct fib_info *fi;
1062
1063                 hlist_for_each_entry(fi, node, head, fib_lhash) {
1064                         if (fi->fib_prefsrc == local) {
1065                                 fi->fib_flags |= RTNH_F_DEAD;
1066                                 ret++;
1067                         }
1068                 }
1069         }
1070
1071         if (dev) {
1072                 struct fib_info *prev_fi = NULL;
1073                 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1074                 struct hlist_head *head = &fib_info_devhash[hash];
1075                 struct hlist_node *node;
1076                 struct fib_nh *nh;
1077
1078                 hlist_for_each_entry(nh, node, head, nh_hash) {
1079                         struct fib_info *fi = nh->nh_parent;
1080                         int dead;
1081
1082                         BUG_ON(!fi->fib_nhs);
1083                         if (nh->nh_dev != dev || fi == prev_fi)
1084                                 continue;
1085                         prev_fi = fi;
1086                         dead = 0;
1087                         change_nexthops(fi) {
1088                                 if (nh->nh_flags&RTNH_F_DEAD)
1089                                         dead++;
1090                                 else if (nh->nh_dev == dev &&
1091                                          nh->nh_scope != scope) {
1092                                         nh->nh_flags |= RTNH_F_DEAD;
1093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1094                                         spin_lock_bh(&fib_multipath_lock);
1095                                         fi->fib_power -= nh->nh_power;
1096                                         nh->nh_power = 0;
1097                                         spin_unlock_bh(&fib_multipath_lock);
1098 #endif
1099                                         dead++;
1100                                 }
1101 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1102                                 if (force > 1 && nh->nh_dev == dev) {
1103                                         dead = fi->fib_nhs;
1104                                         break;
1105                                 }
1106 #endif
1107                         } endfor_nexthops(fi)
1108                         if (dead == fi->fib_nhs) {
1109                                 fi->fib_flags |= RTNH_F_DEAD;
1110                                 ret++;
1111                         }
1112                 }
1113         }
1114
1115         return ret;
1116 }
1117
1118 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1119
1120 /*
1121    Dead device goes up. We wake up dead nexthops.
1122    It takes sense only on multipath routes.
1123  */
1124
1125 int fib_sync_up(struct net_device *dev)
1126 {
1127         struct fib_info *prev_fi;
1128         unsigned int hash;
1129         struct hlist_head *head;
1130         struct hlist_node *node;
1131         struct fib_nh *nh;
1132         int ret;
1133
1134         if (!(dev->flags&IFF_UP))
1135                 return 0;
1136
1137         prev_fi = NULL;
1138         hash = fib_devindex_hashfn(dev->ifindex);
1139         head = &fib_info_devhash[hash];
1140         ret = 0;
1141
1142         hlist_for_each_entry(nh, node, head, nh_hash) {
1143                 struct fib_info *fi = nh->nh_parent;
1144                 int alive;
1145
1146                 BUG_ON(!fi->fib_nhs);
1147                 if (nh->nh_dev != dev || fi == prev_fi)
1148                         continue;
1149
1150                 prev_fi = fi;
1151                 alive = 0;
1152                 change_nexthops(fi) {
1153                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1154                                 alive++;
1155                                 continue;
1156                         }
1157                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1158                                 continue;
1159                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1160                                 continue;
1161                         alive++;
1162                         spin_lock_bh(&fib_multipath_lock);
1163                         nh->nh_power = 0;
1164                         nh->nh_flags &= ~RTNH_F_DEAD;
1165                         spin_unlock_bh(&fib_multipath_lock);
1166                 } endfor_nexthops(fi)
1167
1168                 if (alive > 0) {
1169                         fi->fib_flags &= ~RTNH_F_DEAD;
1170                         ret++;
1171                 }
1172         }
1173
1174         return ret;
1175 }
1176
1177 /*
1178    The algorithm is suboptimal, but it provides really
1179    fair weighted route distribution.
1180  */
1181
1182 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1183 {
1184         struct fib_info *fi = res->fi;
1185         int w;
1186
1187         spin_lock_bh(&fib_multipath_lock);
1188         if (fi->fib_power <= 0) {
1189                 int power = 0;
1190                 change_nexthops(fi) {
1191                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1192                                 power += nh->nh_weight;
1193                                 nh->nh_power = nh->nh_weight;
1194                         }
1195                 } endfor_nexthops(fi);
1196                 fi->fib_power = power;
1197                 if (power <= 0) {
1198                         spin_unlock_bh(&fib_multipath_lock);
1199                         /* Race condition: route has just become dead. */
1200                         res->nh_sel = 0;
1201                         return;
1202                 }
1203         }
1204
1205
1206         /* w should be random number [0..fi->fib_power-1],
1207            it is pretty bad approximation.
1208          */
1209
1210         w = jiffies % fi->fib_power;
1211
1212         change_nexthops(fi) {
1213                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1214                         if ((w -= nh->nh_power) <= 0) {
1215                                 nh->nh_power--;
1216                                 fi->fib_power--;
1217                                 res->nh_sel = nhsel;
1218                                 spin_unlock_bh(&fib_multipath_lock);
1219                                 return;
1220                         }
1221                 }
1222         } endfor_nexthops(fi);
1223
1224         /* Race condition: route has just become dead. */
1225         res->nh_sel = 0;
1226         spin_unlock_bh(&fib_multipath_lock);
1227 }
1228 #endif