Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
47
48 #include "fib_lookup.h"
49
50 static DEFINE_SPINLOCK(fib_info_lock);
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
55
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
61
62 static DEFINE_SPINLOCK(fib_multipath_lock);
63
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
71
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
73
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
76
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
81
82 #define endfor_nexthops(fi) }
83
84
85 static const struct
86 {
87         int     error;
88         u8      scope;
89 } fib_props[RTN_MAX + 1] = {
90         {
91                 .error  = 0,
92                 .scope  = RT_SCOPE_NOWHERE,
93         },      /* RTN_UNSPEC */
94         {
95                 .error  = 0,
96                 .scope  = RT_SCOPE_UNIVERSE,
97         },      /* RTN_UNICAST */
98         {
99                 .error  = 0,
100                 .scope  = RT_SCOPE_HOST,
101         },      /* RTN_LOCAL */
102         {
103                 .error  = 0,
104                 .scope  = RT_SCOPE_LINK,
105         },      /* RTN_BROADCAST */
106         {
107                 .error  = 0,
108                 .scope  = RT_SCOPE_LINK,
109         },      /* RTN_ANYCAST */
110         {
111                 .error  = 0,
112                 .scope  = RT_SCOPE_UNIVERSE,
113         },      /* RTN_MULTICAST */
114         {
115                 .error  = -EINVAL,
116                 .scope  = RT_SCOPE_UNIVERSE,
117         },      /* RTN_BLACKHOLE */
118         {
119                 .error  = -EHOSTUNREACH,
120                 .scope  = RT_SCOPE_UNIVERSE,
121         },      /* RTN_UNREACHABLE */
122         {
123                 .error  = -EACCES,
124                 .scope  = RT_SCOPE_UNIVERSE,
125         },      /* RTN_PROHIBIT */
126         {
127                 .error  = -EAGAIN,
128                 .scope  = RT_SCOPE_UNIVERSE,
129         },      /* RTN_THROW */
130         {
131                 .error  = -EINVAL,
132                 .scope  = RT_SCOPE_NOWHERE,
133         },      /* RTN_NAT */
134         {
135                 .error  = -EINVAL,
136                 .scope  = RT_SCOPE_NOWHERE,
137         },      /* RTN_XRESOLVE */
138 };
139
140
141 /* Release a nexthop info record */
142
143 void free_fib_info(struct fib_info *fi)
144 {
145         if (fi->fib_dead == 0) {
146                 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
147                 return;
148         }
149         change_nexthops(fi) {
150                 if (nh->nh_dev)
151                         dev_put(nh->nh_dev);
152                 nh->nh_dev = NULL;
153         } endfor_nexthops(fi);
154         fib_info_cnt--;
155         release_net(fi->fib_net);
156         kfree(fi);
157 }
158
159 void fib_release_info(struct fib_info *fi)
160 {
161         spin_lock_bh(&fib_info_lock);
162         if (fi && --fi->fib_treeref == 0) {
163                 hlist_del(&fi->fib_hash);
164                 if (fi->fib_prefsrc)
165                         hlist_del(&fi->fib_lhash);
166                 change_nexthops(fi) {
167                         if (!nh->nh_dev)
168                                 continue;
169                         hlist_del(&nh->nh_hash);
170                 } endfor_nexthops(fi)
171                 fi->fib_dead = 1;
172                 fib_info_put(fi);
173         }
174         spin_unlock_bh(&fib_info_lock);
175 }
176
177 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
178 {
179         const struct fib_nh *onh = ofi->fib_nh;
180
181         for_nexthops(fi) {
182                 if (nh->nh_oif != onh->nh_oif ||
183                     nh->nh_gw  != onh->nh_gw ||
184                     nh->nh_scope != onh->nh_scope ||
185 #ifdef CONFIG_IP_ROUTE_MULTIPATH
186                     nh->nh_weight != onh->nh_weight ||
187 #endif
188 #ifdef CONFIG_NET_CLS_ROUTE
189                     nh->nh_tclassid != onh->nh_tclassid ||
190 #endif
191                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
192                         return -1;
193                 onh++;
194         } endfor_nexthops(fi);
195         return 0;
196 }
197
198 static inline unsigned int fib_devindex_hashfn(unsigned int val)
199 {
200         unsigned int mask = DEVINDEX_HASHSIZE - 1;
201
202         return (val ^
203                 (val >> DEVINDEX_HASHBITS) ^
204                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
205 }
206
207 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
208 {
209         unsigned int mask = (fib_hash_size - 1);
210         unsigned int val = fi->fib_nhs;
211
212         val ^= fi->fib_protocol;
213         val ^= (__force u32)fi->fib_prefsrc;
214         val ^= fi->fib_priority;
215         for_nexthops(fi) {
216                 val ^= fib_devindex_hashfn(nh->nh_oif);
217         } endfor_nexthops(fi)
218
219         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
220 }
221
222 static struct fib_info *fib_find_info(const struct fib_info *nfi)
223 {
224         struct hlist_head *head;
225         struct hlist_node *node;
226         struct fib_info *fi;
227         unsigned int hash;
228
229         hash = fib_info_hashfn(nfi);
230         head = &fib_info_hash[hash];
231
232         hlist_for_each_entry(fi, node, head, fib_hash) {
233                 if (fi->fib_net != nfi->fib_net)
234                         continue;
235                 if (fi->fib_nhs != nfi->fib_nhs)
236                         continue;
237                 if (nfi->fib_protocol == fi->fib_protocol &&
238                     nfi->fib_prefsrc == fi->fib_prefsrc &&
239                     nfi->fib_priority == fi->fib_priority &&
240                     memcmp(nfi->fib_metrics, fi->fib_metrics,
241                            sizeof(fi->fib_metrics)) == 0 &&
242                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
243                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
244                         return fi;
245         }
246
247         return NULL;
248 }
249
250 /* Check, that the gateway is already configured.
251    Used only by redirect accept routine.
252  */
253
254 int ip_fib_check_default(__be32 gw, struct net_device *dev)
255 {
256         struct hlist_head *head;
257         struct hlist_node *node;
258         struct fib_nh *nh;
259         unsigned int hash;
260
261         spin_lock(&fib_info_lock);
262
263         hash = fib_devindex_hashfn(dev->ifindex);
264         head = &fib_info_devhash[hash];
265         hlist_for_each_entry(nh, node, head, nh_hash) {
266                 if (nh->nh_dev == dev &&
267                     nh->nh_gw == gw &&
268                     !(nh->nh_flags&RTNH_F_DEAD)) {
269                         spin_unlock(&fib_info_lock);
270                         return 0;
271                 }
272         }
273
274         spin_unlock(&fib_info_lock);
275
276         return -1;
277 }
278
279 static inline size_t fib_nlmsg_size(struct fib_info *fi)
280 {
281         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
282                          + nla_total_size(4) /* RTA_TABLE */
283                          + nla_total_size(4) /* RTA_DST */
284                          + nla_total_size(4) /* RTA_PRIORITY */
285                          + nla_total_size(4); /* RTA_PREFSRC */
286
287         /* space for nested metrics */
288         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
289
290         if (fi->fib_nhs) {
291                 /* Also handles the special case fib_nhs == 1 */
292
293                 /* each nexthop is packed in an attribute */
294                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
295
296                 /* may contain flow and gateway attribute */
297                 nhsize += 2 * nla_total_size(4);
298
299                 /* all nexthops are packed in a nested attribute */
300                 payload += nla_total_size(fi->fib_nhs * nhsize);
301         }
302
303         return payload;
304 }
305
306 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
307                int dst_len, u32 tb_id, struct nl_info *info,
308                unsigned int nlm_flags)
309 {
310         struct sk_buff *skb;
311         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
312         int err = -ENOBUFS;
313
314         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
315         if (skb == NULL)
316                 goto errout;
317
318         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
319                             fa->fa_type, fa->fa_scope, key, dst_len,
320                             fa->fa_tos, fa->fa_info, nlm_flags);
321         if (err < 0) {
322                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
323                 WARN_ON(err == -EMSGSIZE);
324                 kfree_skb(skb);
325                 goto errout;
326         }
327         err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
328                           info->nlh, GFP_KERNEL);
329 errout:
330         if (err < 0)
331                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
332 }
333
334 /* Return the first fib alias matching TOS with
335  * priority less than or equal to PRIO.
336  */
337 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
338 {
339         if (fah) {
340                 struct fib_alias *fa;
341                 list_for_each_entry(fa, fah, fa_list) {
342                         if (fa->fa_tos > tos)
343                                 continue;
344                         if (fa->fa_info->fib_priority >= prio ||
345                             fa->fa_tos < tos)
346                                 return fa;
347                 }
348         }
349         return NULL;
350 }
351
352 int fib_detect_death(struct fib_info *fi, int order,
353                      struct fib_info **last_resort, int *last_idx, int dflt)
354 {
355         struct neighbour *n;
356         int state = NUD_NONE;
357
358         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
359         if (n) {
360                 state = n->nud_state;
361                 neigh_release(n);
362         }
363         if (state==NUD_REACHABLE)
364                 return 0;
365         if ((state&NUD_VALID) && order != dflt)
366                 return 0;
367         if ((state&NUD_VALID) ||
368             (*last_idx<0 && order > dflt)) {
369                 *last_resort = fi;
370                 *last_idx = order;
371         }
372         return 1;
373 }
374
375 #ifdef CONFIG_IP_ROUTE_MULTIPATH
376
377 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
378 {
379         int nhs = 0;
380
381         while (rtnh_ok(rtnh, remaining)) {
382                 nhs++;
383                 rtnh = rtnh_next(rtnh, &remaining);
384         }
385
386         /* leftover implies invalid nexthop configuration, discard it */
387         return remaining > 0 ? 0 : nhs;
388 }
389
390 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
391                        int remaining, struct fib_config *cfg)
392 {
393         change_nexthops(fi) {
394                 int attrlen;
395
396                 if (!rtnh_ok(rtnh, remaining))
397                         return -EINVAL;
398
399                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400                 nh->nh_oif = rtnh->rtnh_ifindex;
401                 nh->nh_weight = rtnh->rtnh_hops + 1;
402
403                 attrlen = rtnh_attrlen(rtnh);
404                 if (attrlen > 0) {
405                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
406
407                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
408                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
409 #ifdef CONFIG_NET_CLS_ROUTE
410                         nla = nla_find(attrs, attrlen, RTA_FLOW);
411                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
412 #endif
413                 }
414
415                 rtnh = rtnh_next(rtnh, &remaining);
416         } endfor_nexthops(fi);
417
418         return 0;
419 }
420
421 #endif
422
423 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
424 {
425 #ifdef CONFIG_IP_ROUTE_MULTIPATH
426         struct rtnexthop *rtnh;
427         int remaining;
428 #endif
429
430         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
431                 return 1;
432
433         if (cfg->fc_oif || cfg->fc_gw) {
434                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
435                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
436                         return 0;
437                 return 1;
438         }
439
440 #ifdef CONFIG_IP_ROUTE_MULTIPATH
441         if (cfg->fc_mp == NULL)
442                 return 0;
443
444         rtnh = cfg->fc_mp;
445         remaining = cfg->fc_mp_len;
446
447         for_nexthops(fi) {
448                 int attrlen;
449
450                 if (!rtnh_ok(rtnh, remaining))
451                         return -EINVAL;
452
453                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
454                         return 1;
455
456                 attrlen = rtnh_attrlen(rtnh);
457                 if (attrlen < 0) {
458                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
459
460                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
461                         if (nla && nla_get_be32(nla) != nh->nh_gw)
462                                 return 1;
463 #ifdef CONFIG_NET_CLS_ROUTE
464                         nla = nla_find(attrs, attrlen, RTA_FLOW);
465                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
466                                 return 1;
467 #endif
468                 }
469
470                 rtnh = rtnh_next(rtnh, &remaining);
471         } endfor_nexthops(fi);
472 #endif
473         return 0;
474 }
475
476
477 /*
478    Picture
479    -------
480
481    Semantics of nexthop is very messy by historical reasons.
482    We have to take into account, that:
483    a) gateway can be actually local interface address,
484       so that gatewayed route is direct.
485    b) gateway must be on-link address, possibly
486       described not by an ifaddr, but also by a direct route.
487    c) If both gateway and interface are specified, they should not
488       contradict.
489    d) If we use tunnel routes, gateway could be not on-link.
490
491    Attempt to reconcile all of these (alas, self-contradictory) conditions
492    results in pretty ugly and hairy code with obscure logic.
493
494    I chose to generalized it instead, so that the size
495    of code does not increase practically, but it becomes
496    much more general.
497    Every prefix is assigned a "scope" value: "host" is local address,
498    "link" is direct route,
499    [ ... "site" ... "interior" ... ]
500    and "universe" is true gateway route with global meaning.
501
502    Every prefix refers to a set of "nexthop"s (gw, oif),
503    where gw must have narrower scope. This recursion stops
504    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
505    which means that gw is forced to be on link.
506
507    Code is still hairy, but now it is apparently logically
508    consistent and very flexible. F.e. as by-product it allows
509    to co-exists in peace independent exterior and interior
510    routing processes.
511
512    Normally it looks as following.
513
514    {universe prefix}  -> (gw, oif) [scope link]
515                           |
516                           |-> {link prefix} -> (gw, oif) [scope local]
517                                                 |
518                                                 |-> {local prefix} (terminal node)
519  */
520
521 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
522                         struct fib_nh *nh)
523 {
524         int err;
525         struct net *net;
526
527         net = cfg->fc_nlinfo.nl_net;
528         if (nh->nh_gw) {
529                 struct fib_result res;
530
531 #ifdef CONFIG_IP_ROUTE_PERVASIVE
532                 if (nh->nh_flags&RTNH_F_PERVASIVE)
533                         return 0;
534 #endif
535                 if (nh->nh_flags&RTNH_F_ONLINK) {
536                         struct net_device *dev;
537
538                         if (cfg->fc_scope >= RT_SCOPE_LINK)
539                                 return -EINVAL;
540                         if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
541                                 return -EINVAL;
542                         if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
543                                 return -ENODEV;
544                         if (!(dev->flags&IFF_UP))
545                                 return -ENETDOWN;
546                         nh->nh_dev = dev;
547                         dev_hold(dev);
548                         nh->nh_scope = RT_SCOPE_LINK;
549                         return 0;
550                 }
551                 {
552                         struct flowi fl = {
553                                 .nl_u = {
554                                         .ip4_u = {
555                                                 .daddr = nh->nh_gw,
556                                                 .scope = cfg->fc_scope + 1,
557                                         },
558                                 },
559                                 .oif = nh->nh_oif,
560                         };
561
562                         /* It is not necessary, but requires a bit of thinking */
563                         if (fl.fl4_scope < RT_SCOPE_LINK)
564                                 fl.fl4_scope = RT_SCOPE_LINK;
565                         if ((err = fib_lookup(net, &fl, &res)) != 0)
566                                 return err;
567                 }
568                 err = -EINVAL;
569                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
570                         goto out;
571                 nh->nh_scope = res.scope;
572                 nh->nh_oif = FIB_RES_OIF(res);
573                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
574                         goto out;
575                 dev_hold(nh->nh_dev);
576                 err = -ENETDOWN;
577                 if (!(nh->nh_dev->flags & IFF_UP))
578                         goto out;
579                 err = 0;
580 out:
581                 fib_res_put(&res);
582                 return err;
583         } else {
584                 struct in_device *in_dev;
585
586                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
587                         return -EINVAL;
588
589                 in_dev = inetdev_by_index(net, nh->nh_oif);
590                 if (in_dev == NULL)
591                         return -ENODEV;
592                 if (!(in_dev->dev->flags&IFF_UP)) {
593                         in_dev_put(in_dev);
594                         return -ENETDOWN;
595                 }
596                 nh->nh_dev = in_dev->dev;
597                 dev_hold(nh->nh_dev);
598                 nh->nh_scope = RT_SCOPE_HOST;
599                 in_dev_put(in_dev);
600         }
601         return 0;
602 }
603
604 static inline unsigned int fib_laddr_hashfn(__be32 val)
605 {
606         unsigned int mask = (fib_hash_size - 1);
607
608         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
609 }
610
611 static struct hlist_head *fib_hash_alloc(int bytes)
612 {
613         if (bytes <= PAGE_SIZE)
614                 return kzalloc(bytes, GFP_KERNEL);
615         else
616                 return (struct hlist_head *)
617                         __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
618 }
619
620 static void fib_hash_free(struct hlist_head *hash, int bytes)
621 {
622         if (!hash)
623                 return;
624
625         if (bytes <= PAGE_SIZE)
626                 kfree(hash);
627         else
628                 free_pages((unsigned long) hash, get_order(bytes));
629 }
630
631 static void fib_hash_move(struct hlist_head *new_info_hash,
632                           struct hlist_head *new_laddrhash,
633                           unsigned int new_size)
634 {
635         struct hlist_head *old_info_hash, *old_laddrhash;
636         unsigned int old_size = fib_hash_size;
637         unsigned int i, bytes;
638
639         spin_lock_bh(&fib_info_lock);
640         old_info_hash = fib_info_hash;
641         old_laddrhash = fib_info_laddrhash;
642         fib_hash_size = new_size;
643
644         for (i = 0; i < old_size; i++) {
645                 struct hlist_head *head = &fib_info_hash[i];
646                 struct hlist_node *node, *n;
647                 struct fib_info *fi;
648
649                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
650                         struct hlist_head *dest;
651                         unsigned int new_hash;
652
653                         hlist_del(&fi->fib_hash);
654
655                         new_hash = fib_info_hashfn(fi);
656                         dest = &new_info_hash[new_hash];
657                         hlist_add_head(&fi->fib_hash, dest);
658                 }
659         }
660         fib_info_hash = new_info_hash;
661
662         for (i = 0; i < old_size; i++) {
663                 struct hlist_head *lhead = &fib_info_laddrhash[i];
664                 struct hlist_node *node, *n;
665                 struct fib_info *fi;
666
667                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
668                         struct hlist_head *ldest;
669                         unsigned int new_hash;
670
671                         hlist_del(&fi->fib_lhash);
672
673                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
674                         ldest = &new_laddrhash[new_hash];
675                         hlist_add_head(&fi->fib_lhash, ldest);
676                 }
677         }
678         fib_info_laddrhash = new_laddrhash;
679
680         spin_unlock_bh(&fib_info_lock);
681
682         bytes = old_size * sizeof(struct hlist_head *);
683         fib_hash_free(old_info_hash, bytes);
684         fib_hash_free(old_laddrhash, bytes);
685 }
686
687 struct fib_info *fib_create_info(struct fib_config *cfg)
688 {
689         int err;
690         struct fib_info *fi = NULL;
691         struct fib_info *ofi;
692         int nhs = 1;
693         struct net *net = cfg->fc_nlinfo.nl_net;
694
695         /* Fast check to catch the most weird cases */
696         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
697                 goto err_inval;
698
699 #ifdef CONFIG_IP_ROUTE_MULTIPATH
700         if (cfg->fc_mp) {
701                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
702                 if (nhs == 0)
703                         goto err_inval;
704         }
705 #endif
706
707         err = -ENOBUFS;
708         if (fib_info_cnt >= fib_hash_size) {
709                 unsigned int new_size = fib_hash_size << 1;
710                 struct hlist_head *new_info_hash;
711                 struct hlist_head *new_laddrhash;
712                 unsigned int bytes;
713
714                 if (!new_size)
715                         new_size = 1;
716                 bytes = new_size * sizeof(struct hlist_head *);
717                 new_info_hash = fib_hash_alloc(bytes);
718                 new_laddrhash = fib_hash_alloc(bytes);
719                 if (!new_info_hash || !new_laddrhash) {
720                         fib_hash_free(new_info_hash, bytes);
721                         fib_hash_free(new_laddrhash, bytes);
722                 } else
723                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
724
725                 if (!fib_hash_size)
726                         goto failure;
727         }
728
729         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
730         if (fi == NULL)
731                 goto failure;
732         fib_info_cnt++;
733
734         fi->fib_net = hold_net(net);
735         fi->fib_protocol = cfg->fc_protocol;
736         fi->fib_flags = cfg->fc_flags;
737         fi->fib_priority = cfg->fc_priority;
738         fi->fib_prefsrc = cfg->fc_prefsrc;
739
740         fi->fib_nhs = nhs;
741         change_nexthops(fi) {
742                 nh->nh_parent = fi;
743         } endfor_nexthops(fi)
744
745         if (cfg->fc_mx) {
746                 struct nlattr *nla;
747                 int remaining;
748
749                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
750                         int type = nla_type(nla);
751
752                         if (type) {
753                                 if (type > RTAX_MAX)
754                                         goto err_inval;
755                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
756                         }
757                 }
758         }
759
760         if (cfg->fc_mp) {
761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
762                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
763                 if (err != 0)
764                         goto failure;
765                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
766                         goto err_inval;
767                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
768                         goto err_inval;
769 #ifdef CONFIG_NET_CLS_ROUTE
770                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
771                         goto err_inval;
772 #endif
773 #else
774                 goto err_inval;
775 #endif
776         } else {
777                 struct fib_nh *nh = fi->fib_nh;
778
779                 nh->nh_oif = cfg->fc_oif;
780                 nh->nh_gw = cfg->fc_gw;
781                 nh->nh_flags = cfg->fc_flags;
782 #ifdef CONFIG_NET_CLS_ROUTE
783                 nh->nh_tclassid = cfg->fc_flow;
784 #endif
785 #ifdef CONFIG_IP_ROUTE_MULTIPATH
786                 nh->nh_weight = 1;
787 #endif
788         }
789
790         if (fib_props[cfg->fc_type].error) {
791                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
792                         goto err_inval;
793                 goto link_it;
794         }
795
796         if (cfg->fc_scope > RT_SCOPE_HOST)
797                 goto err_inval;
798
799         if (cfg->fc_scope == RT_SCOPE_HOST) {
800                 struct fib_nh *nh = fi->fib_nh;
801
802                 /* Local address is added. */
803                 if (nhs != 1 || nh->nh_gw)
804                         goto err_inval;
805                 nh->nh_scope = RT_SCOPE_NOWHERE;
806                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
807                 err = -ENODEV;
808                 if (nh->nh_dev == NULL)
809                         goto failure;
810         } else {
811                 change_nexthops(fi) {
812                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
813                                 goto failure;
814                 } endfor_nexthops(fi)
815         }
816
817         if (fi->fib_prefsrc) {
818                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
819                     fi->fib_prefsrc != cfg->fc_dst)
820                         if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
821                                 goto err_inval;
822         }
823
824 link_it:
825         if ((ofi = fib_find_info(fi)) != NULL) {
826                 fi->fib_dead = 1;
827                 free_fib_info(fi);
828                 ofi->fib_treeref++;
829                 return ofi;
830         }
831
832         fi->fib_treeref++;
833         atomic_inc(&fi->fib_clntref);
834         spin_lock_bh(&fib_info_lock);
835         hlist_add_head(&fi->fib_hash,
836                        &fib_info_hash[fib_info_hashfn(fi)]);
837         if (fi->fib_prefsrc) {
838                 struct hlist_head *head;
839
840                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
841                 hlist_add_head(&fi->fib_lhash, head);
842         }
843         change_nexthops(fi) {
844                 struct hlist_head *head;
845                 unsigned int hash;
846
847                 if (!nh->nh_dev)
848                         continue;
849                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
850                 head = &fib_info_devhash[hash];
851                 hlist_add_head(&nh->nh_hash, head);
852         } endfor_nexthops(fi)
853         spin_unlock_bh(&fib_info_lock);
854         return fi;
855
856 err_inval:
857         err = -EINVAL;
858
859 failure:
860         if (fi) {
861                 fi->fib_dead = 1;
862                 free_fib_info(fi);
863         }
864
865         return ERR_PTR(err);
866 }
867
868 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
869 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
870                        struct fib_result *res, __be32 zone, __be32 mask,
871                         int prefixlen)
872 {
873         struct fib_alias *fa;
874         int nh_sel = 0;
875
876         list_for_each_entry_rcu(fa, head, fa_list) {
877                 int err;
878
879                 if (fa->fa_tos &&
880                     fa->fa_tos != flp->fl4_tos)
881                         continue;
882
883                 if (fa->fa_scope < flp->fl4_scope)
884                         continue;
885
886                 fa->fa_state |= FA_S_ACCESSED;
887
888                 err = fib_props[fa->fa_type].error;
889                 if (err == 0) {
890                         struct fib_info *fi = fa->fa_info;
891
892                         if (fi->fib_flags & RTNH_F_DEAD)
893                                 continue;
894
895                         switch (fa->fa_type) {
896                         case RTN_UNICAST:
897                         case RTN_LOCAL:
898                         case RTN_BROADCAST:
899                         case RTN_ANYCAST:
900                         case RTN_MULTICAST:
901                                 for_nexthops(fi) {
902                                         if (nh->nh_flags&RTNH_F_DEAD)
903                                                 continue;
904                                         if (!flp->oif || flp->oif == nh->nh_oif)
905                                                 break;
906                                 }
907 #ifdef CONFIG_IP_ROUTE_MULTIPATH
908                                 if (nhsel < fi->fib_nhs) {
909                                         nh_sel = nhsel;
910                                         goto out_fill_res;
911                                 }
912 #else
913                                 if (nhsel < 1) {
914                                         goto out_fill_res;
915                                 }
916 #endif
917                                 endfor_nexthops(fi);
918                                 continue;
919
920                         default:
921                                 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
922                                         fa->fa_type);
923                                 return -EINVAL;
924                         }
925                 }
926                 return err;
927         }
928         return 1;
929
930 out_fill_res:
931         res->prefixlen = prefixlen;
932         res->nh_sel = nh_sel;
933         res->type = fa->fa_type;
934         res->scope = fa->fa_scope;
935         res->fi = fa->fa_info;
936         atomic_inc(&res->fi->fib_clntref);
937         return 0;
938 }
939
940 /* Find appropriate source address to this destination */
941
942 __be32 __fib_res_prefsrc(struct fib_result *res)
943 {
944         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
945 }
946
947 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
948                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
949                   struct fib_info *fi, unsigned int flags)
950 {
951         struct nlmsghdr *nlh;
952         struct rtmsg *rtm;
953
954         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
955         if (nlh == NULL)
956                 return -EMSGSIZE;
957
958         rtm = nlmsg_data(nlh);
959         rtm->rtm_family = AF_INET;
960         rtm->rtm_dst_len = dst_len;
961         rtm->rtm_src_len = 0;
962         rtm->rtm_tos = tos;
963         if (tb_id < 256)
964                 rtm->rtm_table = tb_id;
965         else
966                 rtm->rtm_table = RT_TABLE_COMPAT;
967         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
968         rtm->rtm_type = type;
969         rtm->rtm_flags = fi->fib_flags;
970         rtm->rtm_scope = scope;
971         rtm->rtm_protocol = fi->fib_protocol;
972
973         if (rtm->rtm_dst_len)
974                 NLA_PUT_BE32(skb, RTA_DST, dst);
975
976         if (fi->fib_priority)
977                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
978
979         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
980                 goto nla_put_failure;
981
982         if (fi->fib_prefsrc)
983                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
984
985         if (fi->fib_nhs == 1) {
986                 if (fi->fib_nh->nh_gw)
987                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
988
989                 if (fi->fib_nh->nh_oif)
990                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
991 #ifdef CONFIG_NET_CLS_ROUTE
992                 if (fi->fib_nh[0].nh_tclassid)
993                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
994 #endif
995         }
996 #ifdef CONFIG_IP_ROUTE_MULTIPATH
997         if (fi->fib_nhs > 1) {
998                 struct rtnexthop *rtnh;
999                 struct nlattr *mp;
1000
1001                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1002                 if (mp == NULL)
1003                         goto nla_put_failure;
1004
1005                 for_nexthops(fi) {
1006                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1007                         if (rtnh == NULL)
1008                                 goto nla_put_failure;
1009
1010                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1011                         rtnh->rtnh_hops = nh->nh_weight - 1;
1012                         rtnh->rtnh_ifindex = nh->nh_oif;
1013
1014                         if (nh->nh_gw)
1015                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1016 #ifdef CONFIG_NET_CLS_ROUTE
1017                         if (nh->nh_tclassid)
1018                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1019 #endif
1020                         /* length of rtnetlink header + attributes */
1021                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1022                 } endfor_nexthops(fi);
1023
1024                 nla_nest_end(skb, mp);
1025         }
1026 #endif
1027         return nlmsg_end(skb, nlh);
1028
1029 nla_put_failure:
1030         nlmsg_cancel(skb, nlh);
1031         return -EMSGSIZE;
1032 }
1033
1034 /*
1035    Update FIB if:
1036    - local address disappeared -> we must delete all the entries
1037      referring to it.
1038    - device went down -> we must shutdown all nexthops going via it.
1039  */
1040 int fib_sync_down_addr(struct net *net, __be32 local)
1041 {
1042         int ret = 0;
1043         unsigned int hash = fib_laddr_hashfn(local);
1044         struct hlist_head *head = &fib_info_laddrhash[hash];
1045         struct hlist_node *node;
1046         struct fib_info *fi;
1047
1048         if (fib_info_laddrhash == NULL || local == 0)
1049                 return 0;
1050
1051         hlist_for_each_entry(fi, node, head, fib_lhash) {
1052                 if (fi->fib_net != net)
1053                         continue;
1054                 if (fi->fib_prefsrc == local) {
1055                         fi->fib_flags |= RTNH_F_DEAD;
1056                         ret++;
1057                 }
1058         }
1059         return ret;
1060 }
1061
1062 int fib_sync_down_dev(struct net_device *dev, int force)
1063 {
1064         int ret = 0;
1065         int scope = RT_SCOPE_NOWHERE;
1066         struct fib_info *prev_fi = NULL;
1067         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1068         struct hlist_head *head = &fib_info_devhash[hash];
1069         struct hlist_node *node;
1070         struct fib_nh *nh;
1071
1072         if (force)
1073                 scope = -1;
1074
1075         hlist_for_each_entry(nh, node, head, nh_hash) {
1076                 struct fib_info *fi = nh->nh_parent;
1077                 int dead;
1078
1079                 BUG_ON(!fi->fib_nhs);
1080                 if (nh->nh_dev != dev || fi == prev_fi)
1081                         continue;
1082                 prev_fi = fi;
1083                 dead = 0;
1084                 change_nexthops(fi) {
1085                         if (nh->nh_flags&RTNH_F_DEAD)
1086                                 dead++;
1087                         else if (nh->nh_dev == dev &&
1088                                         nh->nh_scope != scope) {
1089                                 nh->nh_flags |= RTNH_F_DEAD;
1090 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1091                                 spin_lock_bh(&fib_multipath_lock);
1092                                 fi->fib_power -= nh->nh_power;
1093                                 nh->nh_power = 0;
1094                                 spin_unlock_bh(&fib_multipath_lock);
1095 #endif
1096                                 dead++;
1097                         }
1098 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1099                         if (force > 1 && nh->nh_dev == dev) {
1100                                 dead = fi->fib_nhs;
1101                                 break;
1102                         }
1103 #endif
1104                 } endfor_nexthops(fi)
1105                 if (dead == fi->fib_nhs) {
1106                         fi->fib_flags |= RTNH_F_DEAD;
1107                         ret++;
1108                 }
1109         }
1110
1111         return ret;
1112 }
1113
1114 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1115
1116 /*
1117    Dead device goes up. We wake up dead nexthops.
1118    It takes sense only on multipath routes.
1119  */
1120
1121 int fib_sync_up(struct net_device *dev)
1122 {
1123         struct fib_info *prev_fi;
1124         unsigned int hash;
1125         struct hlist_head *head;
1126         struct hlist_node *node;
1127         struct fib_nh *nh;
1128         int ret;
1129
1130         if (!(dev->flags&IFF_UP))
1131                 return 0;
1132
1133         prev_fi = NULL;
1134         hash = fib_devindex_hashfn(dev->ifindex);
1135         head = &fib_info_devhash[hash];
1136         ret = 0;
1137
1138         hlist_for_each_entry(nh, node, head, nh_hash) {
1139                 struct fib_info *fi = nh->nh_parent;
1140                 int alive;
1141
1142                 BUG_ON(!fi->fib_nhs);
1143                 if (nh->nh_dev != dev || fi == prev_fi)
1144                         continue;
1145
1146                 prev_fi = fi;
1147                 alive = 0;
1148                 change_nexthops(fi) {
1149                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1150                                 alive++;
1151                                 continue;
1152                         }
1153                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1154                                 continue;
1155                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1156                                 continue;
1157                         alive++;
1158                         spin_lock_bh(&fib_multipath_lock);
1159                         nh->nh_power = 0;
1160                         nh->nh_flags &= ~RTNH_F_DEAD;
1161                         spin_unlock_bh(&fib_multipath_lock);
1162                 } endfor_nexthops(fi)
1163
1164                 if (alive > 0) {
1165                         fi->fib_flags &= ~RTNH_F_DEAD;
1166                         ret++;
1167                 }
1168         }
1169
1170         return ret;
1171 }
1172
1173 /*
1174    The algorithm is suboptimal, but it provides really
1175    fair weighted route distribution.
1176  */
1177
1178 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1179 {
1180         struct fib_info *fi = res->fi;
1181         int w;
1182
1183         spin_lock_bh(&fib_multipath_lock);
1184         if (fi->fib_power <= 0) {
1185                 int power = 0;
1186                 change_nexthops(fi) {
1187                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1188                                 power += nh->nh_weight;
1189                                 nh->nh_power = nh->nh_weight;
1190                         }
1191                 } endfor_nexthops(fi);
1192                 fi->fib_power = power;
1193                 if (power <= 0) {
1194                         spin_unlock_bh(&fib_multipath_lock);
1195                         /* Race condition: route has just become dead. */
1196                         res->nh_sel = 0;
1197                         return;
1198                 }
1199         }
1200
1201
1202         /* w should be random number [0..fi->fib_power-1],
1203            it is pretty bad approximation.
1204          */
1205
1206         w = jiffies % fi->fib_power;
1207
1208         change_nexthops(fi) {
1209                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1210                         if ((w -= nh->nh_power) <= 0) {
1211                                 nh->nh_power--;
1212                                 fi->fib_power--;
1213                                 res->nh_sel = nhsel;
1214                                 spin_unlock_bh(&fib_multipath_lock);
1215                                 return;
1216                         }
1217                 }
1218         } endfor_nexthops(fi);
1219
1220         /* Race condition: route has just become dead. */
1221         res->nh_sel = 0;
1222         spin_unlock_bh(&fib_multipath_lock);
1223 }
1224 #endif