Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
47
48 #include "fib_lookup.h"
49
50 static DEFINE_SPINLOCK(fib_info_lock);
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
55
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
61
62 static DEFINE_SPINLOCK(fib_multipath_lock);
63
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
71
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
73
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
76
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
81
82 #define endfor_nexthops(fi) }
83
84
85 static const struct
86 {
87         int     error;
88         u8      scope;
89 } fib_props[RTN_MAX + 1] = {
90         {
91                 .error  = 0,
92                 .scope  = RT_SCOPE_NOWHERE,
93         },      /* RTN_UNSPEC */
94         {
95                 .error  = 0,
96                 .scope  = RT_SCOPE_UNIVERSE,
97         },      /* RTN_UNICAST */
98         {
99                 .error  = 0,
100                 .scope  = RT_SCOPE_HOST,
101         },      /* RTN_LOCAL */
102         {
103                 .error  = 0,
104                 .scope  = RT_SCOPE_LINK,
105         },      /* RTN_BROADCAST */
106         {
107                 .error  = 0,
108                 .scope  = RT_SCOPE_LINK,
109         },      /* RTN_ANYCAST */
110         {
111                 .error  = 0,
112                 .scope  = RT_SCOPE_UNIVERSE,
113         },      /* RTN_MULTICAST */
114         {
115                 .error  = -EINVAL,
116                 .scope  = RT_SCOPE_UNIVERSE,
117         },      /* RTN_BLACKHOLE */
118         {
119                 .error  = -EHOSTUNREACH,
120                 .scope  = RT_SCOPE_UNIVERSE,
121         },      /* RTN_UNREACHABLE */
122         {
123                 .error  = -EACCES,
124                 .scope  = RT_SCOPE_UNIVERSE,
125         },      /* RTN_PROHIBIT */
126         {
127                 .error  = -EAGAIN,
128                 .scope  = RT_SCOPE_UNIVERSE,
129         },      /* RTN_THROW */
130         {
131                 .error  = -EINVAL,
132                 .scope  = RT_SCOPE_NOWHERE,
133         },      /* RTN_NAT */
134         {
135                 .error  = -EINVAL,
136                 .scope  = RT_SCOPE_NOWHERE,
137         },      /* RTN_XRESOLVE */
138 };
139
140
141 /* Release a nexthop info record */
142
143 void free_fib_info(struct fib_info *fi)
144 {
145         if (fi->fib_dead == 0) {
146                 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
147                 return;
148         }
149         change_nexthops(fi) {
150                 if (nh->nh_dev)
151                         dev_put(nh->nh_dev);
152                 nh->nh_dev = NULL;
153         } endfor_nexthops(fi);
154         fib_info_cnt--;
155         kfree(fi);
156 }
157
158 void fib_release_info(struct fib_info *fi)
159 {
160         spin_lock_bh(&fib_info_lock);
161         if (fi && --fi->fib_treeref == 0) {
162                 hlist_del(&fi->fib_hash);
163                 if (fi->fib_prefsrc)
164                         hlist_del(&fi->fib_lhash);
165                 change_nexthops(fi) {
166                         if (!nh->nh_dev)
167                                 continue;
168                         hlist_del(&nh->nh_hash);
169                 } endfor_nexthops(fi)
170                 fi->fib_dead = 1;
171                 fib_info_put(fi);
172         }
173         spin_unlock_bh(&fib_info_lock);
174 }
175
176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177 {
178         const struct fib_nh *onh = ofi->fib_nh;
179
180         for_nexthops(fi) {
181                 if (nh->nh_oif != onh->nh_oif ||
182                     nh->nh_gw  != onh->nh_gw ||
183                     nh->nh_scope != onh->nh_scope ||
184 #ifdef CONFIG_IP_ROUTE_MULTIPATH
185                     nh->nh_weight != onh->nh_weight ||
186 #endif
187 #ifdef CONFIG_NET_CLS_ROUTE
188                     nh->nh_tclassid != onh->nh_tclassid ||
189 #endif
190                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
191                         return -1;
192                 onh++;
193         } endfor_nexthops(fi);
194         return 0;
195 }
196
197 static inline unsigned int fib_devindex_hashfn(unsigned int val)
198 {
199         unsigned int mask = DEVINDEX_HASHSIZE - 1;
200
201         return (val ^
202                 (val >> DEVINDEX_HASHBITS) ^
203                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
204 }
205
206 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
207 {
208         unsigned int mask = (fib_hash_size - 1);
209         unsigned int val = fi->fib_nhs;
210
211         val ^= fi->fib_protocol;
212         val ^= (__force u32)fi->fib_prefsrc;
213         val ^= fi->fib_priority;
214         for_nexthops(fi) {
215                 val ^= fib_devindex_hashfn(nh->nh_oif);
216         } endfor_nexthops(fi)
217
218         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
219 }
220
221 static struct fib_info *fib_find_info(const struct fib_info *nfi)
222 {
223         struct hlist_head *head;
224         struct hlist_node *node;
225         struct fib_info *fi;
226         unsigned int hash;
227
228         hash = fib_info_hashfn(nfi);
229         head = &fib_info_hash[hash];
230
231         hlist_for_each_entry(fi, node, head, fib_hash) {
232                 if (fi->fib_net != nfi->fib_net)
233                         continue;
234                 if (fi->fib_nhs != nfi->fib_nhs)
235                         continue;
236                 if (nfi->fib_protocol == fi->fib_protocol &&
237                     nfi->fib_prefsrc == fi->fib_prefsrc &&
238                     nfi->fib_priority == fi->fib_priority &&
239                     memcmp(nfi->fib_metrics, fi->fib_metrics,
240                            sizeof(fi->fib_metrics)) == 0 &&
241                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
242                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
243                         return fi;
244         }
245
246         return NULL;
247 }
248
249 /* Check, that the gateway is already configured.
250    Used only by redirect accept routine.
251  */
252
253 int ip_fib_check_default(__be32 gw, struct net_device *dev)
254 {
255         struct hlist_head *head;
256         struct hlist_node *node;
257         struct fib_nh *nh;
258         unsigned int hash;
259
260         spin_lock(&fib_info_lock);
261
262         hash = fib_devindex_hashfn(dev->ifindex);
263         head = &fib_info_devhash[hash];
264         hlist_for_each_entry(nh, node, head, nh_hash) {
265                 if (nh->nh_dev == dev &&
266                     nh->nh_gw == gw &&
267                     !(nh->nh_flags&RTNH_F_DEAD)) {
268                         spin_unlock(&fib_info_lock);
269                         return 0;
270                 }
271         }
272
273         spin_unlock(&fib_info_lock);
274
275         return -1;
276 }
277
278 static inline size_t fib_nlmsg_size(struct fib_info *fi)
279 {
280         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
281                          + nla_total_size(4) /* RTA_TABLE */
282                          + nla_total_size(4) /* RTA_DST */
283                          + nla_total_size(4) /* RTA_PRIORITY */
284                          + nla_total_size(4); /* RTA_PREFSRC */
285
286         /* space for nested metrics */
287         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
288
289         if (fi->fib_nhs) {
290                 /* Also handles the special case fib_nhs == 1 */
291
292                 /* each nexthop is packed in an attribute */
293                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
294
295                 /* may contain flow and gateway attribute */
296                 nhsize += 2 * nla_total_size(4);
297
298                 /* all nexthops are packed in a nested attribute */
299                 payload += nla_total_size(fi->fib_nhs * nhsize);
300         }
301
302         return payload;
303 }
304
305 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
306                int dst_len, u32 tb_id, struct nl_info *info,
307                unsigned int nlm_flags)
308 {
309         struct sk_buff *skb;
310         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
311         int err = -ENOBUFS;
312
313         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
314         if (skb == NULL)
315                 goto errout;
316
317         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
318                             fa->fa_type, fa->fa_scope, key, dst_len,
319                             fa->fa_tos, fa->fa_info, nlm_flags);
320         if (err < 0) {
321                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
322                 WARN_ON(err == -EMSGSIZE);
323                 kfree_skb(skb);
324                 goto errout;
325         }
326         err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
327                           info->nlh, GFP_KERNEL);
328 errout:
329         if (err < 0)
330                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
331 }
332
333 /* Return the first fib alias matching TOS with
334  * priority less than or equal to PRIO.
335  */
336 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
337 {
338         if (fah) {
339                 struct fib_alias *fa;
340                 list_for_each_entry(fa, fah, fa_list) {
341                         if (fa->fa_tos > tos)
342                                 continue;
343                         if (fa->fa_info->fib_priority >= prio ||
344                             fa->fa_tos < tos)
345                                 return fa;
346                 }
347         }
348         return NULL;
349 }
350
351 int fib_detect_death(struct fib_info *fi, int order,
352                      struct fib_info **last_resort, int *last_idx, int dflt)
353 {
354         struct neighbour *n;
355         int state = NUD_NONE;
356
357         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
358         if (n) {
359                 state = n->nud_state;
360                 neigh_release(n);
361         }
362         if (state==NUD_REACHABLE)
363                 return 0;
364         if ((state&NUD_VALID) && order != dflt)
365                 return 0;
366         if ((state&NUD_VALID) ||
367             (*last_idx<0 && order > dflt)) {
368                 *last_resort = fi;
369                 *last_idx = order;
370         }
371         return 1;
372 }
373
374 #ifdef CONFIG_IP_ROUTE_MULTIPATH
375
376 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
377 {
378         int nhs = 0;
379
380         while (rtnh_ok(rtnh, remaining)) {
381                 nhs++;
382                 rtnh = rtnh_next(rtnh, &remaining);
383         }
384
385         /* leftover implies invalid nexthop configuration, discard it */
386         return remaining > 0 ? 0 : nhs;
387 }
388
389 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
390                        int remaining, struct fib_config *cfg)
391 {
392         change_nexthops(fi) {
393                 int attrlen;
394
395                 if (!rtnh_ok(rtnh, remaining))
396                         return -EINVAL;
397
398                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
399                 nh->nh_oif = rtnh->rtnh_ifindex;
400                 nh->nh_weight = rtnh->rtnh_hops + 1;
401
402                 attrlen = rtnh_attrlen(rtnh);
403                 if (attrlen > 0) {
404                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405
406                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
407                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
408 #ifdef CONFIG_NET_CLS_ROUTE
409                         nla = nla_find(attrs, attrlen, RTA_FLOW);
410                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
411 #endif
412                 }
413
414                 rtnh = rtnh_next(rtnh, &remaining);
415         } endfor_nexthops(fi);
416
417         return 0;
418 }
419
420 #endif
421
422 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
423 {
424 #ifdef CONFIG_IP_ROUTE_MULTIPATH
425         struct rtnexthop *rtnh;
426         int remaining;
427 #endif
428
429         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
430                 return 1;
431
432         if (cfg->fc_oif || cfg->fc_gw) {
433                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
434                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
435                         return 0;
436                 return 1;
437         }
438
439 #ifdef CONFIG_IP_ROUTE_MULTIPATH
440         if (cfg->fc_mp == NULL)
441                 return 0;
442
443         rtnh = cfg->fc_mp;
444         remaining = cfg->fc_mp_len;
445
446         for_nexthops(fi) {
447                 int attrlen;
448
449                 if (!rtnh_ok(rtnh, remaining))
450                         return -EINVAL;
451
452                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
453                         return 1;
454
455                 attrlen = rtnh_attrlen(rtnh);
456                 if (attrlen < 0) {
457                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
458
459                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
460                         if (nla && nla_get_be32(nla) != nh->nh_gw)
461                                 return 1;
462 #ifdef CONFIG_NET_CLS_ROUTE
463                         nla = nla_find(attrs, attrlen, RTA_FLOW);
464                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
465                                 return 1;
466 #endif
467                 }
468
469                 rtnh = rtnh_next(rtnh, &remaining);
470         } endfor_nexthops(fi);
471 #endif
472         return 0;
473 }
474
475
476 /*
477    Picture
478    -------
479
480    Semantics of nexthop is very messy by historical reasons.
481    We have to take into account, that:
482    a) gateway can be actually local interface address,
483       so that gatewayed route is direct.
484    b) gateway must be on-link address, possibly
485       described not by an ifaddr, but also by a direct route.
486    c) If both gateway and interface are specified, they should not
487       contradict.
488    d) If we use tunnel routes, gateway could be not on-link.
489
490    Attempt to reconcile all of these (alas, self-contradictory) conditions
491    results in pretty ugly and hairy code with obscure logic.
492
493    I chose to generalized it instead, so that the size
494    of code does not increase practically, but it becomes
495    much more general.
496    Every prefix is assigned a "scope" value: "host" is local address,
497    "link" is direct route,
498    [ ... "site" ... "interior" ... ]
499    and "universe" is true gateway route with global meaning.
500
501    Every prefix refers to a set of "nexthop"s (gw, oif),
502    where gw must have narrower scope. This recursion stops
503    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
504    which means that gw is forced to be on link.
505
506    Code is still hairy, but now it is apparently logically
507    consistent and very flexible. F.e. as by-product it allows
508    to co-exists in peace independent exterior and interior
509    routing processes.
510
511    Normally it looks as following.
512
513    {universe prefix}  -> (gw, oif) [scope link]
514                           |
515                           |-> {link prefix} -> (gw, oif) [scope local]
516                                                 |
517                                                 |-> {local prefix} (terminal node)
518  */
519
520 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
521                         struct fib_nh *nh)
522 {
523         int err;
524         struct net *net;
525
526         net = cfg->fc_nlinfo.nl_net;
527         if (nh->nh_gw) {
528                 struct fib_result res;
529
530 #ifdef CONFIG_IP_ROUTE_PERVASIVE
531                 if (nh->nh_flags&RTNH_F_PERVASIVE)
532                         return 0;
533 #endif
534                 if (nh->nh_flags&RTNH_F_ONLINK) {
535                         struct net_device *dev;
536
537                         if (cfg->fc_scope >= RT_SCOPE_LINK)
538                                 return -EINVAL;
539                         if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
540                                 return -EINVAL;
541                         if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
542                                 return -ENODEV;
543                         if (!(dev->flags&IFF_UP))
544                                 return -ENETDOWN;
545                         nh->nh_dev = dev;
546                         dev_hold(dev);
547                         nh->nh_scope = RT_SCOPE_LINK;
548                         return 0;
549                 }
550                 {
551                         struct flowi fl = {
552                                 .nl_u = {
553                                         .ip4_u = {
554                                                 .daddr = nh->nh_gw,
555                                                 .scope = cfg->fc_scope + 1,
556                                         },
557                                 },
558                                 .oif = nh->nh_oif,
559                         };
560
561                         /* It is not necessary, but requires a bit of thinking */
562                         if (fl.fl4_scope < RT_SCOPE_LINK)
563                                 fl.fl4_scope = RT_SCOPE_LINK;
564                         if ((err = fib_lookup(net, &fl, &res)) != 0)
565                                 return err;
566                 }
567                 err = -EINVAL;
568                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
569                         goto out;
570                 nh->nh_scope = res.scope;
571                 nh->nh_oif = FIB_RES_OIF(res);
572                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
573                         goto out;
574                 dev_hold(nh->nh_dev);
575                 err = -ENETDOWN;
576                 if (!(nh->nh_dev->flags & IFF_UP))
577                         goto out;
578                 err = 0;
579 out:
580                 fib_res_put(&res);
581                 return err;
582         } else {
583                 struct in_device *in_dev;
584
585                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
586                         return -EINVAL;
587
588                 in_dev = inetdev_by_index(net, nh->nh_oif);
589                 if (in_dev == NULL)
590                         return -ENODEV;
591                 if (!(in_dev->dev->flags&IFF_UP)) {
592                         in_dev_put(in_dev);
593                         return -ENETDOWN;
594                 }
595                 nh->nh_dev = in_dev->dev;
596                 dev_hold(nh->nh_dev);
597                 nh->nh_scope = RT_SCOPE_HOST;
598                 in_dev_put(in_dev);
599         }
600         return 0;
601 }
602
603 static inline unsigned int fib_laddr_hashfn(__be32 val)
604 {
605         unsigned int mask = (fib_hash_size - 1);
606
607         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
608 }
609
610 static struct hlist_head *fib_hash_alloc(int bytes)
611 {
612         if (bytes <= PAGE_SIZE)
613                 return kzalloc(bytes, GFP_KERNEL);
614         else
615                 return (struct hlist_head *)
616                         __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
617 }
618
619 static void fib_hash_free(struct hlist_head *hash, int bytes)
620 {
621         if (!hash)
622                 return;
623
624         if (bytes <= PAGE_SIZE)
625                 kfree(hash);
626         else
627                 free_pages((unsigned long) hash, get_order(bytes));
628 }
629
630 static void fib_hash_move(struct hlist_head *new_info_hash,
631                           struct hlist_head *new_laddrhash,
632                           unsigned int new_size)
633 {
634         struct hlist_head *old_info_hash, *old_laddrhash;
635         unsigned int old_size = fib_hash_size;
636         unsigned int i, bytes;
637
638         spin_lock_bh(&fib_info_lock);
639         old_info_hash = fib_info_hash;
640         old_laddrhash = fib_info_laddrhash;
641         fib_hash_size = new_size;
642
643         for (i = 0; i < old_size; i++) {
644                 struct hlist_head *head = &fib_info_hash[i];
645                 struct hlist_node *node, *n;
646                 struct fib_info *fi;
647
648                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
649                         struct hlist_head *dest;
650                         unsigned int new_hash;
651
652                         hlist_del(&fi->fib_hash);
653
654                         new_hash = fib_info_hashfn(fi);
655                         dest = &new_info_hash[new_hash];
656                         hlist_add_head(&fi->fib_hash, dest);
657                 }
658         }
659         fib_info_hash = new_info_hash;
660
661         for (i = 0; i < old_size; i++) {
662                 struct hlist_head *lhead = &fib_info_laddrhash[i];
663                 struct hlist_node *node, *n;
664                 struct fib_info *fi;
665
666                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
667                         struct hlist_head *ldest;
668                         unsigned int new_hash;
669
670                         hlist_del(&fi->fib_lhash);
671
672                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
673                         ldest = &new_laddrhash[new_hash];
674                         hlist_add_head(&fi->fib_lhash, ldest);
675                 }
676         }
677         fib_info_laddrhash = new_laddrhash;
678
679         spin_unlock_bh(&fib_info_lock);
680
681         bytes = old_size * sizeof(struct hlist_head *);
682         fib_hash_free(old_info_hash, bytes);
683         fib_hash_free(old_laddrhash, bytes);
684 }
685
686 struct fib_info *fib_create_info(struct fib_config *cfg)
687 {
688         int err;
689         struct fib_info *fi = NULL;
690         struct fib_info *ofi;
691         int nhs = 1;
692         struct net *net = cfg->fc_nlinfo.nl_net;
693
694         /* Fast check to catch the most weird cases */
695         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
696                 goto err_inval;
697
698 #ifdef CONFIG_IP_ROUTE_MULTIPATH
699         if (cfg->fc_mp) {
700                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
701                 if (nhs == 0)
702                         goto err_inval;
703         }
704 #endif
705
706         err = -ENOBUFS;
707         if (fib_info_cnt >= fib_hash_size) {
708                 unsigned int new_size = fib_hash_size << 1;
709                 struct hlist_head *new_info_hash;
710                 struct hlist_head *new_laddrhash;
711                 unsigned int bytes;
712
713                 if (!new_size)
714                         new_size = 1;
715                 bytes = new_size * sizeof(struct hlist_head *);
716                 new_info_hash = fib_hash_alloc(bytes);
717                 new_laddrhash = fib_hash_alloc(bytes);
718                 if (!new_info_hash || !new_laddrhash) {
719                         fib_hash_free(new_info_hash, bytes);
720                         fib_hash_free(new_laddrhash, bytes);
721                 } else
722                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
723
724                 if (!fib_hash_size)
725                         goto failure;
726         }
727
728         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
729         if (fi == NULL)
730                 goto failure;
731         fib_info_cnt++;
732
733         fi->fib_net = net;
734         fi->fib_protocol = cfg->fc_protocol;
735         fi->fib_flags = cfg->fc_flags;
736         fi->fib_priority = cfg->fc_priority;
737         fi->fib_prefsrc = cfg->fc_prefsrc;
738
739         fi->fib_nhs = nhs;
740         change_nexthops(fi) {
741                 nh->nh_parent = fi;
742         } endfor_nexthops(fi)
743
744         if (cfg->fc_mx) {
745                 struct nlattr *nla;
746                 int remaining;
747
748                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
749                         int type = nla_type(nla);
750
751                         if (type) {
752                                 if (type > RTAX_MAX)
753                                         goto err_inval;
754                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
755                         }
756                 }
757         }
758
759         if (cfg->fc_mp) {
760 #ifdef CONFIG_IP_ROUTE_MULTIPATH
761                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
762                 if (err != 0)
763                         goto failure;
764                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
765                         goto err_inval;
766                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
767                         goto err_inval;
768 #ifdef CONFIG_NET_CLS_ROUTE
769                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
770                         goto err_inval;
771 #endif
772 #else
773                 goto err_inval;
774 #endif
775         } else {
776                 struct fib_nh *nh = fi->fib_nh;
777
778                 nh->nh_oif = cfg->fc_oif;
779                 nh->nh_gw = cfg->fc_gw;
780                 nh->nh_flags = cfg->fc_flags;
781 #ifdef CONFIG_NET_CLS_ROUTE
782                 nh->nh_tclassid = cfg->fc_flow;
783 #endif
784 #ifdef CONFIG_IP_ROUTE_MULTIPATH
785                 nh->nh_weight = 1;
786 #endif
787         }
788
789         if (fib_props[cfg->fc_type].error) {
790                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
791                         goto err_inval;
792                 goto link_it;
793         }
794
795         if (cfg->fc_scope > RT_SCOPE_HOST)
796                 goto err_inval;
797
798         if (cfg->fc_scope == RT_SCOPE_HOST) {
799                 struct fib_nh *nh = fi->fib_nh;
800
801                 /* Local address is added. */
802                 if (nhs != 1 || nh->nh_gw)
803                         goto err_inval;
804                 nh->nh_scope = RT_SCOPE_NOWHERE;
805                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
806                 err = -ENODEV;
807                 if (nh->nh_dev == NULL)
808                         goto failure;
809         } else {
810                 change_nexthops(fi) {
811                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
812                                 goto failure;
813                 } endfor_nexthops(fi)
814         }
815
816         if (fi->fib_prefsrc) {
817                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
818                     fi->fib_prefsrc != cfg->fc_dst)
819                         if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
820                                 goto err_inval;
821         }
822
823 link_it:
824         if ((ofi = fib_find_info(fi)) != NULL) {
825                 fi->fib_dead = 1;
826                 free_fib_info(fi);
827                 ofi->fib_treeref++;
828                 return ofi;
829         }
830
831         fi->fib_treeref++;
832         atomic_inc(&fi->fib_clntref);
833         spin_lock_bh(&fib_info_lock);
834         hlist_add_head(&fi->fib_hash,
835                        &fib_info_hash[fib_info_hashfn(fi)]);
836         if (fi->fib_prefsrc) {
837                 struct hlist_head *head;
838
839                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
840                 hlist_add_head(&fi->fib_lhash, head);
841         }
842         change_nexthops(fi) {
843                 struct hlist_head *head;
844                 unsigned int hash;
845
846                 if (!nh->nh_dev)
847                         continue;
848                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
849                 head = &fib_info_devhash[hash];
850                 hlist_add_head(&nh->nh_hash, head);
851         } endfor_nexthops(fi)
852         spin_unlock_bh(&fib_info_lock);
853         return fi;
854
855 err_inval:
856         err = -EINVAL;
857
858 failure:
859         if (fi) {
860                 fi->fib_dead = 1;
861                 free_fib_info(fi);
862         }
863
864         return ERR_PTR(err);
865 }
866
867 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
868 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
869                        struct fib_result *res, __be32 zone, __be32 mask,
870                         int prefixlen)
871 {
872         struct fib_alias *fa;
873         int nh_sel = 0;
874
875         list_for_each_entry_rcu(fa, head, fa_list) {
876                 int err;
877
878                 if (fa->fa_tos &&
879                     fa->fa_tos != flp->fl4_tos)
880                         continue;
881
882                 if (fa->fa_scope < flp->fl4_scope)
883                         continue;
884
885                 fa->fa_state |= FA_S_ACCESSED;
886
887                 err = fib_props[fa->fa_type].error;
888                 if (err == 0) {
889                         struct fib_info *fi = fa->fa_info;
890
891                         if (fi->fib_flags & RTNH_F_DEAD)
892                                 continue;
893
894                         switch (fa->fa_type) {
895                         case RTN_UNICAST:
896                         case RTN_LOCAL:
897                         case RTN_BROADCAST:
898                         case RTN_ANYCAST:
899                         case RTN_MULTICAST:
900                                 for_nexthops(fi) {
901                                         if (nh->nh_flags&RTNH_F_DEAD)
902                                                 continue;
903                                         if (!flp->oif || flp->oif == nh->nh_oif)
904                                                 break;
905                                 }
906 #ifdef CONFIG_IP_ROUTE_MULTIPATH
907                                 if (nhsel < fi->fib_nhs) {
908                                         nh_sel = nhsel;
909                                         goto out_fill_res;
910                                 }
911 #else
912                                 if (nhsel < 1) {
913                                         goto out_fill_res;
914                                 }
915 #endif
916                                 endfor_nexthops(fi);
917                                 continue;
918
919                         default:
920                                 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
921                                         fa->fa_type);
922                                 return -EINVAL;
923                         }
924                 }
925                 return err;
926         }
927         return 1;
928
929 out_fill_res:
930         res->prefixlen = prefixlen;
931         res->nh_sel = nh_sel;
932         res->type = fa->fa_type;
933         res->scope = fa->fa_scope;
934         res->fi = fa->fa_info;
935         atomic_inc(&res->fi->fib_clntref);
936         return 0;
937 }
938
939 /* Find appropriate source address to this destination */
940
941 __be32 __fib_res_prefsrc(struct fib_result *res)
942 {
943         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
944 }
945
946 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
947                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
948                   struct fib_info *fi, unsigned int flags)
949 {
950         struct nlmsghdr *nlh;
951         struct rtmsg *rtm;
952
953         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
954         if (nlh == NULL)
955                 return -EMSGSIZE;
956
957         rtm = nlmsg_data(nlh);
958         rtm->rtm_family = AF_INET;
959         rtm->rtm_dst_len = dst_len;
960         rtm->rtm_src_len = 0;
961         rtm->rtm_tos = tos;
962         rtm->rtm_table = tb_id;
963         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
964         rtm->rtm_type = type;
965         rtm->rtm_flags = fi->fib_flags;
966         rtm->rtm_scope = scope;
967         rtm->rtm_protocol = fi->fib_protocol;
968
969         if (rtm->rtm_dst_len)
970                 NLA_PUT_BE32(skb, RTA_DST, dst);
971
972         if (fi->fib_priority)
973                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
974
975         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
976                 goto nla_put_failure;
977
978         if (fi->fib_prefsrc)
979                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
980
981         if (fi->fib_nhs == 1) {
982                 if (fi->fib_nh->nh_gw)
983                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
984
985                 if (fi->fib_nh->nh_oif)
986                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
987 #ifdef CONFIG_NET_CLS_ROUTE
988                 if (fi->fib_nh[0].nh_tclassid)
989                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
990 #endif
991         }
992 #ifdef CONFIG_IP_ROUTE_MULTIPATH
993         if (fi->fib_nhs > 1) {
994                 struct rtnexthop *rtnh;
995                 struct nlattr *mp;
996
997                 mp = nla_nest_start(skb, RTA_MULTIPATH);
998                 if (mp == NULL)
999                         goto nla_put_failure;
1000
1001                 for_nexthops(fi) {
1002                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1003                         if (rtnh == NULL)
1004                                 goto nla_put_failure;
1005
1006                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1007                         rtnh->rtnh_hops = nh->nh_weight - 1;
1008                         rtnh->rtnh_ifindex = nh->nh_oif;
1009
1010                         if (nh->nh_gw)
1011                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1012 #ifdef CONFIG_NET_CLS_ROUTE
1013                         if (nh->nh_tclassid)
1014                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1015 #endif
1016                         /* length of rtnetlink header + attributes */
1017                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1018                 } endfor_nexthops(fi);
1019
1020                 nla_nest_end(skb, mp);
1021         }
1022 #endif
1023         return nlmsg_end(skb, nlh);
1024
1025 nla_put_failure:
1026         nlmsg_cancel(skb, nlh);
1027         return -EMSGSIZE;
1028 }
1029
1030 /*
1031    Update FIB if:
1032    - local address disappeared -> we must delete all the entries
1033      referring to it.
1034    - device went down -> we must shutdown all nexthops going via it.
1035  */
1036 int fib_sync_down_addr(struct net *net, __be32 local)
1037 {
1038         int ret = 0;
1039         unsigned int hash = fib_laddr_hashfn(local);
1040         struct hlist_head *head = &fib_info_laddrhash[hash];
1041         struct hlist_node *node;
1042         struct fib_info *fi;
1043
1044         if (fib_info_laddrhash == NULL || local == 0)
1045                 return 0;
1046
1047         hlist_for_each_entry(fi, node, head, fib_lhash) {
1048                 if (fi->fib_net != net)
1049                         continue;
1050                 if (fi->fib_prefsrc == local) {
1051                         fi->fib_flags |= RTNH_F_DEAD;
1052                         ret++;
1053                 }
1054         }
1055         return ret;
1056 }
1057
1058 int fib_sync_down_dev(struct net_device *dev, int force)
1059 {
1060         int ret = 0;
1061         int scope = RT_SCOPE_NOWHERE;
1062         struct fib_info *prev_fi = NULL;
1063         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1064         struct hlist_head *head = &fib_info_devhash[hash];
1065         struct hlist_node *node;
1066         struct fib_nh *nh;
1067
1068         if (force)
1069                 scope = -1;
1070
1071         hlist_for_each_entry(nh, node, head, nh_hash) {
1072                 struct fib_info *fi = nh->nh_parent;
1073                 int dead;
1074
1075                 BUG_ON(!fi->fib_nhs);
1076                 if (nh->nh_dev != dev || fi == prev_fi)
1077                         continue;
1078                 prev_fi = fi;
1079                 dead = 0;
1080                 change_nexthops(fi) {
1081                         if (nh->nh_flags&RTNH_F_DEAD)
1082                                 dead++;
1083                         else if (nh->nh_dev == dev &&
1084                                         nh->nh_scope != scope) {
1085                                 nh->nh_flags |= RTNH_F_DEAD;
1086 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1087                                 spin_lock_bh(&fib_multipath_lock);
1088                                 fi->fib_power -= nh->nh_power;
1089                                 nh->nh_power = 0;
1090                                 spin_unlock_bh(&fib_multipath_lock);
1091 #endif
1092                                 dead++;
1093                         }
1094 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1095                         if (force > 1 && nh->nh_dev == dev) {
1096                                 dead = fi->fib_nhs;
1097                                 break;
1098                         }
1099 #endif
1100                 } endfor_nexthops(fi)
1101                 if (dead == fi->fib_nhs) {
1102                         fi->fib_flags |= RTNH_F_DEAD;
1103                         ret++;
1104                 }
1105         }
1106
1107         return ret;
1108 }
1109
1110 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1111
1112 /*
1113    Dead device goes up. We wake up dead nexthops.
1114    It takes sense only on multipath routes.
1115  */
1116
1117 int fib_sync_up(struct net_device *dev)
1118 {
1119         struct fib_info *prev_fi;
1120         unsigned int hash;
1121         struct hlist_head *head;
1122         struct hlist_node *node;
1123         struct fib_nh *nh;
1124         int ret;
1125
1126         if (!(dev->flags&IFF_UP))
1127                 return 0;
1128
1129         prev_fi = NULL;
1130         hash = fib_devindex_hashfn(dev->ifindex);
1131         head = &fib_info_devhash[hash];
1132         ret = 0;
1133
1134         hlist_for_each_entry(nh, node, head, nh_hash) {
1135                 struct fib_info *fi = nh->nh_parent;
1136                 int alive;
1137
1138                 BUG_ON(!fi->fib_nhs);
1139                 if (nh->nh_dev != dev || fi == prev_fi)
1140                         continue;
1141
1142                 prev_fi = fi;
1143                 alive = 0;
1144                 change_nexthops(fi) {
1145                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1146                                 alive++;
1147                                 continue;
1148                         }
1149                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1150                                 continue;
1151                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1152                                 continue;
1153                         alive++;
1154                         spin_lock_bh(&fib_multipath_lock);
1155                         nh->nh_power = 0;
1156                         nh->nh_flags &= ~RTNH_F_DEAD;
1157                         spin_unlock_bh(&fib_multipath_lock);
1158                 } endfor_nexthops(fi)
1159
1160                 if (alive > 0) {
1161                         fi->fib_flags &= ~RTNH_F_DEAD;
1162                         ret++;
1163                 }
1164         }
1165
1166         return ret;
1167 }
1168
1169 /*
1170    The algorithm is suboptimal, but it provides really
1171    fair weighted route distribution.
1172  */
1173
1174 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1175 {
1176         struct fib_info *fi = res->fi;
1177         int w;
1178
1179         spin_lock_bh(&fib_multipath_lock);
1180         if (fi->fib_power <= 0) {
1181                 int power = 0;
1182                 change_nexthops(fi) {
1183                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1184                                 power += nh->nh_weight;
1185                                 nh->nh_power = nh->nh_weight;
1186                         }
1187                 } endfor_nexthops(fi);
1188                 fi->fib_power = power;
1189                 if (power <= 0) {
1190                         spin_unlock_bh(&fib_multipath_lock);
1191                         /* Race condition: route has just become dead. */
1192                         res->nh_sel = 0;
1193                         return;
1194                 }
1195         }
1196
1197
1198         /* w should be random number [0..fi->fib_power-1],
1199            it is pretty bad approximation.
1200          */
1201
1202         w = jiffies % fi->fib_power;
1203
1204         change_nexthops(fi) {
1205                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1206                         if ((w -= nh->nh_power) <= 0) {
1207                                 nh->nh_power--;
1208                                 fi->fib_power--;
1209                                 res->nh_sel = nhsel;
1210                                 spin_unlock_bh(&fib_multipath_lock);
1211                                 return;
1212                         }
1213                 }
1214         } endfor_nexthops(fi);
1215
1216         /* Race condition: route has just become dead. */
1217         res->nh_sel = 0;
1218         spin_unlock_bh(&fib_multipath_lock);
1219 }
1220 #endif