Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
46 #include <net/netlink.h>
47 #include <net/nexthop.h>
48
49 #include "fib_lookup.h"
50
51 #define FSprintk(a...)
52
53 static DEFINE_SPINLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
58
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64
65 static DEFINE_SPINLOCK(fib_multipath_lock);
66
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
76
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
82
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84
85 #define endfor_nexthops(fi) }
86
87
88 static const struct 
89 {
90         int     error;
91         u8      scope;
92 } fib_props[RTA_MAX + 1] = {
93         {
94                 .error  = 0,
95                 .scope  = RT_SCOPE_NOWHERE,
96         },      /* RTN_UNSPEC */
97         {
98                 .error  = 0,
99                 .scope  = RT_SCOPE_UNIVERSE,
100         },      /* RTN_UNICAST */
101         {
102                 .error  = 0,
103                 .scope  = RT_SCOPE_HOST,
104         },      /* RTN_LOCAL */
105         {
106                 .error  = 0,
107                 .scope  = RT_SCOPE_LINK,
108         },      /* RTN_BROADCAST */
109         {
110                 .error  = 0,
111                 .scope  = RT_SCOPE_LINK,
112         },      /* RTN_ANYCAST */
113         {
114                 .error  = 0,
115                 .scope  = RT_SCOPE_UNIVERSE,
116         },      /* RTN_MULTICAST */
117         {
118                 .error  = -EINVAL,
119                 .scope  = RT_SCOPE_UNIVERSE,
120         },      /* RTN_BLACKHOLE */
121         {
122                 .error  = -EHOSTUNREACH,
123                 .scope  = RT_SCOPE_UNIVERSE,
124         },      /* RTN_UNREACHABLE */
125         {
126                 .error  = -EACCES,
127                 .scope  = RT_SCOPE_UNIVERSE,
128         },      /* RTN_PROHIBIT */
129         {
130                 .error  = -EAGAIN,
131                 .scope  = RT_SCOPE_UNIVERSE,
132         },      /* RTN_THROW */
133         {
134                 .error  = -EINVAL,
135                 .scope  = RT_SCOPE_NOWHERE,
136         },      /* RTN_NAT */
137         {
138                 .error  = -EINVAL,
139                 .scope  = RT_SCOPE_NOWHERE,
140         },      /* RTN_XRESOLVE */
141 };
142
143
144 /* Release a nexthop info record */
145
146 void free_fib_info(struct fib_info *fi)
147 {
148         if (fi->fib_dead == 0) {
149                 printk("Freeing alive fib_info %p\n", fi);
150                 return;
151         }
152         change_nexthops(fi) {
153                 if (nh->nh_dev)
154                         dev_put(nh->nh_dev);
155                 nh->nh_dev = NULL;
156         } endfor_nexthops(fi);
157         fib_info_cnt--;
158         kfree(fi);
159 }
160
161 void fib_release_info(struct fib_info *fi)
162 {
163         spin_lock_bh(&fib_info_lock);
164         if (fi && --fi->fib_treeref == 0) {
165                 hlist_del(&fi->fib_hash);
166                 if (fi->fib_prefsrc)
167                         hlist_del(&fi->fib_lhash);
168                 change_nexthops(fi) {
169                         if (!nh->nh_dev)
170                                 continue;
171                         hlist_del(&nh->nh_hash);
172                 } endfor_nexthops(fi)
173                 fi->fib_dead = 1;
174                 fib_info_put(fi);
175         }
176         spin_unlock_bh(&fib_info_lock);
177 }
178
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 {
181         const struct fib_nh *onh = ofi->fib_nh;
182
183         for_nexthops(fi) {
184                 if (nh->nh_oif != onh->nh_oif ||
185                     nh->nh_gw  != onh->nh_gw ||
186                     nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188                     nh->nh_weight != onh->nh_weight ||
189 #endif
190 #ifdef CONFIG_NET_CLS_ROUTE
191                     nh->nh_tclassid != onh->nh_tclassid ||
192 #endif
193                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194                         return -1;
195                 onh++;
196         } endfor_nexthops(fi);
197         return 0;
198 }
199
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 {
202         unsigned int mask = (fib_hash_size - 1);
203         unsigned int val = fi->fib_nhs;
204
205         val ^= fi->fib_protocol;
206         val ^= (__force u32)fi->fib_prefsrc;
207         val ^= fi->fib_priority;
208
209         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 }
211
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 {
214         struct hlist_head *head;
215         struct hlist_node *node;
216         struct fib_info *fi;
217         unsigned int hash;
218
219         hash = fib_info_hashfn(nfi);
220         head = &fib_info_hash[hash];
221
222         hlist_for_each_entry(fi, node, head, fib_hash) {
223                 if (fi->fib_nhs != nfi->fib_nhs)
224                         continue;
225                 if (nfi->fib_protocol == fi->fib_protocol &&
226                     nfi->fib_prefsrc == fi->fib_prefsrc &&
227                     nfi->fib_priority == fi->fib_priority &&
228                     memcmp(nfi->fib_metrics, fi->fib_metrics,
229                            sizeof(fi->fib_metrics)) == 0 &&
230                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
232                         return fi;
233         }
234
235         return NULL;
236 }
237
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 {
240         unsigned int mask = DEVINDEX_HASHSIZE - 1;
241
242         return (val ^
243                 (val >> DEVINDEX_HASHBITS) ^
244                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 }
246
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250
251 int ip_fib_check_default(__be32 gw, struct net_device *dev)
252 {
253         struct hlist_head *head;
254         struct hlist_node *node;
255         struct fib_nh *nh;
256         unsigned int hash;
257
258         spin_lock(&fib_info_lock);
259
260         hash = fib_devindex_hashfn(dev->ifindex);
261         head = &fib_info_devhash[hash];
262         hlist_for_each_entry(nh, node, head, nh_hash) {
263                 if (nh->nh_dev == dev &&
264                     nh->nh_gw == gw &&
265                     !(nh->nh_flags&RTNH_F_DEAD)) {
266                         spin_unlock(&fib_info_lock);
267                         return 0;
268                 }
269         }
270
271         spin_unlock(&fib_info_lock);
272
273         return -1;
274 }
275
276 static inline size_t fib_nlmsg_size(struct fib_info *fi)
277 {
278         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
279                          + nla_total_size(4) /* RTA_TABLE */
280                          + nla_total_size(4) /* RTA_DST */
281                          + nla_total_size(4) /* RTA_PRIORITY */
282                          + nla_total_size(4); /* RTA_PREFSRC */
283
284         /* space for nested metrics */
285         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
286
287         if (fi->fib_nhs) {
288                 /* Also handles the special case fib_nhs == 1 */
289
290                 /* each nexthop is packed in an attribute */
291                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
292
293                 /* may contain flow and gateway attribute */
294                 nhsize += 2 * nla_total_size(4);
295
296                 /* all nexthops are packed in a nested attribute */
297                 payload += nla_total_size(fi->fib_nhs * nhsize);
298         }
299
300         return payload;
301 }
302
303 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
304                int dst_len, u32 tb_id, struct nl_info *info)
305 {
306         struct sk_buff *skb;
307         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
308         int err = -ENOBUFS;
309
310         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
311         if (skb == NULL)
312                 goto errout;
313
314         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
315                             fa->fa_type, fa->fa_scope, key, dst_len,
316                             fa->fa_tos, fa->fa_info, 0);
317         /* failure implies BUG in fib_nlmsg_size() */
318         BUG_ON(err < 0);
319
320         err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
321                           info->nlh, GFP_KERNEL);
322 errout:
323         if (err < 0)
324                 rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
325 }
326
327 /* Return the first fib alias matching TOS with
328  * priority less than or equal to PRIO.
329  */
330 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
331 {
332         if (fah) {
333                 struct fib_alias *fa;
334                 list_for_each_entry(fa, fah, fa_list) {
335                         if (fa->fa_tos > tos)
336                                 continue;
337                         if (fa->fa_info->fib_priority >= prio ||
338                             fa->fa_tos < tos)
339                                 return fa;
340                 }
341         }
342         return NULL;
343 }
344
345 int fib_detect_death(struct fib_info *fi, int order,
346                      struct fib_info **last_resort, int *last_idx, int *dflt)
347 {
348         struct neighbour *n;
349         int state = NUD_NONE;
350
351         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
352         if (n) {
353                 state = n->nud_state;
354                 neigh_release(n);
355         }
356         if (state==NUD_REACHABLE)
357                 return 0;
358         if ((state&NUD_VALID) && order != *dflt)
359                 return 0;
360         if ((state&NUD_VALID) ||
361             (*last_idx<0 && order > *dflt)) {
362                 *last_resort = fi;
363                 *last_idx = order;
364         }
365         return 1;
366 }
367
368 #ifdef CONFIG_IP_ROUTE_MULTIPATH
369
370 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
371 {
372         int nhs = 0;
373
374         while (rtnh_ok(rtnh, remaining)) {
375                 nhs++;
376                 rtnh = rtnh_next(rtnh, &remaining);
377         }
378
379         /* leftover implies invalid nexthop configuration, discard it */
380         return remaining > 0 ? 0 : nhs;
381 }
382
383 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
384                        int remaining, struct fib_config *cfg)
385 {
386         change_nexthops(fi) {
387                 int attrlen;
388
389                 if (!rtnh_ok(rtnh, remaining))
390                         return -EINVAL;
391
392                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
393                 nh->nh_oif = rtnh->rtnh_ifindex;
394                 nh->nh_weight = rtnh->rtnh_hops + 1;
395
396                 attrlen = rtnh_attrlen(rtnh);
397                 if (attrlen > 0) {
398                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
399
400                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
401                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
402 #ifdef CONFIG_NET_CLS_ROUTE
403                         nla = nla_find(attrs, attrlen, RTA_FLOW);
404                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
405 #endif
406                 }
407
408                 rtnh = rtnh_next(rtnh, &remaining);
409         } endfor_nexthops(fi);
410
411         return 0;
412 }
413
414 #endif
415
416 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
417 {
418 #ifdef CONFIG_IP_ROUTE_MULTIPATH
419         struct rtnexthop *rtnh;
420         int remaining;
421 #endif
422
423         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
424                 return 1;
425
426         if (cfg->fc_oif || cfg->fc_gw) {
427                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
428                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
429                         return 0;
430                 return 1;
431         }
432
433 #ifdef CONFIG_IP_ROUTE_MULTIPATH
434         if (cfg->fc_mp == NULL)
435                 return 0;
436
437         rtnh = cfg->fc_mp;
438         remaining = cfg->fc_mp_len;
439         
440         for_nexthops(fi) {
441                 int attrlen;
442
443                 if (!rtnh_ok(rtnh, remaining))
444                         return -EINVAL;
445
446                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
447                         return 1;
448
449                 attrlen = rtnh_attrlen(rtnh);
450                 if (attrlen < 0) {
451                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
452
453                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
454                         if (nla && nla_get_be32(nla) != nh->nh_gw)
455                                 return 1;
456 #ifdef CONFIG_NET_CLS_ROUTE
457                         nla = nla_find(attrs, attrlen, RTA_FLOW);
458                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
459                                 return 1;
460 #endif
461                 }
462
463                 rtnh = rtnh_next(rtnh, &remaining);
464         } endfor_nexthops(fi);
465 #endif
466         return 0;
467 }
468
469
470 /*
471    Picture
472    -------
473
474    Semantics of nexthop is very messy by historical reasons.
475    We have to take into account, that:
476    a) gateway can be actually local interface address,
477       so that gatewayed route is direct.
478    b) gateway must be on-link address, possibly
479       described not by an ifaddr, but also by a direct route.
480    c) If both gateway and interface are specified, they should not
481       contradict.
482    d) If we use tunnel routes, gateway could be not on-link.
483
484    Attempt to reconcile all of these (alas, self-contradictory) conditions
485    results in pretty ugly and hairy code with obscure logic.
486
487    I chose to generalized it instead, so that the size
488    of code does not increase practically, but it becomes
489    much more general.
490    Every prefix is assigned a "scope" value: "host" is local address,
491    "link" is direct route,
492    [ ... "site" ... "interior" ... ]
493    and "universe" is true gateway route with global meaning.
494
495    Every prefix refers to a set of "nexthop"s (gw, oif),
496    where gw must have narrower scope. This recursion stops
497    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
498    which means that gw is forced to be on link.
499
500    Code is still hairy, but now it is apparently logically
501    consistent and very flexible. F.e. as by-product it allows
502    to co-exists in peace independent exterior and interior
503    routing processes.
504
505    Normally it looks as following.
506
507    {universe prefix}  -> (gw, oif) [scope link]
508                           |
509                           |-> {link prefix} -> (gw, oif) [scope local]
510                                                 |
511                                                 |-> {local prefix} (terminal node)
512  */
513
514 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
515                         struct fib_nh *nh)
516 {
517         int err;
518
519         if (nh->nh_gw) {
520                 struct fib_result res;
521
522 #ifdef CONFIG_IP_ROUTE_PERVASIVE
523                 if (nh->nh_flags&RTNH_F_PERVASIVE)
524                         return 0;
525 #endif
526                 if (nh->nh_flags&RTNH_F_ONLINK) {
527                         struct net_device *dev;
528
529                         if (cfg->fc_scope >= RT_SCOPE_LINK)
530                                 return -EINVAL;
531                         if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
532                                 return -EINVAL;
533                         if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
534                                 return -ENODEV;
535                         if (!(dev->flags&IFF_UP))
536                                 return -ENETDOWN;
537                         nh->nh_dev = dev;
538                         dev_hold(dev);
539                         nh->nh_scope = RT_SCOPE_LINK;
540                         return 0;
541                 }
542                 {
543                         struct flowi fl = {
544                                 .nl_u = {
545                                         .ip4_u = {
546                                                 .daddr = nh->nh_gw,
547                                                 .scope = cfg->fc_scope + 1,
548                                         },
549                                 },
550                                 .oif = nh->nh_oif,
551                         };
552
553                         /* It is not necessary, but requires a bit of thinking */
554                         if (fl.fl4_scope < RT_SCOPE_LINK)
555                                 fl.fl4_scope = RT_SCOPE_LINK;
556                         if ((err = fib_lookup(&fl, &res)) != 0)
557                                 return err;
558                 }
559                 err = -EINVAL;
560                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
561                         goto out;
562                 nh->nh_scope = res.scope;
563                 nh->nh_oif = FIB_RES_OIF(res);
564                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
565                         goto out;
566                 dev_hold(nh->nh_dev);
567                 err = -ENETDOWN;
568                 if (!(nh->nh_dev->flags & IFF_UP))
569                         goto out;
570                 err = 0;
571 out:
572                 fib_res_put(&res);
573                 return err;
574         } else {
575                 struct in_device *in_dev;
576
577                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
578                         return -EINVAL;
579
580                 in_dev = inetdev_by_index(nh->nh_oif);
581                 if (in_dev == NULL)
582                         return -ENODEV;
583                 if (!(in_dev->dev->flags&IFF_UP)) {
584                         in_dev_put(in_dev);
585                         return -ENETDOWN;
586                 }
587                 nh->nh_dev = in_dev->dev;
588                 dev_hold(nh->nh_dev);
589                 nh->nh_scope = RT_SCOPE_HOST;
590                 in_dev_put(in_dev);
591         }
592         return 0;
593 }
594
595 static inline unsigned int fib_laddr_hashfn(__be32 val)
596 {
597         unsigned int mask = (fib_hash_size - 1);
598
599         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
600 }
601
602 static struct hlist_head *fib_hash_alloc(int bytes)
603 {
604         if (bytes <= PAGE_SIZE)
605                 return kmalloc(bytes, GFP_KERNEL);
606         else
607                 return (struct hlist_head *)
608                         __get_free_pages(GFP_KERNEL, get_order(bytes));
609 }
610
611 static void fib_hash_free(struct hlist_head *hash, int bytes)
612 {
613         if (!hash)
614                 return;
615
616         if (bytes <= PAGE_SIZE)
617                 kfree(hash);
618         else
619                 free_pages((unsigned long) hash, get_order(bytes));
620 }
621
622 static void fib_hash_move(struct hlist_head *new_info_hash,
623                           struct hlist_head *new_laddrhash,
624                           unsigned int new_size)
625 {
626         struct hlist_head *old_info_hash, *old_laddrhash;
627         unsigned int old_size = fib_hash_size;
628         unsigned int i, bytes;
629
630         spin_lock_bh(&fib_info_lock);
631         old_info_hash = fib_info_hash;
632         old_laddrhash = fib_info_laddrhash;
633         fib_hash_size = new_size;
634
635         for (i = 0; i < old_size; i++) {
636                 struct hlist_head *head = &fib_info_hash[i];
637                 struct hlist_node *node, *n;
638                 struct fib_info *fi;
639
640                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
641                         struct hlist_head *dest;
642                         unsigned int new_hash;
643
644                         hlist_del(&fi->fib_hash);
645
646                         new_hash = fib_info_hashfn(fi);
647                         dest = &new_info_hash[new_hash];
648                         hlist_add_head(&fi->fib_hash, dest);
649                 }
650         }
651         fib_info_hash = new_info_hash;
652
653         for (i = 0; i < old_size; i++) {
654                 struct hlist_head *lhead = &fib_info_laddrhash[i];
655                 struct hlist_node *node, *n;
656                 struct fib_info *fi;
657
658                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
659                         struct hlist_head *ldest;
660                         unsigned int new_hash;
661
662                         hlist_del(&fi->fib_lhash);
663
664                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
665                         ldest = &new_laddrhash[new_hash];
666                         hlist_add_head(&fi->fib_lhash, ldest);
667                 }
668         }
669         fib_info_laddrhash = new_laddrhash;
670
671         spin_unlock_bh(&fib_info_lock);
672
673         bytes = old_size * sizeof(struct hlist_head *);
674         fib_hash_free(old_info_hash, bytes);
675         fib_hash_free(old_laddrhash, bytes);
676 }
677
678 struct fib_info *fib_create_info(struct fib_config *cfg)
679 {
680         int err;
681         struct fib_info *fi = NULL;
682         struct fib_info *ofi;
683         int nhs = 1;
684
685         /* Fast check to catch the most weird cases */
686         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
687                 goto err_inval;
688
689 #ifdef CONFIG_IP_ROUTE_MULTIPATH
690         if (cfg->fc_mp) {
691                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
692                 if (nhs == 0)
693                         goto err_inval;
694         }
695 #endif
696 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
697         if (cfg->fc_mp_alg) {
698                 if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
699                     cfg->fc_mp_alg > IP_MP_ALG_MAX)
700                         goto err_inval;
701         }
702 #endif
703
704         err = -ENOBUFS;
705         if (fib_info_cnt >= fib_hash_size) {
706                 unsigned int new_size = fib_hash_size << 1;
707                 struct hlist_head *new_info_hash;
708                 struct hlist_head *new_laddrhash;
709                 unsigned int bytes;
710
711                 if (!new_size)
712                         new_size = 1;
713                 bytes = new_size * sizeof(struct hlist_head *);
714                 new_info_hash = fib_hash_alloc(bytes);
715                 new_laddrhash = fib_hash_alloc(bytes);
716                 if (!new_info_hash || !new_laddrhash) {
717                         fib_hash_free(new_info_hash, bytes);
718                         fib_hash_free(new_laddrhash, bytes);
719                 } else {
720                         memset(new_info_hash, 0, bytes);
721                         memset(new_laddrhash, 0, bytes);
722
723                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
724                 }
725
726                 if (!fib_hash_size)
727                         goto failure;
728         }
729
730         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
731         if (fi == NULL)
732                 goto failure;
733         fib_info_cnt++;
734
735         fi->fib_protocol = cfg->fc_protocol;
736         fi->fib_flags = cfg->fc_flags;
737         fi->fib_priority = cfg->fc_priority;
738         fi->fib_prefsrc = cfg->fc_prefsrc;
739
740         fi->fib_nhs = nhs;
741         change_nexthops(fi) {
742                 nh->nh_parent = fi;
743         } endfor_nexthops(fi)
744
745         if (cfg->fc_mx) {
746                 struct nlattr *nla;
747                 int remaining;
748
749                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
750                         int type = nla->nla_type;
751
752                         if (type) {
753                                 if (type > RTAX_MAX)
754                                         goto err_inval;
755                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
756                         }
757                 }
758         }
759
760         if (cfg->fc_mp) {
761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
762                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
763                 if (err != 0)
764                         goto failure;
765                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
766                         goto err_inval;
767                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
768                         goto err_inval;
769 #ifdef CONFIG_NET_CLS_ROUTE
770                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
771                         goto err_inval;
772 #endif
773 #else
774                 goto err_inval;
775 #endif
776         } else {
777                 struct fib_nh *nh = fi->fib_nh;
778
779                 nh->nh_oif = cfg->fc_oif;
780                 nh->nh_gw = cfg->fc_gw;
781                 nh->nh_flags = cfg->fc_flags;
782 #ifdef CONFIG_NET_CLS_ROUTE
783                 nh->nh_tclassid = cfg->fc_flow;
784 #endif
785 #ifdef CONFIG_IP_ROUTE_MULTIPATH
786                 nh->nh_weight = 1;
787 #endif
788         }
789
790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
791         fi->fib_mp_alg = cfg->fc_mp_alg;
792 #endif
793
794         if (fib_props[cfg->fc_type].error) {
795                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
796                         goto err_inval;
797                 goto link_it;
798         }
799
800         if (cfg->fc_scope > RT_SCOPE_HOST)
801                 goto err_inval;
802
803         if (cfg->fc_scope == RT_SCOPE_HOST) {
804                 struct fib_nh *nh = fi->fib_nh;
805
806                 /* Local address is added. */
807                 if (nhs != 1 || nh->nh_gw)
808                         goto err_inval;
809                 nh->nh_scope = RT_SCOPE_NOWHERE;
810                 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
811                 err = -ENODEV;
812                 if (nh->nh_dev == NULL)
813                         goto failure;
814         } else {
815                 change_nexthops(fi) {
816                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
817                                 goto failure;
818                 } endfor_nexthops(fi)
819         }
820
821         if (fi->fib_prefsrc) {
822                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
823                     fi->fib_prefsrc != cfg->fc_dst)
824                         if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
825                                 goto err_inval;
826         }
827
828 link_it:
829         if ((ofi = fib_find_info(fi)) != NULL) {
830                 fi->fib_dead = 1;
831                 free_fib_info(fi);
832                 ofi->fib_treeref++;
833                 return ofi;
834         }
835
836         fi->fib_treeref++;
837         atomic_inc(&fi->fib_clntref);
838         spin_lock_bh(&fib_info_lock);
839         hlist_add_head(&fi->fib_hash,
840                        &fib_info_hash[fib_info_hashfn(fi)]);
841         if (fi->fib_prefsrc) {
842                 struct hlist_head *head;
843
844                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
845                 hlist_add_head(&fi->fib_lhash, head);
846         }
847         change_nexthops(fi) {
848                 struct hlist_head *head;
849                 unsigned int hash;
850
851                 if (!nh->nh_dev)
852                         continue;
853                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
854                 head = &fib_info_devhash[hash];
855                 hlist_add_head(&nh->nh_hash, head);
856         } endfor_nexthops(fi)
857         spin_unlock_bh(&fib_info_lock);
858         return fi;
859
860 err_inval:
861         err = -EINVAL;
862
863 failure:
864         if (fi) {
865                 fi->fib_dead = 1;
866                 free_fib_info(fi);
867         }
868
869         return ERR_PTR(err);
870 }
871
872 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
873 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
874                        struct fib_result *res, __be32 zone, __be32 mask,
875                         int prefixlen)
876 {
877         struct fib_alias *fa;
878         int nh_sel = 0;
879
880         list_for_each_entry_rcu(fa, head, fa_list) {
881                 int err;
882
883                 if (fa->fa_tos &&
884                     fa->fa_tos != flp->fl4_tos)
885                         continue;
886
887                 if (fa->fa_scope < flp->fl4_scope)
888                         continue;
889
890                 fa->fa_state |= FA_S_ACCESSED;
891
892                 err = fib_props[fa->fa_type].error;
893                 if (err == 0) {
894                         struct fib_info *fi = fa->fa_info;
895
896                         if (fi->fib_flags & RTNH_F_DEAD)
897                                 continue;
898
899                         switch (fa->fa_type) {
900                         case RTN_UNICAST:
901                         case RTN_LOCAL:
902                         case RTN_BROADCAST:
903                         case RTN_ANYCAST:
904                         case RTN_MULTICAST:
905                                 for_nexthops(fi) {
906                                         if (nh->nh_flags&RTNH_F_DEAD)
907                                                 continue;
908                                         if (!flp->oif || flp->oif == nh->nh_oif)
909                                                 break;
910                                 }
911 #ifdef CONFIG_IP_ROUTE_MULTIPATH
912                                 if (nhsel < fi->fib_nhs) {
913                                         nh_sel = nhsel;
914                                         goto out_fill_res;
915                                 }
916 #else
917                                 if (nhsel < 1) {
918                                         goto out_fill_res;
919                                 }
920 #endif
921                                 endfor_nexthops(fi);
922                                 continue;
923
924                         default:
925                                 printk(KERN_DEBUG "impossible 102\n");
926                                 return -EINVAL;
927                         };
928                 }
929                 return err;
930         }
931         return 1;
932
933 out_fill_res:
934         res->prefixlen = prefixlen;
935         res->nh_sel = nh_sel;
936         res->type = fa->fa_type;
937         res->scope = fa->fa_scope;
938         res->fi = fa->fa_info;
939 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
940         res->netmask = mask;
941         res->network = zone & inet_make_mask(prefixlen);
942 #endif
943         atomic_inc(&res->fi->fib_clntref);
944         return 0;
945 }
946
947 /* Find appropriate source address to this destination */
948
949 __be32 __fib_res_prefsrc(struct fib_result *res)
950 {
951         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
952 }
953
954 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
955                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
956                   struct fib_info *fi, unsigned int flags)
957 {
958         struct nlmsghdr *nlh;
959         struct rtmsg *rtm;
960
961         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
962         if (nlh == NULL)
963                 return -ENOBUFS;
964
965         rtm = nlmsg_data(nlh);
966         rtm->rtm_family = AF_INET;
967         rtm->rtm_dst_len = dst_len;
968         rtm->rtm_src_len = 0;
969         rtm->rtm_tos = tos;
970         rtm->rtm_table = tb_id;
971         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
972         rtm->rtm_type = type;
973         rtm->rtm_flags = fi->fib_flags;
974         rtm->rtm_scope = scope;
975         rtm->rtm_protocol = fi->fib_protocol;
976
977         if (rtm->rtm_dst_len)
978                 NLA_PUT_BE32(skb, RTA_DST, dst);
979
980         if (fi->fib_priority)
981                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
982
983         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
984                 goto nla_put_failure;
985
986         if (fi->fib_prefsrc)
987                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
988
989         if (fi->fib_nhs == 1) {
990                 if (fi->fib_nh->nh_gw)
991                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
992
993                 if (fi->fib_nh->nh_oif)
994                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
995 #ifdef CONFIG_NET_CLS_ROUTE
996                 if (fi->fib_nh[0].nh_tclassid)
997                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
998 #endif
999         }
1000 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1001         if (fi->fib_nhs > 1) {
1002                 struct rtnexthop *rtnh;
1003                 struct nlattr *mp;
1004
1005                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1006                 if (mp == NULL)
1007                         goto nla_put_failure;
1008
1009                 for_nexthops(fi) {
1010                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1011                         if (rtnh == NULL)
1012                                 goto nla_put_failure;
1013
1014                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1015                         rtnh->rtnh_hops = nh->nh_weight - 1;
1016                         rtnh->rtnh_ifindex = nh->nh_oif;
1017
1018                         if (nh->nh_gw)
1019                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1020 #ifdef CONFIG_NET_CLS_ROUTE
1021                         if (nh->nh_tclassid)
1022                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1023 #endif
1024                         /* length of rtnetlink header + attributes */
1025                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1026                 } endfor_nexthops(fi);
1027
1028                 nla_nest_end(skb, mp);
1029         }
1030 #endif
1031         return nlmsg_end(skb, nlh);
1032
1033 nla_put_failure:
1034         return nlmsg_cancel(skb, nlh);
1035 }
1036
1037 /*
1038    Update FIB if:
1039    - local address disappeared -> we must delete all the entries
1040      referring to it.
1041    - device went down -> we must shutdown all nexthops going via it.
1042  */
1043
1044 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1045 {
1046         int ret = 0;
1047         int scope = RT_SCOPE_NOWHERE;
1048         
1049         if (force)
1050                 scope = -1;
1051
1052         if (local && fib_info_laddrhash) {
1053                 unsigned int hash = fib_laddr_hashfn(local);
1054                 struct hlist_head *head = &fib_info_laddrhash[hash];
1055                 struct hlist_node *node;
1056                 struct fib_info *fi;
1057
1058                 hlist_for_each_entry(fi, node, head, fib_lhash) {
1059                         if (fi->fib_prefsrc == local) {
1060                                 fi->fib_flags |= RTNH_F_DEAD;
1061                                 ret++;
1062                         }
1063                 }
1064         }
1065
1066         if (dev) {
1067                 struct fib_info *prev_fi = NULL;
1068                 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1069                 struct hlist_head *head = &fib_info_devhash[hash];
1070                 struct hlist_node *node;
1071                 struct fib_nh *nh;
1072
1073                 hlist_for_each_entry(nh, node, head, nh_hash) {
1074                         struct fib_info *fi = nh->nh_parent;
1075                         int dead;
1076
1077                         BUG_ON(!fi->fib_nhs);
1078                         if (nh->nh_dev != dev || fi == prev_fi)
1079                                 continue;
1080                         prev_fi = fi;
1081                         dead = 0;
1082                         change_nexthops(fi) {
1083                                 if (nh->nh_flags&RTNH_F_DEAD)
1084                                         dead++;
1085                                 else if (nh->nh_dev == dev &&
1086                                          nh->nh_scope != scope) {
1087                                         nh->nh_flags |= RTNH_F_DEAD;
1088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1089                                         spin_lock_bh(&fib_multipath_lock);
1090                                         fi->fib_power -= nh->nh_power;
1091                                         nh->nh_power = 0;
1092                                         spin_unlock_bh(&fib_multipath_lock);
1093 #endif
1094                                         dead++;
1095                                 }
1096 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1097                                 if (force > 1 && nh->nh_dev == dev) {
1098                                         dead = fi->fib_nhs;
1099                                         break;
1100                                 }
1101 #endif
1102                         } endfor_nexthops(fi)
1103                         if (dead == fi->fib_nhs) {
1104                                 fi->fib_flags |= RTNH_F_DEAD;
1105                                 ret++;
1106                         }
1107                 }
1108         }
1109
1110         return ret;
1111 }
1112
1113 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1114
1115 /*
1116    Dead device goes up. We wake up dead nexthops.
1117    It takes sense only on multipath routes.
1118  */
1119
1120 int fib_sync_up(struct net_device *dev)
1121 {
1122         struct fib_info *prev_fi;
1123         unsigned int hash;
1124         struct hlist_head *head;
1125         struct hlist_node *node;
1126         struct fib_nh *nh;
1127         int ret;
1128
1129         if (!(dev->flags&IFF_UP))
1130                 return 0;
1131
1132         prev_fi = NULL;
1133         hash = fib_devindex_hashfn(dev->ifindex);
1134         head = &fib_info_devhash[hash];
1135         ret = 0;
1136
1137         hlist_for_each_entry(nh, node, head, nh_hash) {
1138                 struct fib_info *fi = nh->nh_parent;
1139                 int alive;
1140
1141                 BUG_ON(!fi->fib_nhs);
1142                 if (nh->nh_dev != dev || fi == prev_fi)
1143                         continue;
1144
1145                 prev_fi = fi;
1146                 alive = 0;
1147                 change_nexthops(fi) {
1148                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1149                                 alive++;
1150                                 continue;
1151                         }
1152                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1153                                 continue;
1154                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1155                                 continue;
1156                         alive++;
1157                         spin_lock_bh(&fib_multipath_lock);
1158                         nh->nh_power = 0;
1159                         nh->nh_flags &= ~RTNH_F_DEAD;
1160                         spin_unlock_bh(&fib_multipath_lock);
1161                 } endfor_nexthops(fi)
1162
1163                 if (alive > 0) {
1164                         fi->fib_flags &= ~RTNH_F_DEAD;
1165                         ret++;
1166                 }
1167         }
1168
1169         return ret;
1170 }
1171
1172 /*
1173    The algorithm is suboptimal, but it provides really
1174    fair weighted route distribution.
1175  */
1176
1177 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1178 {
1179         struct fib_info *fi = res->fi;
1180         int w;
1181
1182         spin_lock_bh(&fib_multipath_lock);
1183         if (fi->fib_power <= 0) {
1184                 int power = 0;
1185                 change_nexthops(fi) {
1186                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1187                                 power += nh->nh_weight;
1188                                 nh->nh_power = nh->nh_weight;
1189                         }
1190                 } endfor_nexthops(fi);
1191                 fi->fib_power = power;
1192                 if (power <= 0) {
1193                         spin_unlock_bh(&fib_multipath_lock);
1194                         /* Race condition: route has just become dead. */
1195                         res->nh_sel = 0;
1196                         return;
1197                 }
1198         }
1199
1200
1201         /* w should be random number [0..fi->fib_power-1],
1202            it is pretty bad approximation.
1203          */
1204
1205         w = jiffies % fi->fib_power;
1206
1207         change_nexthops(fi) {
1208                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1209                         if ((w -= nh->nh_power) <= 0) {
1210                                 nh->nh_power--;
1211                                 fi->fib_power--;
1212                                 res->nh_sel = nhsel;
1213                                 spin_unlock_bh(&fib_multipath_lock);
1214                                 return;
1215                         }
1216                 }
1217         } endfor_nexthops(fi);
1218
1219         /* Race condition: route has just become dead. */
1220         res->nh_sel = 0;
1221         spin_unlock_bh(&fib_multipath_lock);
1222 }
1223 #endif