Pull battery into release branch
[linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
47
48 #include "fib_lookup.h"
49
50 #define FSprintk(a...)
51
52 static DEFINE_SPINLOCK(fib_info_lock);
53 static struct hlist_head *fib_info_hash;
54 static struct hlist_head *fib_info_laddrhash;
55 static unsigned int fib_hash_size;
56 static unsigned int fib_info_cnt;
57
58 #define DEVINDEX_HASHBITS 8
59 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
60 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
61
62 #ifdef CONFIG_IP_ROUTE_MULTIPATH
63
64 static DEFINE_SPINLOCK(fib_multipath_lock);
65
66 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
67 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
68
69 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
70 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
71
72 #else /* CONFIG_IP_ROUTE_MULTIPATH */
73
74 /* Hope, that gcc will optimize it to get rid of dummy loop */
75
76 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
77 for (nhsel=0; nhsel < 1; nhsel++)
78
79 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
80 for (nhsel=0; nhsel < 1; nhsel++)
81
82 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
83
84 #define endfor_nexthops(fi) }
85
86
87 static const struct
88 {
89         int     error;
90         u8      scope;
91 } fib_props[RTN_MAX + 1] = {
92         {
93                 .error  = 0,
94                 .scope  = RT_SCOPE_NOWHERE,
95         },      /* RTN_UNSPEC */
96         {
97                 .error  = 0,
98                 .scope  = RT_SCOPE_UNIVERSE,
99         },      /* RTN_UNICAST */
100         {
101                 .error  = 0,
102                 .scope  = RT_SCOPE_HOST,
103         },      /* RTN_LOCAL */
104         {
105                 .error  = 0,
106                 .scope  = RT_SCOPE_LINK,
107         },      /* RTN_BROADCAST */
108         {
109                 .error  = 0,
110                 .scope  = RT_SCOPE_LINK,
111         },      /* RTN_ANYCAST */
112         {
113                 .error  = 0,
114                 .scope  = RT_SCOPE_UNIVERSE,
115         },      /* RTN_MULTICAST */
116         {
117                 .error  = -EINVAL,
118                 .scope  = RT_SCOPE_UNIVERSE,
119         },      /* RTN_BLACKHOLE */
120         {
121                 .error  = -EHOSTUNREACH,
122                 .scope  = RT_SCOPE_UNIVERSE,
123         },      /* RTN_UNREACHABLE */
124         {
125                 .error  = -EACCES,
126                 .scope  = RT_SCOPE_UNIVERSE,
127         },      /* RTN_PROHIBIT */
128         {
129                 .error  = -EAGAIN,
130                 .scope  = RT_SCOPE_UNIVERSE,
131         },      /* RTN_THROW */
132         {
133                 .error  = -EINVAL,
134                 .scope  = RT_SCOPE_NOWHERE,
135         },      /* RTN_NAT */
136         {
137                 .error  = -EINVAL,
138                 .scope  = RT_SCOPE_NOWHERE,
139         },      /* RTN_XRESOLVE */
140 };
141
142
143 /* Release a nexthop info record */
144
145 void free_fib_info(struct fib_info *fi)
146 {
147         if (fi->fib_dead == 0) {
148                 printk("Freeing alive fib_info %p\n", fi);
149                 return;
150         }
151         change_nexthops(fi) {
152                 if (nh->nh_dev)
153                         dev_put(nh->nh_dev);
154                 nh->nh_dev = NULL;
155         } endfor_nexthops(fi);
156         fib_info_cnt--;
157         kfree(fi);
158 }
159
160 void fib_release_info(struct fib_info *fi)
161 {
162         spin_lock_bh(&fib_info_lock);
163         if (fi && --fi->fib_treeref == 0) {
164                 hlist_del(&fi->fib_hash);
165                 if (fi->fib_prefsrc)
166                         hlist_del(&fi->fib_lhash);
167                 change_nexthops(fi) {
168                         if (!nh->nh_dev)
169                                 continue;
170                         hlist_del(&nh->nh_hash);
171                 } endfor_nexthops(fi)
172                 fi->fib_dead = 1;
173                 fib_info_put(fi);
174         }
175         spin_unlock_bh(&fib_info_lock);
176 }
177
178 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
179 {
180         const struct fib_nh *onh = ofi->fib_nh;
181
182         for_nexthops(fi) {
183                 if (nh->nh_oif != onh->nh_oif ||
184                     nh->nh_gw  != onh->nh_gw ||
185                     nh->nh_scope != onh->nh_scope ||
186 #ifdef CONFIG_IP_ROUTE_MULTIPATH
187                     nh->nh_weight != onh->nh_weight ||
188 #endif
189 #ifdef CONFIG_NET_CLS_ROUTE
190                     nh->nh_tclassid != onh->nh_tclassid ||
191 #endif
192                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
193                         return -1;
194                 onh++;
195         } endfor_nexthops(fi);
196         return 0;
197 }
198
199 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
200 {
201         unsigned int mask = (fib_hash_size - 1);
202         unsigned int val = fi->fib_nhs;
203
204         val ^= fi->fib_protocol;
205         val ^= (__force u32)fi->fib_prefsrc;
206         val ^= fi->fib_priority;
207
208         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
209 }
210
211 static struct fib_info *fib_find_info(const struct fib_info *nfi)
212 {
213         struct hlist_head *head;
214         struct hlist_node *node;
215         struct fib_info *fi;
216         unsigned int hash;
217
218         hash = fib_info_hashfn(nfi);
219         head = &fib_info_hash[hash];
220
221         hlist_for_each_entry(fi, node, head, fib_hash) {
222                 if (fi->fib_nhs != nfi->fib_nhs)
223                         continue;
224                 if (nfi->fib_protocol == fi->fib_protocol &&
225                     nfi->fib_prefsrc == fi->fib_prefsrc &&
226                     nfi->fib_priority == fi->fib_priority &&
227                     memcmp(nfi->fib_metrics, fi->fib_metrics,
228                            sizeof(fi->fib_metrics)) == 0 &&
229                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
230                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
231                         return fi;
232         }
233
234         return NULL;
235 }
236
237 static inline unsigned int fib_devindex_hashfn(unsigned int val)
238 {
239         unsigned int mask = DEVINDEX_HASHSIZE - 1;
240
241         return (val ^
242                 (val >> DEVINDEX_HASHBITS) ^
243                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
244 }
245
246 /* Check, that the gateway is already configured.
247    Used only by redirect accept routine.
248  */
249
250 int ip_fib_check_default(__be32 gw, struct net_device *dev)
251 {
252         struct hlist_head *head;
253         struct hlist_node *node;
254         struct fib_nh *nh;
255         unsigned int hash;
256
257         spin_lock(&fib_info_lock);
258
259         hash = fib_devindex_hashfn(dev->ifindex);
260         head = &fib_info_devhash[hash];
261         hlist_for_each_entry(nh, node, head, nh_hash) {
262                 if (nh->nh_dev == dev &&
263                     nh->nh_gw == gw &&
264                     !(nh->nh_flags&RTNH_F_DEAD)) {
265                         spin_unlock(&fib_info_lock);
266                         return 0;
267                 }
268         }
269
270         spin_unlock(&fib_info_lock);
271
272         return -1;
273 }
274
275 static inline size_t fib_nlmsg_size(struct fib_info *fi)
276 {
277         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
278                          + nla_total_size(4) /* RTA_TABLE */
279                          + nla_total_size(4) /* RTA_DST */
280                          + nla_total_size(4) /* RTA_PRIORITY */
281                          + nla_total_size(4); /* RTA_PREFSRC */
282
283         /* space for nested metrics */
284         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
285
286         if (fi->fib_nhs) {
287                 /* Also handles the special case fib_nhs == 1 */
288
289                 /* each nexthop is packed in an attribute */
290                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
291
292                 /* may contain flow and gateway attribute */
293                 nhsize += 2 * nla_total_size(4);
294
295                 /* all nexthops are packed in a nested attribute */
296                 payload += nla_total_size(fi->fib_nhs * nhsize);
297         }
298
299         return payload;
300 }
301
302 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
303                int dst_len, u32 tb_id, struct nl_info *info,
304                unsigned int nlm_flags)
305 {
306         struct sk_buff *skb;
307         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
308         int err = -ENOBUFS;
309
310         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
311         if (skb == NULL)
312                 goto errout;
313
314         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
315                             fa->fa_type, fa->fa_scope, key, dst_len,
316                             fa->fa_tos, fa->fa_info, nlm_flags);
317         if (err < 0) {
318                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
319                 WARN_ON(err == -EMSGSIZE);
320                 kfree_skb(skb);
321                 goto errout;
322         }
323         err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
324                           info->nlh, GFP_KERNEL);
325 errout:
326         if (err < 0)
327                 rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
328 }
329
330 /* Return the first fib alias matching TOS with
331  * priority less than or equal to PRIO.
332  */
333 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
334 {
335         if (fah) {
336                 struct fib_alias *fa;
337                 list_for_each_entry(fa, fah, fa_list) {
338                         if (fa->fa_tos > tos)
339                                 continue;
340                         if (fa->fa_info->fib_priority >= prio ||
341                             fa->fa_tos < tos)
342                                 return fa;
343                 }
344         }
345         return NULL;
346 }
347
348 int fib_detect_death(struct fib_info *fi, int order,
349                      struct fib_info **last_resort, int *last_idx, int *dflt)
350 {
351         struct neighbour *n;
352         int state = NUD_NONE;
353
354         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
355         if (n) {
356                 state = n->nud_state;
357                 neigh_release(n);
358         }
359         if (state==NUD_REACHABLE)
360                 return 0;
361         if ((state&NUD_VALID) && order != *dflt)
362                 return 0;
363         if ((state&NUD_VALID) ||
364             (*last_idx<0 && order > *dflt)) {
365                 *last_resort = fi;
366                 *last_idx = order;
367         }
368         return 1;
369 }
370
371 #ifdef CONFIG_IP_ROUTE_MULTIPATH
372
373 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
374 {
375         int nhs = 0;
376
377         while (rtnh_ok(rtnh, remaining)) {
378                 nhs++;
379                 rtnh = rtnh_next(rtnh, &remaining);
380         }
381
382         /* leftover implies invalid nexthop configuration, discard it */
383         return remaining > 0 ? 0 : nhs;
384 }
385
386 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
387                        int remaining, struct fib_config *cfg)
388 {
389         change_nexthops(fi) {
390                 int attrlen;
391
392                 if (!rtnh_ok(rtnh, remaining))
393                         return -EINVAL;
394
395                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
396                 nh->nh_oif = rtnh->rtnh_ifindex;
397                 nh->nh_weight = rtnh->rtnh_hops + 1;
398
399                 attrlen = rtnh_attrlen(rtnh);
400                 if (attrlen > 0) {
401                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
402
403                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
404                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
405 #ifdef CONFIG_NET_CLS_ROUTE
406                         nla = nla_find(attrs, attrlen, RTA_FLOW);
407                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
408 #endif
409                 }
410
411                 rtnh = rtnh_next(rtnh, &remaining);
412         } endfor_nexthops(fi);
413
414         return 0;
415 }
416
417 #endif
418
419 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
420 {
421 #ifdef CONFIG_IP_ROUTE_MULTIPATH
422         struct rtnexthop *rtnh;
423         int remaining;
424 #endif
425
426         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
427                 return 1;
428
429         if (cfg->fc_oif || cfg->fc_gw) {
430                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
431                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
432                         return 0;
433                 return 1;
434         }
435
436 #ifdef CONFIG_IP_ROUTE_MULTIPATH
437         if (cfg->fc_mp == NULL)
438                 return 0;
439
440         rtnh = cfg->fc_mp;
441         remaining = cfg->fc_mp_len;
442
443         for_nexthops(fi) {
444                 int attrlen;
445
446                 if (!rtnh_ok(rtnh, remaining))
447                         return -EINVAL;
448
449                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
450                         return 1;
451
452                 attrlen = rtnh_attrlen(rtnh);
453                 if (attrlen < 0) {
454                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
455
456                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
457                         if (nla && nla_get_be32(nla) != nh->nh_gw)
458                                 return 1;
459 #ifdef CONFIG_NET_CLS_ROUTE
460                         nla = nla_find(attrs, attrlen, RTA_FLOW);
461                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
462                                 return 1;
463 #endif
464                 }
465
466                 rtnh = rtnh_next(rtnh, &remaining);
467         } endfor_nexthops(fi);
468 #endif
469         return 0;
470 }
471
472
473 /*
474    Picture
475    -------
476
477    Semantics of nexthop is very messy by historical reasons.
478    We have to take into account, that:
479    a) gateway can be actually local interface address,
480       so that gatewayed route is direct.
481    b) gateway must be on-link address, possibly
482       described not by an ifaddr, but also by a direct route.
483    c) If both gateway and interface are specified, they should not
484       contradict.
485    d) If we use tunnel routes, gateway could be not on-link.
486
487    Attempt to reconcile all of these (alas, self-contradictory) conditions
488    results in pretty ugly and hairy code with obscure logic.
489
490    I chose to generalized it instead, so that the size
491    of code does not increase practically, but it becomes
492    much more general.
493    Every prefix is assigned a "scope" value: "host" is local address,
494    "link" is direct route,
495    [ ... "site" ... "interior" ... ]
496    and "universe" is true gateway route with global meaning.
497
498    Every prefix refers to a set of "nexthop"s (gw, oif),
499    where gw must have narrower scope. This recursion stops
500    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
501    which means that gw is forced to be on link.
502
503    Code is still hairy, but now it is apparently logically
504    consistent and very flexible. F.e. as by-product it allows
505    to co-exists in peace independent exterior and interior
506    routing processes.
507
508    Normally it looks as following.
509
510    {universe prefix}  -> (gw, oif) [scope link]
511                           |
512                           |-> {link prefix} -> (gw, oif) [scope local]
513                                                 |
514                                                 |-> {local prefix} (terminal node)
515  */
516
517 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
518                         struct fib_nh *nh)
519 {
520         int err;
521
522         if (nh->nh_gw) {
523                 struct fib_result res;
524
525 #ifdef CONFIG_IP_ROUTE_PERVASIVE
526                 if (nh->nh_flags&RTNH_F_PERVASIVE)
527                         return 0;
528 #endif
529                 if (nh->nh_flags&RTNH_F_ONLINK) {
530                         struct net_device *dev;
531
532                         if (cfg->fc_scope >= RT_SCOPE_LINK)
533                                 return -EINVAL;
534                         if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
535                                 return -EINVAL;
536                         if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
537                                 return -ENODEV;
538                         if (!(dev->flags&IFF_UP))
539                                 return -ENETDOWN;
540                         nh->nh_dev = dev;
541                         dev_hold(dev);
542                         nh->nh_scope = RT_SCOPE_LINK;
543                         return 0;
544                 }
545                 {
546                         struct flowi fl = {
547                                 .nl_u = {
548                                         .ip4_u = {
549                                                 .daddr = nh->nh_gw,
550                                                 .scope = cfg->fc_scope + 1,
551                                         },
552                                 },
553                                 .oif = nh->nh_oif,
554                         };
555
556                         /* It is not necessary, but requires a bit of thinking */
557                         if (fl.fl4_scope < RT_SCOPE_LINK)
558                                 fl.fl4_scope = RT_SCOPE_LINK;
559                         if ((err = fib_lookup(&fl, &res)) != 0)
560                                 return err;
561                 }
562                 err = -EINVAL;
563                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
564                         goto out;
565                 nh->nh_scope = res.scope;
566                 nh->nh_oif = FIB_RES_OIF(res);
567                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
568                         goto out;
569                 dev_hold(nh->nh_dev);
570                 err = -ENETDOWN;
571                 if (!(nh->nh_dev->flags & IFF_UP))
572                         goto out;
573                 err = 0;
574 out:
575                 fib_res_put(&res);
576                 return err;
577         } else {
578                 struct in_device *in_dev;
579
580                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
581                         return -EINVAL;
582
583                 in_dev = inetdev_by_index(nh->nh_oif);
584                 if (in_dev == NULL)
585                         return -ENODEV;
586                 if (!(in_dev->dev->flags&IFF_UP)) {
587                         in_dev_put(in_dev);
588                         return -ENETDOWN;
589                 }
590                 nh->nh_dev = in_dev->dev;
591                 dev_hold(nh->nh_dev);
592                 nh->nh_scope = RT_SCOPE_HOST;
593                 in_dev_put(in_dev);
594         }
595         return 0;
596 }
597
598 static inline unsigned int fib_laddr_hashfn(__be32 val)
599 {
600         unsigned int mask = (fib_hash_size - 1);
601
602         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
603 }
604
605 static struct hlist_head *fib_hash_alloc(int bytes)
606 {
607         if (bytes <= PAGE_SIZE)
608                 return kmalloc(bytes, GFP_KERNEL);
609         else
610                 return (struct hlist_head *)
611                         __get_free_pages(GFP_KERNEL, get_order(bytes));
612 }
613
614 static void fib_hash_free(struct hlist_head *hash, int bytes)
615 {
616         if (!hash)
617                 return;
618
619         if (bytes <= PAGE_SIZE)
620                 kfree(hash);
621         else
622                 free_pages((unsigned long) hash, get_order(bytes));
623 }
624
625 static void fib_hash_move(struct hlist_head *new_info_hash,
626                           struct hlist_head *new_laddrhash,
627                           unsigned int new_size)
628 {
629         struct hlist_head *old_info_hash, *old_laddrhash;
630         unsigned int old_size = fib_hash_size;
631         unsigned int i, bytes;
632
633         spin_lock_bh(&fib_info_lock);
634         old_info_hash = fib_info_hash;
635         old_laddrhash = fib_info_laddrhash;
636         fib_hash_size = new_size;
637
638         for (i = 0; i < old_size; i++) {
639                 struct hlist_head *head = &fib_info_hash[i];
640                 struct hlist_node *node, *n;
641                 struct fib_info *fi;
642
643                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
644                         struct hlist_head *dest;
645                         unsigned int new_hash;
646
647                         hlist_del(&fi->fib_hash);
648
649                         new_hash = fib_info_hashfn(fi);
650                         dest = &new_info_hash[new_hash];
651                         hlist_add_head(&fi->fib_hash, dest);
652                 }
653         }
654         fib_info_hash = new_info_hash;
655
656         for (i = 0; i < old_size; i++) {
657                 struct hlist_head *lhead = &fib_info_laddrhash[i];
658                 struct hlist_node *node, *n;
659                 struct fib_info *fi;
660
661                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
662                         struct hlist_head *ldest;
663                         unsigned int new_hash;
664
665                         hlist_del(&fi->fib_lhash);
666
667                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
668                         ldest = &new_laddrhash[new_hash];
669                         hlist_add_head(&fi->fib_lhash, ldest);
670                 }
671         }
672         fib_info_laddrhash = new_laddrhash;
673
674         spin_unlock_bh(&fib_info_lock);
675
676         bytes = old_size * sizeof(struct hlist_head *);
677         fib_hash_free(old_info_hash, bytes);
678         fib_hash_free(old_laddrhash, bytes);
679 }
680
681 struct fib_info *fib_create_info(struct fib_config *cfg)
682 {
683         int err;
684         struct fib_info *fi = NULL;
685         struct fib_info *ofi;
686         int nhs = 1;
687
688         /* Fast check to catch the most weird cases */
689         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
690                 goto err_inval;
691
692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
693         if (cfg->fc_mp) {
694                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
695                 if (nhs == 0)
696                         goto err_inval;
697         }
698 #endif
699
700         err = -ENOBUFS;
701         if (fib_info_cnt >= fib_hash_size) {
702                 unsigned int new_size = fib_hash_size << 1;
703                 struct hlist_head *new_info_hash;
704                 struct hlist_head *new_laddrhash;
705                 unsigned int bytes;
706
707                 if (!new_size)
708                         new_size = 1;
709                 bytes = new_size * sizeof(struct hlist_head *);
710                 new_info_hash = fib_hash_alloc(bytes);
711                 new_laddrhash = fib_hash_alloc(bytes);
712                 if (!new_info_hash || !new_laddrhash) {
713                         fib_hash_free(new_info_hash, bytes);
714                         fib_hash_free(new_laddrhash, bytes);
715                 } else {
716                         memset(new_info_hash, 0, bytes);
717                         memset(new_laddrhash, 0, bytes);
718
719                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
720                 }
721
722                 if (!fib_hash_size)
723                         goto failure;
724         }
725
726         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
727         if (fi == NULL)
728                 goto failure;
729         fib_info_cnt++;
730
731         fi->fib_protocol = cfg->fc_protocol;
732         fi->fib_flags = cfg->fc_flags;
733         fi->fib_priority = cfg->fc_priority;
734         fi->fib_prefsrc = cfg->fc_prefsrc;
735
736         fi->fib_nhs = nhs;
737         change_nexthops(fi) {
738                 nh->nh_parent = fi;
739         } endfor_nexthops(fi)
740
741         if (cfg->fc_mx) {
742                 struct nlattr *nla;
743                 int remaining;
744
745                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
746                         int type = nla->nla_type;
747
748                         if (type) {
749                                 if (type > RTAX_MAX)
750                                         goto err_inval;
751                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
752                         }
753                 }
754         }
755
756         if (cfg->fc_mp) {
757 #ifdef CONFIG_IP_ROUTE_MULTIPATH
758                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
759                 if (err != 0)
760                         goto failure;
761                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
762                         goto err_inval;
763                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
764                         goto err_inval;
765 #ifdef CONFIG_NET_CLS_ROUTE
766                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
767                         goto err_inval;
768 #endif
769 #else
770                 goto err_inval;
771 #endif
772         } else {
773                 struct fib_nh *nh = fi->fib_nh;
774
775                 nh->nh_oif = cfg->fc_oif;
776                 nh->nh_gw = cfg->fc_gw;
777                 nh->nh_flags = cfg->fc_flags;
778 #ifdef CONFIG_NET_CLS_ROUTE
779                 nh->nh_tclassid = cfg->fc_flow;
780 #endif
781 #ifdef CONFIG_IP_ROUTE_MULTIPATH
782                 nh->nh_weight = 1;
783 #endif
784         }
785
786         if (fib_props[cfg->fc_type].error) {
787                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
788                         goto err_inval;
789                 goto link_it;
790         }
791
792         if (cfg->fc_scope > RT_SCOPE_HOST)
793                 goto err_inval;
794
795         if (cfg->fc_scope == RT_SCOPE_HOST) {
796                 struct fib_nh *nh = fi->fib_nh;
797
798                 /* Local address is added. */
799                 if (nhs != 1 || nh->nh_gw)
800                         goto err_inval;
801                 nh->nh_scope = RT_SCOPE_NOWHERE;
802                 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
803                 err = -ENODEV;
804                 if (nh->nh_dev == NULL)
805                         goto failure;
806         } else {
807                 change_nexthops(fi) {
808                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
809                                 goto failure;
810                 } endfor_nexthops(fi)
811         }
812
813         if (fi->fib_prefsrc) {
814                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
815                     fi->fib_prefsrc != cfg->fc_dst)
816                         if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
817                                 goto err_inval;
818         }
819
820 link_it:
821         if ((ofi = fib_find_info(fi)) != NULL) {
822                 fi->fib_dead = 1;
823                 free_fib_info(fi);
824                 ofi->fib_treeref++;
825                 return ofi;
826         }
827
828         fi->fib_treeref++;
829         atomic_inc(&fi->fib_clntref);
830         spin_lock_bh(&fib_info_lock);
831         hlist_add_head(&fi->fib_hash,
832                        &fib_info_hash[fib_info_hashfn(fi)]);
833         if (fi->fib_prefsrc) {
834                 struct hlist_head *head;
835
836                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
837                 hlist_add_head(&fi->fib_lhash, head);
838         }
839         change_nexthops(fi) {
840                 struct hlist_head *head;
841                 unsigned int hash;
842
843                 if (!nh->nh_dev)
844                         continue;
845                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
846                 head = &fib_info_devhash[hash];
847                 hlist_add_head(&nh->nh_hash, head);
848         } endfor_nexthops(fi)
849         spin_unlock_bh(&fib_info_lock);
850         return fi;
851
852 err_inval:
853         err = -EINVAL;
854
855 failure:
856         if (fi) {
857                 fi->fib_dead = 1;
858                 free_fib_info(fi);
859         }
860
861         return ERR_PTR(err);
862 }
863
864 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
865 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
866                        struct fib_result *res, __be32 zone, __be32 mask,
867                         int prefixlen)
868 {
869         struct fib_alias *fa;
870         int nh_sel = 0;
871
872         list_for_each_entry_rcu(fa, head, fa_list) {
873                 int err;
874
875                 if (fa->fa_tos &&
876                     fa->fa_tos != flp->fl4_tos)
877                         continue;
878
879                 if (fa->fa_scope < flp->fl4_scope)
880                         continue;
881
882                 fa->fa_state |= FA_S_ACCESSED;
883
884                 err = fib_props[fa->fa_type].error;
885                 if (err == 0) {
886                         struct fib_info *fi = fa->fa_info;
887
888                         if (fi->fib_flags & RTNH_F_DEAD)
889                                 continue;
890
891                         switch (fa->fa_type) {
892                         case RTN_UNICAST:
893                         case RTN_LOCAL:
894                         case RTN_BROADCAST:
895                         case RTN_ANYCAST:
896                         case RTN_MULTICAST:
897                                 for_nexthops(fi) {
898                                         if (nh->nh_flags&RTNH_F_DEAD)
899                                                 continue;
900                                         if (!flp->oif || flp->oif == nh->nh_oif)
901                                                 break;
902                                 }
903 #ifdef CONFIG_IP_ROUTE_MULTIPATH
904                                 if (nhsel < fi->fib_nhs) {
905                                         nh_sel = nhsel;
906                                         goto out_fill_res;
907                                 }
908 #else
909                                 if (nhsel < 1) {
910                                         goto out_fill_res;
911                                 }
912 #endif
913                                 endfor_nexthops(fi);
914                                 continue;
915
916                         default:
917                                 printk(KERN_DEBUG "impossible 102\n");
918                                 return -EINVAL;
919                         }
920                 }
921                 return err;
922         }
923         return 1;
924
925 out_fill_res:
926         res->prefixlen = prefixlen;
927         res->nh_sel = nh_sel;
928         res->type = fa->fa_type;
929         res->scope = fa->fa_scope;
930         res->fi = fa->fa_info;
931         atomic_inc(&res->fi->fib_clntref);
932         return 0;
933 }
934
935 /* Find appropriate source address to this destination */
936
937 __be32 __fib_res_prefsrc(struct fib_result *res)
938 {
939         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
940 }
941
942 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
943                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
944                   struct fib_info *fi, unsigned int flags)
945 {
946         struct nlmsghdr *nlh;
947         struct rtmsg *rtm;
948
949         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
950         if (nlh == NULL)
951                 return -EMSGSIZE;
952
953         rtm = nlmsg_data(nlh);
954         rtm->rtm_family = AF_INET;
955         rtm->rtm_dst_len = dst_len;
956         rtm->rtm_src_len = 0;
957         rtm->rtm_tos = tos;
958         rtm->rtm_table = tb_id;
959         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
960         rtm->rtm_type = type;
961         rtm->rtm_flags = fi->fib_flags;
962         rtm->rtm_scope = scope;
963         rtm->rtm_protocol = fi->fib_protocol;
964
965         if (rtm->rtm_dst_len)
966                 NLA_PUT_BE32(skb, RTA_DST, dst);
967
968         if (fi->fib_priority)
969                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
970
971         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
972                 goto nla_put_failure;
973
974         if (fi->fib_prefsrc)
975                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
976
977         if (fi->fib_nhs == 1) {
978                 if (fi->fib_nh->nh_gw)
979                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
980
981                 if (fi->fib_nh->nh_oif)
982                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
983 #ifdef CONFIG_NET_CLS_ROUTE
984                 if (fi->fib_nh[0].nh_tclassid)
985                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
986 #endif
987         }
988 #ifdef CONFIG_IP_ROUTE_MULTIPATH
989         if (fi->fib_nhs > 1) {
990                 struct rtnexthop *rtnh;
991                 struct nlattr *mp;
992
993                 mp = nla_nest_start(skb, RTA_MULTIPATH);
994                 if (mp == NULL)
995                         goto nla_put_failure;
996
997                 for_nexthops(fi) {
998                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
999                         if (rtnh == NULL)
1000                                 goto nla_put_failure;
1001
1002                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1003                         rtnh->rtnh_hops = nh->nh_weight - 1;
1004                         rtnh->rtnh_ifindex = nh->nh_oif;
1005
1006                         if (nh->nh_gw)
1007                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1008 #ifdef CONFIG_NET_CLS_ROUTE
1009                         if (nh->nh_tclassid)
1010                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1011 #endif
1012                         /* length of rtnetlink header + attributes */
1013                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1014                 } endfor_nexthops(fi);
1015
1016                 nla_nest_end(skb, mp);
1017         }
1018 #endif
1019         return nlmsg_end(skb, nlh);
1020
1021 nla_put_failure:
1022         nlmsg_cancel(skb, nlh);
1023         return -EMSGSIZE;
1024 }
1025
1026 /*
1027    Update FIB if:
1028    - local address disappeared -> we must delete all the entries
1029      referring to it.
1030    - device went down -> we must shutdown all nexthops going via it.
1031  */
1032
1033 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1034 {
1035         int ret = 0;
1036         int scope = RT_SCOPE_NOWHERE;
1037
1038         if (force)
1039                 scope = -1;
1040
1041         if (local && fib_info_laddrhash) {
1042                 unsigned int hash = fib_laddr_hashfn(local);
1043                 struct hlist_head *head = &fib_info_laddrhash[hash];
1044                 struct hlist_node *node;
1045                 struct fib_info *fi;
1046
1047                 hlist_for_each_entry(fi, node, head, fib_lhash) {
1048                         if (fi->fib_prefsrc == local) {
1049                                 fi->fib_flags |= RTNH_F_DEAD;
1050                                 ret++;
1051                         }
1052                 }
1053         }
1054
1055         if (dev) {
1056                 struct fib_info *prev_fi = NULL;
1057                 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1058                 struct hlist_head *head = &fib_info_devhash[hash];
1059                 struct hlist_node *node;
1060                 struct fib_nh *nh;
1061
1062                 hlist_for_each_entry(nh, node, head, nh_hash) {
1063                         struct fib_info *fi = nh->nh_parent;
1064                         int dead;
1065
1066                         BUG_ON(!fi->fib_nhs);
1067                         if (nh->nh_dev != dev || fi == prev_fi)
1068                                 continue;
1069                         prev_fi = fi;
1070                         dead = 0;
1071                         change_nexthops(fi) {
1072                                 if (nh->nh_flags&RTNH_F_DEAD)
1073                                         dead++;
1074                                 else if (nh->nh_dev == dev &&
1075                                          nh->nh_scope != scope) {
1076                                         nh->nh_flags |= RTNH_F_DEAD;
1077 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1078                                         spin_lock_bh(&fib_multipath_lock);
1079                                         fi->fib_power -= nh->nh_power;
1080                                         nh->nh_power = 0;
1081                                         spin_unlock_bh(&fib_multipath_lock);
1082 #endif
1083                                         dead++;
1084                                 }
1085 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1086                                 if (force > 1 && nh->nh_dev == dev) {
1087                                         dead = fi->fib_nhs;
1088                                         break;
1089                                 }
1090 #endif
1091                         } endfor_nexthops(fi)
1092                         if (dead == fi->fib_nhs) {
1093                                 fi->fib_flags |= RTNH_F_DEAD;
1094                                 ret++;
1095                         }
1096                 }
1097         }
1098
1099         return ret;
1100 }
1101
1102 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1103
1104 /*
1105    Dead device goes up. We wake up dead nexthops.
1106    It takes sense only on multipath routes.
1107  */
1108
1109 int fib_sync_up(struct net_device *dev)
1110 {
1111         struct fib_info *prev_fi;
1112         unsigned int hash;
1113         struct hlist_head *head;
1114         struct hlist_node *node;
1115         struct fib_nh *nh;
1116         int ret;
1117
1118         if (!(dev->flags&IFF_UP))
1119                 return 0;
1120
1121         prev_fi = NULL;
1122         hash = fib_devindex_hashfn(dev->ifindex);
1123         head = &fib_info_devhash[hash];
1124         ret = 0;
1125
1126         hlist_for_each_entry(nh, node, head, nh_hash) {
1127                 struct fib_info *fi = nh->nh_parent;
1128                 int alive;
1129
1130                 BUG_ON(!fi->fib_nhs);
1131                 if (nh->nh_dev != dev || fi == prev_fi)
1132                         continue;
1133
1134                 prev_fi = fi;
1135                 alive = 0;
1136                 change_nexthops(fi) {
1137                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1138                                 alive++;
1139                                 continue;
1140                         }
1141                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1142                                 continue;
1143                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1144                                 continue;
1145                         alive++;
1146                         spin_lock_bh(&fib_multipath_lock);
1147                         nh->nh_power = 0;
1148                         nh->nh_flags &= ~RTNH_F_DEAD;
1149                         spin_unlock_bh(&fib_multipath_lock);
1150                 } endfor_nexthops(fi)
1151
1152                 if (alive > 0) {
1153                         fi->fib_flags &= ~RTNH_F_DEAD;
1154                         ret++;
1155                 }
1156         }
1157
1158         return ret;
1159 }
1160
1161 /*
1162    The algorithm is suboptimal, but it provides really
1163    fair weighted route distribution.
1164  */
1165
1166 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1167 {
1168         struct fib_info *fi = res->fi;
1169         int w;
1170
1171         spin_lock_bh(&fib_multipath_lock);
1172         if (fi->fib_power <= 0) {
1173                 int power = 0;
1174                 change_nexthops(fi) {
1175                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1176                                 power += nh->nh_weight;
1177                                 nh->nh_power = nh->nh_weight;
1178                         }
1179                 } endfor_nexthops(fi);
1180                 fi->fib_power = power;
1181                 if (power <= 0) {
1182                         spin_unlock_bh(&fib_multipath_lock);
1183                         /* Race condition: route has just become dead. */
1184                         res->nh_sel = 0;
1185                         return;
1186                 }
1187         }
1188
1189
1190         /* w should be random number [0..fi->fib_power-1],
1191            it is pretty bad approximation.
1192          */
1193
1194         w = jiffies % fi->fib_power;
1195
1196         change_nexthops(fi) {
1197                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1198                         if ((w -= nh->nh_power) <= 0) {
1199                                 nh->nh_power--;
1200                                 fi->fib_power--;
1201                                 res->nh_sel = nhsel;
1202                                 spin_unlock_bh(&fib_multipath_lock);
1203                                 return;
1204                         }
1205                 }
1206         } endfor_nexthops(fi);
1207
1208         /* Race condition: route has just become dead. */
1209         res->nh_sel = 0;
1210         spin_unlock_bh(&fib_multipath_lock);
1211 }
1212 #endif