Merge commit 'kumar/next' into next
[linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *              This program is free software; you can redistribute it and/or
11  *              modify it under the terms of the GNU General Public License
12  *              as published by the Free Software Foundation; either version
13  *              2 of the License, or (at your option) any later version.
14  */
15
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45
46 #include "fib_lookup.h"
47
48 static DEFINE_SPINLOCK(fib_info_lock);
49 static struct hlist_head *fib_info_hash;
50 static struct hlist_head *fib_info_laddrhash;
51 static unsigned int fib_hash_size;
52 static unsigned int fib_info_cnt;
53
54 #define DEVINDEX_HASHBITS 8
55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57
58 #ifdef CONFIG_IP_ROUTE_MULTIPATH
59
60 static DEFINE_SPINLOCK(fib_multipath_lock);
61
62 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64
65 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
66 for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
67
68 #else /* CONFIG_IP_ROUTE_MULTIPATH */
69
70 /* Hope, that gcc will optimize it to get rid of dummy loop */
71
72 #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73 for (nhsel=0; nhsel < 1; nhsel++)
74
75 #define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
76 for (nhsel=0; nhsel < 1; nhsel++)
77
78 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
79
80 #define endfor_nexthops(fi) }
81
82
83 static const struct
84 {
85         int     error;
86         u8      scope;
87 } fib_props[RTN_MAX + 1] = {
88         {
89                 .error  = 0,
90                 .scope  = RT_SCOPE_NOWHERE,
91         },      /* RTN_UNSPEC */
92         {
93                 .error  = 0,
94                 .scope  = RT_SCOPE_UNIVERSE,
95         },      /* RTN_UNICAST */
96         {
97                 .error  = 0,
98                 .scope  = RT_SCOPE_HOST,
99         },      /* RTN_LOCAL */
100         {
101                 .error  = 0,
102                 .scope  = RT_SCOPE_LINK,
103         },      /* RTN_BROADCAST */
104         {
105                 .error  = 0,
106                 .scope  = RT_SCOPE_LINK,
107         },      /* RTN_ANYCAST */
108         {
109                 .error  = 0,
110                 .scope  = RT_SCOPE_UNIVERSE,
111         },      /* RTN_MULTICAST */
112         {
113                 .error  = -EINVAL,
114                 .scope  = RT_SCOPE_UNIVERSE,
115         },      /* RTN_BLACKHOLE */
116         {
117                 .error  = -EHOSTUNREACH,
118                 .scope  = RT_SCOPE_UNIVERSE,
119         },      /* RTN_UNREACHABLE */
120         {
121                 .error  = -EACCES,
122                 .scope  = RT_SCOPE_UNIVERSE,
123         },      /* RTN_PROHIBIT */
124         {
125                 .error  = -EAGAIN,
126                 .scope  = RT_SCOPE_UNIVERSE,
127         },      /* RTN_THROW */
128         {
129                 .error  = -EINVAL,
130                 .scope  = RT_SCOPE_NOWHERE,
131         },      /* RTN_NAT */
132         {
133                 .error  = -EINVAL,
134                 .scope  = RT_SCOPE_NOWHERE,
135         },      /* RTN_XRESOLVE */
136 };
137
138
139 /* Release a nexthop info record */
140
141 void free_fib_info(struct fib_info *fi)
142 {
143         if (fi->fib_dead == 0) {
144                 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
145                 return;
146         }
147         change_nexthops(fi) {
148                 if (nh->nh_dev)
149                         dev_put(nh->nh_dev);
150                 nh->nh_dev = NULL;
151         } endfor_nexthops(fi);
152         fib_info_cnt--;
153         release_net(fi->fib_net);
154         kfree(fi);
155 }
156
157 void fib_release_info(struct fib_info *fi)
158 {
159         spin_lock_bh(&fib_info_lock);
160         if (fi && --fi->fib_treeref == 0) {
161                 hlist_del(&fi->fib_hash);
162                 if (fi->fib_prefsrc)
163                         hlist_del(&fi->fib_lhash);
164                 change_nexthops(fi) {
165                         if (!nh->nh_dev)
166                                 continue;
167                         hlist_del(&nh->nh_hash);
168                 } endfor_nexthops(fi)
169                 fi->fib_dead = 1;
170                 fib_info_put(fi);
171         }
172         spin_unlock_bh(&fib_info_lock);
173 }
174
175 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 {
177         const struct fib_nh *onh = ofi->fib_nh;
178
179         for_nexthops(fi) {
180                 if (nh->nh_oif != onh->nh_oif ||
181                     nh->nh_gw  != onh->nh_gw ||
182                     nh->nh_scope != onh->nh_scope ||
183 #ifdef CONFIG_IP_ROUTE_MULTIPATH
184                     nh->nh_weight != onh->nh_weight ||
185 #endif
186 #ifdef CONFIG_NET_CLS_ROUTE
187                     nh->nh_tclassid != onh->nh_tclassid ||
188 #endif
189                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190                         return -1;
191                 onh++;
192         } endfor_nexthops(fi);
193         return 0;
194 }
195
196 static inline unsigned int fib_devindex_hashfn(unsigned int val)
197 {
198         unsigned int mask = DEVINDEX_HASHSIZE - 1;
199
200         return (val ^
201                 (val >> DEVINDEX_HASHBITS) ^
202                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
203 }
204
205 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206 {
207         unsigned int mask = (fib_hash_size - 1);
208         unsigned int val = fi->fib_nhs;
209
210         val ^= fi->fib_protocol;
211         val ^= (__force u32)fi->fib_prefsrc;
212         val ^= fi->fib_priority;
213         for_nexthops(fi) {
214                 val ^= fib_devindex_hashfn(nh->nh_oif);
215         } endfor_nexthops(fi)
216
217         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218 }
219
220 static struct fib_info *fib_find_info(const struct fib_info *nfi)
221 {
222         struct hlist_head *head;
223         struct hlist_node *node;
224         struct fib_info *fi;
225         unsigned int hash;
226
227         hash = fib_info_hashfn(nfi);
228         head = &fib_info_hash[hash];
229
230         hlist_for_each_entry(fi, node, head, fib_hash) {
231                 if (fi->fib_net != nfi->fib_net)
232                         continue;
233                 if (fi->fib_nhs != nfi->fib_nhs)
234                         continue;
235                 if (nfi->fib_protocol == fi->fib_protocol &&
236                     nfi->fib_prefsrc == fi->fib_prefsrc &&
237                     nfi->fib_priority == fi->fib_priority &&
238                     memcmp(nfi->fib_metrics, fi->fib_metrics,
239                            sizeof(fi->fib_metrics)) == 0 &&
240                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242                         return fi;
243         }
244
245         return NULL;
246 }
247
248 /* Check, that the gateway is already configured.
249    Used only by redirect accept routine.
250  */
251
252 int ip_fib_check_default(__be32 gw, struct net_device *dev)
253 {
254         struct hlist_head *head;
255         struct hlist_node *node;
256         struct fib_nh *nh;
257         unsigned int hash;
258
259         spin_lock(&fib_info_lock);
260
261         hash = fib_devindex_hashfn(dev->ifindex);
262         head = &fib_info_devhash[hash];
263         hlist_for_each_entry(nh, node, head, nh_hash) {
264                 if (nh->nh_dev == dev &&
265                     nh->nh_gw == gw &&
266                     !(nh->nh_flags&RTNH_F_DEAD)) {
267                         spin_unlock(&fib_info_lock);
268                         return 0;
269                 }
270         }
271
272         spin_unlock(&fib_info_lock);
273
274         return -1;
275 }
276
277 static inline size_t fib_nlmsg_size(struct fib_info *fi)
278 {
279         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280                          + nla_total_size(4) /* RTA_TABLE */
281                          + nla_total_size(4) /* RTA_DST */
282                          + nla_total_size(4) /* RTA_PRIORITY */
283                          + nla_total_size(4); /* RTA_PREFSRC */
284
285         /* space for nested metrics */
286         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287
288         if (fi->fib_nhs) {
289                 /* Also handles the special case fib_nhs == 1 */
290
291                 /* each nexthop is packed in an attribute */
292                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293
294                 /* may contain flow and gateway attribute */
295                 nhsize += 2 * nla_total_size(4);
296
297                 /* all nexthops are packed in a nested attribute */
298                 payload += nla_total_size(fi->fib_nhs * nhsize);
299         }
300
301         return payload;
302 }
303
304 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
305                int dst_len, u32 tb_id, struct nl_info *info,
306                unsigned int nlm_flags)
307 {
308         struct sk_buff *skb;
309         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
310         int err = -ENOBUFS;
311
312         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
313         if (skb == NULL)
314                 goto errout;
315
316         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
317                             fa->fa_type, fa->fa_scope, key, dst_len,
318                             fa->fa_tos, fa->fa_info, nlm_flags);
319         if (err < 0) {
320                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321                 WARN_ON(err == -EMSGSIZE);
322                 kfree_skb(skb);
323                 goto errout;
324         }
325         err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326                           info->nlh, GFP_KERNEL);
327 errout:
328         if (err < 0)
329                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
330 }
331
332 /* Return the first fib alias matching TOS with
333  * priority less than or equal to PRIO.
334  */
335 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
336 {
337         if (fah) {
338                 struct fib_alias *fa;
339                 list_for_each_entry(fa, fah, fa_list) {
340                         if (fa->fa_tos > tos)
341                                 continue;
342                         if (fa->fa_info->fib_priority >= prio ||
343                             fa->fa_tos < tos)
344                                 return fa;
345                 }
346         }
347         return NULL;
348 }
349
350 int fib_detect_death(struct fib_info *fi, int order,
351                      struct fib_info **last_resort, int *last_idx, int dflt)
352 {
353         struct neighbour *n;
354         int state = NUD_NONE;
355
356         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
357         if (n) {
358                 state = n->nud_state;
359                 neigh_release(n);
360         }
361         if (state == NUD_REACHABLE)
362                 return 0;
363         if ((state&NUD_VALID) && order != dflt)
364                 return 0;
365         if ((state&NUD_VALID) ||
366             (*last_idx<0 && order > dflt)) {
367                 *last_resort = fi;
368                 *last_idx = order;
369         }
370         return 1;
371 }
372
373 #ifdef CONFIG_IP_ROUTE_MULTIPATH
374
375 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
376 {
377         int nhs = 0;
378
379         while (rtnh_ok(rtnh, remaining)) {
380                 nhs++;
381                 rtnh = rtnh_next(rtnh, &remaining);
382         }
383
384         /* leftover implies invalid nexthop configuration, discard it */
385         return remaining > 0 ? 0 : nhs;
386 }
387
388 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
389                        int remaining, struct fib_config *cfg)
390 {
391         change_nexthops(fi) {
392                 int attrlen;
393
394                 if (!rtnh_ok(rtnh, remaining))
395                         return -EINVAL;
396
397                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
398                 nh->nh_oif = rtnh->rtnh_ifindex;
399                 nh->nh_weight = rtnh->rtnh_hops + 1;
400
401                 attrlen = rtnh_attrlen(rtnh);
402                 if (attrlen > 0) {
403                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
404
405                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
406                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
407 #ifdef CONFIG_NET_CLS_ROUTE
408                         nla = nla_find(attrs, attrlen, RTA_FLOW);
409                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
410 #endif
411                 }
412
413                 rtnh = rtnh_next(rtnh, &remaining);
414         } endfor_nexthops(fi);
415
416         return 0;
417 }
418
419 #endif
420
421 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
422 {
423 #ifdef CONFIG_IP_ROUTE_MULTIPATH
424         struct rtnexthop *rtnh;
425         int remaining;
426 #endif
427
428         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
429                 return 1;
430
431         if (cfg->fc_oif || cfg->fc_gw) {
432                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
433                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
434                         return 0;
435                 return 1;
436         }
437
438 #ifdef CONFIG_IP_ROUTE_MULTIPATH
439         if (cfg->fc_mp == NULL)
440                 return 0;
441
442         rtnh = cfg->fc_mp;
443         remaining = cfg->fc_mp_len;
444
445         for_nexthops(fi) {
446                 int attrlen;
447
448                 if (!rtnh_ok(rtnh, remaining))
449                         return -EINVAL;
450
451                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
452                         return 1;
453
454                 attrlen = rtnh_attrlen(rtnh);
455                 if (attrlen < 0) {
456                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
457
458                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
459                         if (nla && nla_get_be32(nla) != nh->nh_gw)
460                                 return 1;
461 #ifdef CONFIG_NET_CLS_ROUTE
462                         nla = nla_find(attrs, attrlen, RTA_FLOW);
463                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
464                                 return 1;
465 #endif
466                 }
467
468                 rtnh = rtnh_next(rtnh, &remaining);
469         } endfor_nexthops(fi);
470 #endif
471         return 0;
472 }
473
474
475 /*
476    Picture
477    -------
478
479    Semantics of nexthop is very messy by historical reasons.
480    We have to take into account, that:
481    a) gateway can be actually local interface address,
482       so that gatewayed route is direct.
483    b) gateway must be on-link address, possibly
484       described not by an ifaddr, but also by a direct route.
485    c) If both gateway and interface are specified, they should not
486       contradict.
487    d) If we use tunnel routes, gateway could be not on-link.
488
489    Attempt to reconcile all of these (alas, self-contradictory) conditions
490    results in pretty ugly and hairy code with obscure logic.
491
492    I chose to generalized it instead, so that the size
493    of code does not increase practically, but it becomes
494    much more general.
495    Every prefix is assigned a "scope" value: "host" is local address,
496    "link" is direct route,
497    [ ... "site" ... "interior" ... ]
498    and "universe" is true gateway route with global meaning.
499
500    Every prefix refers to a set of "nexthop"s (gw, oif),
501    where gw must have narrower scope. This recursion stops
502    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
503    which means that gw is forced to be on link.
504
505    Code is still hairy, but now it is apparently logically
506    consistent and very flexible. F.e. as by-product it allows
507    to co-exists in peace independent exterior and interior
508    routing processes.
509
510    Normally it looks as following.
511
512    {universe prefix}  -> (gw, oif) [scope link]
513                           |
514                           |-> {link prefix} -> (gw, oif) [scope local]
515                                                 |
516                                                 |-> {local prefix} (terminal node)
517  */
518
519 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
520                         struct fib_nh *nh)
521 {
522         int err;
523         struct net *net;
524
525         net = cfg->fc_nlinfo.nl_net;
526         if (nh->nh_gw) {
527                 struct fib_result res;
528
529 #ifdef CONFIG_IP_ROUTE_PERVASIVE
530                 if (nh->nh_flags&RTNH_F_PERVASIVE)
531                         return 0;
532 #endif
533                 if (nh->nh_flags&RTNH_F_ONLINK) {
534                         struct net_device *dev;
535
536                         if (cfg->fc_scope >= RT_SCOPE_LINK)
537                                 return -EINVAL;
538                         if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
539                                 return -EINVAL;
540                         if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
541                                 return -ENODEV;
542                         if (!(dev->flags&IFF_UP))
543                                 return -ENETDOWN;
544                         nh->nh_dev = dev;
545                         dev_hold(dev);
546                         nh->nh_scope = RT_SCOPE_LINK;
547                         return 0;
548                 }
549                 {
550                         struct flowi fl = {
551                                 .nl_u = {
552                                         .ip4_u = {
553                                                 .daddr = nh->nh_gw,
554                                                 .scope = cfg->fc_scope + 1,
555                                         },
556                                 },
557                                 .oif = nh->nh_oif,
558                         };
559
560                         /* It is not necessary, but requires a bit of thinking */
561                         if (fl.fl4_scope < RT_SCOPE_LINK)
562                                 fl.fl4_scope = RT_SCOPE_LINK;
563                         if ((err = fib_lookup(net, &fl, &res)) != 0)
564                                 return err;
565                 }
566                 err = -EINVAL;
567                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
568                         goto out;
569                 nh->nh_scope = res.scope;
570                 nh->nh_oif = FIB_RES_OIF(res);
571                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
572                         goto out;
573                 dev_hold(nh->nh_dev);
574                 err = -ENETDOWN;
575                 if (!(nh->nh_dev->flags & IFF_UP))
576                         goto out;
577                 err = 0;
578 out:
579                 fib_res_put(&res);
580                 return err;
581         } else {
582                 struct in_device *in_dev;
583
584                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
585                         return -EINVAL;
586
587                 in_dev = inetdev_by_index(net, nh->nh_oif);
588                 if (in_dev == NULL)
589                         return -ENODEV;
590                 if (!(in_dev->dev->flags&IFF_UP)) {
591                         in_dev_put(in_dev);
592                         return -ENETDOWN;
593                 }
594                 nh->nh_dev = in_dev->dev;
595                 dev_hold(nh->nh_dev);
596                 nh->nh_scope = RT_SCOPE_HOST;
597                 in_dev_put(in_dev);
598         }
599         return 0;
600 }
601
602 static inline unsigned int fib_laddr_hashfn(__be32 val)
603 {
604         unsigned int mask = (fib_hash_size - 1);
605
606         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
607 }
608
609 static struct hlist_head *fib_hash_alloc(int bytes)
610 {
611         if (bytes <= PAGE_SIZE)
612                 return kzalloc(bytes, GFP_KERNEL);
613         else
614                 return (struct hlist_head *)
615                         __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
616 }
617
618 static void fib_hash_free(struct hlist_head *hash, int bytes)
619 {
620         if (!hash)
621                 return;
622
623         if (bytes <= PAGE_SIZE)
624                 kfree(hash);
625         else
626                 free_pages((unsigned long) hash, get_order(bytes));
627 }
628
629 static void fib_hash_move(struct hlist_head *new_info_hash,
630                           struct hlist_head *new_laddrhash,
631                           unsigned int new_size)
632 {
633         struct hlist_head *old_info_hash, *old_laddrhash;
634         unsigned int old_size = fib_hash_size;
635         unsigned int i, bytes;
636
637         spin_lock_bh(&fib_info_lock);
638         old_info_hash = fib_info_hash;
639         old_laddrhash = fib_info_laddrhash;
640         fib_hash_size = new_size;
641
642         for (i = 0; i < old_size; i++) {
643                 struct hlist_head *head = &fib_info_hash[i];
644                 struct hlist_node *node, *n;
645                 struct fib_info *fi;
646
647                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
648                         struct hlist_head *dest;
649                         unsigned int new_hash;
650
651                         hlist_del(&fi->fib_hash);
652
653                         new_hash = fib_info_hashfn(fi);
654                         dest = &new_info_hash[new_hash];
655                         hlist_add_head(&fi->fib_hash, dest);
656                 }
657         }
658         fib_info_hash = new_info_hash;
659
660         for (i = 0; i < old_size; i++) {
661                 struct hlist_head *lhead = &fib_info_laddrhash[i];
662                 struct hlist_node *node, *n;
663                 struct fib_info *fi;
664
665                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
666                         struct hlist_head *ldest;
667                         unsigned int new_hash;
668
669                         hlist_del(&fi->fib_lhash);
670
671                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
672                         ldest = &new_laddrhash[new_hash];
673                         hlist_add_head(&fi->fib_lhash, ldest);
674                 }
675         }
676         fib_info_laddrhash = new_laddrhash;
677
678         spin_unlock_bh(&fib_info_lock);
679
680         bytes = old_size * sizeof(struct hlist_head *);
681         fib_hash_free(old_info_hash, bytes);
682         fib_hash_free(old_laddrhash, bytes);
683 }
684
685 struct fib_info *fib_create_info(struct fib_config *cfg)
686 {
687         int err;
688         struct fib_info *fi = NULL;
689         struct fib_info *ofi;
690         int nhs = 1;
691         struct net *net = cfg->fc_nlinfo.nl_net;
692
693         /* Fast check to catch the most weird cases */
694         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
695                 goto err_inval;
696
697 #ifdef CONFIG_IP_ROUTE_MULTIPATH
698         if (cfg->fc_mp) {
699                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
700                 if (nhs == 0)
701                         goto err_inval;
702         }
703 #endif
704
705         err = -ENOBUFS;
706         if (fib_info_cnt >= fib_hash_size) {
707                 unsigned int new_size = fib_hash_size << 1;
708                 struct hlist_head *new_info_hash;
709                 struct hlist_head *new_laddrhash;
710                 unsigned int bytes;
711
712                 if (!new_size)
713                         new_size = 1;
714                 bytes = new_size * sizeof(struct hlist_head *);
715                 new_info_hash = fib_hash_alloc(bytes);
716                 new_laddrhash = fib_hash_alloc(bytes);
717                 if (!new_info_hash || !new_laddrhash) {
718                         fib_hash_free(new_info_hash, bytes);
719                         fib_hash_free(new_laddrhash, bytes);
720                 } else
721                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
722
723                 if (!fib_hash_size)
724                         goto failure;
725         }
726
727         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
728         if (fi == NULL)
729                 goto failure;
730         fib_info_cnt++;
731
732         fi->fib_net = hold_net(net);
733         fi->fib_protocol = cfg->fc_protocol;
734         fi->fib_flags = cfg->fc_flags;
735         fi->fib_priority = cfg->fc_priority;
736         fi->fib_prefsrc = cfg->fc_prefsrc;
737
738         fi->fib_nhs = nhs;
739         change_nexthops(fi) {
740                 nh->nh_parent = fi;
741         } endfor_nexthops(fi)
742
743         if (cfg->fc_mx) {
744                 struct nlattr *nla;
745                 int remaining;
746
747                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
748                         int type = nla_type(nla);
749
750                         if (type) {
751                                 if (type > RTAX_MAX)
752                                         goto err_inval;
753                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
754                         }
755                 }
756         }
757
758         if (cfg->fc_mp) {
759 #ifdef CONFIG_IP_ROUTE_MULTIPATH
760                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
761                 if (err != 0)
762                         goto failure;
763                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
764                         goto err_inval;
765                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
766                         goto err_inval;
767 #ifdef CONFIG_NET_CLS_ROUTE
768                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
769                         goto err_inval;
770 #endif
771 #else
772                 goto err_inval;
773 #endif
774         } else {
775                 struct fib_nh *nh = fi->fib_nh;
776
777                 nh->nh_oif = cfg->fc_oif;
778                 nh->nh_gw = cfg->fc_gw;
779                 nh->nh_flags = cfg->fc_flags;
780 #ifdef CONFIG_NET_CLS_ROUTE
781                 nh->nh_tclassid = cfg->fc_flow;
782 #endif
783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
784                 nh->nh_weight = 1;
785 #endif
786         }
787
788         if (fib_props[cfg->fc_type].error) {
789                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
790                         goto err_inval;
791                 goto link_it;
792         }
793
794         if (cfg->fc_scope > RT_SCOPE_HOST)
795                 goto err_inval;
796
797         if (cfg->fc_scope == RT_SCOPE_HOST) {
798                 struct fib_nh *nh = fi->fib_nh;
799
800                 /* Local address is added. */
801                 if (nhs != 1 || nh->nh_gw)
802                         goto err_inval;
803                 nh->nh_scope = RT_SCOPE_NOWHERE;
804                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
805                 err = -ENODEV;
806                 if (nh->nh_dev == NULL)
807                         goto failure;
808         } else {
809                 change_nexthops(fi) {
810                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
811                                 goto failure;
812                 } endfor_nexthops(fi)
813         }
814
815         if (fi->fib_prefsrc) {
816                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
817                     fi->fib_prefsrc != cfg->fc_dst)
818                         if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
819                                 goto err_inval;
820         }
821
822 link_it:
823         if ((ofi = fib_find_info(fi)) != NULL) {
824                 fi->fib_dead = 1;
825                 free_fib_info(fi);
826                 ofi->fib_treeref++;
827                 return ofi;
828         }
829
830         fi->fib_treeref++;
831         atomic_inc(&fi->fib_clntref);
832         spin_lock_bh(&fib_info_lock);
833         hlist_add_head(&fi->fib_hash,
834                        &fib_info_hash[fib_info_hashfn(fi)]);
835         if (fi->fib_prefsrc) {
836                 struct hlist_head *head;
837
838                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
839                 hlist_add_head(&fi->fib_lhash, head);
840         }
841         change_nexthops(fi) {
842                 struct hlist_head *head;
843                 unsigned int hash;
844
845                 if (!nh->nh_dev)
846                         continue;
847                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
848                 head = &fib_info_devhash[hash];
849                 hlist_add_head(&nh->nh_hash, head);
850         } endfor_nexthops(fi)
851         spin_unlock_bh(&fib_info_lock);
852         return fi;
853
854 err_inval:
855         err = -EINVAL;
856
857 failure:
858         if (fi) {
859                 fi->fib_dead = 1;
860                 free_fib_info(fi);
861         }
862
863         return ERR_PTR(err);
864 }
865
866 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
867 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
868                        struct fib_result *res, __be32 zone, __be32 mask,
869                         int prefixlen)
870 {
871         struct fib_alias *fa;
872         int nh_sel = 0;
873
874         list_for_each_entry_rcu(fa, head, fa_list) {
875                 int err;
876
877                 if (fa->fa_tos &&
878                     fa->fa_tos != flp->fl4_tos)
879                         continue;
880
881                 if (fa->fa_scope < flp->fl4_scope)
882                         continue;
883
884                 fa->fa_state |= FA_S_ACCESSED;
885
886                 err = fib_props[fa->fa_type].error;
887                 if (err == 0) {
888                         struct fib_info *fi = fa->fa_info;
889
890                         if (fi->fib_flags & RTNH_F_DEAD)
891                                 continue;
892
893                         switch (fa->fa_type) {
894                         case RTN_UNICAST:
895                         case RTN_LOCAL:
896                         case RTN_BROADCAST:
897                         case RTN_ANYCAST:
898                         case RTN_MULTICAST:
899                                 for_nexthops(fi) {
900                                         if (nh->nh_flags&RTNH_F_DEAD)
901                                                 continue;
902                                         if (!flp->oif || flp->oif == nh->nh_oif)
903                                                 break;
904                                 }
905 #ifdef CONFIG_IP_ROUTE_MULTIPATH
906                                 if (nhsel < fi->fib_nhs) {
907                                         nh_sel = nhsel;
908                                         goto out_fill_res;
909                                 }
910 #else
911                                 if (nhsel < 1) {
912                                         goto out_fill_res;
913                                 }
914 #endif
915                                 endfor_nexthops(fi);
916                                 continue;
917
918                         default:
919                                 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
920                                         fa->fa_type);
921                                 return -EINVAL;
922                         }
923                 }
924                 return err;
925         }
926         return 1;
927
928 out_fill_res:
929         res->prefixlen = prefixlen;
930         res->nh_sel = nh_sel;
931         res->type = fa->fa_type;
932         res->scope = fa->fa_scope;
933         res->fi = fa->fa_info;
934         atomic_inc(&res->fi->fib_clntref);
935         return 0;
936 }
937
938 /* Find appropriate source address to this destination */
939
940 __be32 __fib_res_prefsrc(struct fib_result *res)
941 {
942         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
943 }
944
945 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
946                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
947                   struct fib_info *fi, unsigned int flags)
948 {
949         struct nlmsghdr *nlh;
950         struct rtmsg *rtm;
951
952         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
953         if (nlh == NULL)
954                 return -EMSGSIZE;
955
956         rtm = nlmsg_data(nlh);
957         rtm->rtm_family = AF_INET;
958         rtm->rtm_dst_len = dst_len;
959         rtm->rtm_src_len = 0;
960         rtm->rtm_tos = tos;
961         if (tb_id < 256)
962                 rtm->rtm_table = tb_id;
963         else
964                 rtm->rtm_table = RT_TABLE_COMPAT;
965         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
966         rtm->rtm_type = type;
967         rtm->rtm_flags = fi->fib_flags;
968         rtm->rtm_scope = scope;
969         rtm->rtm_protocol = fi->fib_protocol;
970
971         if (rtm->rtm_dst_len)
972                 NLA_PUT_BE32(skb, RTA_DST, dst);
973
974         if (fi->fib_priority)
975                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
976
977         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
978                 goto nla_put_failure;
979
980         if (fi->fib_prefsrc)
981                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
982
983         if (fi->fib_nhs == 1) {
984                 if (fi->fib_nh->nh_gw)
985                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
986
987                 if (fi->fib_nh->nh_oif)
988                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
989 #ifdef CONFIG_NET_CLS_ROUTE
990                 if (fi->fib_nh[0].nh_tclassid)
991                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
992 #endif
993         }
994 #ifdef CONFIG_IP_ROUTE_MULTIPATH
995         if (fi->fib_nhs > 1) {
996                 struct rtnexthop *rtnh;
997                 struct nlattr *mp;
998
999                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1000                 if (mp == NULL)
1001                         goto nla_put_failure;
1002
1003                 for_nexthops(fi) {
1004                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1005                         if (rtnh == NULL)
1006                                 goto nla_put_failure;
1007
1008                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1009                         rtnh->rtnh_hops = nh->nh_weight - 1;
1010                         rtnh->rtnh_ifindex = nh->nh_oif;
1011
1012                         if (nh->nh_gw)
1013                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1014 #ifdef CONFIG_NET_CLS_ROUTE
1015                         if (nh->nh_tclassid)
1016                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1017 #endif
1018                         /* length of rtnetlink header + attributes */
1019                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1020                 } endfor_nexthops(fi);
1021
1022                 nla_nest_end(skb, mp);
1023         }
1024 #endif
1025         return nlmsg_end(skb, nlh);
1026
1027 nla_put_failure:
1028         nlmsg_cancel(skb, nlh);
1029         return -EMSGSIZE;
1030 }
1031
1032 /*
1033    Update FIB if:
1034    - local address disappeared -> we must delete all the entries
1035      referring to it.
1036    - device went down -> we must shutdown all nexthops going via it.
1037  */
1038 int fib_sync_down_addr(struct net *net, __be32 local)
1039 {
1040         int ret = 0;
1041         unsigned int hash = fib_laddr_hashfn(local);
1042         struct hlist_head *head = &fib_info_laddrhash[hash];
1043         struct hlist_node *node;
1044         struct fib_info *fi;
1045
1046         if (fib_info_laddrhash == NULL || local == 0)
1047                 return 0;
1048
1049         hlist_for_each_entry(fi, node, head, fib_lhash) {
1050                 if (fi->fib_net != net)
1051                         continue;
1052                 if (fi->fib_prefsrc == local) {
1053                         fi->fib_flags |= RTNH_F_DEAD;
1054                         ret++;
1055                 }
1056         }
1057         return ret;
1058 }
1059
1060 int fib_sync_down_dev(struct net_device *dev, int force)
1061 {
1062         int ret = 0;
1063         int scope = RT_SCOPE_NOWHERE;
1064         struct fib_info *prev_fi = NULL;
1065         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1066         struct hlist_head *head = &fib_info_devhash[hash];
1067         struct hlist_node *node;
1068         struct fib_nh *nh;
1069
1070         if (force)
1071                 scope = -1;
1072
1073         hlist_for_each_entry(nh, node, head, nh_hash) {
1074                 struct fib_info *fi = nh->nh_parent;
1075                 int dead;
1076
1077                 BUG_ON(!fi->fib_nhs);
1078                 if (nh->nh_dev != dev || fi == prev_fi)
1079                         continue;
1080                 prev_fi = fi;
1081                 dead = 0;
1082                 change_nexthops(fi) {
1083                         if (nh->nh_flags&RTNH_F_DEAD)
1084                                 dead++;
1085                         else if (nh->nh_dev == dev &&
1086                                         nh->nh_scope != scope) {
1087                                 nh->nh_flags |= RTNH_F_DEAD;
1088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1089                                 spin_lock_bh(&fib_multipath_lock);
1090                                 fi->fib_power -= nh->nh_power;
1091                                 nh->nh_power = 0;
1092                                 spin_unlock_bh(&fib_multipath_lock);
1093 #endif
1094                                 dead++;
1095                         }
1096 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1097                         if (force > 1 && nh->nh_dev == dev) {
1098                                 dead = fi->fib_nhs;
1099                                 break;
1100                         }
1101 #endif
1102                 } endfor_nexthops(fi)
1103                 if (dead == fi->fib_nhs) {
1104                         fi->fib_flags |= RTNH_F_DEAD;
1105                         ret++;
1106                 }
1107         }
1108
1109         return ret;
1110 }
1111
1112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1113
1114 /*
1115    Dead device goes up. We wake up dead nexthops.
1116    It takes sense only on multipath routes.
1117  */
1118
1119 int fib_sync_up(struct net_device *dev)
1120 {
1121         struct fib_info *prev_fi;
1122         unsigned int hash;
1123         struct hlist_head *head;
1124         struct hlist_node *node;
1125         struct fib_nh *nh;
1126         int ret;
1127
1128         if (!(dev->flags&IFF_UP))
1129                 return 0;
1130
1131         prev_fi = NULL;
1132         hash = fib_devindex_hashfn(dev->ifindex);
1133         head = &fib_info_devhash[hash];
1134         ret = 0;
1135
1136         hlist_for_each_entry(nh, node, head, nh_hash) {
1137                 struct fib_info *fi = nh->nh_parent;
1138                 int alive;
1139
1140                 BUG_ON(!fi->fib_nhs);
1141                 if (nh->nh_dev != dev || fi == prev_fi)
1142                         continue;
1143
1144                 prev_fi = fi;
1145                 alive = 0;
1146                 change_nexthops(fi) {
1147                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1148                                 alive++;
1149                                 continue;
1150                         }
1151                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1152                                 continue;
1153                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1154                                 continue;
1155                         alive++;
1156                         spin_lock_bh(&fib_multipath_lock);
1157                         nh->nh_power = 0;
1158                         nh->nh_flags &= ~RTNH_F_DEAD;
1159                         spin_unlock_bh(&fib_multipath_lock);
1160                 } endfor_nexthops(fi)
1161
1162                 if (alive > 0) {
1163                         fi->fib_flags &= ~RTNH_F_DEAD;
1164                         ret++;
1165                 }
1166         }
1167
1168         return ret;
1169 }
1170
1171 /*
1172    The algorithm is suboptimal, but it provides really
1173    fair weighted route distribution.
1174  */
1175
1176 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177 {
1178         struct fib_info *fi = res->fi;
1179         int w;
1180
1181         spin_lock_bh(&fib_multipath_lock);
1182         if (fi->fib_power <= 0) {
1183                 int power = 0;
1184                 change_nexthops(fi) {
1185                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1186                                 power += nh->nh_weight;
1187                                 nh->nh_power = nh->nh_weight;
1188                         }
1189                 } endfor_nexthops(fi);
1190                 fi->fib_power = power;
1191                 if (power <= 0) {
1192                         spin_unlock_bh(&fib_multipath_lock);
1193                         /* Race condition: route has just become dead. */
1194                         res->nh_sel = 0;
1195                         return;
1196                 }
1197         }
1198
1199
1200         /* w should be random number [0..fi->fib_power-1],
1201            it is pretty bad approximation.
1202          */
1203
1204         w = jiffies % fi->fib_power;
1205
1206         change_nexthops(fi) {
1207                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1208                         if ((w -= nh->nh_power) <= 0) {
1209                                 nh->nh_power--;
1210                                 fi->fib_power--;
1211                                 res->nh_sel = nhsel;
1212                                 spin_unlock_bh(&fib_multipath_lock);
1213                                 return;
1214                         }
1215                 }
1216         } endfor_nexthops(fi);
1217
1218         /* Race condition: route has just become dead. */
1219         res->nh_sel = 0;
1220         spin_unlock_bh(&fib_multipath_lock);
1221 }
1222 #endif