[NET]: Clean up sk_buff walkers.
[linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
46 #include <net/netlink.h>
47 #include <net/nexthop.h>
48
49 #include "fib_lookup.h"
50
51 #define FSprintk(a...)
52
53 static DEFINE_SPINLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
58
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64
65 static DEFINE_SPINLOCK(fib_multipath_lock);
66
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
76
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
82
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84
85 #define endfor_nexthops(fi) }
86
87
88 static const struct
89 {
90         int     error;
91         u8      scope;
92 } fib_props[RTN_MAX + 1] = {
93         {
94                 .error  = 0,
95                 .scope  = RT_SCOPE_NOWHERE,
96         },      /* RTN_UNSPEC */
97         {
98                 .error  = 0,
99                 .scope  = RT_SCOPE_UNIVERSE,
100         },      /* RTN_UNICAST */
101         {
102                 .error  = 0,
103                 .scope  = RT_SCOPE_HOST,
104         },      /* RTN_LOCAL */
105         {
106                 .error  = 0,
107                 .scope  = RT_SCOPE_LINK,
108         },      /* RTN_BROADCAST */
109         {
110                 .error  = 0,
111                 .scope  = RT_SCOPE_LINK,
112         },      /* RTN_ANYCAST */
113         {
114                 .error  = 0,
115                 .scope  = RT_SCOPE_UNIVERSE,
116         },      /* RTN_MULTICAST */
117         {
118                 .error  = -EINVAL,
119                 .scope  = RT_SCOPE_UNIVERSE,
120         },      /* RTN_BLACKHOLE */
121         {
122                 .error  = -EHOSTUNREACH,
123                 .scope  = RT_SCOPE_UNIVERSE,
124         },      /* RTN_UNREACHABLE */
125         {
126                 .error  = -EACCES,
127                 .scope  = RT_SCOPE_UNIVERSE,
128         },      /* RTN_PROHIBIT */
129         {
130                 .error  = -EAGAIN,
131                 .scope  = RT_SCOPE_UNIVERSE,
132         },      /* RTN_THROW */
133         {
134                 .error  = -EINVAL,
135                 .scope  = RT_SCOPE_NOWHERE,
136         },      /* RTN_NAT */
137         {
138                 .error  = -EINVAL,
139                 .scope  = RT_SCOPE_NOWHERE,
140         },      /* RTN_XRESOLVE */
141 };
142
143
144 /* Release a nexthop info record */
145
146 void free_fib_info(struct fib_info *fi)
147 {
148         if (fi->fib_dead == 0) {
149                 printk("Freeing alive fib_info %p\n", fi);
150                 return;
151         }
152         change_nexthops(fi) {
153                 if (nh->nh_dev)
154                         dev_put(nh->nh_dev);
155                 nh->nh_dev = NULL;
156         } endfor_nexthops(fi);
157         fib_info_cnt--;
158         kfree(fi);
159 }
160
161 void fib_release_info(struct fib_info *fi)
162 {
163         spin_lock_bh(&fib_info_lock);
164         if (fi && --fi->fib_treeref == 0) {
165                 hlist_del(&fi->fib_hash);
166                 if (fi->fib_prefsrc)
167                         hlist_del(&fi->fib_lhash);
168                 change_nexthops(fi) {
169                         if (!nh->nh_dev)
170                                 continue;
171                         hlist_del(&nh->nh_hash);
172                 } endfor_nexthops(fi)
173                 fi->fib_dead = 1;
174                 fib_info_put(fi);
175         }
176         spin_unlock_bh(&fib_info_lock);
177 }
178
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 {
181         const struct fib_nh *onh = ofi->fib_nh;
182
183         for_nexthops(fi) {
184                 if (nh->nh_oif != onh->nh_oif ||
185                     nh->nh_gw  != onh->nh_gw ||
186                     nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188                     nh->nh_weight != onh->nh_weight ||
189 #endif
190 #ifdef CONFIG_NET_CLS_ROUTE
191                     nh->nh_tclassid != onh->nh_tclassid ||
192 #endif
193                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194                         return -1;
195                 onh++;
196         } endfor_nexthops(fi);
197         return 0;
198 }
199
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 {
202         unsigned int mask = (fib_hash_size - 1);
203         unsigned int val = fi->fib_nhs;
204
205         val ^= fi->fib_protocol;
206         val ^= (__force u32)fi->fib_prefsrc;
207         val ^= fi->fib_priority;
208
209         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 }
211
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 {
214         struct hlist_head *head;
215         struct hlist_node *node;
216         struct fib_info *fi;
217         unsigned int hash;
218
219         hash = fib_info_hashfn(nfi);
220         head = &fib_info_hash[hash];
221
222         hlist_for_each_entry(fi, node, head, fib_hash) {
223                 if (fi->fib_nhs != nfi->fib_nhs)
224                         continue;
225                 if (nfi->fib_protocol == fi->fib_protocol &&
226                     nfi->fib_prefsrc == fi->fib_prefsrc &&
227                     nfi->fib_priority == fi->fib_priority &&
228                     memcmp(nfi->fib_metrics, fi->fib_metrics,
229                            sizeof(fi->fib_metrics)) == 0 &&
230                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
232                         return fi;
233         }
234
235         return NULL;
236 }
237
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 {
240         unsigned int mask = DEVINDEX_HASHSIZE - 1;
241
242         return (val ^
243                 (val >> DEVINDEX_HASHBITS) ^
244                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 }
246
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250
251 int ip_fib_check_default(__be32 gw, struct net_device *dev)
252 {
253         struct hlist_head *head;
254         struct hlist_node *node;
255         struct fib_nh *nh;
256         unsigned int hash;
257
258         spin_lock(&fib_info_lock);
259
260         hash = fib_devindex_hashfn(dev->ifindex);
261         head = &fib_info_devhash[hash];
262         hlist_for_each_entry(nh, node, head, nh_hash) {
263                 if (nh->nh_dev == dev &&
264                     nh->nh_gw == gw &&
265                     !(nh->nh_flags&RTNH_F_DEAD)) {
266                         spin_unlock(&fib_info_lock);
267                         return 0;
268                 }
269         }
270
271         spin_unlock(&fib_info_lock);
272
273         return -1;
274 }
275
276 static inline size_t fib_nlmsg_size(struct fib_info *fi)
277 {
278         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
279                          + nla_total_size(4) /* RTA_TABLE */
280                          + nla_total_size(4) /* RTA_DST */
281                          + nla_total_size(4) /* RTA_PRIORITY */
282                          + nla_total_size(4); /* RTA_PREFSRC */
283
284         /* space for nested metrics */
285         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
286
287         if (fi->fib_nhs) {
288                 /* Also handles the special case fib_nhs == 1 */
289
290                 /* each nexthop is packed in an attribute */
291                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
292
293                 /* may contain flow and gateway attribute */
294                 nhsize += 2 * nla_total_size(4);
295
296                 /* all nexthops are packed in a nested attribute */
297                 payload += nla_total_size(fi->fib_nhs * nhsize);
298         }
299
300         return payload;
301 }
302
303 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
304                int dst_len, u32 tb_id, struct nl_info *info)
305 {
306         struct sk_buff *skb;
307         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
308         int err = -ENOBUFS;
309
310         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
311         if (skb == NULL)
312                 goto errout;
313
314         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
315                             fa->fa_type, fa->fa_scope, key, dst_len,
316                             fa->fa_tos, fa->fa_info, 0);
317         if (err < 0) {
318                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
319                 WARN_ON(err == -EMSGSIZE);
320                 kfree_skb(skb);
321                 goto errout;
322         }
323         err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
324                           info->nlh, GFP_KERNEL);
325 errout:
326         if (err < 0)
327                 rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
328 }
329
330 /* Return the first fib alias matching TOS with
331  * priority less than or equal to PRIO.
332  */
333 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
334 {
335         if (fah) {
336                 struct fib_alias *fa;
337                 list_for_each_entry(fa, fah, fa_list) {
338                         if (fa->fa_tos > tos)
339                                 continue;
340                         if (fa->fa_info->fib_priority >= prio ||
341                             fa->fa_tos < tos)
342                                 return fa;
343                 }
344         }
345         return NULL;
346 }
347
348 int fib_detect_death(struct fib_info *fi, int order,
349                      struct fib_info **last_resort, int *last_idx, int *dflt)
350 {
351         struct neighbour *n;
352         int state = NUD_NONE;
353
354         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
355         if (n) {
356                 state = n->nud_state;
357                 neigh_release(n);
358         }
359         if (state==NUD_REACHABLE)
360                 return 0;
361         if ((state&NUD_VALID) && order != *dflt)
362                 return 0;
363         if ((state&NUD_VALID) ||
364             (*last_idx<0 && order > *dflt)) {
365                 *last_resort = fi;
366                 *last_idx = order;
367         }
368         return 1;
369 }
370
371 #ifdef CONFIG_IP_ROUTE_MULTIPATH
372
373 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
374 {
375         int nhs = 0;
376
377         while (rtnh_ok(rtnh, remaining)) {
378                 nhs++;
379                 rtnh = rtnh_next(rtnh, &remaining);
380         }
381
382         /* leftover implies invalid nexthop configuration, discard it */
383         return remaining > 0 ? 0 : nhs;
384 }
385
386 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
387                        int remaining, struct fib_config *cfg)
388 {
389         change_nexthops(fi) {
390                 int attrlen;
391
392                 if (!rtnh_ok(rtnh, remaining))
393                         return -EINVAL;
394
395                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
396                 nh->nh_oif = rtnh->rtnh_ifindex;
397                 nh->nh_weight = rtnh->rtnh_hops + 1;
398
399                 attrlen = rtnh_attrlen(rtnh);
400                 if (attrlen > 0) {
401                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
402
403                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
404                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
405 #ifdef CONFIG_NET_CLS_ROUTE
406                         nla = nla_find(attrs, attrlen, RTA_FLOW);
407                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
408 #endif
409                 }
410
411                 rtnh = rtnh_next(rtnh, &remaining);
412         } endfor_nexthops(fi);
413
414         return 0;
415 }
416
417 #endif
418
419 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
420 {
421 #ifdef CONFIG_IP_ROUTE_MULTIPATH
422         struct rtnexthop *rtnh;
423         int remaining;
424 #endif
425
426         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
427                 return 1;
428
429         if (cfg->fc_oif || cfg->fc_gw) {
430                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
431                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
432                         return 0;
433                 return 1;
434         }
435
436 #ifdef CONFIG_IP_ROUTE_MULTIPATH
437         if (cfg->fc_mp == NULL)
438                 return 0;
439
440         rtnh = cfg->fc_mp;
441         remaining = cfg->fc_mp_len;
442
443         for_nexthops(fi) {
444                 int attrlen;
445
446                 if (!rtnh_ok(rtnh, remaining))
447                         return -EINVAL;
448
449                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
450                         return 1;
451
452                 attrlen = rtnh_attrlen(rtnh);
453                 if (attrlen < 0) {
454                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
455
456                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
457                         if (nla && nla_get_be32(nla) != nh->nh_gw)
458                                 return 1;
459 #ifdef CONFIG_NET_CLS_ROUTE
460                         nla = nla_find(attrs, attrlen, RTA_FLOW);
461                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
462                                 return 1;
463 #endif
464                 }
465
466                 rtnh = rtnh_next(rtnh, &remaining);
467         } endfor_nexthops(fi);
468 #endif
469         return 0;
470 }
471
472
473 /*
474    Picture
475    -------
476
477    Semantics of nexthop is very messy by historical reasons.
478    We have to take into account, that:
479    a) gateway can be actually local interface address,
480       so that gatewayed route is direct.
481    b) gateway must be on-link address, possibly
482       described not by an ifaddr, but also by a direct route.
483    c) If both gateway and interface are specified, they should not
484       contradict.
485    d) If we use tunnel routes, gateway could be not on-link.
486
487    Attempt to reconcile all of these (alas, self-contradictory) conditions
488    results in pretty ugly and hairy code with obscure logic.
489
490    I chose to generalized it instead, so that the size
491    of code does not increase practically, but it becomes
492    much more general.
493    Every prefix is assigned a "scope" value: "host" is local address,
494    "link" is direct route,
495    [ ... "site" ... "interior" ... ]
496    and "universe" is true gateway route with global meaning.
497
498    Every prefix refers to a set of "nexthop"s (gw, oif),
499    where gw must have narrower scope. This recursion stops
500    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
501    which means that gw is forced to be on link.
502
503    Code is still hairy, but now it is apparently logically
504    consistent and very flexible. F.e. as by-product it allows
505    to co-exists in peace independent exterior and interior
506    routing processes.
507
508    Normally it looks as following.
509
510    {universe prefix}  -> (gw, oif) [scope link]
511                           |
512                           |-> {link prefix} -> (gw, oif) [scope local]
513                                                 |
514                                                 |-> {local prefix} (terminal node)
515  */
516
517 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
518                         struct fib_nh *nh)
519 {
520         int err;
521
522         if (nh->nh_gw) {
523                 struct fib_result res;
524
525 #ifdef CONFIG_IP_ROUTE_PERVASIVE
526                 if (nh->nh_flags&RTNH_F_PERVASIVE)
527                         return 0;
528 #endif
529                 if (nh->nh_flags&RTNH_F_ONLINK) {
530                         struct net_device *dev;
531
532                         if (cfg->fc_scope >= RT_SCOPE_LINK)
533                                 return -EINVAL;
534                         if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
535                                 return -EINVAL;
536                         if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
537                                 return -ENODEV;
538                         if (!(dev->flags&IFF_UP))
539                                 return -ENETDOWN;
540                         nh->nh_dev = dev;
541                         dev_hold(dev);
542                         nh->nh_scope = RT_SCOPE_LINK;
543                         return 0;
544                 }
545                 {
546                         struct flowi fl = {
547                                 .nl_u = {
548                                         .ip4_u = {
549                                                 .daddr = nh->nh_gw,
550                                                 .scope = cfg->fc_scope + 1,
551                                         },
552                                 },
553                                 .oif = nh->nh_oif,
554                         };
555
556                         /* It is not necessary, but requires a bit of thinking */
557                         if (fl.fl4_scope < RT_SCOPE_LINK)
558                                 fl.fl4_scope = RT_SCOPE_LINK;
559                         if ((err = fib_lookup(&fl, &res)) != 0)
560                                 return err;
561                 }
562                 err = -EINVAL;
563                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
564                         goto out;
565                 nh->nh_scope = res.scope;
566                 nh->nh_oif = FIB_RES_OIF(res);
567                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
568                         goto out;
569                 dev_hold(nh->nh_dev);
570                 err = -ENETDOWN;
571                 if (!(nh->nh_dev->flags & IFF_UP))
572                         goto out;
573                 err = 0;
574 out:
575                 fib_res_put(&res);
576                 return err;
577         } else {
578                 struct in_device *in_dev;
579
580                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
581                         return -EINVAL;
582
583                 in_dev = inetdev_by_index(nh->nh_oif);
584                 if (in_dev == NULL)
585                         return -ENODEV;
586                 if (!(in_dev->dev->flags&IFF_UP)) {
587                         in_dev_put(in_dev);
588                         return -ENETDOWN;
589                 }
590                 nh->nh_dev = in_dev->dev;
591                 dev_hold(nh->nh_dev);
592                 nh->nh_scope = RT_SCOPE_HOST;
593                 in_dev_put(in_dev);
594         }
595         return 0;
596 }
597
598 static inline unsigned int fib_laddr_hashfn(__be32 val)
599 {
600         unsigned int mask = (fib_hash_size - 1);
601
602         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
603 }
604
605 static struct hlist_head *fib_hash_alloc(int bytes)
606 {
607         if (bytes <= PAGE_SIZE)
608                 return kmalloc(bytes, GFP_KERNEL);
609         else
610                 return (struct hlist_head *)
611                         __get_free_pages(GFP_KERNEL, get_order(bytes));
612 }
613
614 static void fib_hash_free(struct hlist_head *hash, int bytes)
615 {
616         if (!hash)
617                 return;
618
619         if (bytes <= PAGE_SIZE)
620                 kfree(hash);
621         else
622                 free_pages((unsigned long) hash, get_order(bytes));
623 }
624
625 static void fib_hash_move(struct hlist_head *new_info_hash,
626                           struct hlist_head *new_laddrhash,
627                           unsigned int new_size)
628 {
629         struct hlist_head *old_info_hash, *old_laddrhash;
630         unsigned int old_size = fib_hash_size;
631         unsigned int i, bytes;
632
633         spin_lock_bh(&fib_info_lock);
634         old_info_hash = fib_info_hash;
635         old_laddrhash = fib_info_laddrhash;
636         fib_hash_size = new_size;
637
638         for (i = 0; i < old_size; i++) {
639                 struct hlist_head *head = &fib_info_hash[i];
640                 struct hlist_node *node, *n;
641                 struct fib_info *fi;
642
643                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
644                         struct hlist_head *dest;
645                         unsigned int new_hash;
646
647                         hlist_del(&fi->fib_hash);
648
649                         new_hash = fib_info_hashfn(fi);
650                         dest = &new_info_hash[new_hash];
651                         hlist_add_head(&fi->fib_hash, dest);
652                 }
653         }
654         fib_info_hash = new_info_hash;
655
656         for (i = 0; i < old_size; i++) {
657                 struct hlist_head *lhead = &fib_info_laddrhash[i];
658                 struct hlist_node *node, *n;
659                 struct fib_info *fi;
660
661                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
662                         struct hlist_head *ldest;
663                         unsigned int new_hash;
664
665                         hlist_del(&fi->fib_lhash);
666
667                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
668                         ldest = &new_laddrhash[new_hash];
669                         hlist_add_head(&fi->fib_lhash, ldest);
670                 }
671         }
672         fib_info_laddrhash = new_laddrhash;
673
674         spin_unlock_bh(&fib_info_lock);
675
676         bytes = old_size * sizeof(struct hlist_head *);
677         fib_hash_free(old_info_hash, bytes);
678         fib_hash_free(old_laddrhash, bytes);
679 }
680
681 struct fib_info *fib_create_info(struct fib_config *cfg)
682 {
683         int err;
684         struct fib_info *fi = NULL;
685         struct fib_info *ofi;
686         int nhs = 1;
687
688         /* Fast check to catch the most weird cases */
689         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
690                 goto err_inval;
691
692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
693         if (cfg->fc_mp) {
694                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
695                 if (nhs == 0)
696                         goto err_inval;
697         }
698 #endif
699 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
700         if (cfg->fc_mp_alg) {
701                 if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
702                     cfg->fc_mp_alg > IP_MP_ALG_MAX)
703                         goto err_inval;
704         }
705 #endif
706
707         err = -ENOBUFS;
708         if (fib_info_cnt >= fib_hash_size) {
709                 unsigned int new_size = fib_hash_size << 1;
710                 struct hlist_head *new_info_hash;
711                 struct hlist_head *new_laddrhash;
712                 unsigned int bytes;
713
714                 if (!new_size)
715                         new_size = 1;
716                 bytes = new_size * sizeof(struct hlist_head *);
717                 new_info_hash = fib_hash_alloc(bytes);
718                 new_laddrhash = fib_hash_alloc(bytes);
719                 if (!new_info_hash || !new_laddrhash) {
720                         fib_hash_free(new_info_hash, bytes);
721                         fib_hash_free(new_laddrhash, bytes);
722                 } else {
723                         memset(new_info_hash, 0, bytes);
724                         memset(new_laddrhash, 0, bytes);
725
726                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
727                 }
728
729                 if (!fib_hash_size)
730                         goto failure;
731         }
732
733         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
734         if (fi == NULL)
735                 goto failure;
736         fib_info_cnt++;
737
738         fi->fib_protocol = cfg->fc_protocol;
739         fi->fib_flags = cfg->fc_flags;
740         fi->fib_priority = cfg->fc_priority;
741         fi->fib_prefsrc = cfg->fc_prefsrc;
742
743         fi->fib_nhs = nhs;
744         change_nexthops(fi) {
745                 nh->nh_parent = fi;
746         } endfor_nexthops(fi)
747
748         if (cfg->fc_mx) {
749                 struct nlattr *nla;
750                 int remaining;
751
752                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
753                         int type = nla->nla_type;
754
755                         if (type) {
756                                 if (type > RTAX_MAX)
757                                         goto err_inval;
758                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
759                         }
760                 }
761         }
762
763         if (cfg->fc_mp) {
764 #ifdef CONFIG_IP_ROUTE_MULTIPATH
765                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
766                 if (err != 0)
767                         goto failure;
768                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
769                         goto err_inval;
770                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
771                         goto err_inval;
772 #ifdef CONFIG_NET_CLS_ROUTE
773                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
774                         goto err_inval;
775 #endif
776 #else
777                 goto err_inval;
778 #endif
779         } else {
780                 struct fib_nh *nh = fi->fib_nh;
781
782                 nh->nh_oif = cfg->fc_oif;
783                 nh->nh_gw = cfg->fc_gw;
784                 nh->nh_flags = cfg->fc_flags;
785 #ifdef CONFIG_NET_CLS_ROUTE
786                 nh->nh_tclassid = cfg->fc_flow;
787 #endif
788 #ifdef CONFIG_IP_ROUTE_MULTIPATH
789                 nh->nh_weight = 1;
790 #endif
791         }
792
793 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
794         fi->fib_mp_alg = cfg->fc_mp_alg;
795 #endif
796
797         if (fib_props[cfg->fc_type].error) {
798                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
799                         goto err_inval;
800                 goto link_it;
801         }
802
803         if (cfg->fc_scope > RT_SCOPE_HOST)
804                 goto err_inval;
805
806         if (cfg->fc_scope == RT_SCOPE_HOST) {
807                 struct fib_nh *nh = fi->fib_nh;
808
809                 /* Local address is added. */
810                 if (nhs != 1 || nh->nh_gw)
811                         goto err_inval;
812                 nh->nh_scope = RT_SCOPE_NOWHERE;
813                 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
814                 err = -ENODEV;
815                 if (nh->nh_dev == NULL)
816                         goto failure;
817         } else {
818                 change_nexthops(fi) {
819                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
820                                 goto failure;
821                 } endfor_nexthops(fi)
822         }
823
824         if (fi->fib_prefsrc) {
825                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
826                     fi->fib_prefsrc != cfg->fc_dst)
827                         if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
828                                 goto err_inval;
829         }
830
831 link_it:
832         if ((ofi = fib_find_info(fi)) != NULL) {
833                 fi->fib_dead = 1;
834                 free_fib_info(fi);
835                 ofi->fib_treeref++;
836                 return ofi;
837         }
838
839         fi->fib_treeref++;
840         atomic_inc(&fi->fib_clntref);
841         spin_lock_bh(&fib_info_lock);
842         hlist_add_head(&fi->fib_hash,
843                        &fib_info_hash[fib_info_hashfn(fi)]);
844         if (fi->fib_prefsrc) {
845                 struct hlist_head *head;
846
847                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
848                 hlist_add_head(&fi->fib_lhash, head);
849         }
850         change_nexthops(fi) {
851                 struct hlist_head *head;
852                 unsigned int hash;
853
854                 if (!nh->nh_dev)
855                         continue;
856                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
857                 head = &fib_info_devhash[hash];
858                 hlist_add_head(&nh->nh_hash, head);
859         } endfor_nexthops(fi)
860         spin_unlock_bh(&fib_info_lock);
861         return fi;
862
863 err_inval:
864         err = -EINVAL;
865
866 failure:
867         if (fi) {
868                 fi->fib_dead = 1;
869                 free_fib_info(fi);
870         }
871
872         return ERR_PTR(err);
873 }
874
875 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
876 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
877                        struct fib_result *res, __be32 zone, __be32 mask,
878                         int prefixlen)
879 {
880         struct fib_alias *fa;
881         int nh_sel = 0;
882
883         list_for_each_entry_rcu(fa, head, fa_list) {
884                 int err;
885
886                 if (fa->fa_tos &&
887                     fa->fa_tos != flp->fl4_tos)
888                         continue;
889
890                 if (fa->fa_scope < flp->fl4_scope)
891                         continue;
892
893                 fa->fa_state |= FA_S_ACCESSED;
894
895                 err = fib_props[fa->fa_type].error;
896                 if (err == 0) {
897                         struct fib_info *fi = fa->fa_info;
898
899                         if (fi->fib_flags & RTNH_F_DEAD)
900                                 continue;
901
902                         switch (fa->fa_type) {
903                         case RTN_UNICAST:
904                         case RTN_LOCAL:
905                         case RTN_BROADCAST:
906                         case RTN_ANYCAST:
907                         case RTN_MULTICAST:
908                                 for_nexthops(fi) {
909                                         if (nh->nh_flags&RTNH_F_DEAD)
910                                                 continue;
911                                         if (!flp->oif || flp->oif == nh->nh_oif)
912                                                 break;
913                                 }
914 #ifdef CONFIG_IP_ROUTE_MULTIPATH
915                                 if (nhsel < fi->fib_nhs) {
916                                         nh_sel = nhsel;
917                                         goto out_fill_res;
918                                 }
919 #else
920                                 if (nhsel < 1) {
921                                         goto out_fill_res;
922                                 }
923 #endif
924                                 endfor_nexthops(fi);
925                                 continue;
926
927                         default:
928                                 printk(KERN_DEBUG "impossible 102\n");
929                                 return -EINVAL;
930                         }
931                 }
932                 return err;
933         }
934         return 1;
935
936 out_fill_res:
937         res->prefixlen = prefixlen;
938         res->nh_sel = nh_sel;
939         res->type = fa->fa_type;
940         res->scope = fa->fa_scope;
941         res->fi = fa->fa_info;
942 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
943         res->netmask = mask;
944         res->network = zone & inet_make_mask(prefixlen);
945 #endif
946         atomic_inc(&res->fi->fib_clntref);
947         return 0;
948 }
949
950 /* Find appropriate source address to this destination */
951
952 __be32 __fib_res_prefsrc(struct fib_result *res)
953 {
954         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
955 }
956
957 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
958                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
959                   struct fib_info *fi, unsigned int flags)
960 {
961         struct nlmsghdr *nlh;
962         struct rtmsg *rtm;
963
964         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
965         if (nlh == NULL)
966                 return -EMSGSIZE;
967
968         rtm = nlmsg_data(nlh);
969         rtm->rtm_family = AF_INET;
970         rtm->rtm_dst_len = dst_len;
971         rtm->rtm_src_len = 0;
972         rtm->rtm_tos = tos;
973         rtm->rtm_table = tb_id;
974         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
975         rtm->rtm_type = type;
976         rtm->rtm_flags = fi->fib_flags;
977         rtm->rtm_scope = scope;
978         rtm->rtm_protocol = fi->fib_protocol;
979
980         if (rtm->rtm_dst_len)
981                 NLA_PUT_BE32(skb, RTA_DST, dst);
982
983         if (fi->fib_priority)
984                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
985
986         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
987                 goto nla_put_failure;
988
989         if (fi->fib_prefsrc)
990                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
991
992         if (fi->fib_nhs == 1) {
993                 if (fi->fib_nh->nh_gw)
994                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
995
996                 if (fi->fib_nh->nh_oif)
997                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
998 #ifdef CONFIG_NET_CLS_ROUTE
999                 if (fi->fib_nh[0].nh_tclassid)
1000                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1001 #endif
1002         }
1003 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1004         if (fi->fib_nhs > 1) {
1005                 struct rtnexthop *rtnh;
1006                 struct nlattr *mp;
1007
1008                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1009                 if (mp == NULL)
1010                         goto nla_put_failure;
1011
1012                 for_nexthops(fi) {
1013                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1014                         if (rtnh == NULL)
1015                                 goto nla_put_failure;
1016
1017                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1018                         rtnh->rtnh_hops = nh->nh_weight - 1;
1019                         rtnh->rtnh_ifindex = nh->nh_oif;
1020
1021                         if (nh->nh_gw)
1022                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1023 #ifdef CONFIG_NET_CLS_ROUTE
1024                         if (nh->nh_tclassid)
1025                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1026 #endif
1027                         /* length of rtnetlink header + attributes */
1028                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1029                 } endfor_nexthops(fi);
1030
1031                 nla_nest_end(skb, mp);
1032         }
1033 #endif
1034         return nlmsg_end(skb, nlh);
1035
1036 nla_put_failure:
1037         nlmsg_cancel(skb, nlh);
1038         return -EMSGSIZE;
1039 }
1040
1041 /*
1042    Update FIB if:
1043    - local address disappeared -> we must delete all the entries
1044      referring to it.
1045    - device went down -> we must shutdown all nexthops going via it.
1046  */
1047
1048 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1049 {
1050         int ret = 0;
1051         int scope = RT_SCOPE_NOWHERE;
1052
1053         if (force)
1054                 scope = -1;
1055
1056         if (local && fib_info_laddrhash) {
1057                 unsigned int hash = fib_laddr_hashfn(local);
1058                 struct hlist_head *head = &fib_info_laddrhash[hash];
1059                 struct hlist_node *node;
1060                 struct fib_info *fi;
1061
1062                 hlist_for_each_entry(fi, node, head, fib_lhash) {
1063                         if (fi->fib_prefsrc == local) {
1064                                 fi->fib_flags |= RTNH_F_DEAD;
1065                                 ret++;
1066                         }
1067                 }
1068         }
1069
1070         if (dev) {
1071                 struct fib_info *prev_fi = NULL;
1072                 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1073                 struct hlist_head *head = &fib_info_devhash[hash];
1074                 struct hlist_node *node;
1075                 struct fib_nh *nh;
1076
1077                 hlist_for_each_entry(nh, node, head, nh_hash) {
1078                         struct fib_info *fi = nh->nh_parent;
1079                         int dead;
1080
1081                         BUG_ON(!fi->fib_nhs);
1082                         if (nh->nh_dev != dev || fi == prev_fi)
1083                                 continue;
1084                         prev_fi = fi;
1085                         dead = 0;
1086                         change_nexthops(fi) {
1087                                 if (nh->nh_flags&RTNH_F_DEAD)
1088                                         dead++;
1089                                 else if (nh->nh_dev == dev &&
1090                                          nh->nh_scope != scope) {
1091                                         nh->nh_flags |= RTNH_F_DEAD;
1092 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1093                                         spin_lock_bh(&fib_multipath_lock);
1094                                         fi->fib_power -= nh->nh_power;
1095                                         nh->nh_power = 0;
1096                                         spin_unlock_bh(&fib_multipath_lock);
1097 #endif
1098                                         dead++;
1099                                 }
1100 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1101                                 if (force > 1 && nh->nh_dev == dev) {
1102                                         dead = fi->fib_nhs;
1103                                         break;
1104                                 }
1105 #endif
1106                         } endfor_nexthops(fi)
1107                         if (dead == fi->fib_nhs) {
1108                                 fi->fib_flags |= RTNH_F_DEAD;
1109                                 ret++;
1110                         }
1111                 }
1112         }
1113
1114         return ret;
1115 }
1116
1117 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1118
1119 /*
1120    Dead device goes up. We wake up dead nexthops.
1121    It takes sense only on multipath routes.
1122  */
1123
1124 int fib_sync_up(struct net_device *dev)
1125 {
1126         struct fib_info *prev_fi;
1127         unsigned int hash;
1128         struct hlist_head *head;
1129         struct hlist_node *node;
1130         struct fib_nh *nh;
1131         int ret;
1132
1133         if (!(dev->flags&IFF_UP))
1134                 return 0;
1135
1136         prev_fi = NULL;
1137         hash = fib_devindex_hashfn(dev->ifindex);
1138         head = &fib_info_devhash[hash];
1139         ret = 0;
1140
1141         hlist_for_each_entry(nh, node, head, nh_hash) {
1142                 struct fib_info *fi = nh->nh_parent;
1143                 int alive;
1144
1145                 BUG_ON(!fi->fib_nhs);
1146                 if (nh->nh_dev != dev || fi == prev_fi)
1147                         continue;
1148
1149                 prev_fi = fi;
1150                 alive = 0;
1151                 change_nexthops(fi) {
1152                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1153                                 alive++;
1154                                 continue;
1155                         }
1156                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1157                                 continue;
1158                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1159                                 continue;
1160                         alive++;
1161                         spin_lock_bh(&fib_multipath_lock);
1162                         nh->nh_power = 0;
1163                         nh->nh_flags &= ~RTNH_F_DEAD;
1164                         spin_unlock_bh(&fib_multipath_lock);
1165                 } endfor_nexthops(fi)
1166
1167                 if (alive > 0) {
1168                         fi->fib_flags &= ~RTNH_F_DEAD;
1169                         ret++;
1170                 }
1171         }
1172
1173         return ret;
1174 }
1175
1176 /*
1177    The algorithm is suboptimal, but it provides really
1178    fair weighted route distribution.
1179  */
1180
1181 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 {
1183         struct fib_info *fi = res->fi;
1184         int w;
1185
1186         spin_lock_bh(&fib_multipath_lock);
1187         if (fi->fib_power <= 0) {
1188                 int power = 0;
1189                 change_nexthops(fi) {
1190                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1191                                 power += nh->nh_weight;
1192                                 nh->nh_power = nh->nh_weight;
1193                         }
1194                 } endfor_nexthops(fi);
1195                 fi->fib_power = power;
1196                 if (power <= 0) {
1197                         spin_unlock_bh(&fib_multipath_lock);
1198                         /* Race condition: route has just become dead. */
1199                         res->nh_sel = 0;
1200                         return;
1201                 }
1202         }
1203
1204
1205         /* w should be random number [0..fi->fib_power-1],
1206            it is pretty bad approximation.
1207          */
1208
1209         w = jiffies % fi->fib_power;
1210
1211         change_nexthops(fi) {
1212                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1213                         if ((w -= nh->nh_power) <= 0) {
1214                                 nh->nh_power--;
1215                                 fi->fib_power--;
1216                                 res->nh_sel = nhsel;
1217                                 spin_unlock_bh(&fib_multipath_lock);
1218                                 return;
1219                         }
1220                 }
1221         } endfor_nexthops(fi);
1222
1223         /* Race condition: route has just become dead. */
1224         res->nh_sel = 0;
1225         spin_unlock_bh(&fib_multipath_lock);
1226 }
1227 #endif