2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <linux/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
47 #include "fib_lookup.h"
49 #define FSprintk(a...)
51 static DEFINE_RWLOCK(fib_info_lock);
52 static struct hlist_head *fib_info_hash;
53 static struct hlist_head *fib_info_laddrhash;
54 static unsigned int fib_hash_size;
55 static unsigned int fib_info_cnt;
57 #define DEVINDEX_HASHBITS 8
58 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
59 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
61 #ifdef CONFIG_IP_ROUTE_MULTIPATH
63 static DEFINE_SPINLOCK(fib_multipath_lock);
65 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
66 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
68 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
69 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
71 #else /* CONFIG_IP_ROUTE_MULTIPATH */
73 /* Hope, that gcc will optimize it to get rid of dummy loop */
75 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
76 for (nhsel=0; nhsel < 1; nhsel++)
78 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
79 for (nhsel=0; nhsel < 1; nhsel++)
81 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
83 #define endfor_nexthops(fi) }
90 } fib_props[RTA_MAX + 1] = {
93 .scope = RT_SCOPE_NOWHERE,
97 .scope = RT_SCOPE_UNIVERSE,
101 .scope = RT_SCOPE_HOST,
105 .scope = RT_SCOPE_LINK,
106 }, /* RTN_BROADCAST */
109 .scope = RT_SCOPE_LINK,
113 .scope = RT_SCOPE_UNIVERSE,
114 }, /* RTN_MULTICAST */
117 .scope = RT_SCOPE_UNIVERSE,
118 }, /* RTN_BLACKHOLE */
120 .error = -EHOSTUNREACH,
121 .scope = RT_SCOPE_UNIVERSE,
122 }, /* RTN_UNREACHABLE */
125 .scope = RT_SCOPE_UNIVERSE,
126 }, /* RTN_PROHIBIT */
129 .scope = RT_SCOPE_UNIVERSE,
133 .scope = RT_SCOPE_NOWHERE,
137 .scope = RT_SCOPE_NOWHERE,
138 }, /* RTN_XRESOLVE */
142 /* Release a nexthop info record */
144 void free_fib_info(struct fib_info *fi)
146 if (fi->fib_dead == 0) {
147 printk("Freeing alive fib_info %p\n", fi);
150 change_nexthops(fi) {
154 } endfor_nexthops(fi);
159 void fib_release_info(struct fib_info *fi)
161 write_lock(&fib_info_lock);
162 if (fi && --fi->fib_treeref == 0) {
163 hlist_del(&fi->fib_hash);
165 hlist_del(&fi->fib_lhash);
166 change_nexthops(fi) {
169 hlist_del(&nh->nh_hash);
170 } endfor_nexthops(fi)
174 write_unlock(&fib_info_lock);
177 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
179 const struct fib_nh *onh = ofi->fib_nh;
182 if (nh->nh_oif != onh->nh_oif ||
183 nh->nh_gw != onh->nh_gw ||
184 nh->nh_scope != onh->nh_scope ||
185 #ifdef CONFIG_IP_ROUTE_MULTIPATH
186 nh->nh_weight != onh->nh_weight ||
188 #ifdef CONFIG_NET_CLS_ROUTE
189 nh->nh_tclassid != onh->nh_tclassid ||
191 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194 } endfor_nexthops(fi);
198 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
200 unsigned int mask = (fib_hash_size - 1);
201 unsigned int val = fi->fib_nhs;
203 val ^= fi->fib_protocol;
204 val ^= fi->fib_prefsrc;
205 val ^= fi->fib_priority;
207 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 static struct fib_info *fib_find_info(const struct fib_info *nfi)
212 struct hlist_head *head;
213 struct hlist_node *node;
217 hash = fib_info_hashfn(nfi);
218 head = &fib_info_hash[hash];
220 hlist_for_each_entry(fi, node, head, fib_hash) {
221 if (fi->fib_nhs != nfi->fib_nhs)
223 if (nfi->fib_protocol == fi->fib_protocol &&
224 nfi->fib_prefsrc == fi->fib_prefsrc &&
225 nfi->fib_priority == fi->fib_priority &&
226 memcmp(nfi->fib_metrics, fi->fib_metrics,
227 sizeof(fi->fib_metrics)) == 0 &&
228 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
229 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
236 static inline unsigned int fib_devindex_hashfn(unsigned int val)
238 unsigned int mask = DEVINDEX_HASHSIZE - 1;
241 (val >> DEVINDEX_HASHBITS) ^
242 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 /* Check, that the gateway is already configured.
246 Used only by redirect accept routine.
249 int ip_fib_check_default(u32 gw, struct net_device *dev)
251 struct hlist_head *head;
252 struct hlist_node *node;
256 read_lock(&fib_info_lock);
258 hash = fib_devindex_hashfn(dev->ifindex);
259 head = &fib_info_devhash[hash];
260 hlist_for_each_entry(nh, node, head, nh_hash) {
261 if (nh->nh_dev == dev &&
263 !(nh->nh_flags&RTNH_F_DEAD)) {
264 read_unlock(&fib_info_lock);
269 read_unlock(&fib_info_lock);
274 void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
276 struct nlmsghdr *n, struct netlink_skb_parms *req)
279 u32 pid = req ? req->pid : n->nlmsg_pid;
280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
282 skb = alloc_skb(size, GFP_KERNEL);
286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
287 fa->fa_type, fa->fa_scope, &key, z,
289 fa->fa_info, 0) < 0) {
293 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
294 if (n->nlmsg_flags&NLM_F_ECHO)
295 atomic_inc(&skb->users);
296 netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
297 if (n->nlmsg_flags&NLM_F_ECHO)
298 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
301 /* Return the first fib alias matching TOS with
302 * priority less than or equal to PRIO.
304 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
307 struct fib_alias *fa;
308 list_for_each_entry(fa, fah, fa_list) {
309 if (fa->fa_tos > tos)
311 if (fa->fa_info->fib_priority >= prio ||
319 int fib_detect_death(struct fib_info *fi, int order,
320 struct fib_info **last_resort, int *last_idx, int *dflt)
323 int state = NUD_NONE;
325 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
327 state = n->nud_state;
330 if (state==NUD_REACHABLE)
332 if ((state&NUD_VALID) && order != *dflt)
334 if ((state&NUD_VALID) ||
335 (*last_idx<0 && order > *dflt)) {
342 #ifdef CONFIG_IP_ROUTE_MULTIPATH
344 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
346 while (RTA_OK(attr,attrlen)) {
347 if (attr->rta_type == type)
348 return *(u32*)RTA_DATA(attr);
349 attr = RTA_NEXT(attr, attrlen);
355 fib_count_nexthops(struct rtattr *rta)
358 struct rtnexthop *nhp = RTA_DATA(rta);
359 int nhlen = RTA_PAYLOAD(rta);
361 while (nhlen >= (int)sizeof(struct rtnexthop)) {
362 if ((nhlen -= nhp->rtnh_len) < 0)
365 nhp = RTNH_NEXT(nhp);
371 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
373 struct rtnexthop *nhp = RTA_DATA(rta);
374 int nhlen = RTA_PAYLOAD(rta);
376 change_nexthops(fi) {
377 int attrlen = nhlen - sizeof(struct rtnexthop);
378 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
380 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
381 nh->nh_oif = nhp->rtnh_ifindex;
382 nh->nh_weight = nhp->rtnh_hops + 1;
384 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
385 #ifdef CONFIG_NET_CLS_ROUTE
386 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
389 nhp = RTNH_NEXT(nhp);
390 } endfor_nexthops(fi);
396 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
399 #ifdef CONFIG_IP_ROUTE_MULTIPATH
400 struct rtnexthop *nhp;
404 if (rta->rta_priority &&
405 *rta->rta_priority != fi->fib_priority)
408 if (rta->rta_oif || rta->rta_gw) {
409 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
410 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
415 #ifdef CONFIG_IP_ROUTE_MULTIPATH
416 if (rta->rta_mp == NULL)
418 nhp = RTA_DATA(rta->rta_mp);
419 nhlen = RTA_PAYLOAD(rta->rta_mp);
422 int attrlen = nhlen - sizeof(struct rtnexthop);
425 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
427 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
430 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
431 if (gw && gw != nh->nh_gw)
433 #ifdef CONFIG_NET_CLS_ROUTE
434 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
435 if (gw && gw != nh->nh_tclassid)
439 nhp = RTNH_NEXT(nhp);
440 } endfor_nexthops(fi);
450 Semantics of nexthop is very messy by historical reasons.
451 We have to take into account, that:
452 a) gateway can be actually local interface address,
453 so that gatewayed route is direct.
454 b) gateway must be on-link address, possibly
455 described not by an ifaddr, but also by a direct route.
456 c) If both gateway and interface are specified, they should not
458 d) If we use tunnel routes, gateway could be not on-link.
460 Attempt to reconcile all of these (alas, self-contradictory) conditions
461 results in pretty ugly and hairy code with obscure logic.
463 I chose to generalized it instead, so that the size
464 of code does not increase practically, but it becomes
466 Every prefix is assigned a "scope" value: "host" is local address,
467 "link" is direct route,
468 [ ... "site" ... "interior" ... ]
469 and "universe" is true gateway route with global meaning.
471 Every prefix refers to a set of "nexthop"s (gw, oif),
472 where gw must have narrower scope. This recursion stops
473 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
474 which means that gw is forced to be on link.
476 Code is still hairy, but now it is apparently logically
477 consistent and very flexible. F.e. as by-product it allows
478 to co-exists in peace independent exterior and interior
481 Normally it looks as following.
483 {universe prefix} -> (gw, oif) [scope link]
485 |-> {link prefix} -> (gw, oif) [scope local]
487 |-> {local prefix} (terminal node)
490 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
495 struct fib_result res;
497 #ifdef CONFIG_IP_ROUTE_PERVASIVE
498 if (nh->nh_flags&RTNH_F_PERVASIVE)
501 if (nh->nh_flags&RTNH_F_ONLINK) {
502 struct net_device *dev;
504 if (r->rtm_scope >= RT_SCOPE_LINK)
506 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
508 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
510 if (!(dev->flags&IFF_UP))
514 nh->nh_scope = RT_SCOPE_LINK;
518 struct flowi fl = { .nl_u = { .ip4_u =
519 { .daddr = nh->nh_gw,
520 .scope = r->rtm_scope + 1 } },
523 /* It is not necessary, but requires a bit of thinking */
524 if (fl.fl4_scope < RT_SCOPE_LINK)
525 fl.fl4_scope = RT_SCOPE_LINK;
526 if ((err = fib_lookup(&fl, &res)) != 0)
530 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
532 nh->nh_scope = res.scope;
533 nh->nh_oif = FIB_RES_OIF(res);
534 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
536 dev_hold(nh->nh_dev);
538 if (!(nh->nh_dev->flags & IFF_UP))
545 struct in_device *in_dev;
547 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
550 in_dev = inetdev_by_index(nh->nh_oif);
553 if (!(in_dev->dev->flags&IFF_UP)) {
557 nh->nh_dev = in_dev->dev;
558 dev_hold(nh->nh_dev);
559 nh->nh_scope = RT_SCOPE_HOST;
565 static inline unsigned int fib_laddr_hashfn(u32 val)
567 unsigned int mask = (fib_hash_size - 1);
569 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
572 static struct hlist_head *fib_hash_alloc(int bytes)
574 if (bytes <= PAGE_SIZE)
575 return kmalloc(bytes, GFP_KERNEL);
577 return (struct hlist_head *)
578 __get_free_pages(GFP_KERNEL, get_order(bytes));
581 static void fib_hash_free(struct hlist_head *hash, int bytes)
586 if (bytes <= PAGE_SIZE)
589 free_pages((unsigned long) hash, get_order(bytes));
592 static void fib_hash_move(struct hlist_head *new_info_hash,
593 struct hlist_head *new_laddrhash,
594 unsigned int new_size)
596 struct hlist_head *old_info_hash, *old_laddrhash;
597 unsigned int old_size = fib_hash_size;
598 unsigned int i, bytes;
600 write_lock(&fib_info_lock);
601 old_info_hash = fib_info_hash;
602 old_laddrhash = fib_info_laddrhash;
603 fib_hash_size = new_size;
605 for (i = 0; i < old_size; i++) {
606 struct hlist_head *head = &fib_info_hash[i];
607 struct hlist_node *node, *n;
610 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
611 struct hlist_head *dest;
612 unsigned int new_hash;
614 hlist_del(&fi->fib_hash);
616 new_hash = fib_info_hashfn(fi);
617 dest = &new_info_hash[new_hash];
618 hlist_add_head(&fi->fib_hash, dest);
621 fib_info_hash = new_info_hash;
623 for (i = 0; i < old_size; i++) {
624 struct hlist_head *lhead = &fib_info_laddrhash[i];
625 struct hlist_node *node, *n;
628 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
629 struct hlist_head *ldest;
630 unsigned int new_hash;
632 hlist_del(&fi->fib_lhash);
634 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
635 ldest = &new_laddrhash[new_hash];
636 hlist_add_head(&fi->fib_lhash, ldest);
639 fib_info_laddrhash = new_laddrhash;
641 write_unlock(&fib_info_lock);
643 bytes = old_size * sizeof(struct hlist_head *);
644 fib_hash_free(old_info_hash, bytes);
645 fib_hash_free(old_laddrhash, bytes);
649 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
650 const struct nlmsghdr *nlh, int *errp)
653 struct fib_info *fi = NULL;
654 struct fib_info *ofi;
655 #ifdef CONFIG_IP_ROUTE_MULTIPATH
660 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
661 u32 mp_alg = IP_MP_ALG_NONE;
664 /* Fast check to catch the most weird cases */
665 if (fib_props[r->rtm_type].scope > r->rtm_scope)
668 #ifdef CONFIG_IP_ROUTE_MULTIPATH
670 nhs = fib_count_nexthops(rta->rta_mp);
675 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
676 if (rta->rta_mp_alg) {
677 mp_alg = *rta->rta_mp_alg;
679 if (mp_alg < IP_MP_ALG_NONE ||
680 mp_alg > IP_MP_ALG_MAX)
686 if (fib_info_cnt >= fib_hash_size) {
687 unsigned int new_size = fib_hash_size << 1;
688 struct hlist_head *new_info_hash;
689 struct hlist_head *new_laddrhash;
694 bytes = new_size * sizeof(struct hlist_head *);
695 new_info_hash = fib_hash_alloc(bytes);
696 new_laddrhash = fib_hash_alloc(bytes);
697 if (!new_info_hash || !new_laddrhash) {
698 fib_hash_free(new_info_hash, bytes);
699 fib_hash_free(new_laddrhash, bytes);
701 memset(new_info_hash, 0, bytes);
702 memset(new_laddrhash, 0, bytes);
704 fib_hash_move(new_info_hash, new_laddrhash, new_size);
711 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
715 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
717 fi->fib_protocol = r->rtm_protocol;
720 change_nexthops(fi) {
722 } endfor_nexthops(fi)
724 fi->fib_flags = r->rtm_flags;
725 if (rta->rta_priority)
726 fi->fib_priority = *rta->rta_priority;
728 int attrlen = RTA_PAYLOAD(rta->rta_mx);
729 struct rtattr *attr = RTA_DATA(rta->rta_mx);
731 while (RTA_OK(attr, attrlen)) {
732 unsigned flavor = attr->rta_type;
734 if (flavor > RTAX_MAX)
736 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
738 attr = RTA_NEXT(attr, attrlen);
741 if (rta->rta_prefsrc)
742 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
745 #ifdef CONFIG_IP_ROUTE_MULTIPATH
746 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
748 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
750 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
752 #ifdef CONFIG_NET_CLS_ROUTE
753 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
760 struct fib_nh *nh = fi->fib_nh;
762 nh->nh_oif = *rta->rta_oif;
764 memcpy(&nh->nh_gw, rta->rta_gw, 4);
765 #ifdef CONFIG_NET_CLS_ROUTE
767 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
769 nh->nh_flags = r->rtm_flags;
770 #ifdef CONFIG_IP_ROUTE_MULTIPATH
775 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
776 fi->fib_mp_alg = mp_alg;
779 if (fib_props[r->rtm_type].error) {
780 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
785 if (r->rtm_scope > RT_SCOPE_HOST)
788 if (r->rtm_scope == RT_SCOPE_HOST) {
789 struct fib_nh *nh = fi->fib_nh;
791 /* Local address is added. */
792 if (nhs != 1 || nh->nh_gw)
794 nh->nh_scope = RT_SCOPE_NOWHERE;
795 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
797 if (nh->nh_dev == NULL)
800 change_nexthops(fi) {
801 if ((err = fib_check_nh(r, fi, nh)) != 0)
803 } endfor_nexthops(fi)
806 if (fi->fib_prefsrc) {
807 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
808 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
809 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
814 if ((ofi = fib_find_info(fi)) != NULL) {
822 atomic_inc(&fi->fib_clntref);
823 write_lock(&fib_info_lock);
824 hlist_add_head(&fi->fib_hash,
825 &fib_info_hash[fib_info_hashfn(fi)]);
826 if (fi->fib_prefsrc) {
827 struct hlist_head *head;
829 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
830 hlist_add_head(&fi->fib_lhash, head);
832 change_nexthops(fi) {
833 struct hlist_head *head;
838 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
839 head = &fib_info_devhash[hash];
840 hlist_add_head(&nh->nh_hash, head);
841 } endfor_nexthops(fi)
842 write_unlock(&fib_info_lock);
857 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
858 struct fib_result *res, __u32 zone, __u32 mask,
861 struct fib_alias *fa;
864 list_for_each_entry(fa, head, fa_list) {
868 fa->fa_tos != flp->fl4_tos)
871 if (fa->fa_scope < flp->fl4_scope)
874 fa->fa_state |= FA_S_ACCESSED;
876 err = fib_props[fa->fa_type].error;
878 struct fib_info *fi = fa->fa_info;
880 if (fi->fib_flags & RTNH_F_DEAD)
883 switch (fa->fa_type) {
890 if (nh->nh_flags&RTNH_F_DEAD)
892 if (!flp->oif || flp->oif == nh->nh_oif)
895 #ifdef CONFIG_IP_ROUTE_MULTIPATH
896 if (nhsel < fi->fib_nhs) {
909 printk(KERN_DEBUG "impossible 102\n");
918 res->prefixlen = prefixlen;
919 res->nh_sel = nh_sel;
920 res->type = fa->fa_type;
921 res->scope = fa->fa_scope;
922 res->fi = fa->fa_info;
923 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
925 res->network = zone &
926 (0xFFFFFFFF >> (32 - prefixlen));
928 atomic_inc(&res->fi->fib_clntref);
932 /* Find appropriate source address to this destination */
934 u32 __fib_res_prefsrc(struct fib_result *res)
936 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
940 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
941 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
942 struct fib_info *fi, unsigned int flags)
945 struct nlmsghdr *nlh;
946 unsigned char *b = skb->tail;
948 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
949 rtm = NLMSG_DATA(nlh);
950 rtm->rtm_family = AF_INET;
951 rtm->rtm_dst_len = dst_len;
952 rtm->rtm_src_len = 0;
954 rtm->rtm_table = tb_id;
955 rtm->rtm_type = type;
956 rtm->rtm_flags = fi->fib_flags;
957 rtm->rtm_scope = scope;
958 if (rtm->rtm_dst_len)
959 RTA_PUT(skb, RTA_DST, 4, dst);
960 rtm->rtm_protocol = fi->fib_protocol;
961 if (fi->fib_priority)
962 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
963 #ifdef CONFIG_NET_CLS_ROUTE
964 if (fi->fib_nh[0].nh_tclassid)
965 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
967 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
970 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
971 if (fi->fib_nhs == 1) {
972 if (fi->fib_nh->nh_gw)
973 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
974 if (fi->fib_nh->nh_oif)
975 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
977 #ifdef CONFIG_IP_ROUTE_MULTIPATH
978 if (fi->fib_nhs > 1) {
979 struct rtnexthop *nhp;
980 struct rtattr *mp_head;
981 if (skb_tailroom(skb) <= RTA_SPACE(0))
983 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
986 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
988 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
989 nhp->rtnh_flags = nh->nh_flags & 0xFF;
990 nhp->rtnh_hops = nh->nh_weight-1;
991 nhp->rtnh_ifindex = nh->nh_oif;
993 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
994 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
995 } endfor_nexthops(fi);
996 mp_head->rta_type = RTA_MULTIPATH;
997 mp_head->rta_len = skb->tail - (u8*)mp_head;
1000 nlh->nlmsg_len = skb->tail - b;
1005 skb_trim(skb, b - skb->data);
1009 #ifndef CONFIG_IP_NOSIOCRT
1012 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1013 struct kern_rta *rta, struct rtentry *r)
1018 memset(rtm, 0, sizeof(*rtm));
1019 memset(rta, 0, sizeof(*rta));
1021 if (r->rt_dst.sa_family != AF_INET)
1022 return -EAFNOSUPPORT;
1024 /* Check mask for validity:
1025 a) it must be contiguous.
1026 b) destination must have all host bits clear.
1027 c) if application forgot to set correct family (AF_INET),
1028 reject request unless it is absolutely clear i.e.
1029 both family and mask are zero.
1032 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
1033 if (!(r->rt_flags&RTF_HOST)) {
1034 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
1035 if (r->rt_genmask.sa_family != AF_INET) {
1036 if (mask || r->rt_genmask.sa_family)
1037 return -EAFNOSUPPORT;
1039 if (bad_mask(mask, *ptr))
1041 plen = inet_mask_len(mask);
1044 nl->nlmsg_flags = NLM_F_REQUEST;
1045 nl->nlmsg_pid = current->pid;
1047 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1048 if (cmd == SIOCDELRT) {
1049 nl->nlmsg_type = RTM_DELROUTE;
1050 nl->nlmsg_flags = 0;
1052 nl->nlmsg_type = RTM_NEWROUTE;
1053 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
1054 rtm->rtm_protocol = RTPROT_BOOT;
1057 rtm->rtm_dst_len = plen;
1061 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
1062 rta->rta_priority = (u32*)&r->rt_pad3;
1064 if (r->rt_flags&RTF_REJECT) {
1065 rtm->rtm_scope = RT_SCOPE_HOST;
1066 rtm->rtm_type = RTN_UNREACHABLE;
1069 rtm->rtm_scope = RT_SCOPE_NOWHERE;
1070 rtm->rtm_type = RTN_UNICAST;
1074 struct net_device *dev;
1075 char devname[IFNAMSIZ];
1077 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
1079 devname[IFNAMSIZ-1] = 0;
1080 colon = strchr(devname, ':');
1083 dev = __dev_get_by_name(devname);
1086 rta->rta_oif = &dev->ifindex;
1088 struct in_ifaddr *ifa;
1089 struct in_device *in_dev = __in_dev_get(dev);
1093 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
1094 if (strcmp(ifa->ifa_label, devname) == 0)
1098 rta->rta_prefsrc = &ifa->ifa_local;
1102 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1103 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1105 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1106 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1109 if (cmd == SIOCDELRT)
1112 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1115 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1116 rtm->rtm_scope = RT_SCOPE_LINK;
1118 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1120 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1124 mx->rta_type = RTA_METRICS;
1125 mx->rta_len = RTA_LENGTH(0);
1126 if (r->rt_flags&RTF_MTU) {
1127 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1128 rec->rta_type = RTAX_ADVMSS;
1129 rec->rta_len = RTA_LENGTH(4);
1130 mx->rta_len += RTA_LENGTH(4);
1131 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1133 if (r->rt_flags&RTF_WINDOW) {
1134 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1135 rec->rta_type = RTAX_WINDOW;
1136 rec->rta_len = RTA_LENGTH(4);
1137 mx->rta_len += RTA_LENGTH(4);
1138 *(u32*)RTA_DATA(rec) = r->rt_window;
1140 if (r->rt_flags&RTF_IRTT) {
1141 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1142 rec->rta_type = RTAX_RTT;
1143 rec->rta_len = RTA_LENGTH(4);
1144 mx->rta_len += RTA_LENGTH(4);
1145 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1155 - local address disappeared -> we must delete all the entries
1157 - device went down -> we must shutdown all nexthops going via it.
1160 int fib_sync_down(u32 local, struct net_device *dev, int force)
1163 int scope = RT_SCOPE_NOWHERE;
1168 if (local && fib_info_laddrhash) {
1169 unsigned int hash = fib_laddr_hashfn(local);
1170 struct hlist_head *head = &fib_info_laddrhash[hash];
1171 struct hlist_node *node;
1172 struct fib_info *fi;
1174 hlist_for_each_entry(fi, node, head, fib_lhash) {
1175 if (fi->fib_prefsrc == local) {
1176 fi->fib_flags |= RTNH_F_DEAD;
1183 struct fib_info *prev_fi = NULL;
1184 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1185 struct hlist_head *head = &fib_info_devhash[hash];
1186 struct hlist_node *node;
1189 hlist_for_each_entry(nh, node, head, nh_hash) {
1190 struct fib_info *fi = nh->nh_parent;
1193 BUG_ON(!fi->fib_nhs);
1194 if (nh->nh_dev != dev || fi == prev_fi)
1198 change_nexthops(fi) {
1199 if (nh->nh_flags&RTNH_F_DEAD)
1201 else if (nh->nh_dev == dev &&
1202 nh->nh_scope != scope) {
1203 nh->nh_flags |= RTNH_F_DEAD;
1204 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1205 spin_lock_bh(&fib_multipath_lock);
1206 fi->fib_power -= nh->nh_power;
1208 spin_unlock_bh(&fib_multipath_lock);
1212 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1213 if (force > 1 && nh->nh_dev == dev) {
1218 } endfor_nexthops(fi)
1219 if (dead == fi->fib_nhs) {
1220 fi->fib_flags |= RTNH_F_DEAD;
1229 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1232 Dead device goes up. We wake up dead nexthops.
1233 It takes sense only on multipath routes.
1236 int fib_sync_up(struct net_device *dev)
1238 struct fib_info *prev_fi;
1240 struct hlist_head *head;
1241 struct hlist_node *node;
1245 if (!(dev->flags&IFF_UP))
1249 hash = fib_devindex_hashfn(dev->ifindex);
1250 head = &fib_info_devhash[hash];
1253 hlist_for_each_entry(nh, node, head, nh_hash) {
1254 struct fib_info *fi = nh->nh_parent;
1257 BUG_ON(!fi->fib_nhs);
1258 if (nh->nh_dev != dev || fi == prev_fi)
1263 change_nexthops(fi) {
1264 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1268 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1270 if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1273 spin_lock_bh(&fib_multipath_lock);
1275 nh->nh_flags &= ~RTNH_F_DEAD;
1276 spin_unlock_bh(&fib_multipath_lock);
1277 } endfor_nexthops(fi)
1280 fi->fib_flags &= ~RTNH_F_DEAD;
1289 The algorithm is suboptimal, but it provides really
1290 fair weighted route distribution.
1293 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1295 struct fib_info *fi = res->fi;
1298 spin_lock_bh(&fib_multipath_lock);
1299 if (fi->fib_power <= 0) {
1301 change_nexthops(fi) {
1302 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1303 power += nh->nh_weight;
1304 nh->nh_power = nh->nh_weight;
1306 } endfor_nexthops(fi);
1307 fi->fib_power = power;
1309 spin_unlock_bh(&fib_multipath_lock);
1310 /* Race condition: route has just become dead. */
1317 /* w should be random number [0..fi->fib_power-1],
1318 it is pretty bad approximation.
1321 w = jiffies % fi->fib_power;
1323 change_nexthops(fi) {
1324 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1325 if ((w -= nh->nh_power) <= 0) {
1328 res->nh_sel = nhsel;
1329 spin_unlock_bh(&fib_multipath_lock);
1333 } endfor_nexthops(fi);
1335 /* Race condition: route has just become dead. */
1337 spin_unlock_bh(&fib_multipath_lock);