2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <linux/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
31 #include <linux/inet.h>
32 #include <linux/inetdevice.h>
33 #include <linux/netdevice.h>
34 #include <linux/if_arp.h>
35 #include <linux/proc_fs.h>
36 #include <linux/skbuff.h>
37 #include <linux/netlink.h>
38 #include <linux/init.h>
42 #include <net/protocol.h>
43 #include <net/route.h>
46 #include <net/ip_fib.h>
47 #include <net/ip_mp_alg.h>
49 #include "fib_lookup.h"
51 #define FSprintk(a...)
53 static DEFINE_RWLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
65 static DEFINE_SPINLOCK(fib_multipath_lock);
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
85 #define endfor_nexthops(fi) }
92 } fib_props[RTA_MAX + 1] = {
95 .scope = RT_SCOPE_NOWHERE,
99 .scope = RT_SCOPE_UNIVERSE,
103 .scope = RT_SCOPE_HOST,
107 .scope = RT_SCOPE_LINK,
108 }, /* RTN_BROADCAST */
111 .scope = RT_SCOPE_LINK,
115 .scope = RT_SCOPE_UNIVERSE,
116 }, /* RTN_MULTICAST */
119 .scope = RT_SCOPE_UNIVERSE,
120 }, /* RTN_BLACKHOLE */
122 .error = -EHOSTUNREACH,
123 .scope = RT_SCOPE_UNIVERSE,
124 }, /* RTN_UNREACHABLE */
127 .scope = RT_SCOPE_UNIVERSE,
128 }, /* RTN_PROHIBIT */
131 .scope = RT_SCOPE_UNIVERSE,
135 .scope = RT_SCOPE_NOWHERE,
139 .scope = RT_SCOPE_NOWHERE,
140 }, /* RTN_XRESOLVE */
144 /* Release a nexthop info record */
146 void free_fib_info(struct fib_info *fi)
148 if (fi->fib_dead == 0) {
149 printk("Freeing alive fib_info %p\n", fi);
152 change_nexthops(fi) {
156 } endfor_nexthops(fi);
161 void fib_release_info(struct fib_info *fi)
163 write_lock(&fib_info_lock);
164 if (fi && --fi->fib_treeref == 0) {
165 hlist_del(&fi->fib_hash);
167 hlist_del(&fi->fib_lhash);
168 change_nexthops(fi) {
171 hlist_del(&nh->nh_hash);
172 } endfor_nexthops(fi)
176 write_unlock(&fib_info_lock);
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
181 const struct fib_nh *onh = ofi->fib_nh;
184 if (nh->nh_oif != onh->nh_oif ||
185 nh->nh_gw != onh->nh_gw ||
186 nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188 nh->nh_weight != onh->nh_weight ||
190 #ifdef CONFIG_NET_CLS_ROUTE
191 nh->nh_tclassid != onh->nh_tclassid ||
193 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
196 } endfor_nexthops(fi);
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
202 unsigned int mask = (fib_hash_size - 1);
203 unsigned int val = fi->fib_nhs;
205 val ^= fi->fib_protocol;
206 val ^= fi->fib_prefsrc;
207 val ^= fi->fib_priority;
209 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
214 struct hlist_head *head;
215 struct hlist_node *node;
219 hash = fib_info_hashfn(nfi);
220 head = &fib_info_hash[hash];
222 hlist_for_each_entry(fi, node, head, fib_hash) {
223 if (fi->fib_nhs != nfi->fib_nhs)
225 if (nfi->fib_protocol == fi->fib_protocol &&
226 nfi->fib_prefsrc == fi->fib_prefsrc &&
227 nfi->fib_priority == fi->fib_priority &&
228 memcmp(nfi->fib_metrics, fi->fib_metrics,
229 sizeof(fi->fib_metrics)) == 0 &&
230 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
240 unsigned int mask = DEVINDEX_HASHSIZE - 1;
243 (val >> DEVINDEX_HASHBITS) ^
244 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
247 /* Check, that the gateway is already configured.
248 Used only by redirect accept routine.
251 int ip_fib_check_default(u32 gw, struct net_device *dev)
253 struct hlist_head *head;
254 struct hlist_node *node;
258 read_lock(&fib_info_lock);
260 hash = fib_devindex_hashfn(dev->ifindex);
261 head = &fib_info_devhash[hash];
262 hlist_for_each_entry(nh, node, head, nh_hash) {
263 if (nh->nh_dev == dev &&
265 !(nh->nh_flags&RTNH_F_DEAD)) {
266 read_unlock(&fib_info_lock);
271 read_unlock(&fib_info_lock);
276 void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
278 struct nlmsghdr *n, struct netlink_skb_parms *req)
281 u32 pid = req ? req->pid : n->nlmsg_pid;
282 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
284 skb = alloc_skb(size, GFP_KERNEL);
288 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
289 fa->fa_type, fa->fa_scope, &key, z,
291 fa->fa_info, 0) < 0) {
295 NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
296 if (n->nlmsg_flags&NLM_F_ECHO)
297 atomic_inc(&skb->users);
298 netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
299 if (n->nlmsg_flags&NLM_F_ECHO)
300 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
303 /* Return the first fib alias matching TOS with
304 * priority less than or equal to PRIO.
306 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
309 struct fib_alias *fa;
310 list_for_each_entry(fa, fah, fa_list) {
311 if (fa->fa_tos > tos)
313 if (fa->fa_info->fib_priority >= prio ||
321 int fib_detect_death(struct fib_info *fi, int order,
322 struct fib_info **last_resort, int *last_idx, int *dflt)
325 int state = NUD_NONE;
327 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
329 state = n->nud_state;
332 if (state==NUD_REACHABLE)
334 if ((state&NUD_VALID) && order != *dflt)
336 if ((state&NUD_VALID) ||
337 (*last_idx<0 && order > *dflt)) {
344 #ifdef CONFIG_IP_ROUTE_MULTIPATH
346 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
348 while (RTA_OK(attr,attrlen)) {
349 if (attr->rta_type == type)
350 return *(u32*)RTA_DATA(attr);
351 attr = RTA_NEXT(attr, attrlen);
357 fib_count_nexthops(struct rtattr *rta)
360 struct rtnexthop *nhp = RTA_DATA(rta);
361 int nhlen = RTA_PAYLOAD(rta);
363 while (nhlen >= (int)sizeof(struct rtnexthop)) {
364 if ((nhlen -= nhp->rtnh_len) < 0)
367 nhp = RTNH_NEXT(nhp);
373 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
375 struct rtnexthop *nhp = RTA_DATA(rta);
376 int nhlen = RTA_PAYLOAD(rta);
378 change_nexthops(fi) {
379 int attrlen = nhlen - sizeof(struct rtnexthop);
380 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
382 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
383 nh->nh_oif = nhp->rtnh_ifindex;
384 nh->nh_weight = nhp->rtnh_hops + 1;
386 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
387 #ifdef CONFIG_NET_CLS_ROUTE
388 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
391 nhp = RTNH_NEXT(nhp);
392 } endfor_nexthops(fi);
398 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
401 #ifdef CONFIG_IP_ROUTE_MULTIPATH
402 struct rtnexthop *nhp;
406 if (rta->rta_priority &&
407 *rta->rta_priority != fi->fib_priority)
410 if (rta->rta_oif || rta->rta_gw) {
411 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
412 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
417 #ifdef CONFIG_IP_ROUTE_MULTIPATH
418 if (rta->rta_mp == NULL)
420 nhp = RTA_DATA(rta->rta_mp);
421 nhlen = RTA_PAYLOAD(rta->rta_mp);
424 int attrlen = nhlen - sizeof(struct rtnexthop);
427 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
429 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
432 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
433 if (gw && gw != nh->nh_gw)
435 #ifdef CONFIG_NET_CLS_ROUTE
436 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
437 if (gw && gw != nh->nh_tclassid)
441 nhp = RTNH_NEXT(nhp);
442 } endfor_nexthops(fi);
452 Semantics of nexthop is very messy by historical reasons.
453 We have to take into account, that:
454 a) gateway can be actually local interface address,
455 so that gatewayed route is direct.
456 b) gateway must be on-link address, possibly
457 described not by an ifaddr, but also by a direct route.
458 c) If both gateway and interface are specified, they should not
460 d) If we use tunnel routes, gateway could be not on-link.
462 Attempt to reconcile all of these (alas, self-contradictory) conditions
463 results in pretty ugly and hairy code with obscure logic.
465 I chose to generalized it instead, so that the size
466 of code does not increase practically, but it becomes
468 Every prefix is assigned a "scope" value: "host" is local address,
469 "link" is direct route,
470 [ ... "site" ... "interior" ... ]
471 and "universe" is true gateway route with global meaning.
473 Every prefix refers to a set of "nexthop"s (gw, oif),
474 where gw must have narrower scope. This recursion stops
475 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
476 which means that gw is forced to be on link.
478 Code is still hairy, but now it is apparently logically
479 consistent and very flexible. F.e. as by-product it allows
480 to co-exists in peace independent exterior and interior
483 Normally it looks as following.
485 {universe prefix} -> (gw, oif) [scope link]
487 |-> {link prefix} -> (gw, oif) [scope local]
489 |-> {local prefix} (terminal node)
492 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
497 struct fib_result res;
499 #ifdef CONFIG_IP_ROUTE_PERVASIVE
500 if (nh->nh_flags&RTNH_F_PERVASIVE)
503 if (nh->nh_flags&RTNH_F_ONLINK) {
504 struct net_device *dev;
506 if (r->rtm_scope >= RT_SCOPE_LINK)
508 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
510 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
512 if (!(dev->flags&IFF_UP))
516 nh->nh_scope = RT_SCOPE_LINK;
520 struct flowi fl = { .nl_u = { .ip4_u =
521 { .daddr = nh->nh_gw,
522 .scope = r->rtm_scope + 1 } },
525 /* It is not necessary, but requires a bit of thinking */
526 if (fl.fl4_scope < RT_SCOPE_LINK)
527 fl.fl4_scope = RT_SCOPE_LINK;
528 if ((err = fib_lookup(&fl, &res)) != 0)
532 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
534 nh->nh_scope = res.scope;
535 nh->nh_oif = FIB_RES_OIF(res);
536 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
538 dev_hold(nh->nh_dev);
540 if (!(nh->nh_dev->flags & IFF_UP))
547 struct in_device *in_dev;
549 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
552 in_dev = inetdev_by_index(nh->nh_oif);
555 if (!(in_dev->dev->flags&IFF_UP)) {
559 nh->nh_dev = in_dev->dev;
560 dev_hold(nh->nh_dev);
561 nh->nh_scope = RT_SCOPE_HOST;
567 static inline unsigned int fib_laddr_hashfn(u32 val)
569 unsigned int mask = (fib_hash_size - 1);
571 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
574 static struct hlist_head *fib_hash_alloc(int bytes)
576 if (bytes <= PAGE_SIZE)
577 return kmalloc(bytes, GFP_KERNEL);
579 return (struct hlist_head *)
580 __get_free_pages(GFP_KERNEL, get_order(bytes));
583 static void fib_hash_free(struct hlist_head *hash, int bytes)
588 if (bytes <= PAGE_SIZE)
591 free_pages((unsigned long) hash, get_order(bytes));
594 static void fib_hash_move(struct hlist_head *new_info_hash,
595 struct hlist_head *new_laddrhash,
596 unsigned int new_size)
598 struct hlist_head *old_info_hash, *old_laddrhash;
599 unsigned int old_size = fib_hash_size;
600 unsigned int i, bytes;
602 write_lock(&fib_info_lock);
603 old_info_hash = fib_info_hash;
604 old_laddrhash = fib_info_laddrhash;
605 fib_hash_size = new_size;
607 for (i = 0; i < old_size; i++) {
608 struct hlist_head *head = &fib_info_hash[i];
609 struct hlist_node *node, *n;
612 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
613 struct hlist_head *dest;
614 unsigned int new_hash;
616 hlist_del(&fi->fib_hash);
618 new_hash = fib_info_hashfn(fi);
619 dest = &new_info_hash[new_hash];
620 hlist_add_head(&fi->fib_hash, dest);
623 fib_info_hash = new_info_hash;
625 for (i = 0; i < old_size; i++) {
626 struct hlist_head *lhead = &fib_info_laddrhash[i];
627 struct hlist_node *node, *n;
630 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
631 struct hlist_head *ldest;
632 unsigned int new_hash;
634 hlist_del(&fi->fib_lhash);
636 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
637 ldest = &new_laddrhash[new_hash];
638 hlist_add_head(&fi->fib_lhash, ldest);
641 fib_info_laddrhash = new_laddrhash;
643 write_unlock(&fib_info_lock);
645 bytes = old_size * sizeof(struct hlist_head *);
646 fib_hash_free(old_info_hash, bytes);
647 fib_hash_free(old_laddrhash, bytes);
651 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
652 const struct nlmsghdr *nlh, int *errp)
655 struct fib_info *fi = NULL;
656 struct fib_info *ofi;
657 #ifdef CONFIG_IP_ROUTE_MULTIPATH
662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 u32 mp_alg = IP_MP_ALG_NONE;
666 /* Fast check to catch the most weird cases */
667 if (fib_props[r->rtm_type].scope > r->rtm_scope)
670 #ifdef CONFIG_IP_ROUTE_MULTIPATH
672 nhs = fib_count_nexthops(rta->rta_mp);
677 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
678 if (rta->rta_mp_alg) {
679 mp_alg = *rta->rta_mp_alg;
681 if (mp_alg < IP_MP_ALG_NONE ||
682 mp_alg > IP_MP_ALG_MAX)
688 if (fib_info_cnt >= fib_hash_size) {
689 unsigned int new_size = fib_hash_size << 1;
690 struct hlist_head *new_info_hash;
691 struct hlist_head *new_laddrhash;
696 bytes = new_size * sizeof(struct hlist_head *);
697 new_info_hash = fib_hash_alloc(bytes);
698 new_laddrhash = fib_hash_alloc(bytes);
699 if (!new_info_hash || !new_laddrhash) {
700 fib_hash_free(new_info_hash, bytes);
701 fib_hash_free(new_laddrhash, bytes);
703 memset(new_info_hash, 0, bytes);
704 memset(new_laddrhash, 0, bytes);
706 fib_hash_move(new_info_hash, new_laddrhash, new_size);
713 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
717 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
719 fi->fib_protocol = r->rtm_protocol;
722 change_nexthops(fi) {
724 } endfor_nexthops(fi)
726 fi->fib_flags = r->rtm_flags;
727 if (rta->rta_priority)
728 fi->fib_priority = *rta->rta_priority;
730 int attrlen = RTA_PAYLOAD(rta->rta_mx);
731 struct rtattr *attr = RTA_DATA(rta->rta_mx);
733 while (RTA_OK(attr, attrlen)) {
734 unsigned flavor = attr->rta_type;
736 if (flavor > RTAX_MAX)
738 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
740 attr = RTA_NEXT(attr, attrlen);
743 if (rta->rta_prefsrc)
744 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
747 #ifdef CONFIG_IP_ROUTE_MULTIPATH
748 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
750 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
752 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
754 #ifdef CONFIG_NET_CLS_ROUTE
755 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
762 struct fib_nh *nh = fi->fib_nh;
764 nh->nh_oif = *rta->rta_oif;
766 memcpy(&nh->nh_gw, rta->rta_gw, 4);
767 #ifdef CONFIG_NET_CLS_ROUTE
769 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
771 nh->nh_flags = r->rtm_flags;
772 #ifdef CONFIG_IP_ROUTE_MULTIPATH
777 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
778 fi->fib_mp_alg = mp_alg;
781 if (fib_props[r->rtm_type].error) {
782 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
787 if (r->rtm_scope > RT_SCOPE_HOST)
790 if (r->rtm_scope == RT_SCOPE_HOST) {
791 struct fib_nh *nh = fi->fib_nh;
793 /* Local address is added. */
794 if (nhs != 1 || nh->nh_gw)
796 nh->nh_scope = RT_SCOPE_NOWHERE;
797 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
799 if (nh->nh_dev == NULL)
802 change_nexthops(fi) {
803 if ((err = fib_check_nh(r, fi, nh)) != 0)
805 } endfor_nexthops(fi)
808 if (fi->fib_prefsrc) {
809 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
810 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
811 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
816 if ((ofi = fib_find_info(fi)) != NULL) {
824 atomic_inc(&fi->fib_clntref);
825 write_lock(&fib_info_lock);
826 hlist_add_head(&fi->fib_hash,
827 &fib_info_hash[fib_info_hashfn(fi)]);
828 if (fi->fib_prefsrc) {
829 struct hlist_head *head;
831 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
832 hlist_add_head(&fi->fib_lhash, head);
834 change_nexthops(fi) {
835 struct hlist_head *head;
840 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
841 head = &fib_info_devhash[hash];
842 hlist_add_head(&nh->nh_hash, head);
843 } endfor_nexthops(fi)
844 write_unlock(&fib_info_lock);
859 /* Note! fib_semantic_match intentionally uses RCU list functions. */
860 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
861 struct fib_result *res, __u32 zone, __u32 mask,
864 struct fib_alias *fa;
867 list_for_each_entry_rcu(fa, head, fa_list) {
871 fa->fa_tos != flp->fl4_tos)
874 if (fa->fa_scope < flp->fl4_scope)
877 fa->fa_state |= FA_S_ACCESSED;
879 err = fib_props[fa->fa_type].error;
881 struct fib_info *fi = fa->fa_info;
883 if (fi->fib_flags & RTNH_F_DEAD)
886 switch (fa->fa_type) {
893 if (nh->nh_flags&RTNH_F_DEAD)
895 if (!flp->oif || flp->oif == nh->nh_oif)
898 #ifdef CONFIG_IP_ROUTE_MULTIPATH
899 if (nhsel < fi->fib_nhs) {
912 printk(KERN_DEBUG "impossible 102\n");
921 res->prefixlen = prefixlen;
922 res->nh_sel = nh_sel;
923 res->type = fa->fa_type;
924 res->scope = fa->fa_scope;
925 res->fi = fa->fa_info;
926 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928 res->network = zone &
929 (0xFFFFFFFF >> (32 - prefixlen));
931 atomic_inc(&res->fi->fib_clntref);
935 /* Find appropriate source address to this destination */
937 u32 __fib_res_prefsrc(struct fib_result *res)
939 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
943 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
944 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
945 struct fib_info *fi, unsigned int flags)
948 struct nlmsghdr *nlh;
949 unsigned char *b = skb->tail;
951 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
952 rtm = NLMSG_DATA(nlh);
953 rtm->rtm_family = AF_INET;
954 rtm->rtm_dst_len = dst_len;
955 rtm->rtm_src_len = 0;
957 rtm->rtm_table = tb_id;
958 rtm->rtm_type = type;
959 rtm->rtm_flags = fi->fib_flags;
960 rtm->rtm_scope = scope;
961 if (rtm->rtm_dst_len)
962 RTA_PUT(skb, RTA_DST, 4, dst);
963 rtm->rtm_protocol = fi->fib_protocol;
964 if (fi->fib_priority)
965 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
966 #ifdef CONFIG_NET_CLS_ROUTE
967 if (fi->fib_nh[0].nh_tclassid)
968 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
970 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
973 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
974 if (fi->fib_nhs == 1) {
975 if (fi->fib_nh->nh_gw)
976 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
977 if (fi->fib_nh->nh_oif)
978 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
980 #ifdef CONFIG_IP_ROUTE_MULTIPATH
981 if (fi->fib_nhs > 1) {
982 struct rtnexthop *nhp;
983 struct rtattr *mp_head;
984 if (skb_tailroom(skb) <= RTA_SPACE(0))
986 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
989 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
991 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
992 nhp->rtnh_flags = nh->nh_flags & 0xFF;
993 nhp->rtnh_hops = nh->nh_weight-1;
994 nhp->rtnh_ifindex = nh->nh_oif;
996 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
997 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
998 } endfor_nexthops(fi);
999 mp_head->rta_type = RTA_MULTIPATH;
1000 mp_head->rta_len = skb->tail - (u8*)mp_head;
1003 nlh->nlmsg_len = skb->tail - b;
1008 skb_trim(skb, b - skb->data);
1012 #ifndef CONFIG_IP_NOSIOCRT
1015 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1016 struct kern_rta *rta, struct rtentry *r)
1021 memset(rtm, 0, sizeof(*rtm));
1022 memset(rta, 0, sizeof(*rta));
1024 if (r->rt_dst.sa_family != AF_INET)
1025 return -EAFNOSUPPORT;
1027 /* Check mask for validity:
1028 a) it must be contiguous.
1029 b) destination must have all host bits clear.
1030 c) if application forgot to set correct family (AF_INET),
1031 reject request unless it is absolutely clear i.e.
1032 both family and mask are zero.
1035 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
1036 if (!(r->rt_flags&RTF_HOST)) {
1037 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
1038 if (r->rt_genmask.sa_family != AF_INET) {
1039 if (mask || r->rt_genmask.sa_family)
1040 return -EAFNOSUPPORT;
1042 if (bad_mask(mask, *ptr))
1044 plen = inet_mask_len(mask);
1047 nl->nlmsg_flags = NLM_F_REQUEST;
1050 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1051 if (cmd == SIOCDELRT) {
1052 nl->nlmsg_type = RTM_DELROUTE;
1053 nl->nlmsg_flags = 0;
1055 nl->nlmsg_type = RTM_NEWROUTE;
1056 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
1057 rtm->rtm_protocol = RTPROT_BOOT;
1060 rtm->rtm_dst_len = plen;
1064 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
1065 rta->rta_priority = (u32*)&r->rt_pad3;
1067 if (r->rt_flags&RTF_REJECT) {
1068 rtm->rtm_scope = RT_SCOPE_HOST;
1069 rtm->rtm_type = RTN_UNREACHABLE;
1072 rtm->rtm_scope = RT_SCOPE_NOWHERE;
1073 rtm->rtm_type = RTN_UNICAST;
1077 struct net_device *dev;
1078 char devname[IFNAMSIZ];
1080 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
1082 devname[IFNAMSIZ-1] = 0;
1083 colon = strchr(devname, ':');
1086 dev = __dev_get_by_name(devname);
1089 rta->rta_oif = &dev->ifindex;
1091 struct in_ifaddr *ifa;
1092 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1096 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
1097 if (strcmp(ifa->ifa_label, devname) == 0)
1101 rta->rta_prefsrc = &ifa->ifa_local;
1105 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1106 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1108 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1109 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1112 if (cmd == SIOCDELRT)
1115 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1118 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1119 rtm->rtm_scope = RT_SCOPE_LINK;
1121 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1123 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1127 mx->rta_type = RTA_METRICS;
1128 mx->rta_len = RTA_LENGTH(0);
1129 if (r->rt_flags&RTF_MTU) {
1130 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1131 rec->rta_type = RTAX_ADVMSS;
1132 rec->rta_len = RTA_LENGTH(4);
1133 mx->rta_len += RTA_LENGTH(4);
1134 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1136 if (r->rt_flags&RTF_WINDOW) {
1137 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1138 rec->rta_type = RTAX_WINDOW;
1139 rec->rta_len = RTA_LENGTH(4);
1140 mx->rta_len += RTA_LENGTH(4);
1141 *(u32*)RTA_DATA(rec) = r->rt_window;
1143 if (r->rt_flags&RTF_IRTT) {
1144 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1145 rec->rta_type = RTAX_RTT;
1146 rec->rta_len = RTA_LENGTH(4);
1147 mx->rta_len += RTA_LENGTH(4);
1148 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1158 - local address disappeared -> we must delete all the entries
1160 - device went down -> we must shutdown all nexthops going via it.
1163 int fib_sync_down(u32 local, struct net_device *dev, int force)
1166 int scope = RT_SCOPE_NOWHERE;
1171 if (local && fib_info_laddrhash) {
1172 unsigned int hash = fib_laddr_hashfn(local);
1173 struct hlist_head *head = &fib_info_laddrhash[hash];
1174 struct hlist_node *node;
1175 struct fib_info *fi;
1177 hlist_for_each_entry(fi, node, head, fib_lhash) {
1178 if (fi->fib_prefsrc == local) {
1179 fi->fib_flags |= RTNH_F_DEAD;
1186 struct fib_info *prev_fi = NULL;
1187 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1188 struct hlist_head *head = &fib_info_devhash[hash];
1189 struct hlist_node *node;
1192 hlist_for_each_entry(nh, node, head, nh_hash) {
1193 struct fib_info *fi = nh->nh_parent;
1196 BUG_ON(!fi->fib_nhs);
1197 if (nh->nh_dev != dev || fi == prev_fi)
1201 change_nexthops(fi) {
1202 if (nh->nh_flags&RTNH_F_DEAD)
1204 else if (nh->nh_dev == dev &&
1205 nh->nh_scope != scope) {
1206 nh->nh_flags |= RTNH_F_DEAD;
1207 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1208 spin_lock_bh(&fib_multipath_lock);
1209 fi->fib_power -= nh->nh_power;
1211 spin_unlock_bh(&fib_multipath_lock);
1215 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1216 if (force > 1 && nh->nh_dev == dev) {
1221 } endfor_nexthops(fi)
1222 if (dead == fi->fib_nhs) {
1223 fi->fib_flags |= RTNH_F_DEAD;
1232 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1235 Dead device goes up. We wake up dead nexthops.
1236 It takes sense only on multipath routes.
1239 int fib_sync_up(struct net_device *dev)
1241 struct fib_info *prev_fi;
1243 struct hlist_head *head;
1244 struct hlist_node *node;
1248 if (!(dev->flags&IFF_UP))
1252 hash = fib_devindex_hashfn(dev->ifindex);
1253 head = &fib_info_devhash[hash];
1256 hlist_for_each_entry(nh, node, head, nh_hash) {
1257 struct fib_info *fi = nh->nh_parent;
1260 BUG_ON(!fi->fib_nhs);
1261 if (nh->nh_dev != dev || fi == prev_fi)
1266 change_nexthops(fi) {
1267 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1271 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1273 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1276 spin_lock_bh(&fib_multipath_lock);
1278 nh->nh_flags &= ~RTNH_F_DEAD;
1279 spin_unlock_bh(&fib_multipath_lock);
1280 } endfor_nexthops(fi)
1283 fi->fib_flags &= ~RTNH_F_DEAD;
1292 The algorithm is suboptimal, but it provides really
1293 fair weighted route distribution.
1296 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1298 struct fib_info *fi = res->fi;
1301 spin_lock_bh(&fib_multipath_lock);
1302 if (fi->fib_power <= 0) {
1304 change_nexthops(fi) {
1305 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1306 power += nh->nh_weight;
1307 nh->nh_power = nh->nh_weight;
1309 } endfor_nexthops(fi);
1310 fi->fib_power = power;
1312 spin_unlock_bh(&fib_multipath_lock);
1313 /* Race condition: route has just become dead. */
1320 /* w should be random number [0..fi->fib_power-1],
1321 it is pretty bad approximation.
1324 w = jiffies % fi->fib_power;
1326 change_nexthops(fi) {
1327 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1328 if ((w -= nh->nh_power) <= 0) {
1331 res->nh_sel = nhsel;
1332 spin_unlock_bh(&fib_multipath_lock);
1336 } endfor_nexthops(fi);
1338 /* Race condition: route has just become dead. */
1340 spin_unlock_bh(&fib_multipath_lock);