2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
26 * Code from fib_hash has been reused which includes the following header:
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
33 * IPv4 FIB: lookup engine and maintenance routines.
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
43 * Substantial contributions to this work comes from:
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
51 #define VERSION "0.408"
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
83 #define MAX_STAT_DEPTH 32
85 #define KEYLENGTH (8*sizeof(t_key))
87 typedef unsigned int t_key;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
103 unsigned long parent;
105 struct hlist_head list;
110 struct hlist_node hlist;
113 struct list_head falh;
117 unsigned long parent;
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
125 struct work_struct work;
126 struct tnode *tnode_free;
128 struct node *child[0];
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
138 unsigned int resize_node_skipped;
143 unsigned int totdepth;
144 unsigned int maxdepth;
147 unsigned int nullpointers;
148 unsigned int prefixes;
149 unsigned int nodesizes[MAX_STAT_DEPTH];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
159 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
168 static struct kmem_cache *fn_alias_kmem __read_mostly;
169 static struct kmem_cache *trie_leaf_kmem __read_mostly;
171 static inline struct tnode *node_parent(struct node *node)
173 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
176 static inline struct tnode *node_parent_rcu(struct node *node)
178 struct tnode *ret = node_parent(node);
180 return rcu_dereference(ret);
183 /* Same as rcu_assign_pointer
184 * but that macro() assumes that value is a pointer.
186 static inline void node_set_parent(struct node *node, struct tnode *ptr)
189 node->parent = (unsigned long)ptr | NODE_TYPE(node);
192 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
194 BUG_ON(i >= 1U << tn->bits);
199 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
201 struct node *ret = tnode_get_child(tn, i);
203 return rcu_dereference(ret);
206 static inline int tnode_child_length(const struct tnode *tn)
208 return 1 << tn->bits;
211 static inline t_key mask_pfx(t_key k, unsigned short l)
213 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
216 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
218 if (offset < KEYLENGTH)
219 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
224 static inline int tkey_equals(t_key a, t_key b)
229 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
231 if (bits == 0 || offset >= KEYLENGTH)
233 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
234 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237 static inline int tkey_mismatch(t_key a, int offset, t_key b)
244 while ((diff << i) >> (KEYLENGTH-1) == 0)
250 To understand this stuff, an understanding of keys and all their bits is
251 necessary. Every node in the trie has a key associated with it, but not
252 all of the bits in that key are significant.
254 Consider a node 'n' and its parent 'tp'.
256 If n is a leaf, every bit in its key is significant. Its presence is
257 necessitated by path compression, since during a tree traversal (when
258 searching for a leaf - unless we are doing an insertion) we will completely
259 ignore all skipped bits we encounter. Thus we need to verify, at the end of
260 a potentially successful search, that we have indeed been walking the
263 Note that we can never "miss" the correct key in the tree if present by
264 following the wrong path. Path compression ensures that segments of the key
265 that are the same for all keys with a given prefix are skipped, but the
266 skipped part *is* identical for each node in the subtrie below the skipped
267 bit! trie_insert() in this implementation takes care of that - note the
268 call to tkey_sub_equals() in trie_insert().
270 if n is an internal node - a 'tnode' here, the various parts of its key
271 have many different meanings.
274 _________________________________________________________________
275 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
276 -----------------------------------------------------------------
277 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
279 _________________________________________________________________
280 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
281 -----------------------------------------------------------------
282 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
289 First, let's just ignore the bits that come before the parent tp, that is
290 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
291 not use them for anything.
293 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
294 index into the parent's child array. That is, they will be used to find
295 'n' among tp's children.
297 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
300 All the bits we have seen so far are significant to the node n. The rest
301 of the bits are really not needed or indeed known in n->key.
303 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
304 n's child array, and will of course be different for each child.
307 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
312 static inline void check_tnode(const struct tnode *tn)
314 WARN_ON(tn && tn->pos+tn->bits > 32);
317 static const int halve_threshold = 25;
318 static const int inflate_threshold = 50;
319 static const int halve_threshold_root = 8;
320 static const int inflate_threshold_root = 15;
323 static void __alias_free_mem(struct rcu_head *head)
325 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
326 kmem_cache_free(fn_alias_kmem, fa);
329 static inline void alias_free_mem_rcu(struct fib_alias *fa)
331 call_rcu(&fa->rcu, __alias_free_mem);
334 static void __leaf_free_rcu(struct rcu_head *head)
336 struct leaf *l = container_of(head, struct leaf, rcu);
337 kmem_cache_free(trie_leaf_kmem, l);
340 static inline void free_leaf(struct leaf *l)
342 call_rcu_bh(&l->rcu, __leaf_free_rcu);
345 static void __leaf_info_free_rcu(struct rcu_head *head)
347 kfree(container_of(head, struct leaf_info, rcu));
350 static inline void free_leaf_info(struct leaf_info *leaf)
352 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
355 static struct tnode *tnode_alloc(size_t size)
357 if (size <= PAGE_SIZE)
358 return kzalloc(size, GFP_KERNEL);
360 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
363 static void __tnode_vfree(struct work_struct *arg)
365 struct tnode *tn = container_of(arg, struct tnode, work);
369 static void __tnode_free_rcu(struct rcu_head *head)
371 struct tnode *tn = container_of(head, struct tnode, rcu);
372 size_t size = sizeof(struct tnode) +
373 (sizeof(struct node *) << tn->bits);
375 if (size <= PAGE_SIZE)
378 INIT_WORK(&tn->work, __tnode_vfree);
379 schedule_work(&tn->work);
383 static inline void tnode_free(struct tnode *tn)
386 free_leaf((struct leaf *) tn);
388 call_rcu(&tn->rcu, __tnode_free_rcu);
391 static void tnode_free_safe(struct tnode *tn)
394 tn->tnode_free = tnode_free_head;
395 tnode_free_head = tn;
398 static void tnode_free_flush(void)
402 while ((tn = tnode_free_head)) {
403 tnode_free_head = tn->tnode_free;
404 tn->tnode_free = NULL;
409 static struct leaf *leaf_new(void)
411 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
414 INIT_HLIST_HEAD(&l->list);
419 static struct leaf_info *leaf_info_new(int plen)
421 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
424 INIT_LIST_HEAD(&li->falh);
429 static struct tnode *tnode_new(t_key key, int pos, int bits)
431 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
432 struct tnode *tn = tnode_alloc(sz);
435 tn->parent = T_TNODE;
439 tn->full_children = 0;
440 tn->empty_children = 1<<bits;
443 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
444 (unsigned long) (sizeof(struct node) << bits));
449 * Check whether a tnode 'n' is "full", i.e. it is an internal node
450 * and no bits are skipped. See discussion in dyntree paper p. 6
453 static inline int tnode_full(const struct tnode *tn, const struct node *n)
455 if (n == NULL || IS_LEAF(n))
458 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
461 static inline void put_child(struct trie *t, struct tnode *tn, int i,
464 tnode_put_child_reorg(tn, i, n, -1);
468 * Add a child at position i overwriting the old value.
469 * Update the value of full_children and empty_children.
472 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
475 struct node *chi = tn->child[i];
478 BUG_ON(i >= 1<<tn->bits);
480 /* update emptyChildren */
481 if (n == NULL && chi != NULL)
482 tn->empty_children++;
483 else if (n != NULL && chi == NULL)
484 tn->empty_children--;
486 /* update fullChildren */
488 wasfull = tnode_full(tn, chi);
490 isfull = tnode_full(tn, n);
491 if (wasfull && !isfull)
493 else if (!wasfull && isfull)
497 node_set_parent(n, tn);
499 rcu_assign_pointer(tn->child[i], n);
502 static struct node *resize(struct trie *t, struct tnode *tn)
506 struct tnode *old_tn;
507 int inflate_threshold_use;
508 int halve_threshold_use;
514 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
515 tn, inflate_threshold, halve_threshold);
518 if (tn->empty_children == tnode_child_length(tn)) {
523 if (tn->empty_children == tnode_child_length(tn) - 1)
524 for (i = 0; i < tnode_child_length(tn); i++) {
531 /* compress one level */
532 node_set_parent(n, NULL);
537 * Double as long as the resulting node has a number of
538 * nonempty nodes that are above the threshold.
542 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
543 * the Helsinki University of Technology and Matti Tikkanen of Nokia
544 * Telecommunications, page 6:
545 * "A node is doubled if the ratio of non-empty children to all
546 * children in the *doubled* node is at least 'high'."
548 * 'high' in this instance is the variable 'inflate_threshold'. It
549 * is expressed as a percentage, so we multiply it with
550 * tnode_child_length() and instead of multiplying by 2 (since the
551 * child array will be doubled by inflate()) and multiplying
552 * the left-hand side by 100 (to handle the percentage thing) we
553 * multiply the left-hand side by 50.
555 * The left-hand side may look a bit weird: tnode_child_length(tn)
556 * - tn->empty_children is of course the number of non-null children
557 * in the current node. tn->full_children is the number of "full"
558 * children, that is non-null tnodes with a skip value of 0.
559 * All of those will be doubled in the resulting inflated tnode, so
560 * we just count them one extra time here.
562 * A clearer way to write this would be:
564 * to_be_doubled = tn->full_children;
565 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
568 * new_child_length = tnode_child_length(tn) * 2;
570 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
572 * if (new_fill_factor >= inflate_threshold)
574 * ...and so on, tho it would mess up the while () loop.
577 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
581 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
582 * inflate_threshold * new_child_length
584 * expand not_to_be_doubled and to_be_doubled, and shorten:
585 * 100 * (tnode_child_length(tn) - tn->empty_children +
586 * tn->full_children) >= inflate_threshold * new_child_length
588 * expand new_child_length:
589 * 100 * (tnode_child_length(tn) - tn->empty_children +
590 * tn->full_children) >=
591 * inflate_threshold * tnode_child_length(tn) * 2
594 * 50 * (tn->full_children + tnode_child_length(tn) -
595 * tn->empty_children) >= inflate_threshold *
596 * tnode_child_length(tn)
602 /* Keep root node larger */
605 inflate_threshold_use = inflate_threshold_root;
607 inflate_threshold_use = inflate_threshold;
611 while ((tn->full_children > 0 && max_resize-- &&
612 50 * (tn->full_children + tnode_child_length(tn)
613 - tn->empty_children)
614 >= inflate_threshold_use * tnode_child_length(tn))) {
621 #ifdef CONFIG_IP_FIB_TRIE_STATS
622 t->stats.resize_node_skipped++;
628 if (max_resize < 0) {
630 pr_warning("Fix inflate_threshold_root."
631 " Now=%d size=%d bits\n",
632 inflate_threshold_root, tn->bits);
634 pr_warning("Fix inflate_threshold."
635 " Now=%d size=%d bits\n",
636 inflate_threshold, tn->bits);
642 * Halve as long as the number of empty children in this
643 * node is above threshold.
647 /* Keep root node larger */
650 halve_threshold_use = halve_threshold_root;
652 halve_threshold_use = halve_threshold;
656 while (tn->bits > 1 && max_resize-- &&
657 100 * (tnode_child_length(tn) - tn->empty_children) <
658 halve_threshold_use * tnode_child_length(tn)) {
664 #ifdef CONFIG_IP_FIB_TRIE_STATS
665 t->stats.resize_node_skipped++;
671 if (max_resize < 0) {
673 pr_warning("Fix halve_threshold_root."
674 " Now=%d size=%d bits\n",
675 halve_threshold_root, tn->bits);
677 pr_warning("Fix halve_threshold."
678 " Now=%d size=%d bits\n",
679 halve_threshold, tn->bits);
682 /* Only one child remains */
683 if (tn->empty_children == tnode_child_length(tn) - 1)
684 for (i = 0; i < tnode_child_length(tn); i++) {
691 /* compress one level */
693 node_set_parent(n, NULL);
698 return (struct node *) tn;
701 static struct tnode *inflate(struct trie *t, struct tnode *tn)
703 struct tnode *oldtnode = tn;
704 int olen = tnode_child_length(tn);
707 pr_debug("In inflate\n");
709 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
712 return ERR_PTR(-ENOMEM);
715 * Preallocate and store tnodes before the actual work so we
716 * don't get into an inconsistent state if memory allocation
717 * fails. In case of failure we return the oldnode and inflate
718 * of tnode is ignored.
721 for (i = 0; i < olen; i++) {
724 inode = (struct tnode *) tnode_get_child(oldtnode, i);
727 inode->pos == oldtnode->pos + oldtnode->bits &&
729 struct tnode *left, *right;
730 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732 left = tnode_new(inode->key&(~m), inode->pos + 1,
737 right = tnode_new(inode->key|m, inode->pos + 1,
745 put_child(t, tn, 2*i, (struct node *) left);
746 put_child(t, tn, 2*i+1, (struct node *) right);
750 for (i = 0; i < olen; i++) {
752 struct node *node = tnode_get_child(oldtnode, i);
753 struct tnode *left, *right;
760 /* A leaf or an internal node with skipped bits */
762 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
763 tn->pos + tn->bits - 1) {
764 if (tkey_extract_bits(node->key,
765 oldtnode->pos + oldtnode->bits,
767 put_child(t, tn, 2*i, node);
769 put_child(t, tn, 2*i+1, node);
773 /* An internal node with two children */
774 inode = (struct tnode *) node;
776 if (inode->bits == 1) {
777 put_child(t, tn, 2*i, inode->child[0]);
778 put_child(t, tn, 2*i+1, inode->child[1]);
780 tnode_free_safe(inode);
784 /* An internal node with more than two children */
786 /* We will replace this node 'inode' with two new
787 * ones, 'left' and 'right', each with half of the
788 * original children. The two new nodes will have
789 * a position one bit further down the key and this
790 * means that the "significant" part of their keys
791 * (see the discussion near the top of this file)
792 * will differ by one bit, which will be "0" in
793 * left's key and "1" in right's key. Since we are
794 * moving the key position by one step, the bit that
795 * we are moving away from - the bit at position
796 * (inode->pos) - is the one that will differ between
797 * left and right. So... we synthesize that bit in the
799 * The mask 'm' below will be a single "one" bit at
800 * the position (inode->pos)
803 /* Use the old key, but set the new significant
807 left = (struct tnode *) tnode_get_child(tn, 2*i);
808 put_child(t, tn, 2*i, NULL);
812 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
813 put_child(t, tn, 2*i+1, NULL);
817 size = tnode_child_length(left);
818 for (j = 0; j < size; j++) {
819 put_child(t, left, j, inode->child[j]);
820 put_child(t, right, j, inode->child[j + size]);
822 put_child(t, tn, 2*i, resize(t, left));
823 put_child(t, tn, 2*i+1, resize(t, right));
825 tnode_free_safe(inode);
827 tnode_free_safe(oldtnode);
831 int size = tnode_child_length(tn);
834 for (j = 0; j < size; j++)
836 tnode_free((struct tnode *)tn->child[j]);
840 return ERR_PTR(-ENOMEM);
844 static struct tnode *halve(struct trie *t, struct tnode *tn)
846 struct tnode *oldtnode = tn;
847 struct node *left, *right;
849 int olen = tnode_child_length(tn);
851 pr_debug("In halve\n");
853 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
856 return ERR_PTR(-ENOMEM);
859 * Preallocate and store tnodes before the actual work so we
860 * don't get into an inconsistent state if memory allocation
861 * fails. In case of failure we return the oldnode and halve
862 * of tnode is ignored.
865 for (i = 0; i < olen; i += 2) {
866 left = tnode_get_child(oldtnode, i);
867 right = tnode_get_child(oldtnode, i+1);
869 /* Two nonempty children */
873 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
878 put_child(t, tn, i/2, (struct node *)newn);
883 for (i = 0; i < olen; i += 2) {
884 struct tnode *newBinNode;
886 left = tnode_get_child(oldtnode, i);
887 right = tnode_get_child(oldtnode, i+1);
889 /* At least one of the children is empty */
891 if (right == NULL) /* Both are empty */
893 put_child(t, tn, i/2, right);
898 put_child(t, tn, i/2, left);
902 /* Two nonempty children */
903 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
904 put_child(t, tn, i/2, NULL);
905 put_child(t, newBinNode, 0, left);
906 put_child(t, newBinNode, 1, right);
907 put_child(t, tn, i/2, resize(t, newBinNode));
909 tnode_free_safe(oldtnode);
913 int size = tnode_child_length(tn);
916 for (j = 0; j < size; j++)
918 tnode_free((struct tnode *)tn->child[j]);
922 return ERR_PTR(-ENOMEM);
926 /* readside must use rcu_read_lock currently dump routines
927 via get_fa_head and dump */
929 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
931 struct hlist_head *head = &l->list;
932 struct hlist_node *node;
933 struct leaf_info *li;
935 hlist_for_each_entry_rcu(li, node, head, hlist)
936 if (li->plen == plen)
942 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
944 struct leaf_info *li = find_leaf_info(l, plen);
952 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
954 struct leaf_info *li = NULL, *last = NULL;
955 struct hlist_node *node;
957 if (hlist_empty(head)) {
958 hlist_add_head_rcu(&new->hlist, head);
960 hlist_for_each_entry(li, node, head, hlist) {
961 if (new->plen > li->plen)
967 hlist_add_after_rcu(&last->hlist, &new->hlist);
969 hlist_add_before_rcu(&new->hlist, &li->hlist);
973 /* rcu_read_lock needs to be hold by caller from readside */
976 fib_find_node(struct trie *t, u32 key)
983 n = rcu_dereference(t->trie);
985 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
986 tn = (struct tnode *) n;
990 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
991 pos = tn->pos + tn->bits;
992 n = tnode_get_child_rcu(tn,
993 tkey_extract_bits(key,
999 /* Case we have found a leaf. Compare prefixes */
1001 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1002 return (struct leaf *)n;
1007 static void trie_rebalance(struct trie *t, struct tnode *tn)
1015 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1016 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1018 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1020 tnode_put_child_reorg((struct tnode *)tp, cindex,
1021 (struct node *)tn, wasfull);
1023 tp = node_parent((struct node *) tn);
1030 /* Handle last (top) tnode */
1032 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1034 rcu_assign_pointer(t->trie, (struct node *)tn);
1040 /* only used from updater-side */
1042 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1045 struct tnode *tp = NULL, *tn = NULL;
1049 struct list_head *fa_head = NULL;
1050 struct leaf_info *li;
1056 /* If we point to NULL, stop. Either the tree is empty and we should
1057 * just put a new leaf in if, or we have reached an empty child slot,
1058 * and we should just put our new leaf in that.
1059 * If we point to a T_TNODE, check if it matches our key. Note that
1060 * a T_TNODE might be skipping any number of bits - its 'pos' need
1061 * not be the parent's 'pos'+'bits'!
1063 * If it does match the current key, get pos/bits from it, extract
1064 * the index from our key, push the T_TNODE and walk the tree.
1066 * If it doesn't, we have to replace it with a new T_TNODE.
1068 * If we point to a T_LEAF, it might or might not have the same key
1069 * as we do. If it does, just change the value, update the T_LEAF's
1070 * value, and return it.
1071 * If it doesn't, we need to replace it with a T_TNODE.
1074 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1075 tn = (struct tnode *) n;
1079 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1081 pos = tn->pos + tn->bits;
1082 n = tnode_get_child(tn,
1083 tkey_extract_bits(key,
1087 BUG_ON(n && node_parent(n) != tn);
1093 * n ----> NULL, LEAF or TNODE
1095 * tp is n's (parent) ----> NULL or TNODE
1098 BUG_ON(tp && IS_LEAF(tp));
1100 /* Case 1: n is a leaf. Compare prefixes */
1102 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1103 l = (struct leaf *) n;
1104 li = leaf_info_new(plen);
1109 fa_head = &li->falh;
1110 insert_leaf_info(&l->list, li);
1119 li = leaf_info_new(plen);
1126 fa_head = &li->falh;
1127 insert_leaf_info(&l->list, li);
1129 if (t->trie && n == NULL) {
1130 /* Case 2: n is NULL, and will just insert a new leaf */
1132 node_set_parent((struct node *)l, tp);
1134 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1135 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1137 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1139 * Add a new tnode here
1140 * first tnode need some special handling
1144 pos = tp->pos+tp->bits;
1149 newpos = tkey_mismatch(key, pos, n->key);
1150 tn = tnode_new(n->key, newpos, 1);
1153 tn = tnode_new(key, newpos, 1); /* First tnode */
1162 node_set_parent((struct node *)tn, tp);
1164 missbit = tkey_extract_bits(key, newpos, 1);
1165 put_child(t, tn, missbit, (struct node *)l);
1166 put_child(t, tn, 1-missbit, n);
1169 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1170 put_child(t, (struct tnode *)tp, cindex,
1173 rcu_assign_pointer(t->trie, (struct node *)tn);
1178 if (tp && tp->pos + tp->bits > 32)
1179 pr_warning("fib_trie"
1180 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1181 tp, tp->pos, tp->bits, key, plen);
1183 /* Rebalance the trie */
1185 trie_rebalance(t, tp);
1191 * Caller must hold RTNL.
1193 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1195 struct trie *t = (struct trie *) tb->tb_data;
1196 struct fib_alias *fa, *new_fa;
1197 struct list_head *fa_head = NULL;
1198 struct fib_info *fi;
1199 int plen = cfg->fc_dst_len;
1200 u8 tos = cfg->fc_tos;
1208 key = ntohl(cfg->fc_dst);
1210 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1212 mask = ntohl(inet_make_mask(plen));
1219 fi = fib_create_info(cfg);
1225 l = fib_find_node(t, key);
1229 fa_head = get_fa_head(l, plen);
1230 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1233 /* Now fa, if non-NULL, points to the first fib alias
1234 * with the same keys [prefix,tos,priority], if such key already
1235 * exists or to the node before which we will insert new one.
1237 * If fa is NULL, we will need to allocate a new one and
1238 * insert to the head of f.
1240 * If f is NULL, no fib node matched the destination key
1241 * and we need to allocate a new one of those as well.
1244 if (fa && fa->fa_tos == tos &&
1245 fa->fa_info->fib_priority == fi->fib_priority) {
1246 struct fib_alias *fa_first, *fa_match;
1249 if (cfg->fc_nlflags & NLM_F_EXCL)
1253 * 1. Find exact match for type, scope, fib_info to avoid
1255 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1259 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1260 list_for_each_entry_continue(fa, fa_head, fa_list) {
1261 if (fa->fa_tos != tos)
1263 if (fa->fa_info->fib_priority != fi->fib_priority)
1265 if (fa->fa_type == cfg->fc_type &&
1266 fa->fa_scope == cfg->fc_scope &&
1267 fa->fa_info == fi) {
1273 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1274 struct fib_info *fi_drop;
1284 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1288 fi_drop = fa->fa_info;
1289 new_fa->fa_tos = fa->fa_tos;
1290 new_fa->fa_info = fi;
1291 new_fa->fa_type = cfg->fc_type;
1292 new_fa->fa_scope = cfg->fc_scope;
1293 state = fa->fa_state;
1294 new_fa->fa_state = state & ~FA_S_ACCESSED;
1296 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1297 alias_free_mem_rcu(fa);
1299 fib_release_info(fi_drop);
1300 if (state & FA_S_ACCESSED)
1301 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1302 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1303 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1307 /* Error if we find a perfect match which
1308 * uses the same scope, type, and nexthop
1314 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1318 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1322 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1326 new_fa->fa_info = fi;
1327 new_fa->fa_tos = tos;
1328 new_fa->fa_type = cfg->fc_type;
1329 new_fa->fa_scope = cfg->fc_scope;
1330 new_fa->fa_state = 0;
1332 * Insert new entry to the list.
1336 fa_head = fib_insert_node(t, key, plen);
1337 if (unlikely(!fa_head)) {
1339 goto out_free_new_fa;
1343 list_add_tail_rcu(&new_fa->fa_list,
1344 (fa ? &fa->fa_list : fa_head));
1346 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1347 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1348 &cfg->fc_nlinfo, 0);
1353 kmem_cache_free(fn_alias_kmem, new_fa);
1355 fib_release_info(fi);
1360 /* should be called with rcu_read_lock */
1361 static int check_leaf(struct trie *t, struct leaf *l,
1362 t_key key, const struct flowi *flp,
1363 struct fib_result *res)
1365 struct leaf_info *li;
1366 struct hlist_head *hhead = &l->list;
1367 struct hlist_node *node;
1369 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1371 int plen = li->plen;
1372 __be32 mask = inet_make_mask(plen);
1374 if (l->key != (key & ntohl(mask)))
1377 err = fib_semantic_match(&li->falh, flp, res, plen);
1379 #ifdef CONFIG_IP_FIB_TRIE_STATS
1381 t->stats.semantic_match_passed++;
1383 t->stats.semantic_match_miss++;
1392 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1393 struct fib_result *res)
1395 struct trie *t = (struct trie *) tb->tb_data;
1400 t_key key = ntohl(flp->fl4_dst);
1403 int current_prefix_length = KEYLENGTH;
1405 t_key node_prefix, key_prefix, pref_mismatch;
1410 n = rcu_dereference(t->trie);
1414 #ifdef CONFIG_IP_FIB_TRIE_STATS
1420 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1424 pn = (struct tnode *) n;
1432 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1435 n = tnode_get_child(pn, cindex);
1438 #ifdef CONFIG_IP_FIB_TRIE_STATS
1439 t->stats.null_node_hit++;
1445 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1451 cn = (struct tnode *)n;
1454 * It's a tnode, and we can do some extra checks here if we
1455 * like, to avoid descending into a dead-end branch.
1456 * This tnode is in the parent's child array at index
1457 * key[p_pos..p_pos+p_bits] but potentially with some bits
1458 * chopped off, so in reality the index may be just a
1459 * subprefix, padded with zero at the end.
1460 * We can also take a look at any skipped bits in this
1461 * tnode - everything up to p_pos is supposed to be ok,
1462 * and the non-chopped bits of the index (se previous
1463 * paragraph) are also guaranteed ok, but the rest is
1464 * considered unknown.
1466 * The skipped bits are key[pos+bits..cn->pos].
1469 /* If current_prefix_length < pos+bits, we are already doing
1470 * actual prefix matching, which means everything from
1471 * pos+(bits-chopped_off) onward must be zero along some
1472 * branch of this subtree - otherwise there is *no* valid
1473 * prefix present. Here we can only check the skipped
1474 * bits. Remember, since we have already indexed into the
1475 * parent's child array, we know that the bits we chopped of
1479 /* NOTA BENE: Checking only skipped bits
1480 for the new node here */
1482 if (current_prefix_length < pos+bits) {
1483 if (tkey_extract_bits(cn->key, current_prefix_length,
1484 cn->pos - current_prefix_length)
1490 * If chopped_off=0, the index is fully validated and we
1491 * only need to look at the skipped bits for this, the new,
1492 * tnode. What we actually want to do is to find out if
1493 * these skipped bits match our key perfectly, or if we will
1494 * have to count on finding a matching prefix further down,
1495 * because if we do, we would like to have some way of
1496 * verifying the existence of such a prefix at this point.
1499 /* The only thing we can do at this point is to verify that
1500 * any such matching prefix can indeed be a prefix to our
1501 * key, and if the bits in the node we are inspecting that
1502 * do not match our key are not ZERO, this cannot be true.
1503 * Thus, find out where there is a mismatch (before cn->pos)
1504 * and verify that all the mismatching bits are zero in the
1509 * Note: We aren't very concerned about the piece of
1510 * the key that precede pn->pos+pn->bits, since these
1511 * have already been checked. The bits after cn->pos
1512 * aren't checked since these are by definition
1513 * "unknown" at this point. Thus, what we want to see
1514 * is if we are about to enter the "prefix matching"
1515 * state, and in that case verify that the skipped
1516 * bits that will prevail throughout this subtree are
1517 * zero, as they have to be if we are to find a
1521 node_prefix = mask_pfx(cn->key, cn->pos);
1522 key_prefix = mask_pfx(key, cn->pos);
1523 pref_mismatch = key_prefix^node_prefix;
1527 * In short: If skipped bits in this node do not match
1528 * the search key, enter the "prefix matching"
1531 if (pref_mismatch) {
1532 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1534 pref_mismatch = pref_mismatch << 1;
1536 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1538 if (key_prefix != 0)
1541 if (current_prefix_length >= cn->pos)
1542 current_prefix_length = mp;
1545 pn = (struct tnode *)n; /* Descend */
1552 /* As zero don't change the child key (cindex) */
1553 while ((chopped_off <= pn->bits)
1554 && !(cindex & (1<<(chopped_off-1))))
1557 /* Decrease current_... with bits chopped off */
1558 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1559 current_prefix_length = pn->pos + pn->bits
1563 * Either we do the actual chop off according or if we have
1564 * chopped off all bits in this tnode walk up to our parent.
1567 if (chopped_off <= pn->bits) {
1568 cindex &= ~(1 << (chopped_off-1));
1570 struct tnode *parent = node_parent((struct node *) pn);
1574 /* Get Child's index */
1575 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1579 #ifdef CONFIG_IP_FIB_TRIE_STATS
1580 t->stats.backtrack++;
1593 * Remove the leaf and return parent.
1595 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1597 struct tnode *tp = node_parent((struct node *) l);
1599 pr_debug("entering trie_leaf_remove(%p)\n", l);
1602 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1603 put_child(t, (struct tnode *)tp, cindex, NULL);
1604 trie_rebalance(t, tp);
1606 rcu_assign_pointer(t->trie, NULL);
1612 * Caller must hold RTNL.
1614 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1616 struct trie *t = (struct trie *) tb->tb_data;
1618 int plen = cfg->fc_dst_len;
1619 u8 tos = cfg->fc_tos;
1620 struct fib_alias *fa, *fa_to_delete;
1621 struct list_head *fa_head;
1623 struct leaf_info *li;
1628 key = ntohl(cfg->fc_dst);
1629 mask = ntohl(inet_make_mask(plen));
1635 l = fib_find_node(t, key);
1640 fa_head = get_fa_head(l, plen);
1641 fa = fib_find_alias(fa_head, tos, 0);
1646 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1648 fa_to_delete = NULL;
1649 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1650 list_for_each_entry_continue(fa, fa_head, fa_list) {
1651 struct fib_info *fi = fa->fa_info;
1653 if (fa->fa_tos != tos)
1656 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1657 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1658 fa->fa_scope == cfg->fc_scope) &&
1659 (!cfg->fc_protocol ||
1660 fi->fib_protocol == cfg->fc_protocol) &&
1661 fib_nh_match(cfg, fi) == 0) {
1671 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1672 &cfg->fc_nlinfo, 0);
1674 l = fib_find_node(t, key);
1675 li = find_leaf_info(l, plen);
1677 list_del_rcu(&fa->fa_list);
1679 if (list_empty(fa_head)) {
1680 hlist_del_rcu(&li->hlist);
1684 if (hlist_empty(&l->list))
1685 trie_leaf_remove(t, l);
1687 if (fa->fa_state & FA_S_ACCESSED)
1688 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1690 fib_release_info(fa->fa_info);
1691 alias_free_mem_rcu(fa);
1695 static int trie_flush_list(struct list_head *head)
1697 struct fib_alias *fa, *fa_node;
1700 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1701 struct fib_info *fi = fa->fa_info;
1703 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1704 list_del_rcu(&fa->fa_list);
1705 fib_release_info(fa->fa_info);
1706 alias_free_mem_rcu(fa);
1713 static int trie_flush_leaf(struct leaf *l)
1716 struct hlist_head *lih = &l->list;
1717 struct hlist_node *node, *tmp;
1718 struct leaf_info *li = NULL;
1720 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1721 found += trie_flush_list(&li->falh);
1723 if (list_empty(&li->falh)) {
1724 hlist_del_rcu(&li->hlist);
1732 * Scan for the next right leaf starting at node p->child[idx]
1733 * Since we have back pointer, no recursion necessary.
1735 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1741 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1745 while (idx < 1u << p->bits) {
1746 c = tnode_get_child_rcu(p, idx++);
1751 prefetch(p->child[idx]);
1752 return (struct leaf *) c;
1755 /* Rescan start scanning in new node */
1756 p = (struct tnode *) c;
1760 /* Node empty, walk back up to parent */
1761 c = (struct node *) p;
1762 } while ( (p = node_parent_rcu(c)) != NULL);
1764 return NULL; /* Root of trie */
1767 static struct leaf *trie_firstleaf(struct trie *t)
1769 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1774 if (IS_LEAF(n)) /* trie is just a leaf */
1775 return (struct leaf *) n;
1777 return leaf_walk_rcu(n, NULL);
1780 static struct leaf *trie_nextleaf(struct leaf *l)
1782 struct node *c = (struct node *) l;
1783 struct tnode *p = node_parent(c);
1786 return NULL; /* trie with just one leaf */
1788 return leaf_walk_rcu(p, c);
1791 static struct leaf *trie_leafindex(struct trie *t, int index)
1793 struct leaf *l = trie_firstleaf(t);
1795 while (l && index-- > 0)
1796 l = trie_nextleaf(l);
1803 * Caller must hold RTNL.
1805 static int fn_trie_flush(struct fib_table *tb)
1807 struct trie *t = (struct trie *) tb->tb_data;
1808 struct leaf *l, *ll = NULL;
1811 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1812 found += trie_flush_leaf(l);
1814 if (ll && hlist_empty(&ll->list))
1815 trie_leaf_remove(t, ll);
1819 if (ll && hlist_empty(&ll->list))
1820 trie_leaf_remove(t, ll);
1822 pr_debug("trie_flush found=%d\n", found);
1826 static void fn_trie_select_default(struct fib_table *tb,
1827 const struct flowi *flp,
1828 struct fib_result *res)
1830 struct trie *t = (struct trie *) tb->tb_data;
1831 int order, last_idx;
1832 struct fib_info *fi = NULL;
1833 struct fib_info *last_resort;
1834 struct fib_alias *fa = NULL;
1835 struct list_head *fa_head;
1844 l = fib_find_node(t, 0);
1848 fa_head = get_fa_head(l, 0);
1852 if (list_empty(fa_head))
1855 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1856 struct fib_info *next_fi = fa->fa_info;
1858 if (fa->fa_scope != res->scope ||
1859 fa->fa_type != RTN_UNICAST)
1862 if (next_fi->fib_priority > res->fi->fib_priority)
1864 if (!next_fi->fib_nh[0].nh_gw ||
1865 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1867 fa->fa_state |= FA_S_ACCESSED;
1870 if (next_fi != res->fi)
1872 } else if (!fib_detect_death(fi, order, &last_resort,
1873 &last_idx, tb->tb_default)) {
1874 fib_result_assign(res, fi);
1875 tb->tb_default = order;
1881 if (order <= 0 || fi == NULL) {
1882 tb->tb_default = -1;
1886 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1888 fib_result_assign(res, fi);
1889 tb->tb_default = order;
1893 fib_result_assign(res, last_resort);
1894 tb->tb_default = last_idx;
1899 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1900 struct fib_table *tb,
1901 struct sk_buff *skb, struct netlink_callback *cb)
1904 struct fib_alias *fa;
1905 __be32 xkey = htonl(key);
1910 /* rcu_read_lock is hold by caller */
1912 list_for_each_entry_rcu(fa, fah, fa_list) {
1918 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1927 fa->fa_info, NLM_F_MULTI) < 0) {
1937 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1938 struct sk_buff *skb, struct netlink_callback *cb)
1940 struct leaf_info *li;
1941 struct hlist_node *node;
1947 /* rcu_read_lock is hold by caller */
1948 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1957 if (list_empty(&li->falh))
1960 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1971 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1972 struct netlink_callback *cb)
1975 struct trie *t = (struct trie *) tb->tb_data;
1976 t_key key = cb->args[2];
1977 int count = cb->args[3];
1980 /* Dump starting at last key.
1981 * Note: 0.0.0.0/0 (ie default) is first key.
1984 l = trie_firstleaf(t);
1986 /* Normally, continue from last key, but if that is missing
1987 * fallback to using slow rescan
1989 l = fib_find_node(t, key);
1991 l = trie_leafindex(t, count);
1995 cb->args[2] = l->key;
1996 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1997 cb->args[3] = count;
2003 l = trie_nextleaf(l);
2004 memset(&cb->args[4], 0,
2005 sizeof(cb->args) - 4*sizeof(cb->args[0]));
2007 cb->args[3] = count;
2013 void __init fib_hash_init(void)
2015 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2016 sizeof(struct fib_alias),
2017 0, SLAB_PANIC, NULL);
2019 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2020 max(sizeof(struct leaf),
2021 sizeof(struct leaf_info)),
2022 0, SLAB_PANIC, NULL);
2026 /* Fix more generic FIB names for init later */
2027 struct fib_table *fib_hash_table(u32 id)
2029 struct fib_table *tb;
2032 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2038 tb->tb_default = -1;
2039 tb->tb_lookup = fn_trie_lookup;
2040 tb->tb_insert = fn_trie_insert;
2041 tb->tb_delete = fn_trie_delete;
2042 tb->tb_flush = fn_trie_flush;
2043 tb->tb_select_default = fn_trie_select_default;
2044 tb->tb_dump = fn_trie_dump;
2046 t = (struct trie *) tb->tb_data;
2047 memset(t, 0, sizeof(*t));
2049 if (id == RT_TABLE_LOCAL)
2050 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2055 #ifdef CONFIG_PROC_FS
2056 /* Depth first Trie walk iterator */
2057 struct fib_trie_iter {
2058 struct seq_net_private p;
2059 struct fib_table *tb;
2060 struct tnode *tnode;
2065 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2067 struct tnode *tn = iter->tnode;
2068 unsigned cindex = iter->index;
2071 /* A single entry routing table */
2075 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2076 iter->tnode, iter->index, iter->depth);
2078 while (cindex < (1<<tn->bits)) {
2079 struct node *n = tnode_get_child_rcu(tn, cindex);
2084 iter->index = cindex + 1;
2086 /* push down one level */
2087 iter->tnode = (struct tnode *) n;
2097 /* Current node exhausted, pop back up */
2098 p = node_parent_rcu((struct node *)tn);
2100 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2110 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2118 n = rcu_dereference(t->trie);
2123 iter->tnode = (struct tnode *) n;
2135 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2138 struct fib_trie_iter iter;
2140 memset(s, 0, sizeof(*s));
2143 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2145 struct leaf *l = (struct leaf *)n;
2146 struct leaf_info *li;
2147 struct hlist_node *tmp;
2150 s->totdepth += iter.depth;
2151 if (iter.depth > s->maxdepth)
2152 s->maxdepth = iter.depth;
2154 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2157 const struct tnode *tn = (const struct tnode *) n;
2161 if (tn->bits < MAX_STAT_DEPTH)
2162 s->nodesizes[tn->bits]++;
2164 for (i = 0; i < (1<<tn->bits); i++)
2173 * This outputs /proc/net/fib_triestats
2175 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2177 unsigned i, max, pointers, bytes, avdepth;
2180 avdepth = stat->totdepth*100 / stat->leaves;
2184 seq_printf(seq, "\tAver depth: %u.%02d\n",
2185 avdepth / 100, avdepth % 100);
2186 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2188 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2189 bytes = sizeof(struct leaf) * stat->leaves;
2191 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2192 bytes += sizeof(struct leaf_info) * stat->prefixes;
2194 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2195 bytes += sizeof(struct tnode) * stat->tnodes;
2197 max = MAX_STAT_DEPTH;
2198 while (max > 0 && stat->nodesizes[max-1] == 0)
2202 for (i = 1; i <= max; i++)
2203 if (stat->nodesizes[i] != 0) {
2204 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2205 pointers += (1<<i) * stat->nodesizes[i];
2207 seq_putc(seq, '\n');
2208 seq_printf(seq, "\tPointers: %u\n", pointers);
2210 bytes += sizeof(struct node *) * pointers;
2211 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2212 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2215 #ifdef CONFIG_IP_FIB_TRIE_STATS
2216 static void trie_show_usage(struct seq_file *seq,
2217 const struct trie_use_stats *stats)
2219 seq_printf(seq, "\nCounters:\n---------\n");
2220 seq_printf(seq, "gets = %u\n", stats->gets);
2221 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2222 seq_printf(seq, "semantic match passed = %u\n",
2223 stats->semantic_match_passed);
2224 seq_printf(seq, "semantic match miss = %u\n",
2225 stats->semantic_match_miss);
2226 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2227 seq_printf(seq, "skipped node resize = %u\n\n",
2228 stats->resize_node_skipped);
2230 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2232 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2234 if (tb->tb_id == RT_TABLE_LOCAL)
2235 seq_puts(seq, "Local:\n");
2236 else if (tb->tb_id == RT_TABLE_MAIN)
2237 seq_puts(seq, "Main:\n");
2239 seq_printf(seq, "Id %d:\n", tb->tb_id);
2243 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2245 struct net *net = (struct net *)seq->private;
2249 "Basic info: size of leaf:"
2250 " %Zd bytes, size of tnode: %Zd bytes.\n",
2251 sizeof(struct leaf), sizeof(struct tnode));
2253 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2254 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2255 struct hlist_node *node;
2256 struct fib_table *tb;
2258 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2259 struct trie *t = (struct trie *) tb->tb_data;
2260 struct trie_stat stat;
2265 fib_table_print(seq, tb);
2267 trie_collect_stats(t, &stat);
2268 trie_show_stats(seq, &stat);
2269 #ifdef CONFIG_IP_FIB_TRIE_STATS
2270 trie_show_usage(seq, &t->stats);
2278 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2280 return single_open_net(inode, file, fib_triestat_seq_show);
2283 static const struct file_operations fib_triestat_fops = {
2284 .owner = THIS_MODULE,
2285 .open = fib_triestat_seq_open,
2287 .llseek = seq_lseek,
2288 .release = single_release_net,
2291 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2293 struct fib_trie_iter *iter = seq->private;
2294 struct net *net = seq_file_net(seq);
2298 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2299 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2300 struct hlist_node *node;
2301 struct fib_table *tb;
2303 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2306 for (n = fib_trie_get_first(iter,
2307 (struct trie *) tb->tb_data);
2308 n; n = fib_trie_get_next(iter))
2319 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2323 return fib_trie_get_idx(seq, *pos);
2326 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2328 struct fib_trie_iter *iter = seq->private;
2329 struct net *net = seq_file_net(seq);
2330 struct fib_table *tb = iter->tb;
2331 struct hlist_node *tb_node;
2336 /* next node in same table */
2337 n = fib_trie_get_next(iter);
2341 /* walk rest of this hash chain */
2342 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2343 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2344 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2345 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2350 /* new hash chain */
2351 while (++h < FIB_TABLE_HASHSZ) {
2352 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2353 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2354 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2366 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2372 static void seq_indent(struct seq_file *seq, int n)
2374 while (n-- > 0) seq_puts(seq, " ");
2377 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2380 case RT_SCOPE_UNIVERSE: return "universe";
2381 case RT_SCOPE_SITE: return "site";
2382 case RT_SCOPE_LINK: return "link";
2383 case RT_SCOPE_HOST: return "host";
2384 case RT_SCOPE_NOWHERE: return "nowhere";
2386 snprintf(buf, len, "scope=%d", s);
2391 static const char *rtn_type_names[__RTN_MAX] = {
2392 [RTN_UNSPEC] = "UNSPEC",
2393 [RTN_UNICAST] = "UNICAST",
2394 [RTN_LOCAL] = "LOCAL",
2395 [RTN_BROADCAST] = "BROADCAST",
2396 [RTN_ANYCAST] = "ANYCAST",
2397 [RTN_MULTICAST] = "MULTICAST",
2398 [RTN_BLACKHOLE] = "BLACKHOLE",
2399 [RTN_UNREACHABLE] = "UNREACHABLE",
2400 [RTN_PROHIBIT] = "PROHIBIT",
2401 [RTN_THROW] = "THROW",
2403 [RTN_XRESOLVE] = "XRESOLVE",
2406 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2408 if (t < __RTN_MAX && rtn_type_names[t])
2409 return rtn_type_names[t];
2410 snprintf(buf, len, "type %u", t);
2414 /* Pretty print the trie */
2415 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2417 const struct fib_trie_iter *iter = seq->private;
2420 if (!node_parent_rcu(n))
2421 fib_table_print(seq, iter->tb);
2424 struct tnode *tn = (struct tnode *) n;
2425 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2427 seq_indent(seq, iter->depth-1);
2428 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2429 &prf, tn->pos, tn->bits, tn->full_children,
2430 tn->empty_children);
2433 struct leaf *l = (struct leaf *) n;
2434 struct leaf_info *li;
2435 struct hlist_node *node;
2436 __be32 val = htonl(l->key);
2438 seq_indent(seq, iter->depth);
2439 seq_printf(seq, " |-- %pI4\n", &val);
2441 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2442 struct fib_alias *fa;
2444 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2445 char buf1[32], buf2[32];
2447 seq_indent(seq, iter->depth+1);
2448 seq_printf(seq, " /%d %s %s", li->plen,
2449 rtn_scope(buf1, sizeof(buf1),
2451 rtn_type(buf2, sizeof(buf2),
2454 seq_printf(seq, " tos=%d", fa->fa_tos);
2455 seq_putc(seq, '\n');
2463 static const struct seq_operations fib_trie_seq_ops = {
2464 .start = fib_trie_seq_start,
2465 .next = fib_trie_seq_next,
2466 .stop = fib_trie_seq_stop,
2467 .show = fib_trie_seq_show,
2470 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2472 return seq_open_net(inode, file, &fib_trie_seq_ops,
2473 sizeof(struct fib_trie_iter));
2476 static const struct file_operations fib_trie_fops = {
2477 .owner = THIS_MODULE,
2478 .open = fib_trie_seq_open,
2480 .llseek = seq_lseek,
2481 .release = seq_release_net,
2484 struct fib_route_iter {
2485 struct seq_net_private p;
2486 struct trie *main_trie;
2491 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2493 struct leaf *l = NULL;
2494 struct trie *t = iter->main_trie;
2496 /* use cache location of last found key */
2497 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2501 l = trie_firstleaf(t);
2504 while (l && pos-- > 0) {
2506 l = trie_nextleaf(l);
2510 iter->key = pos; /* remember it */
2512 iter->pos = 0; /* forget it */
2517 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2520 struct fib_route_iter *iter = seq->private;
2521 struct fib_table *tb;
2524 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2528 iter->main_trie = (struct trie *) tb->tb_data;
2530 return SEQ_START_TOKEN;
2532 return fib_route_get_idx(iter, *pos - 1);
2535 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2537 struct fib_route_iter *iter = seq->private;
2541 if (v == SEQ_START_TOKEN) {
2543 l = trie_firstleaf(iter->main_trie);
2546 l = trie_nextleaf(l);
2556 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2562 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2564 static unsigned type2flags[RTN_MAX + 1] = {
2565 [7] = RTF_REJECT, [8] = RTF_REJECT,
2567 unsigned flags = type2flags[type];
2569 if (fi && fi->fib_nh->nh_gw)
2570 flags |= RTF_GATEWAY;
2571 if (mask == htonl(0xFFFFFFFF))
2578 * This outputs /proc/net/route.
2579 * The format of the file is not supposed to be changed
2580 * and needs to be same as fib_hash output to avoid breaking
2583 static int fib_route_seq_show(struct seq_file *seq, void *v)
2586 struct leaf_info *li;
2587 struct hlist_node *node;
2589 if (v == SEQ_START_TOKEN) {
2590 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2591 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2596 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2597 struct fib_alias *fa;
2598 __be32 mask, prefix;
2600 mask = inet_make_mask(li->plen);
2601 prefix = htonl(l->key);
2603 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2604 const struct fib_info *fi = fa->fa_info;
2605 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2608 if (fa->fa_type == RTN_BROADCAST
2609 || fa->fa_type == RTN_MULTICAST)
2614 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2615 "%d\t%08X\t%d\t%u\t%u%n",
2616 fi->fib_dev ? fi->fib_dev->name : "*",
2618 fi->fib_nh->nh_gw, flags, 0, 0,
2622 fi->fib_advmss + 40 : 0),
2624 fi->fib_rtt >> 3, &len);
2627 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2628 "%d\t%08X\t%d\t%u\t%u%n",
2629 prefix, 0, flags, 0, 0, 0,
2630 mask, 0, 0, 0, &len);
2632 seq_printf(seq, "%*s\n", 127 - len, "");
2639 static const struct seq_operations fib_route_seq_ops = {
2640 .start = fib_route_seq_start,
2641 .next = fib_route_seq_next,
2642 .stop = fib_route_seq_stop,
2643 .show = fib_route_seq_show,
2646 static int fib_route_seq_open(struct inode *inode, struct file *file)
2648 return seq_open_net(inode, file, &fib_route_seq_ops,
2649 sizeof(struct fib_route_iter));
2652 static const struct file_operations fib_route_fops = {
2653 .owner = THIS_MODULE,
2654 .open = fib_route_seq_open,
2656 .llseek = seq_lseek,
2657 .release = seq_release_net,
2660 int __net_init fib_proc_init(struct net *net)
2662 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2665 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2666 &fib_triestat_fops))
2669 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2675 proc_net_remove(net, "fib_triestat");
2677 proc_net_remove(net, "fib_trie");
2682 void __net_exit fib_proc_exit(struct net *net)
2684 proc_net_remove(net, "fib_trie");
2685 proc_net_remove(net, "fib_triestat");
2686 proc_net_remove(net, "route");
2689 #endif /* CONFIG_PROC_FS */