2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
46 #define VERSION "0.403"
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/rcupdate.h>
66 #include <linux/skbuff.h>
67 #include <linux/netlink.h>
68 #include <linux/init.h>
69 #include <linux/list.h>
71 #include <net/protocol.h>
72 #include <net/route.h>
75 #include <net/ip_fib.h>
76 #include "fib_lookup.h"
78 #undef CONFIG_IP_FIB_TRIE_STATS
79 #define MAX_CHILDS 16384
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
85 typedef unsigned int t_key;
89 #define NODE_TYPE_MASK 0x1UL
90 #define NODE_PARENT(node) \
91 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95 #define NODE_SET_PARENT(node, ptr) \
96 rcu_assign_pointer((node)->parent, \
97 ((unsigned long)(ptr)) | NODE_TYPE(node))
99 #define IS_TNODE(n) (!(n->parent & T_LEAF))
100 #define IS_LEAF(n) (n->parent & T_LEAF)
104 unsigned long parent;
109 unsigned long parent;
110 struct hlist_head list;
115 struct hlist_node hlist;
118 struct list_head falh;
123 unsigned long parent;
124 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short full_children; /* KEYLENGTH bits needed */
127 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
144 unsigned int totdepth;
145 unsigned int maxdepth;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
158 unsigned int revision;
161 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
162 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 static void tnode_free(struct tnode *tn);
168 static kmem_cache_t *fn_alias_kmem __read_mostly;
169 static struct trie *trie_local = NULL, *trie_main = NULL;
172 /* rcu_read_lock needs to be hold by caller from readside */
174 static inline struct node *tnode_get_child(struct tnode *tn, int i)
176 BUG_ON(i >= 1 << tn->bits);
178 return rcu_dereference(tn->child[i]);
181 static inline int tnode_child_length(const struct tnode *tn)
183 return 1 << tn->bits;
186 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
188 if (offset < KEYLENGTH)
189 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
194 static inline int tkey_equals(t_key a, t_key b)
199 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
201 if (bits == 0 || offset >= KEYLENGTH)
203 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
204 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
207 static inline int tkey_mismatch(t_key a, int offset, t_key b)
214 while ((diff << i) >> (KEYLENGTH-1) == 0)
220 To understand this stuff, an understanding of keys and all their bits is
221 necessary. Every node in the trie has a key associated with it, but not
222 all of the bits in that key are significant.
224 Consider a node 'n' and its parent 'tp'.
226 If n is a leaf, every bit in its key is significant. Its presence is
227 necessitaded by path compression, since during a tree traversal (when
228 searching for a leaf - unless we are doing an insertion) we will completely
229 ignore all skipped bits we encounter. Thus we need to verify, at the end of
230 a potentially successful search, that we have indeed been walking the
233 Note that we can never "miss" the correct key in the tree if present by
234 following the wrong path. Path compression ensures that segments of the key
235 that are the same for all keys with a given prefix are skipped, but the
236 skipped part *is* identical for each node in the subtrie below the skipped
237 bit! trie_insert() in this implementation takes care of that - note the
238 call to tkey_sub_equals() in trie_insert().
240 if n is an internal node - a 'tnode' here, the various parts of its key
241 have many different meanings.
244 _________________________________________________________________
245 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
246 -----------------------------------------------------------------
247 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
249 _________________________________________________________________
250 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
251 -----------------------------------------------------------------
252 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
259 First, let's just ignore the bits that come before the parent tp, that is
260 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
261 not use them for anything.
263 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
264 index into the parent's child array. That is, they will be used to find
265 'n' among tp's children.
267 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
270 All the bits we have seen so far are significant to the node n. The rest
271 of the bits are really not needed or indeed known in n->key.
273 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
274 n's child array, and will of course be different for each child.
277 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
282 static inline void check_tnode(const struct tnode *tn)
284 WARN_ON(tn && tn->pos+tn->bits > 32);
287 static int halve_threshold = 25;
288 static int inflate_threshold = 50;
291 static void __alias_free_mem(struct rcu_head *head)
293 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
294 kmem_cache_free(fn_alias_kmem, fa);
297 static inline void alias_free_mem_rcu(struct fib_alias *fa)
299 call_rcu(&fa->rcu, __alias_free_mem);
302 static void __leaf_free_rcu(struct rcu_head *head)
304 kfree(container_of(head, struct leaf, rcu));
307 static inline void free_leaf(struct leaf *leaf)
309 call_rcu(&leaf->rcu, __leaf_free_rcu);
312 static void __leaf_info_free_rcu(struct rcu_head *head)
314 kfree(container_of(head, struct leaf_info, rcu));
317 static inline void free_leaf_info(struct leaf_info *leaf)
319 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
322 static struct tnode *tnode_alloc(unsigned int size)
326 if (size <= PAGE_SIZE)
327 return kcalloc(size, 1, GFP_KERNEL);
329 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
333 return page_address(pages);
336 static void __tnode_free_rcu(struct rcu_head *head)
338 struct tnode *tn = container_of(head, struct tnode, rcu);
339 unsigned int size = sizeof(struct tnode) +
340 (1 << tn->bits) * sizeof(struct node *);
342 if (size <= PAGE_SIZE)
345 free_pages((unsigned long)tn, get_order(size));
348 static inline void tnode_free(struct tnode *tn)
350 call_rcu(&tn->rcu, __tnode_free_rcu);
353 static struct leaf *leaf_new(void)
355 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
358 INIT_HLIST_HEAD(&l->list);
363 static struct leaf_info *leaf_info_new(int plen)
365 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
368 INIT_LIST_HEAD(&li->falh);
373 static struct tnode* tnode_new(t_key key, int pos, int bits)
375 int nchildren = 1<<bits;
376 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
377 struct tnode *tn = tnode_alloc(sz);
381 tn->parent = T_TNODE;
385 tn->full_children = 0;
386 tn->empty_children = 1<<bits;
389 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
390 (unsigned int) (sizeof(struct node) * 1<<bits));
395 * Check whether a tnode 'n' is "full", i.e. it is an internal node
396 * and no bits are skipped. See discussion in dyntree paper p. 6
399 static inline int tnode_full(const struct tnode *tn, const struct node *n)
401 if (n == NULL || IS_LEAF(n))
404 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
407 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
409 tnode_put_child_reorg(tn, i, n, -1);
413 * Add a child at position i overwriting the old value.
414 * Update the value of full_children and empty_children.
417 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
419 struct node *chi = tn->child[i];
422 BUG_ON(i >= 1<<tn->bits);
425 /* update emptyChildren */
426 if (n == NULL && chi != NULL)
427 tn->empty_children++;
428 else if (n != NULL && chi == NULL)
429 tn->empty_children--;
431 /* update fullChildren */
433 wasfull = tnode_full(tn, chi);
435 isfull = tnode_full(tn, n);
436 if (wasfull && !isfull)
438 else if (!wasfull && isfull)
442 NODE_SET_PARENT(n, tn);
444 rcu_assign_pointer(tn->child[i], n);
447 static struct node *resize(struct trie *t, struct tnode *tn)
451 struct tnode *old_tn;
456 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
457 tn, inflate_threshold, halve_threshold);
460 if (tn->empty_children == tnode_child_length(tn)) {
465 if (tn->empty_children == tnode_child_length(tn) - 1)
466 for (i = 0; i < tnode_child_length(tn); i++) {
473 /* compress one level */
474 NODE_SET_PARENT(n, NULL);
479 * Double as long as the resulting node has a number of
480 * nonempty nodes that are above the threshold.
484 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
485 * the Helsinki University of Technology and Matti Tikkanen of Nokia
486 * Telecommunications, page 6:
487 * "A node is doubled if the ratio of non-empty children to all
488 * children in the *doubled* node is at least 'high'."
490 * 'high' in this instance is the variable 'inflate_threshold'. It
491 * is expressed as a percentage, so we multiply it with
492 * tnode_child_length() and instead of multiplying by 2 (since the
493 * child array will be doubled by inflate()) and multiplying
494 * the left-hand side by 100 (to handle the percentage thing) we
495 * multiply the left-hand side by 50.
497 * The left-hand side may look a bit weird: tnode_child_length(tn)
498 * - tn->empty_children is of course the number of non-null children
499 * in the current node. tn->full_children is the number of "full"
500 * children, that is non-null tnodes with a skip value of 0.
501 * All of those will be doubled in the resulting inflated tnode, so
502 * we just count them one extra time here.
504 * A clearer way to write this would be:
506 * to_be_doubled = tn->full_children;
507 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
510 * new_child_length = tnode_child_length(tn) * 2;
512 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
514 * if (new_fill_factor >= inflate_threshold)
516 * ...and so on, tho it would mess up the while () loop.
519 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
523 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
524 * inflate_threshold * new_child_length
526 * expand not_to_be_doubled and to_be_doubled, and shorten:
527 * 100 * (tnode_child_length(tn) - tn->empty_children +
528 * tn->full_children) >= inflate_threshold * new_child_length
530 * expand new_child_length:
531 * 100 * (tnode_child_length(tn) - tn->empty_children +
532 * tn->full_children) >=
533 * inflate_threshold * tnode_child_length(tn) * 2
536 * 50 * (tn->full_children + tnode_child_length(tn) -
537 * tn->empty_children) >= inflate_threshold *
538 * tnode_child_length(tn)
545 while ((tn->full_children > 0 &&
546 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
547 inflate_threshold * tnode_child_length(tn))) {
553 #ifdef CONFIG_IP_FIB_TRIE_STATS
554 t->stats.resize_node_skipped++;
563 * Halve as long as the number of empty children in this
564 * node is above threshold.
568 while (tn->bits > 1 &&
569 100 * (tnode_child_length(tn) - tn->empty_children) <
570 halve_threshold * tnode_child_length(tn)) {
576 #ifdef CONFIG_IP_FIB_TRIE_STATS
577 t->stats.resize_node_skipped++;
584 /* Only one child remains */
585 if (tn->empty_children == tnode_child_length(tn) - 1)
586 for (i = 0; i < tnode_child_length(tn); i++) {
593 /* compress one level */
595 NODE_SET_PARENT(n, NULL);
600 return (struct node *) tn;
603 static struct tnode *inflate(struct trie *t, struct tnode *tn)
606 struct tnode *oldtnode = tn;
607 int olen = tnode_child_length(tn);
610 pr_debug("In inflate\n");
612 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
615 return ERR_PTR(-ENOMEM);
618 * Preallocate and store tnodes before the actual work so we
619 * don't get into an inconsistent state if memory allocation
620 * fails. In case of failure we return the oldnode and inflate
621 * of tnode is ignored.
624 for (i = 0; i < olen; i++) {
625 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
629 inode->pos == oldtnode->pos + oldtnode->bits &&
631 struct tnode *left, *right;
632 t_key m = TKEY_GET_MASK(inode->pos, 1);
634 left = tnode_new(inode->key&(~m), inode->pos + 1,
639 right = tnode_new(inode->key|m, inode->pos + 1,
647 put_child(t, tn, 2*i, (struct node *) left);
648 put_child(t, tn, 2*i+1, (struct node *) right);
652 for (i = 0; i < olen; i++) {
653 struct node *node = tnode_get_child(oldtnode, i);
654 struct tnode *left, *right;
661 /* A leaf or an internal node with skipped bits */
663 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
664 tn->pos + tn->bits - 1) {
665 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
667 put_child(t, tn, 2*i, node);
669 put_child(t, tn, 2*i+1, node);
673 /* An internal node with two children */
674 inode = (struct tnode *) node;
676 if (inode->bits == 1) {
677 put_child(t, tn, 2*i, inode->child[0]);
678 put_child(t, tn, 2*i+1, inode->child[1]);
684 /* An internal node with more than two children */
686 /* We will replace this node 'inode' with two new
687 * ones, 'left' and 'right', each with half of the
688 * original children. The two new nodes will have
689 * a position one bit further down the key and this
690 * means that the "significant" part of their keys
691 * (see the discussion near the top of this file)
692 * will differ by one bit, which will be "0" in
693 * left's key and "1" in right's key. Since we are
694 * moving the key position by one step, the bit that
695 * we are moving away from - the bit at position
696 * (inode->pos) - is the one that will differ between
697 * left and right. So... we synthesize that bit in the
699 * The mask 'm' below will be a single "one" bit at
700 * the position (inode->pos)
703 /* Use the old key, but set the new significant
707 left = (struct tnode *) tnode_get_child(tn, 2*i);
708 put_child(t, tn, 2*i, NULL);
712 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
713 put_child(t, tn, 2*i+1, NULL);
717 size = tnode_child_length(left);
718 for (j = 0; j < size; j++) {
719 put_child(t, left, j, inode->child[j]);
720 put_child(t, right, j, inode->child[j + size]);
722 put_child(t, tn, 2*i, resize(t, left));
723 put_child(t, tn, 2*i+1, resize(t, right));
727 tnode_free(oldtnode);
731 int size = tnode_child_length(tn);
734 for (j = 0; j < size; j++)
736 tnode_free((struct tnode *)tn->child[j]);
740 return ERR_PTR(-ENOMEM);
744 static struct tnode *halve(struct trie *t, struct tnode *tn)
746 struct tnode *oldtnode = tn;
747 struct node *left, *right;
749 int olen = tnode_child_length(tn);
751 pr_debug("In halve\n");
753 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
756 return ERR_PTR(-ENOMEM);
759 * Preallocate and store tnodes before the actual work so we
760 * don't get into an inconsistent state if memory allocation
761 * fails. In case of failure we return the oldnode and halve
762 * of tnode is ignored.
765 for (i = 0; i < olen; i += 2) {
766 left = tnode_get_child(oldtnode, i);
767 right = tnode_get_child(oldtnode, i+1);
769 /* Two nonempty children */
773 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
778 put_child(t, tn, i/2, (struct node *)newn);
783 for (i = 0; i < olen; i += 2) {
784 struct tnode *newBinNode;
786 left = tnode_get_child(oldtnode, i);
787 right = tnode_get_child(oldtnode, i+1);
789 /* At least one of the children is empty */
791 if (right == NULL) /* Both are empty */
793 put_child(t, tn, i/2, right);
798 put_child(t, tn, i/2, left);
802 /* Two nonempty children */
803 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
804 put_child(t, tn, i/2, NULL);
805 put_child(t, newBinNode, 0, left);
806 put_child(t, newBinNode, 1, right);
807 put_child(t, tn, i/2, resize(t, newBinNode));
809 tnode_free(oldtnode);
813 int size = tnode_child_length(tn);
816 for (j = 0; j < size; j++)
818 tnode_free((struct tnode *)tn->child[j]);
822 return ERR_PTR(-ENOMEM);
826 static void trie_init(struct trie *t)
832 rcu_assign_pointer(t->trie, NULL);
834 #ifdef CONFIG_IP_FIB_TRIE_STATS
835 memset(&t->stats, 0, sizeof(struct trie_use_stats));
839 /* readside most use rcu_read_lock currently dump routines
840 via get_fa_head and dump */
842 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
844 struct hlist_node *node;
845 struct leaf_info *li;
847 hlist_for_each_entry_rcu(li, node, head, hlist)
848 if (li->plen == plen)
854 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
856 struct leaf_info *li = find_leaf_info(&l->list, plen);
864 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
866 struct leaf_info *li = NULL, *last = NULL;
867 struct hlist_node *node;
869 if (hlist_empty(head)) {
870 hlist_add_head_rcu(&new->hlist, head);
872 hlist_for_each_entry(li, node, head, hlist) {
873 if (new->plen > li->plen)
879 hlist_add_after_rcu(&last->hlist, &new->hlist);
881 hlist_add_before_rcu(&new->hlist, &li->hlist);
885 /* rcu_read_lock needs to be hold by caller from readside */
888 fib_find_node(struct trie *t, u32 key)
895 n = rcu_dereference(t->trie);
897 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
898 tn = (struct tnode *) n;
902 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
903 pos = tn->pos + tn->bits;
904 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
908 /* Case we have found a leaf. Compare prefixes */
910 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
911 return (struct leaf *)n;
916 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
920 struct tnode *tp = NULL;
924 while (tn != NULL && NODE_PARENT(tn) != NULL) {
926 tp = NODE_PARENT(tn);
927 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
928 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
929 tn = (struct tnode *) resize (t, (struct tnode *)tn);
930 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
932 if (!NODE_PARENT(tn))
935 tn = NODE_PARENT(tn);
937 /* Handle last (top) tnode */
939 tn = (struct tnode*) resize(t, (struct tnode *)tn);
941 return (struct node*) tn;
944 /* only used from updater-side */
946 static struct list_head *
947 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
950 struct tnode *tp = NULL, *tn = NULL;
954 struct list_head *fa_head = NULL;
955 struct leaf_info *li;
961 /* If we point to NULL, stop. Either the tree is empty and we should
962 * just put a new leaf in if, or we have reached an empty child slot,
963 * and we should just put our new leaf in that.
964 * If we point to a T_TNODE, check if it matches our key. Note that
965 * a T_TNODE might be skipping any number of bits - its 'pos' need
966 * not be the parent's 'pos'+'bits'!
968 * If it does match the current key, get pos/bits from it, extract
969 * the index from our key, push the T_TNODE and walk the tree.
971 * If it doesn't, we have to replace it with a new T_TNODE.
973 * If we point to a T_LEAF, it might or might not have the same key
974 * as we do. If it does, just change the value, update the T_LEAF's
975 * value, and return it.
976 * If it doesn't, we need to replace it with a T_TNODE.
979 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
980 tn = (struct tnode *) n;
984 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
986 pos = tn->pos + tn->bits;
987 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
989 BUG_ON(n && NODE_PARENT(n) != tn);
995 * n ----> NULL, LEAF or TNODE
997 * tp is n's (parent) ----> NULL or TNODE
1000 BUG_ON(tp && IS_LEAF(tp));
1002 /* Case 1: n is a leaf. Compare prefixes */
1004 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1005 struct leaf *l = (struct leaf *) n;
1007 li = leaf_info_new(plen);
1014 fa_head = &li->falh;
1015 insert_leaf_info(&l->list, li);
1027 li = leaf_info_new(plen);
1030 tnode_free((struct tnode *) l);
1035 fa_head = &li->falh;
1036 insert_leaf_info(&l->list, li);
1038 if (t->trie && n == NULL) {
1039 /* Case 2: n is NULL, and will just insert a new leaf */
1041 NODE_SET_PARENT(l, tp);
1043 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1044 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1046 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1048 * Add a new tnode here
1049 * first tnode need some special handling
1053 pos = tp->pos+tp->bits;
1058 newpos = tkey_mismatch(key, pos, n->key);
1059 tn = tnode_new(n->key, newpos, 1);
1062 tn = tnode_new(key, newpos, 1); /* First tnode */
1067 tnode_free((struct tnode *) l);
1072 NODE_SET_PARENT(tn, tp);
1074 missbit = tkey_extract_bits(key, newpos, 1);
1075 put_child(t, tn, missbit, (struct node *)l);
1076 put_child(t, tn, 1-missbit, n);
1079 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1080 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1082 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1087 if (tp && tp->pos + tp->bits > 32)
1088 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1089 tp, tp->pos, tp->bits, key, plen);
1091 /* Rebalance the trie */
1093 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1101 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1102 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1104 struct trie *t = (struct trie *) tb->tb_data;
1105 struct fib_alias *fa, *new_fa;
1106 struct list_head *fa_head = NULL;
1107 struct fib_info *fi;
1108 int plen = r->rtm_dst_len;
1109 int type = r->rtm_type;
1110 u8 tos = r->rtm_tos;
1120 memcpy(&key, rta->rta_dst, 4);
1124 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1126 mask = ntohl(inet_make_mask(plen));
1133 fi = fib_create_info(r, rta, nlhdr, &err);
1138 l = fib_find_node(t, key);
1142 fa_head = get_fa_head(l, plen);
1143 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1146 /* Now fa, if non-NULL, points to the first fib alias
1147 * with the same keys [prefix,tos,priority], if such key already
1148 * exists or to the node before which we will insert new one.
1150 * If fa is NULL, we will need to allocate a new one and
1151 * insert to the head of f.
1153 * If f is NULL, no fib node matched the destination key
1154 * and we need to allocate a new one of those as well.
1157 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1158 struct fib_alias *fa_orig;
1161 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1164 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1165 struct fib_info *fi_drop;
1169 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1173 fi_drop = fa->fa_info;
1174 new_fa->fa_tos = fa->fa_tos;
1175 new_fa->fa_info = fi;
1176 new_fa->fa_type = type;
1177 new_fa->fa_scope = r->rtm_scope;
1178 state = fa->fa_state;
1179 new_fa->fa_state &= ~FA_S_ACCESSED;
1181 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1182 alias_free_mem_rcu(fa);
1184 fib_release_info(fi_drop);
1185 if (state & FA_S_ACCESSED)
1190 /* Error if we find a perfect match which
1191 * uses the same scope, type, and nexthop
1195 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1196 if (fa->fa_tos != tos)
1198 if (fa->fa_info->fib_priority != fi->fib_priority)
1200 if (fa->fa_type == type &&
1201 fa->fa_scope == r->rtm_scope &&
1202 fa->fa_info == fi) {
1206 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1210 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1214 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1218 new_fa->fa_info = fi;
1219 new_fa->fa_tos = tos;
1220 new_fa->fa_type = type;
1221 new_fa->fa_scope = r->rtm_scope;
1222 new_fa->fa_state = 0;
1224 * Insert new entry to the list.
1228 fa_head = fib_insert_node(t, &err, key, plen);
1231 goto out_free_new_fa;
1234 list_add_tail_rcu(&new_fa->fa_list,
1235 (fa ? &fa->fa_list : fa_head));
1238 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1243 kmem_cache_free(fn_alias_kmem, new_fa);
1245 fib_release_info(fi);
1251 /* should be clalled with rcu_read_lock */
1252 static inline int check_leaf(struct trie *t, struct leaf *l,
1253 t_key key, int *plen, const struct flowi *flp,
1254 struct fib_result *res)
1258 struct leaf_info *li;
1259 struct hlist_head *hhead = &l->list;
1260 struct hlist_node *node;
1262 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1264 mask = ntohl(inet_make_mask(i));
1265 if (l->key != (key & mask))
1268 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1270 #ifdef CONFIG_IP_FIB_TRIE_STATS
1271 t->stats.semantic_match_passed++;
1275 #ifdef CONFIG_IP_FIB_TRIE_STATS
1276 t->stats.semantic_match_miss++;
1283 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1285 struct trie *t = (struct trie *) tb->tb_data;
1290 t_key key = ntohl(flp->fl4_dst);
1293 int current_prefix_length = KEYLENGTH;
1295 t_key node_prefix, key_prefix, pref_mismatch;
1300 n = rcu_dereference(t->trie);
1304 #ifdef CONFIG_IP_FIB_TRIE_STATS
1310 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1314 pn = (struct tnode *) n;
1322 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1324 n = tnode_get_child(pn, cindex);
1327 #ifdef CONFIG_IP_FIB_TRIE_STATS
1328 t->stats.null_node_hit++;
1334 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1342 cn = (struct tnode *)n;
1345 * It's a tnode, and we can do some extra checks here if we
1346 * like, to avoid descending into a dead-end branch.
1347 * This tnode is in the parent's child array at index
1348 * key[p_pos..p_pos+p_bits] but potentially with some bits
1349 * chopped off, so in reality the index may be just a
1350 * subprefix, padded with zero at the end.
1351 * We can also take a look at any skipped bits in this
1352 * tnode - everything up to p_pos is supposed to be ok,
1353 * and the non-chopped bits of the index (se previous
1354 * paragraph) are also guaranteed ok, but the rest is
1355 * considered unknown.
1357 * The skipped bits are key[pos+bits..cn->pos].
1360 /* If current_prefix_length < pos+bits, we are already doing
1361 * actual prefix matching, which means everything from
1362 * pos+(bits-chopped_off) onward must be zero along some
1363 * branch of this subtree - otherwise there is *no* valid
1364 * prefix present. Here we can only check the skipped
1365 * bits. Remember, since we have already indexed into the
1366 * parent's child array, we know that the bits we chopped of
1370 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1372 if (current_prefix_length < pos+bits) {
1373 if (tkey_extract_bits(cn->key, current_prefix_length,
1374 cn->pos - current_prefix_length) != 0 ||
1380 * If chopped_off=0, the index is fully validated and we
1381 * only need to look at the skipped bits for this, the new,
1382 * tnode. What we actually want to do is to find out if
1383 * these skipped bits match our key perfectly, or if we will
1384 * have to count on finding a matching prefix further down,
1385 * because if we do, we would like to have some way of
1386 * verifying the existence of such a prefix at this point.
1389 /* The only thing we can do at this point is to verify that
1390 * any such matching prefix can indeed be a prefix to our
1391 * key, and if the bits in the node we are inspecting that
1392 * do not match our key are not ZERO, this cannot be true.
1393 * Thus, find out where there is a mismatch (before cn->pos)
1394 * and verify that all the mismatching bits are zero in the
1398 /* Note: We aren't very concerned about the piece of the key
1399 * that precede pn->pos+pn->bits, since these have already been
1400 * checked. The bits after cn->pos aren't checked since these are
1401 * by definition "unknown" at this point. Thus, what we want to
1402 * see is if we are about to enter the "prefix matching" state,
1403 * and in that case verify that the skipped bits that will prevail
1404 * throughout this subtree are zero, as they have to be if we are
1405 * to find a matching prefix.
1408 node_prefix = MASK_PFX(cn->key, cn->pos);
1409 key_prefix = MASK_PFX(key, cn->pos);
1410 pref_mismatch = key_prefix^node_prefix;
1413 /* In short: If skipped bits in this node do not match the search
1414 * key, enter the "prefix matching" state.directly.
1416 if (pref_mismatch) {
1417 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1419 pref_mismatch = pref_mismatch <<1;
1421 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1423 if (key_prefix != 0)
1426 if (current_prefix_length >= cn->pos)
1427 current_prefix_length = mp;
1430 pn = (struct tnode *)n; /* Descend */
1437 /* As zero don't change the child key (cindex) */
1438 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1441 /* Decrease current_... with bits chopped off */
1442 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1443 current_prefix_length = pn->pos + pn->bits - chopped_off;
1446 * Either we do the actual chop off according or if we have
1447 * chopped off all bits in this tnode walk up to our parent.
1450 if (chopped_off <= pn->bits) {
1451 cindex &= ~(1 << (chopped_off-1));
1453 if (NODE_PARENT(pn) == NULL)
1456 /* Get Child's index */
1457 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1458 pn = NODE_PARENT(pn);
1461 #ifdef CONFIG_IP_FIB_TRIE_STATS
1462 t->stats.backtrack++;
1474 /* only called from updater side */
1475 static int trie_leaf_remove(struct trie *t, t_key key)
1478 struct tnode *tp = NULL;
1479 struct node *n = t->trie;
1482 pr_debug("entering trie_leaf_remove(%p)\n", n);
1484 /* Note that in the case skipped bits, those bits are *not* checked!
1485 * When we finish this, we will have NULL or a T_LEAF, and the
1486 * T_LEAF may or may not match our key.
1489 while (n != NULL && IS_TNODE(n)) {
1490 struct tnode *tn = (struct tnode *) n;
1492 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1494 BUG_ON(n && NODE_PARENT(n) != tn);
1496 l = (struct leaf *) n;
1498 if (!n || !tkey_equals(l->key, key))
1503 * Remove the leaf and rebalance the tree
1510 tp = NODE_PARENT(n);
1511 tnode_free((struct tnode *) n);
1514 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1515 put_child(t, (struct tnode *)tp, cindex, NULL);
1516 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1518 rcu_assign_pointer(t->trie, NULL);
1525 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1526 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1528 struct trie *t = (struct trie *) tb->tb_data;
1530 int plen = r->rtm_dst_len;
1531 u8 tos = r->rtm_tos;
1532 struct fib_alias *fa, *fa_to_delete;
1533 struct list_head *fa_head;
1535 struct leaf_info *li;
1543 memcpy(&key, rta->rta_dst, 4);
1546 mask = ntohl(inet_make_mask(plen));
1552 l = fib_find_node(t, key);
1557 fa_head = get_fa_head(l, plen);
1558 fa = fib_find_alias(fa_head, tos, 0);
1563 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1565 fa_to_delete = NULL;
1566 fa_head = fa->fa_list.prev;
1568 list_for_each_entry(fa, fa_head, fa_list) {
1569 struct fib_info *fi = fa->fa_info;
1571 if (fa->fa_tos != tos)
1574 if ((!r->rtm_type ||
1575 fa->fa_type == r->rtm_type) &&
1576 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1577 fa->fa_scope == r->rtm_scope) &&
1578 (!r->rtm_protocol ||
1579 fi->fib_protocol == r->rtm_protocol) &&
1580 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1590 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1592 l = fib_find_node(t, key);
1593 li = find_leaf_info(&l->list, plen);
1595 list_del_rcu(&fa->fa_list);
1597 if (list_empty(fa_head)) {
1598 hlist_del_rcu(&li->hlist);
1602 if (hlist_empty(&l->list))
1603 trie_leaf_remove(t, key);
1605 if (fa->fa_state & FA_S_ACCESSED)
1608 fib_release_info(fa->fa_info);
1609 alias_free_mem_rcu(fa);
1613 static int trie_flush_list(struct trie *t, struct list_head *head)
1615 struct fib_alias *fa, *fa_node;
1618 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1619 struct fib_info *fi = fa->fa_info;
1621 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1622 list_del_rcu(&fa->fa_list);
1623 fib_release_info(fa->fa_info);
1624 alias_free_mem_rcu(fa);
1631 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1634 struct hlist_head *lih = &l->list;
1635 struct hlist_node *node, *tmp;
1636 struct leaf_info *li = NULL;
1638 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1639 found += trie_flush_list(t, &li->falh);
1641 if (list_empty(&li->falh)) {
1642 hlist_del_rcu(&li->hlist);
1649 /* rcu_read_lock needs to be hold by caller from readside */
1651 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1653 struct node *c = (struct node *) thisleaf;
1656 struct node *trie = rcu_dereference(t->trie);
1662 if (IS_LEAF(trie)) /* trie w. just a leaf */
1663 return (struct leaf *) trie;
1665 p = (struct tnode*) trie; /* Start */
1667 p = (struct tnode *) NODE_PARENT(c);
1672 /* Find the next child of the parent */
1674 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1678 last = 1 << p->bits;
1679 for (idx = pos; idx < last ; idx++) {
1680 c = rcu_dereference(p->child[idx]);
1685 /* Decend if tnode */
1686 while (IS_TNODE(c)) {
1687 p = (struct tnode *) c;
1690 /* Rightmost non-NULL branch */
1691 if (p && IS_TNODE(p))
1692 while (!(c = rcu_dereference(p->child[idx]))
1693 && idx < (1<<p->bits)) idx++;
1695 /* Done with this tnode? */
1696 if (idx >= (1 << p->bits) || !c)
1699 return (struct leaf *) c;
1702 /* No more children go up one step */
1703 c = (struct node *) p;
1704 p = (struct tnode *) NODE_PARENT(p);
1706 return NULL; /* Ready. Root of trie */
1709 static int fn_trie_flush(struct fib_table *tb)
1711 struct trie *t = (struct trie *) tb->tb_data;
1712 struct leaf *ll = NULL, *l = NULL;
1718 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1719 found += trie_flush_leaf(t, l);
1721 if (ll && hlist_empty(&ll->list))
1722 trie_leaf_remove(t, ll->key);
1727 if (ll && hlist_empty(&ll->list))
1728 trie_leaf_remove(t, ll->key);
1730 pr_debug("trie_flush found=%d\n", found);
1734 static int trie_last_dflt = -1;
1737 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1739 struct trie *t = (struct trie *) tb->tb_data;
1740 int order, last_idx;
1741 struct fib_info *fi = NULL;
1742 struct fib_info *last_resort;
1743 struct fib_alias *fa = NULL;
1744 struct list_head *fa_head;
1753 l = fib_find_node(t, 0);
1757 fa_head = get_fa_head(l, 0);
1761 if (list_empty(fa_head))
1764 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1765 struct fib_info *next_fi = fa->fa_info;
1767 if (fa->fa_scope != res->scope ||
1768 fa->fa_type != RTN_UNICAST)
1771 if (next_fi->fib_priority > res->fi->fib_priority)
1773 if (!next_fi->fib_nh[0].nh_gw ||
1774 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1776 fa->fa_state |= FA_S_ACCESSED;
1779 if (next_fi != res->fi)
1781 } else if (!fib_detect_death(fi, order, &last_resort,
1782 &last_idx, &trie_last_dflt)) {
1784 fib_info_put(res->fi);
1786 atomic_inc(&fi->fib_clntref);
1787 trie_last_dflt = order;
1793 if (order <= 0 || fi == NULL) {
1794 trie_last_dflt = -1;
1798 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1800 fib_info_put(res->fi);
1802 atomic_inc(&fi->fib_clntref);
1803 trie_last_dflt = order;
1806 if (last_idx >= 0) {
1808 fib_info_put(res->fi);
1809 res->fi = last_resort;
1811 atomic_inc(&last_resort->fib_clntref);
1813 trie_last_dflt = last_idx;
1818 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1819 struct sk_buff *skb, struct netlink_callback *cb)
1822 struct fib_alias *fa;
1824 u32 xkey = htonl(key);
1829 /* rcu_read_lock is hold by caller */
1831 list_for_each_entry_rcu(fa, fah, fa_list) {
1836 if (fa->fa_info->fib_nh == NULL) {
1837 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1841 if (fa->fa_info == NULL) {
1842 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1847 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1856 fa->fa_info, 0) < 0) {
1866 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1867 struct netlink_callback *cb)
1870 struct list_head *fa_head;
1871 struct leaf *l = NULL;
1875 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1879 memset(&cb->args[3], 0,
1880 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1882 fa_head = get_fa_head(l, plen);
1887 if (list_empty(fa_head))
1890 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1899 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1902 struct trie *t = (struct trie *) tb->tb_data;
1907 for (m = 0; m <= 32; m++) {
1911 memset(&cb->args[2], 0,
1912 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1914 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1927 /* Fix more generic FIB names for init later */
1929 #ifdef CONFIG_IP_MULTIPLE_TABLES
1930 struct fib_table * fib_hash_init(int id)
1932 struct fib_table * __init fib_hash_init(int id)
1935 struct fib_table *tb;
1938 if (fn_alias_kmem == NULL)
1939 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1940 sizeof(struct fib_alias),
1941 0, SLAB_HWCACHE_ALIGN,
1944 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1950 tb->tb_lookup = fn_trie_lookup;
1951 tb->tb_insert = fn_trie_insert;
1952 tb->tb_delete = fn_trie_delete;
1953 tb->tb_flush = fn_trie_flush;
1954 tb->tb_select_default = fn_trie_select_default;
1955 tb->tb_dump = fn_trie_dump;
1956 memset(tb->tb_data, 0, sizeof(struct trie));
1958 t = (struct trie *) tb->tb_data;
1962 if (id == RT_TABLE_LOCAL)
1964 else if (id == RT_TABLE_MAIN)
1967 if (id == RT_TABLE_LOCAL)
1968 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1973 #ifdef CONFIG_PROC_FS
1974 /* Depth first Trie walk iterator */
1975 struct fib_trie_iter {
1976 struct tnode *tnode;
1982 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1984 struct tnode *tn = iter->tnode;
1985 unsigned cindex = iter->index;
1988 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1989 iter->tnode, iter->index, iter->depth);
1991 while (cindex < (1<<tn->bits)) {
1992 struct node *n = tnode_get_child(tn, cindex);
1997 iter->index = cindex + 1;
1999 /* push down one level */
2000 iter->tnode = (struct tnode *) n;
2010 /* Current node exhausted, pop back up */
2011 p = NODE_PARENT(tn);
2013 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2023 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2026 struct node *n = rcu_dereference(t->trie);
2028 if (n && IS_TNODE(n)) {
2029 iter->tnode = (struct tnode *) n;
2038 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2041 struct fib_trie_iter iter;
2043 memset(s, 0, sizeof(*s));
2046 for (n = fib_trie_get_first(&iter, t); n;
2047 n = fib_trie_get_next(&iter)) {
2050 s->totdepth += iter.depth;
2051 if (iter.depth > s->maxdepth)
2052 s->maxdepth = iter.depth;
2054 const struct tnode *tn = (const struct tnode *) n;
2058 s->nodesizes[tn->bits]++;
2059 for (i = 0; i < (1<<tn->bits); i++)
2068 * This outputs /proc/net/fib_triestats
2070 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2072 unsigned i, max, pointers, bytes, avdepth;
2075 avdepth = stat->totdepth*100 / stat->leaves;
2079 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2080 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2082 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2084 bytes = sizeof(struct leaf) * stat->leaves;
2085 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2086 bytes += sizeof(struct tnode) * stat->tnodes;
2089 while (max >= 0 && stat->nodesizes[max] == 0)
2093 for (i = 1; i <= max; i++)
2094 if (stat->nodesizes[i] != 0) {
2095 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2096 pointers += (1<<i) * stat->nodesizes[i];
2098 seq_putc(seq, '\n');
2099 seq_printf(seq, "\tPointers: %d\n", pointers);
2101 bytes += sizeof(struct node *) * pointers;
2102 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2103 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
2105 #ifdef CONFIG_IP_FIB_TRIE_STATS
2106 seq_printf(seq, "Counters:\n---------\n");
2107 seq_printf(seq,"gets = %d\n", t->stats.gets);
2108 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2109 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2110 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2111 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2112 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2114 memset(&(t->stats), 0, sizeof(t->stats));
2116 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2119 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2121 struct trie_stat *stat;
2123 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2127 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2128 sizeof(struct leaf), sizeof(struct tnode));
2131 seq_printf(seq, "Local:\n");
2132 trie_collect_stats(trie_local, stat);
2133 trie_show_stats(seq, stat);
2137 seq_printf(seq, "Main:\n");
2138 trie_collect_stats(trie_main, stat);
2139 trie_show_stats(seq, stat);
2146 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2148 return single_open(file, fib_triestat_seq_show, NULL);
2151 static struct file_operations fib_triestat_fops = {
2152 .owner = THIS_MODULE,
2153 .open = fib_triestat_seq_open,
2155 .llseek = seq_lseek,
2156 .release = single_release,
2159 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2165 for (n = fib_trie_get_first(iter, trie_local);
2166 n; ++idx, n = fib_trie_get_next(iter)) {
2171 for (n = fib_trie_get_first(iter, trie_main);
2172 n; ++idx, n = fib_trie_get_next(iter)) {
2179 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2183 return SEQ_START_TOKEN;
2184 return fib_trie_get_idx(seq->private, *pos - 1);
2187 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2189 struct fib_trie_iter *iter = seq->private;
2193 if (v == SEQ_START_TOKEN)
2194 return fib_trie_get_idx(iter, 0);
2196 v = fib_trie_get_next(iter);
2201 /* continue scan in next trie */
2202 if (iter->trie == trie_local)
2203 return fib_trie_get_first(iter, trie_main);
2208 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2213 static void seq_indent(struct seq_file *seq, int n)
2215 while (n-- > 0) seq_puts(seq, " ");
2218 static inline const char *rtn_scope(enum rt_scope_t s)
2220 static char buf[32];
2223 case RT_SCOPE_UNIVERSE: return "universe";
2224 case RT_SCOPE_SITE: return "site";
2225 case RT_SCOPE_LINK: return "link";
2226 case RT_SCOPE_HOST: return "host";
2227 case RT_SCOPE_NOWHERE: return "nowhere";
2229 snprintf(buf, sizeof(buf), "scope=%d", s);
2234 static const char *rtn_type_names[__RTN_MAX] = {
2235 [RTN_UNSPEC] = "UNSPEC",
2236 [RTN_UNICAST] = "UNICAST",
2237 [RTN_LOCAL] = "LOCAL",
2238 [RTN_BROADCAST] = "BROADCAST",
2239 [RTN_ANYCAST] = "ANYCAST",
2240 [RTN_MULTICAST] = "MULTICAST",
2241 [RTN_BLACKHOLE] = "BLACKHOLE",
2242 [RTN_UNREACHABLE] = "UNREACHABLE",
2243 [RTN_PROHIBIT] = "PROHIBIT",
2244 [RTN_THROW] = "THROW",
2246 [RTN_XRESOLVE] = "XRESOLVE",
2249 static inline const char *rtn_type(unsigned t)
2251 static char buf[32];
2253 if (t < __RTN_MAX && rtn_type_names[t])
2254 return rtn_type_names[t];
2255 snprintf(buf, sizeof(buf), "type %d", t);
2259 /* Pretty print the trie */
2260 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2262 const struct fib_trie_iter *iter = seq->private;
2265 if (v == SEQ_START_TOKEN)
2269 struct tnode *tn = (struct tnode *) n;
2270 t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
2272 if (!NODE_PARENT(n)) {
2273 if (iter->trie == trie_local)
2274 seq_puts(seq, "<local>:\n");
2276 seq_puts(seq, "<main>:\n");
2278 seq_indent(seq, iter->depth-1);
2279 seq_printf(seq, " +-- %d.%d.%d.%d/%d\n",
2280 NIPQUAD(prf), tn->pos);
2283 struct leaf *l = (struct leaf *) n;
2285 u32 val = ntohl(l->key);
2287 seq_indent(seq, iter->depth);
2288 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2289 for (i = 32; i >= 0; i--) {
2290 struct leaf_info *li = find_leaf_info(&l->list, i);
2292 struct fib_alias *fa;
2293 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2294 seq_indent(seq, iter->depth+1);
2295 seq_printf(seq, " /%d %s %s", i,
2296 rtn_scope(fa->fa_scope),
2297 rtn_type(fa->fa_type));
2299 seq_printf(seq, "tos =%d\n",
2301 seq_putc(seq, '\n');
2310 static struct seq_operations fib_trie_seq_ops = {
2311 .start = fib_trie_seq_start,
2312 .next = fib_trie_seq_next,
2313 .stop = fib_trie_seq_stop,
2314 .show = fib_trie_seq_show,
2317 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2319 struct seq_file *seq;
2321 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2326 rc = seq_open(file, &fib_trie_seq_ops);
2330 seq = file->private_data;
2332 memset(s, 0, sizeof(*s));
2340 static struct file_operations fib_trie_fops = {
2341 .owner = THIS_MODULE,
2342 .open = fib_trie_seq_open,
2344 .llseek = seq_lseek,
2345 .release = seq_release_private,
2348 static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
2350 static unsigned type2flags[RTN_MAX + 1] = {
2351 [7] = RTF_REJECT, [8] = RTF_REJECT,
2353 unsigned flags = type2flags[type];
2355 if (fi && fi->fib_nh->nh_gw)
2356 flags |= RTF_GATEWAY;
2357 if (mask == 0xFFFFFFFF)
2364 * This outputs /proc/net/route.
2365 * The format of the file is not supposed to be changed
2366 * and needs to be same as fib_hash output to avoid breaking
2369 static int fib_route_seq_show(struct seq_file *seq, void *v)
2375 if (v == SEQ_START_TOKEN) {
2376 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2377 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2385 for (i=32; i>=0; i--) {
2386 struct leaf_info *li = find_leaf_info(&l->list, i);
2387 struct fib_alias *fa;
2393 mask = inet_make_mask(li->plen);
2394 prefix = htonl(l->key);
2396 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2397 const struct fib_info *fi = rcu_dereference(fa->fa_info);
2398 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2400 if (fa->fa_type == RTN_BROADCAST
2401 || fa->fa_type == RTN_MULTICAST)
2405 snprintf(bf, sizeof(bf),
2406 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2407 fi->fib_dev ? fi->fib_dev->name : "*",
2409 fi->fib_nh->nh_gw, flags, 0, 0,
2412 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2416 snprintf(bf, sizeof(bf),
2417 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2418 prefix, 0, flags, 0, 0, 0,
2421 seq_printf(seq, "%-127s\n", bf);
2428 static struct seq_operations fib_route_seq_ops = {
2429 .start = fib_trie_seq_start,
2430 .next = fib_trie_seq_next,
2431 .stop = fib_trie_seq_stop,
2432 .show = fib_route_seq_show,
2435 static int fib_route_seq_open(struct inode *inode, struct file *file)
2437 struct seq_file *seq;
2439 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2444 rc = seq_open(file, &fib_route_seq_ops);
2448 seq = file->private_data;
2450 memset(s, 0, sizeof(*s));
2458 static struct file_operations fib_route_fops = {
2459 .owner = THIS_MODULE,
2460 .open = fib_route_seq_open,
2462 .llseek = seq_lseek,
2463 .release = seq_release_private,
2466 int __init fib_proc_init(void)
2468 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2471 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2474 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2480 proc_net_remove("fib_triestat");
2482 proc_net_remove("fib_trie");
2487 void __init fib_proc_exit(void)
2489 proc_net_remove("fib_trie");
2490 proc_net_remove("fib_triestat");
2491 proc_net_remove("route");
2494 #endif /* CONFIG_PROC_FS */