2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
46 #define VERSION "0.325"
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
85 static DEFINE_RWLOCK(fib_lock);
87 typedef unsigned int t_key;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_PARENT(_node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(_node, _ptr) \
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(_node, _type) \
98 ((_node)->_parent = (_type))
99 #define NODE_TYPE(_node) \
100 ((_node)->_parent & NODE_TYPE_MASK)
102 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
103 #define IS_LEAF(n) (n->_parent & T_LEAF)
107 unsigned long _parent;
112 unsigned long _parent;
113 struct hlist_head list;
117 struct hlist_node hlist;
119 struct list_head falh;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
144 unsigned int totdepth;
145 unsigned int maxdepth;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
158 unsigned int revision;
161 static int trie_debug = 0;
163 static int tnode_full(struct tnode *tn, struct node *n);
164 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166 static int tnode_child_length(struct tnode *tn);
167 static struct node *resize(struct trie *t, struct tnode *tn);
168 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
169 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
170 static void tnode_free(struct tnode *tn);
171 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173 extern int fib_detect_death(struct fib_info *fi, int order,
174 struct fib_info **last_resort, int *last_idx, int *dflt);
176 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177 struct nlmsghdr *n, struct netlink_skb_parms *req);
179 static kmem_cache_t *fn_alias_kmem;
180 static struct trie *trie_local = NULL, *trie_main = NULL;
182 static void trie_bug(char *err)
184 printk("Trie Bug: %s\n", err);
188 static inline struct node *tnode_get_child(struct tnode *tn, int i)
190 if (i >= 1<<tn->bits)
191 trie_bug("tnode_get_child");
196 static inline int tnode_child_length(struct tnode *tn)
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
219 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
221 if (offset < KEYLENGTH)
222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
227 static inline int tkey_equals(t_key a, t_key b)
232 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
234 if (bits == 0 || offset >= KEYLENGTH)
236 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
240 static inline int tkey_mismatch(t_key a, int offset, t_key b)
247 while ((diff << i) >> (KEYLENGTH-1) == 0)
252 /* Candiate for fib_semantics */
254 static void fn_free_alias(struct fib_alias *fa)
256 fib_release_info(fa->fa_info);
257 kmem_cache_free(fn_alias_kmem, fa);
261 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not
263 all of the bits in that key are significant.
265 Consider a node 'n' and its parent 'tp'.
267 If n is a leaf, every bit in its key is significant. Its presence is
268 necessitaded by path compression, since during a tree traversal (when
269 searching for a leaf - unless we are doing an insertion) we will completely
270 ignore all skipped bits we encounter. Thus we need to verify, at the end of
271 a potentially successful search, that we have indeed been walking the
274 Note that we can never "miss" the correct key in the tree if present by
275 following the wrong path. Path compression ensures that segments of the key
276 that are the same for all keys with a given prefix are skipped, but the
277 skipped part *is* identical for each node in the subtrie below the skipped
278 bit! trie_insert() in this implementation takes care of that - note the
279 call to tkey_sub_equals() in trie_insert().
281 if n is an internal node - a 'tnode' here, the various parts of its key
282 have many different meanings.
285 _________________________________________________________________
286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287 -----------------------------------------------------------------
288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
290 _________________________________________________________________
291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292 -----------------------------------------------------------------
293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
300 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302 not use them for anything.
304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305 index into the parent's child array. That is, they will be used to find
306 'n' among tp's children.
308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
311 All the bits we have seen so far are significant to the node n. The rest
312 of the bits are really not needed or indeed known in n->key.
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child.
318 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
323 static void check_tnode(struct tnode *tn)
325 if (tn && tn->pos+tn->bits > 32) {
326 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
330 static int halve_threshold = 25;
331 static int inflate_threshold = 50;
333 static struct leaf *leaf_new(void)
335 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
337 NODE_INIT_PARENT(l, T_LEAF);
338 INIT_HLIST_HEAD(&l->list);
343 static struct leaf_info *leaf_info_new(int plen)
345 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
348 INIT_LIST_HEAD(&li->falh);
353 static inline void free_leaf(struct leaf *l)
358 static inline void free_leaf_info(struct leaf_info *li)
363 static struct tnode *tnode_alloc(unsigned int size)
365 if (size <= PAGE_SIZE) {
366 return kmalloc(size, GFP_KERNEL);
368 return (struct tnode *)
369 __get_free_pages(GFP_KERNEL, get_order(size));
373 static void __tnode_free(struct tnode *tn)
375 unsigned int size = sizeof(struct tnode) +
376 (1<<tn->bits) * sizeof(struct node *);
378 if (size <= PAGE_SIZE)
381 free_pages((unsigned long)tn, get_order(size));
384 static struct tnode* tnode_new(t_key key, int pos, int bits)
386 int nchildren = 1<<bits;
387 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
388 struct tnode *tn = tnode_alloc(sz);
392 NODE_INIT_PARENT(tn, T_TNODE);
396 tn->full_children = 0;
397 tn->empty_children = 1<<bits;
401 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
402 (unsigned int) (sizeof(struct node) * 1<<bits));
406 static void tnode_free(struct tnode *tn)
409 trie_bug("tnode_free\n");
412 free_leaf((struct leaf *)tn);
414 printk("FL %p \n", tn);
416 else if (IS_TNODE(tn)) {
419 printk("FT %p \n", tn);
422 trie_bug("tnode_free\n");
427 * Check whether a tnode 'n' is "full", i.e. it is an internal node
428 * and no bits are skipped. See discussion in dyntree paper p. 6
431 static inline int tnode_full(struct tnode *tn, struct node *n)
433 if (n == NULL || IS_LEAF(n))
436 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
439 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
441 tnode_put_child_reorg(tn, i, n, -1);
445 * Add a child at position i overwriting the old value.
446 * Update the value of full_children and empty_children.
449 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
454 if (i >= 1<<tn->bits) {
455 printk("bits=%d, i=%d\n", tn->bits, i);
456 trie_bug("tnode_put_child_reorg bits");
458 write_lock_bh(&fib_lock);
461 /* update emptyChildren */
462 if (n == NULL && chi != NULL)
463 tn->empty_children++;
464 else if (n != NULL && chi == NULL)
465 tn->empty_children--;
467 /* update fullChildren */
469 wasfull = tnode_full(tn, chi);
471 isfull = tnode_full(tn, n);
472 if (wasfull && !isfull)
475 else if (!wasfull && isfull)
478 NODE_SET_PARENT(n, tn);
481 write_unlock_bh(&fib_lock);
484 static struct node *resize(struct trie *t, struct tnode *tn)
493 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
494 tn, inflate_threshold, halve_threshold);
497 if (tn->empty_children == tnode_child_length(tn)) {
502 if (tn->empty_children == tnode_child_length(tn) - 1)
503 for (i = 0; i < tnode_child_length(tn); i++) {
505 write_lock_bh(&fib_lock);
506 if (tn->child[i] != NULL) {
508 /* compress one level */
509 struct node *n = tn->child[i];
511 NODE_INIT_PARENT(n, NODE_TYPE(n));
513 write_unlock_bh(&fib_lock);
517 write_unlock_bh(&fib_lock);
520 * Double as long as the resulting node has a number of
521 * nonempty nodes that are above the threshold.
525 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
526 * the Helsinki University of Technology and Matti Tikkanen of Nokia
527 * Telecommunications, page 6:
528 * "A node is doubled if the ratio of non-empty children to all
529 * children in the *doubled* node is at least 'high'."
531 * 'high' in this instance is the variable 'inflate_threshold'. It
532 * is expressed as a percentage, so we multiply it with
533 * tnode_child_length() and instead of multiplying by 2 (since the
534 * child array will be doubled by inflate()) and multiplying
535 * the left-hand side by 100 (to handle the percentage thing) we
536 * multiply the left-hand side by 50.
538 * The left-hand side may look a bit weird: tnode_child_length(tn)
539 * - tn->empty_children is of course the number of non-null children
540 * in the current node. tn->full_children is the number of "full"
541 * children, that is non-null tnodes with a skip value of 0.
542 * All of those will be doubled in the resulting inflated tnode, so
543 * we just count them one extra time here.
545 * A clearer way to write this would be:
547 * to_be_doubled = tn->full_children;
548 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
551 * new_child_length = tnode_child_length(tn) * 2;
553 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
555 * if (new_fill_factor >= inflate_threshold)
557 * ...and so on, tho it would mess up the while () loop.
560 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
564 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
565 * inflate_threshold * new_child_length
567 * expand not_to_be_doubled and to_be_doubled, and shorten:
568 * 100 * (tnode_child_length(tn) - tn->empty_children +
569 * tn->full_children ) >= inflate_threshold * new_child_length
571 * expand new_child_length:
572 * 100 * (tnode_child_length(tn) - tn->empty_children +
573 * tn->full_children ) >=
574 * inflate_threshold * tnode_child_length(tn) * 2
577 * 50 * (tn->full_children + tnode_child_length(tn) -
578 * tn->empty_children ) >= inflate_threshold *
579 * tnode_child_length(tn)
586 while ((tn->full_children > 0 &&
587 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
588 inflate_threshold * tnode_child_length(tn))) {
590 tn = inflate(t, tn, &err);
593 #ifdef CONFIG_IP_FIB_TRIE_STATS
594 t->stats.resize_node_skipped++;
603 * Halve as long as the number of empty children in this
604 * node is above threshold.
608 while (tn->bits > 1 &&
609 100 * (tnode_child_length(tn) - tn->empty_children) <
610 halve_threshold * tnode_child_length(tn)) {
612 tn = halve(t, tn, &err);
615 #ifdef CONFIG_IP_FIB_TRIE_STATS
616 t->stats.resize_node_skipped++;
623 /* Only one child remains */
625 if (tn->empty_children == tnode_child_length(tn) - 1)
626 for (i = 0; i < tnode_child_length(tn); i++) {
628 write_lock_bh(&fib_lock);
629 if (tn->child[i] != NULL) {
630 /* compress one level */
631 struct node *n = tn->child[i];
634 NODE_INIT_PARENT(n, NODE_TYPE(n));
636 write_unlock_bh(&fib_lock);
640 write_unlock_bh(&fib_lock);
643 return (struct node *) tn;
646 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
649 struct tnode *oldtnode = tn;
650 int olen = tnode_child_length(tn);
654 printk("In inflate\n");
656 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
664 * Preallocate and store tnodes before the actual work so we
665 * don't get into an inconsistent state if memory allocation
666 * fails. In case of failure we return the oldnode and inflate
667 * of tnode is ignored.
670 for(i = 0; i < olen; i++) {
671 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
675 inode->pos == oldtnode->pos + oldtnode->bits &&
677 struct tnode *left, *right;
679 t_key m = TKEY_GET_MASK(inode->pos, 1);
681 left = tnode_new(inode->key&(~m), inode->pos + 1,
689 right = tnode_new(inode->key|m, inode->pos + 1,
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
703 int size = tnode_child_length(tn);
706 for(j = 0; j < size; j++)
708 tnode_free((struct tnode *)tn->child[j]);
716 for(i = 0; i < olen; i++) {
717 struct node *node = tnode_get_child(oldtnode, i);
723 /* A leaf or an internal node with skipped bits */
725 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
726 tn->pos + tn->bits - 1) {
727 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
729 put_child(t, tn, 2*i, node);
731 put_child(t, tn, 2*i+1, node);
735 /* An internal node with two children */
736 inode = (struct tnode *) node;
738 if (inode->bits == 1) {
739 put_child(t, tn, 2*i, inode->child[0]);
740 put_child(t, tn, 2*i+1, inode->child[1]);
745 /* An internal node with more than two children */
747 struct tnode *left, *right;
750 /* We will replace this node 'inode' with two new
751 * ones, 'left' and 'right', each with half of the
752 * original children. The two new nodes will have
753 * a position one bit further down the key and this
754 * means that the "significant" part of their keys
755 * (see the discussion near the top of this file)
756 * will differ by one bit, which will be "0" in
757 * left's key and "1" in right's key. Since we are
758 * moving the key position by one step, the bit that
759 * we are moving away from - the bit at position
760 * (inode->pos) - is the one that will differ between
761 * left and right. So... we synthesize that bit in the
763 * The mask 'm' below will be a single "one" bit at
764 * the position (inode->pos)
767 /* Use the old key, but set the new significant
771 left = (struct tnode *) tnode_get_child(tn, 2*i);
772 put_child(t, tn, 2*i, NULL);
777 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
778 put_child(t, tn, 2*i+1, NULL);
783 size = tnode_child_length(left);
784 for(j = 0; j < size; j++) {
785 put_child(t, left, j, inode->child[j]);
786 put_child(t, right, j, inode->child[j + size]);
788 put_child(t, tn, 2*i, resize(t, left));
789 put_child(t, tn, 2*i+1, resize(t, right));
794 tnode_free(oldtnode);
798 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
800 struct tnode *oldtnode = tn;
801 struct node *left, *right;
803 int olen = tnode_child_length(tn);
805 if (trie_debug) printk("In halve\n");
807 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
815 * Preallocate and store tnodes before the actual work so we
816 * don't get into an inconsistent state if memory allocation
817 * fails. In case of failure we return the oldnode and halve
818 * of tnode is ignored.
821 for(i = 0; i < olen; i += 2) {
822 left = tnode_get_child(oldtnode, i);
823 right = tnode_get_child(oldtnode, i+1);
825 /* Two nonempty children */
827 struct tnode *newBinNode =
828 tnode_new(left->key, tn->pos + tn->bits, 1);
834 put_child(t, tn, i/2, (struct node *)newBinNode);
839 int size = tnode_child_length(tn);
842 for(j = 0; j < size; j++)
844 tnode_free((struct tnode *)tn->child[j]);
852 for(i = 0; i < olen; i += 2) {
853 left = tnode_get_child(oldtnode, i);
854 right = tnode_get_child(oldtnode, i+1);
856 /* At least one of the children is empty */
858 if (right == NULL) /* Both are empty */
860 put_child(t, tn, i/2, right);
861 } else if (right == NULL)
862 put_child(t, tn, i/2, left);
864 /* Two nonempty children */
866 struct tnode *newBinNode =
867 (struct tnode *) tnode_get_child(tn, i/2);
868 put_child(t, tn, i/2, NULL);
873 put_child(t, newBinNode, 0, left);
874 put_child(t, newBinNode, 1, right);
875 put_child(t, tn, i/2, resize(t, newBinNode));
878 tnode_free(oldtnode);
882 static void *trie_init(struct trie *t)
888 #ifdef CONFIG_IP_FIB_TRIE_STATS
889 memset(&t->stats, 0, sizeof(struct trie_use_stats));
895 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
897 struct hlist_node *node;
898 struct leaf_info *li;
900 hlist_for_each_entry(li, node, head, hlist) {
901 if (li->plen == plen)
907 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
909 struct list_head *fa_head = NULL;
910 struct leaf_info *li = find_leaf_info(&l->list, plen);
918 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
920 struct leaf_info *li = NULL, *last = NULL;
921 struct hlist_node *node, *tmp;
923 write_lock_bh(&fib_lock);
925 if (hlist_empty(head))
926 hlist_add_head(&new->hlist, head);
928 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
930 if (new->plen > li->plen)
936 hlist_add_after(&last->hlist, &new->hlist);
938 hlist_add_before(&new->hlist, &li->hlist);
940 write_unlock_bh(&fib_lock);
944 fib_find_node(struct trie *t, u32 key)
953 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
954 tn = (struct tnode *) n;
958 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
959 pos=tn->pos + tn->bits;
960 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
965 /* Case we have found a leaf. Compare prefixes */
967 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
968 struct leaf *l = (struct leaf *) n;
974 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
979 struct tnode *tp = NULL;
987 while (tn != NULL && NODE_PARENT(tn) != NULL) {
990 printk("Rebalance tn=%p \n", tn);
991 if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
993 printk("Rebalance tp=%p \n", tp);
994 if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
1000 tp = NODE_PARENT(tn);
1001 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1002 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1003 tn = (struct tnode *) resize (t, (struct tnode *)tn);
1004 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1006 if (!NODE_PARENT(tn))
1009 tn = NODE_PARENT(tn);
1011 /* Handle last (top) tnode */
1013 tn = (struct tnode*) resize(t, (struct tnode *)tn);
1015 return (struct node*) tn;
1018 static struct list_head *
1019 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1022 struct tnode *tp = NULL, *tn = NULL;
1026 struct list_head *fa_head = NULL;
1027 struct leaf_info *li;
1033 /* If we point to NULL, stop. Either the tree is empty and we should
1034 * just put a new leaf in if, or we have reached an empty child slot,
1035 * and we should just put our new leaf in that.
1036 * If we point to a T_TNODE, check if it matches our key. Note that
1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 * not be the parent's 'pos'+'bits'!
1040 * If it does match the current key, get pos/bits from it, extract
1041 * the index from our key, push the T_TNODE and walk the tree.
1043 * If it doesn't, we have to replace it with a new T_TNODE.
1045 * If we point to a T_LEAF, it might or might not have the same key
1046 * as we do. If it does, just change the value, update the T_LEAF's
1047 * value, and return it.
1048 * If it doesn't, we need to replace it with a T_TNODE.
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1052 tn = (struct tnode *) n;
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1058 pos=tn->pos + tn->bits;
1059 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1061 if (n && NODE_PARENT(n) != tn) {
1062 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1071 * n ----> NULL, LEAF or TNODE
1073 * tp is n's (parent) ----> NULL or TNODE
1076 if (tp && IS_LEAF(tp))
1080 /* Case 1: n is a leaf. Compare prefixes */
1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1083 struct leaf *l = ( struct leaf *) n;
1085 li = leaf_info_new(plen);
1092 fa_head = &li->falh;
1093 insert_leaf_info(&l->list, li);
1105 li = leaf_info_new(plen);
1108 tnode_free((struct tnode *) l);
1113 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li);
1116 /* Case 2: n is NULL, and will just insert a new leaf */
1117 if (t->trie && n == NULL) {
1119 NODE_SET_PARENT(l, tp);
1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1126 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1132 * Add a new tnode here
1133 * first tnode need some special handling
1137 pos=tp->pos+tp->bits;
1141 newpos = tkey_mismatch(key, pos, n->key);
1142 tn = tnode_new(n->key, newpos, 1);
1146 tn = tnode_new(key, newpos, 1); /* First tnode */
1151 tnode_free((struct tnode *) l);
1156 NODE_SET_PARENT(tn, tp);
1158 missbit=tkey_extract_bits(key, newpos, 1);
1159 put_child(t, tn, missbit, (struct node *)l);
1160 put_child(t, tn, 1-missbit, n);
1163 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1164 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1167 t->trie = (struct node*) tn; /* First tnode */
1171 if (tp && tp->pos+tp->bits > 32) {
1172 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1173 tp, tp->pos, tp->bits, key, plen);
1175 /* Rebalance the trie */
1176 t->trie = trie_rebalance(t, tp);
1184 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1185 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1187 struct trie *t = (struct trie *) tb->tb_data;
1188 struct fib_alias *fa, *new_fa;
1189 struct list_head *fa_head = NULL;
1190 struct fib_info *fi;
1191 int plen = r->rtm_dst_len;
1192 int type = r->rtm_type;
1193 u8 tos = r->rtm_tos;
1203 memcpy(&key, rta->rta_dst, 4);
1208 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1210 mask = ntohl( inet_make_mask(plen) );
1217 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1220 l = fib_find_node(t, key);
1224 fa_head = get_fa_head(l, plen);
1225 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1228 /* Now fa, if non-NULL, points to the first fib alias
1229 * with the same keys [prefix,tos,priority], if such key already
1230 * exists or to the node before which we will insert new one.
1232 * If fa is NULL, we will need to allocate a new one and
1233 * insert to the head of f.
1235 * If f is NULL, no fib node matched the destination key
1236 * and we need to allocate a new one of those as well.
1240 fa->fa_info->fib_priority == fi->fib_priority) {
1241 struct fib_alias *fa_orig;
1244 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1247 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1248 struct fib_info *fi_drop;
1251 write_lock_bh(&fib_lock);
1253 fi_drop = fa->fa_info;
1256 fa->fa_scope = r->rtm_scope;
1257 state = fa->fa_state;
1258 fa->fa_state &= ~FA_S_ACCESSED;
1260 write_unlock_bh(&fib_lock);
1262 fib_release_info(fi_drop);
1263 if (state & FA_S_ACCESSED)
1268 /* Error if we find a perfect match which
1269 * uses the same scope, type, and nexthop
1273 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1274 if (fa->fa_tos != tos)
1276 if (fa->fa_info->fib_priority != fi->fib_priority)
1278 if (fa->fa_type == type &&
1279 fa->fa_scope == r->rtm_scope &&
1280 fa->fa_info == fi) {
1284 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1288 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1292 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1296 new_fa->fa_info = fi;
1297 new_fa->fa_tos = tos;
1298 new_fa->fa_type = type;
1299 new_fa->fa_scope = r->rtm_scope;
1300 new_fa->fa_state = 0;
1305 * Insert new entry to the list.
1309 fa_head = fib_insert_node(t, &err, key, plen);
1312 goto out_free_new_fa;
1315 write_lock_bh(&fib_lock);
1317 list_add_tail(&new_fa->fa_list,
1318 (fa ? &fa->fa_list : fa_head));
1320 write_unlock_bh(&fib_lock);
1323 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1328 kmem_cache_free(fn_alias_kmem, new_fa);
1330 fib_release_info(fi);
1335 static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1336 struct fib_result *res)
1340 struct leaf_info *li;
1341 struct hlist_head *hhead = &l->list;
1342 struct hlist_node *node;
1344 hlist_for_each_entry(li, node, hhead, hlist) {
1347 mask = ntohl(inet_make_mask(i));
1348 if (l->key != (key & mask))
1351 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1353 #ifdef CONFIG_IP_FIB_TRIE_STATS
1354 t->stats.semantic_match_passed++;
1358 #ifdef CONFIG_IP_FIB_TRIE_STATS
1359 t->stats.semantic_match_miss++;
1366 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1368 struct trie *t = (struct trie *) tb->tb_data;
1373 t_key key=ntohl(flp->fl4_dst);
1376 int current_prefix_length = KEYLENGTH;
1379 read_lock(&fib_lock);
1383 #ifdef CONFIG_IP_FIB_TRIE_STATS
1389 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1393 pn = (struct tnode *) n;
1402 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1404 n = tnode_get_child(pn, cindex);
1407 #ifdef CONFIG_IP_FIB_TRIE_STATS
1408 t->stats.null_node_hit++;
1416 struct tnode *cn = (struct tnode *)n;
1417 t_key node_prefix, key_prefix, pref_mismatch;
1421 * It's a tnode, and we can do some extra checks here if we
1422 * like, to avoid descending into a dead-end branch.
1423 * This tnode is in the parent's child array at index
1424 * key[p_pos..p_pos+p_bits] but potentially with some bits
1425 * chopped off, so in reality the index may be just a
1426 * subprefix, padded with zero at the end.
1427 * We can also take a look at any skipped bits in this
1428 * tnode - everything up to p_pos is supposed to be ok,
1429 * and the non-chopped bits of the index (se previous
1430 * paragraph) are also guaranteed ok, but the rest is
1431 * considered unknown.
1433 * The skipped bits are key[pos+bits..cn->pos].
1436 /* If current_prefix_length < pos+bits, we are already doing
1437 * actual prefix matching, which means everything from
1438 * pos+(bits-chopped_off) onward must be zero along some
1439 * branch of this subtree - otherwise there is *no* valid
1440 * prefix present. Here we can only check the skipped
1441 * bits. Remember, since we have already indexed into the
1442 * parent's child array, we know that the bits we chopped of
1446 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1448 if (current_prefix_length < pos+bits) {
1449 if (tkey_extract_bits(cn->key, current_prefix_length,
1450 cn->pos - current_prefix_length) != 0 ||
1456 * If chopped_off=0, the index is fully validated and we
1457 * only need to look at the skipped bits for this, the new,
1458 * tnode. What we actually want to do is to find out if
1459 * these skipped bits match our key perfectly, or if we will
1460 * have to count on finding a matching prefix further down,
1461 * because if we do, we would like to have some way of
1462 * verifying the existence of such a prefix at this point.
1465 /* The only thing we can do at this point is to verify that
1466 * any such matching prefix can indeed be a prefix to our
1467 * key, and if the bits in the node we are inspecting that
1468 * do not match our key are not ZERO, this cannot be true.
1469 * Thus, find out where there is a mismatch (before cn->pos)
1470 * and verify that all the mismatching bits are zero in the
1474 /* Note: We aren't very concerned about the piece of the key
1475 * that precede pn->pos+pn->bits, since these have already been
1476 * checked. The bits after cn->pos aren't checked since these are
1477 * by definition "unknown" at this point. Thus, what we want to
1478 * see is if we are about to enter the "prefix matching" state,
1479 * and in that case verify that the skipped bits that will prevail
1480 * throughout this subtree are zero, as they have to be if we are
1481 * to find a matching prefix.
1484 node_prefix = MASK_PFX(cn->key, cn->pos);
1485 key_prefix = MASK_PFX(key, cn->pos);
1486 pref_mismatch = key_prefix^node_prefix;
1489 /* In short: If skipped bits in this node do not match the search
1490 * key, enter the "prefix matching" state.directly.
1492 if (pref_mismatch) {
1493 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1495 pref_mismatch = pref_mismatch <<1;
1497 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1499 if (key_prefix != 0)
1502 if (current_prefix_length >= cn->pos)
1503 current_prefix_length=mp;
1506 pn = (struct tnode *)n; /* Descend */
1511 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1517 /* As zero don't change the child key (cindex) */
1518 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1522 /* Decrease current_... with bits chopped off */
1523 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1524 current_prefix_length = pn->pos + pn->bits - chopped_off;
1527 * Either we do the actual chop off according or if we have
1528 * chopped off all bits in this tnode walk up to our parent.
1531 if (chopped_off <= pn->bits)
1532 cindex &= ~(1 << (chopped_off-1));
1534 if (NODE_PARENT(pn) == NULL)
1537 /* Get Child's index */
1538 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1539 pn = NODE_PARENT(pn);
1542 #ifdef CONFIG_IP_FIB_TRIE_STATS
1543 t->stats.backtrack++;
1551 read_unlock(&fib_lock);
1555 static int trie_leaf_remove(struct trie *t, t_key key)
1558 struct tnode *tp = NULL;
1559 struct node *n = t->trie;
1563 printk("entering trie_leaf_remove(%p)\n", n);
1565 /* Note that in the case skipped bits, those bits are *not* checked!
1566 * When we finish this, we will have NULL or a T_LEAF, and the
1567 * T_LEAF may or may not match our key.
1570 while (n != NULL && IS_TNODE(n)) {
1571 struct tnode *tn = (struct tnode *) n;
1573 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1575 if (n && NODE_PARENT(n) != tn) {
1576 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1580 l = (struct leaf *) n;
1582 if (!n || !tkey_equals(l->key, key))
1587 * Remove the leaf and rebalance the tree
1593 tp = NODE_PARENT(n);
1594 tnode_free((struct tnode *) n);
1597 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1598 put_child(t, (struct tnode *)tp, cindex, NULL);
1599 t->trie = trie_rebalance(t, tp);
1608 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1609 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1611 struct trie *t = (struct trie *) tb->tb_data;
1613 int plen = r->rtm_dst_len;
1614 u8 tos = r->rtm_tos;
1615 struct fib_alias *fa, *fa_to_delete;
1616 struct list_head *fa_head;
1624 memcpy(&key, rta->rta_dst, 4);
1627 mask = ntohl( inet_make_mask(plen) );
1633 l = fib_find_node(t, key);
1638 fa_head = get_fa_head(l, plen);
1639 fa = fib_find_alias(fa_head, tos, 0);
1645 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1647 fa_to_delete = NULL;
1648 fa_head = fa->fa_list.prev;
1649 list_for_each_entry(fa, fa_head, fa_list) {
1650 struct fib_info *fi = fa->fa_info;
1652 if (fa->fa_tos != tos)
1655 if ((!r->rtm_type ||
1656 fa->fa_type == r->rtm_type) &&
1657 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1658 fa->fa_scope == r->rtm_scope) &&
1659 (!r->rtm_protocol ||
1660 fi->fib_protocol == r->rtm_protocol) &&
1661 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1669 struct leaf_info *li;
1672 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1674 l = fib_find_node(t, key);
1675 li = find_leaf_info(&l->list, plen);
1677 write_lock_bh(&fib_lock);
1679 list_del(&fa->fa_list);
1681 if (list_empty(fa_head)) {
1682 hlist_del(&li->hlist);
1685 write_unlock_bh(&fib_lock);
1690 if (hlist_empty(&l->list))
1691 trie_leaf_remove(t, key);
1693 if (fa->fa_state & FA_S_ACCESSED)
1702 static int trie_flush_list(struct trie *t, struct list_head *head)
1704 struct fib_alias *fa, *fa_node;
1707 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1708 struct fib_info *fi = fa->fa_info;
1710 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1712 write_lock_bh(&fib_lock);
1713 list_del(&fa->fa_list);
1714 write_unlock_bh(&fib_lock);
1723 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1726 struct hlist_head *lih = &l->list;
1727 struct hlist_node *node, *tmp;
1728 struct leaf_info *li = NULL;
1730 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1732 found += trie_flush_list(t, &li->falh);
1734 if (list_empty(&li->falh)) {
1736 write_lock_bh(&fib_lock);
1737 hlist_del(&li->hlist);
1738 write_unlock_bh(&fib_lock);
1746 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1748 struct node *c = (struct node *) thisleaf;
1753 if (t->trie == NULL)
1756 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1757 return (struct leaf *) t->trie;
1759 p = (struct tnode*) t->trie; /* Start */
1762 p = (struct tnode *) NODE_PARENT(c);
1767 /* Find the next child of the parent */
1769 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1773 last = 1 << p->bits;
1774 for(idx = pos; idx < last ; idx++) {
1775 if (p->child[idx]) {
1777 /* Decend if tnode */
1779 while (IS_TNODE(p->child[idx])) {
1780 p = (struct tnode*) p->child[idx];
1783 /* Rightmost non-NULL branch */
1784 if (p && IS_TNODE(p))
1785 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1787 /* Done with this tnode? */
1788 if (idx >= (1 << p->bits) || p->child[idx] == NULL )
1791 return (struct leaf*) p->child[idx];
1795 /* No more children go up one step */
1796 c = (struct node*) p;
1797 p = (struct tnode *) NODE_PARENT(p);
1799 return NULL; /* Ready. Root of trie */
1802 static int fn_trie_flush(struct fib_table *tb)
1804 struct trie *t = (struct trie *) tb->tb_data;
1805 struct leaf *ll = NULL, *l = NULL;
1810 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1811 found += trie_flush_leaf(t, l);
1813 if (ll && hlist_empty(&ll->list))
1814 trie_leaf_remove(t, ll->key);
1818 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll->key);
1822 printk("trie_flush found=%d\n", found);
1826 static int trie_last_dflt=-1;
1829 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1831 struct trie *t = (struct trie *) tb->tb_data;
1832 int order, last_idx;
1833 struct fib_info *fi = NULL;
1834 struct fib_info *last_resort;
1835 struct fib_alias *fa = NULL;
1836 struct list_head *fa_head;
1843 read_lock(&fib_lock);
1845 l = fib_find_node(t, 0);
1849 fa_head = get_fa_head(l, 0);
1853 if (list_empty(fa_head))
1856 list_for_each_entry(fa, fa_head, fa_list) {
1857 struct fib_info *next_fi = fa->fa_info;
1859 if (fa->fa_scope != res->scope ||
1860 fa->fa_type != RTN_UNICAST)
1863 if (next_fi->fib_priority > res->fi->fib_priority)
1865 if (!next_fi->fib_nh[0].nh_gw ||
1866 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1868 fa->fa_state |= FA_S_ACCESSED;
1871 if (next_fi != res->fi)
1873 } else if (!fib_detect_death(fi, order, &last_resort,
1874 &last_idx, &trie_last_dflt)) {
1876 fib_info_put(res->fi);
1878 atomic_inc(&fi->fib_clntref);
1879 trie_last_dflt = order;
1885 if (order <= 0 || fi == NULL) {
1886 trie_last_dflt = -1;
1890 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1892 fib_info_put(res->fi);
1894 atomic_inc(&fi->fib_clntref);
1895 trie_last_dflt = order;
1898 if (last_idx >= 0) {
1900 fib_info_put(res->fi);
1901 res->fi = last_resort;
1903 atomic_inc(&last_resort->fib_clntref);
1905 trie_last_dflt = last_idx;
1907 read_unlock(&fib_lock);
1910 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1911 struct sk_buff *skb, struct netlink_callback *cb)
1914 struct fib_alias *fa;
1916 u32 xkey=htonl(key);
1921 list_for_each_entry(fa, fah, fa_list) {
1926 if (fa->fa_info->fib_nh == NULL) {
1927 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1931 if (fa->fa_info == NULL) {
1932 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1937 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1946 fa->fa_info, 0) < 0) {
1956 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1957 struct netlink_callback *cb)
1960 struct list_head *fa_head;
1961 struct leaf *l = NULL;
1964 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1969 memset(&cb->args[3], 0,
1970 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1972 fa_head = get_fa_head(l, plen);
1977 if (list_empty(fa_head))
1980 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1989 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1992 struct trie *t = (struct trie *) tb->tb_data;
1996 read_lock(&fib_lock);
1997 for (m=0; m<=32; m++) {
2002 memset(&cb->args[2], 0,
2003 sizeof(cb->args) - 2*sizeof(cb->args[0]));
2005 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2010 read_unlock(&fib_lock);
2014 read_unlock(&fib_lock);
2018 /* Fix more generic FIB names for init later */
2020 #ifdef CONFIG_IP_MULTIPLE_TABLES
2021 struct fib_table * fib_hash_init(int id)
2023 struct fib_table * __init fib_hash_init(int id)
2026 struct fib_table *tb;
2029 if (fn_alias_kmem == NULL)
2030 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2031 sizeof(struct fib_alias),
2032 0, SLAB_HWCACHE_ALIGN,
2035 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2041 tb->tb_lookup = fn_trie_lookup;
2042 tb->tb_insert = fn_trie_insert;
2043 tb->tb_delete = fn_trie_delete;
2044 tb->tb_flush = fn_trie_flush;
2045 tb->tb_select_default = fn_trie_select_default;
2046 tb->tb_dump = fn_trie_dump;
2047 memset(tb->tb_data, 0, sizeof(struct trie));
2049 t = (struct trie *) tb->tb_data;
2053 if (id == RT_TABLE_LOCAL)
2055 else if (id == RT_TABLE_MAIN)
2058 if (id == RT_TABLE_LOCAL)
2059 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2064 /* Trie dump functions */
2066 static void putspace_seq(struct seq_file *seq, int n)
2068 while (n--) seq_printf(seq, " ");
2071 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2074 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2077 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2078 int pend, int cindex, int bits)
2080 putspace_seq(seq, indent);
2082 seq_printf(seq, "|");
2084 seq_printf(seq, "+");
2086 seq_printf(seq, "%d/", cindex);
2087 printbin_seq(seq, cindex, bits);
2088 seq_printf(seq, ": ");
2091 seq_printf(seq, "<root>: ");
2092 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2095 seq_printf(seq, "key=%d.%d.%d.%d\n",
2096 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2098 int plen = ((struct tnode *)n)->pos;
2099 t_key prf=MASK_PFX(n->key, plen);
2100 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2101 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2104 struct leaf *l=(struct leaf *)n;
2105 struct fib_alias *fa;
2107 for (i=32; i>=0; i--)
2108 if (find_leaf_info(&l->list, i)) {
2110 struct list_head *fa_head = get_fa_head(l, i);
2115 if (list_empty(fa_head))
2118 putspace_seq(seq, indent+2);
2119 seq_printf(seq, "{/%d...dumping}\n", i);
2122 list_for_each_entry(fa, fa_head, fa_list) {
2123 putspace_seq(seq, indent+2);
2124 if (fa->fa_info->fib_nh == NULL) {
2125 seq_printf(seq, "Error _fib_nh=NULL\n");
2128 if (fa->fa_info == NULL) {
2129 seq_printf(seq, "Error fa_info=NULL\n");
2133 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2140 else if (IS_TNODE(n)) {
2141 struct tnode *tn = (struct tnode *)n;
2142 putspace_seq(seq, indent); seq_printf(seq, "| ");
2143 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2144 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2145 seq_printf(seq, "}\n");
2146 putspace_seq(seq, indent); seq_printf(seq, "| ");
2147 seq_printf(seq, "{pos=%d", tn->pos);
2148 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2149 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2150 putspace_seq(seq, indent); seq_printf(seq, "| ");
2151 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2155 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2157 struct node *n = t->trie;
2163 read_lock(&fib_lock);
2165 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2167 printnode_seq(seq, indent, n, pend, cindex, 0);
2169 struct tnode *tn = (struct tnode *)n;
2170 pend = tn->pos+tn->bits;
2171 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2175 while (tn && cindex < (1 << tn->bits)) {
2176 if (tn->child[cindex]) {
2180 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2181 if (IS_LEAF(tn->child[cindex])) {
2187 * New tnode. Decend one level
2191 n = tn->child[cindex];
2192 tn = (struct tnode *)n;
2193 pend = tn->pos+tn->bits;
2194 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2203 * Test if we are done
2206 while (cindex >= (1 << tn->bits)) {
2209 * Move upwards and test for root
2210 * pop off all traversed nodes
2213 if (NODE_PARENT(tn) == NULL) {
2219 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2220 tn = NODE_PARENT(tn);
2222 n = (struct node *)tn;
2223 pend = tn->pos+tn->bits;
2232 else seq_printf(seq, "------ trie is empty\n");
2234 read_unlock(&fib_lock);
2237 static struct trie_stat *trie_stat_new(void)
2239 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2247 s->nullpointers = 0;
2249 for(i=0; i< MAX_CHILDS; i++)
2250 s->nodesizes[i] = 0;
2255 static struct trie_stat *trie_collect_stats(struct trie *t)
2257 struct node *n = t->trie;
2258 struct trie_stat *s = trie_stat_new();
2264 read_lock(&fib_lock);
2269 struct tnode *tn = (struct tnode *)n;
2270 pend = tn->pos+tn->bits;
2272 s->nodesizes[tn->bits]++;
2275 while (tn && cindex < (1 << tn->bits)) {
2276 if (tn->child[cindex]) {
2279 if (IS_LEAF(tn->child[cindex])) {
2283 if (depth > s->maxdepth)
2284 s->maxdepth = depth;
2285 s->totdepth += depth;
2291 * New tnode. Decend one level
2295 s->nodesizes[tn->bits]++;
2298 n = tn->child[cindex];
2299 tn = (struct tnode *)n;
2300 pend = tn->pos+tn->bits;
2312 * Test if we are done
2315 while (cindex >= (1 << tn->bits)) {
2318 * Move upwards and test for root
2319 * pop off all traversed nodes
2323 if (NODE_PARENT(tn) == NULL) {
2329 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2330 tn = NODE_PARENT(tn);
2332 n = (struct node *)tn;
2333 pend = tn->pos+tn->bits;
2344 read_unlock(&fib_lock);
2348 #ifdef CONFIG_PROC_FS
2350 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2355 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2360 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2364 if (ip_fib_main_table)
2365 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2369 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2372 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2375 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2381 * This outputs /proc/net/fib_triestats
2383 * It always works in backward compatibility mode.
2384 * The format of the file is not supposed to be changed.
2387 static void collect_and_show(struct trie *t, struct seq_file *seq)
2389 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2390 int i, max, pointers;
2391 struct trie_stat *stat;
2394 stat = trie_collect_stats(t);
2397 seq_printf(seq, "trie=%p\n", t);
2401 avdepth=stat->totdepth*100 / stat->leaves;
2404 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2405 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2407 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2408 bytes += sizeof(struct leaf) * stat->leaves;
2409 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2410 bytes += sizeof(struct tnode) * stat->tnodes;
2414 while (max >= 0 && stat->nodesizes[max] == 0)
2418 for (i = 1; i <= max; i++)
2419 if (stat->nodesizes[i] != 0) {
2420 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2421 pointers += (1<<i) * stat->nodesizes[i];
2423 seq_printf(seq, "\n");
2424 seq_printf(seq, "Pointers: %d\n", pointers);
2425 bytes += sizeof(struct node *) * pointers;
2426 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2427 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2432 #ifdef CONFIG_IP_FIB_TRIE_STATS
2433 seq_printf(seq, "Counters:\n---------\n");
2434 seq_printf(seq,"gets = %d\n", t->stats.gets);
2435 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2436 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2437 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2438 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2439 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2441 memset(&(t->stats), 0, sizeof(t->stats));
2443 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2446 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2450 if (v == SEQ_START_TOKEN) {
2451 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2452 sizeof(struct leaf), sizeof(struct tnode));
2454 collect_and_show(trie_local, seq);
2457 collect_and_show(trie_main, seq);
2460 snprintf(bf, sizeof(bf),
2461 "*\t%08X\t%08X", 200, 400);
2463 seq_printf(seq, "%-127s\n", bf);
2468 static struct seq_operations fib_triestat_seq_ops = {
2469 .start = fib_triestat_seq_start,
2470 .next = fib_triestat_seq_next,
2471 .stop = fib_triestat_seq_stop,
2472 .show = fib_triestat_seq_show,
2475 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2477 struct seq_file *seq;
2480 rc = seq_open(file, &fib_triestat_seq_ops);
2484 seq = file->private_data;
2491 static struct file_operations fib_triestat_seq_fops = {
2492 .owner = THIS_MODULE,
2493 .open = fib_triestat_seq_open,
2495 .llseek = seq_lseek,
2496 .release = seq_release_private,
2499 int __init fib_stat_proc_init(void)
2501 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2506 void __init fib_stat_proc_exit(void)
2508 proc_net_remove("fib_triestat");
2511 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2516 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2521 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2525 if (ip_fib_main_table)
2526 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2530 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2533 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2536 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2542 * This outputs /proc/net/fib_trie.
2544 * It always works in backward compatibility mode.
2545 * The format of the file is not supposed to be changed.
2548 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2552 if (v == SEQ_START_TOKEN) {
2554 trie_dump_seq(seq, trie_local);
2557 trie_dump_seq(seq, trie_main);
2561 snprintf(bf, sizeof(bf),
2562 "*\t%08X\t%08X", 200, 400);
2563 seq_printf(seq, "%-127s\n", bf);
2569 static struct seq_operations fib_trie_seq_ops = {
2570 .start = fib_trie_seq_start,
2571 .next = fib_trie_seq_next,
2572 .stop = fib_trie_seq_stop,
2573 .show = fib_trie_seq_show,
2576 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2578 struct seq_file *seq;
2581 rc = seq_open(file, &fib_trie_seq_ops);
2585 seq = file->private_data;
2592 static struct file_operations fib_trie_seq_fops = {
2593 .owner = THIS_MODULE,
2594 .open = fib_trie_seq_open,
2596 .llseek = seq_lseek,
2597 .release= seq_release_private,
2600 int __init fib_proc_init(void)
2602 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2607 void __init fib_proc_exit(void)
2609 proc_net_remove("fib_trie");
2612 #endif /* CONFIG_PROC_FS */