skge: rearrange fields
[linux-2.6] / lib / radix-tree.c
1 /*
2  * Copyright (C) 2001 Momchil Velikov
3  * Portions Copyright (C) 2001 Christoph Hellwig
4  * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5  * Copyright (C) 2006 Nick Piggin
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2, or (at
10  * your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #include <linux/errno.h>
23 #include <linux/init.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/radix-tree.h>
27 #include <linux/percpu.h>
28 #include <linux/slab.h>
29 #include <linux/notifier.h>
30 #include <linux/cpu.h>
31 #include <linux/gfp.h>
32 #include <linux/string.h>
33 #include <linux/bitops.h>
34 #include <linux/rcupdate.h>
35
36
37 #ifdef __KERNEL__
38 #define RADIX_TREE_MAP_SHIFT    (CONFIG_BASE_SMALL ? 4 : 6)
39 #else
40 #define RADIX_TREE_MAP_SHIFT    3       /* For more stressful testing */
41 #endif
42
43 #define RADIX_TREE_MAP_SIZE     (1UL << RADIX_TREE_MAP_SHIFT)
44 #define RADIX_TREE_MAP_MASK     (RADIX_TREE_MAP_SIZE-1)
45
46 #define RADIX_TREE_TAG_LONGS    \
47         ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
48
49 struct radix_tree_node {
50         unsigned int    height;         /* Height from the bottom */
51         unsigned int    count;
52         struct rcu_head rcu_head;
53         void            *slots[RADIX_TREE_MAP_SIZE];
54         unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
55 };
56
57 struct radix_tree_path {
58         struct radix_tree_node *node;
59         int offset;
60 };
61
62 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
63 #define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
64
65 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
66
67 /*
68  * Radix tree node cache.
69  */
70 static struct kmem_cache *radix_tree_node_cachep;
71
72 /*
73  * Per-cpu pool of preloaded nodes
74  */
75 struct radix_tree_preload {
76         int nr;
77         struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
78 };
79 DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
80
81 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
82 {
83         return root->gfp_mask & __GFP_BITS_MASK;
84 }
85
86 /*
87  * This assumes that the caller has performed appropriate preallocation, and
88  * that the caller has pinned this thread of control to the current CPU.
89  */
90 static struct radix_tree_node *
91 radix_tree_node_alloc(struct radix_tree_root *root)
92 {
93         struct radix_tree_node *ret;
94         gfp_t gfp_mask = root_gfp_mask(root);
95
96         ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
97         if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
98                 struct radix_tree_preload *rtp;
99
100                 rtp = &__get_cpu_var(radix_tree_preloads);
101                 if (rtp->nr) {
102                         ret = rtp->nodes[rtp->nr - 1];
103                         rtp->nodes[rtp->nr - 1] = NULL;
104                         rtp->nr--;
105                 }
106         }
107         BUG_ON(radix_tree_is_direct_ptr(ret));
108         return ret;
109 }
110
111 static void radix_tree_node_rcu_free(struct rcu_head *head)
112 {
113         struct radix_tree_node *node =
114                         container_of(head, struct radix_tree_node, rcu_head);
115         kmem_cache_free(radix_tree_node_cachep, node);
116 }
117
118 static inline void
119 radix_tree_node_free(struct radix_tree_node *node)
120 {
121         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
122 }
123
124 /*
125  * Load up this CPU's radix_tree_node buffer with sufficient objects to
126  * ensure that the addition of a single element in the tree cannot fail.  On
127  * success, return zero, with preemption disabled.  On error, return -ENOMEM
128  * with preemption not disabled.
129  */
130 int radix_tree_preload(gfp_t gfp_mask)
131 {
132         struct radix_tree_preload *rtp;
133         struct radix_tree_node *node;
134         int ret = -ENOMEM;
135
136         preempt_disable();
137         rtp = &__get_cpu_var(radix_tree_preloads);
138         while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
139                 preempt_enable();
140                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
141                 if (node == NULL)
142                         goto out;
143                 preempt_disable();
144                 rtp = &__get_cpu_var(radix_tree_preloads);
145                 if (rtp->nr < ARRAY_SIZE(rtp->nodes))
146                         rtp->nodes[rtp->nr++] = node;
147                 else
148                         kmem_cache_free(radix_tree_node_cachep, node);
149         }
150         ret = 0;
151 out:
152         return ret;
153 }
154
155 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
156                 int offset)
157 {
158         __set_bit(offset, node->tags[tag]);
159 }
160
161 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
162                 int offset)
163 {
164         __clear_bit(offset, node->tags[tag]);
165 }
166
167 static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
168                 int offset)
169 {
170         return test_bit(offset, node->tags[tag]);
171 }
172
173 static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
174 {
175         root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
176 }
177
178
179 static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
180 {
181         root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
182 }
183
184 static inline void root_tag_clear_all(struct radix_tree_root *root)
185 {
186         root->gfp_mask &= __GFP_BITS_MASK;
187 }
188
189 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
190 {
191         return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
192 }
193
194 /*
195  * Returns 1 if any slot in the node has this tag set.
196  * Otherwise returns 0.
197  */
198 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
199 {
200         int idx;
201         for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
202                 if (node->tags[tag][idx])
203                         return 1;
204         }
205         return 0;
206 }
207
208 /*
209  *      Return the maximum key which can be store into a
210  *      radix tree with height HEIGHT.
211  */
212 static inline unsigned long radix_tree_maxindex(unsigned int height)
213 {
214         return height_to_maxindex[height];
215 }
216
217 /*
218  *      Extend a radix tree so it can store key @index.
219  */
220 static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
221 {
222         struct radix_tree_node *node;
223         unsigned int height;
224         int tag;
225
226         /* Figure out what the height should be.  */
227         height = root->height + 1;
228         while (index > radix_tree_maxindex(height))
229                 height++;
230
231         if (root->rnode == NULL) {
232                 root->height = height;
233                 goto out;
234         }
235
236         do {
237                 unsigned int newheight;
238                 if (!(node = radix_tree_node_alloc(root)))
239                         return -ENOMEM;
240
241                 /* Increase the height.  */
242                 node->slots[0] = radix_tree_direct_to_ptr(root->rnode);
243
244                 /* Propagate the aggregated tag info into the new root */
245                 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
246                         if (root_tag_get(root, tag))
247                                 tag_set(node, tag, 0);
248                 }
249
250                 newheight = root->height+1;
251                 node->height = newheight;
252                 node->count = 1;
253                 rcu_assign_pointer(root->rnode, node);
254                 root->height = newheight;
255         } while (height > root->height);
256 out:
257         return 0;
258 }
259
260 /**
261  *      radix_tree_insert    -    insert into a radix tree
262  *      @root:          radix tree root
263  *      @index:         index key
264  *      @item:          item to insert
265  *
266  *      Insert an item into the radix tree at position @index.
267  */
268 int radix_tree_insert(struct radix_tree_root *root,
269                         unsigned long index, void *item)
270 {
271         struct radix_tree_node *node = NULL, *slot;
272         unsigned int height, shift;
273         int offset;
274         int error;
275
276         BUG_ON(radix_tree_is_direct_ptr(item));
277
278         /* Make sure the tree is high enough.  */
279         if (index > radix_tree_maxindex(root->height)) {
280                 error = radix_tree_extend(root, index);
281                 if (error)
282                         return error;
283         }
284
285         slot = root->rnode;
286         height = root->height;
287         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
288
289         offset = 0;                     /* uninitialised var warning */
290         while (height > 0) {
291                 if (slot == NULL) {
292                         /* Have to add a child node.  */
293                         if (!(slot = radix_tree_node_alloc(root)))
294                                 return -ENOMEM;
295                         slot->height = height;
296                         if (node) {
297                                 rcu_assign_pointer(node->slots[offset], slot);
298                                 node->count++;
299                         } else
300                                 rcu_assign_pointer(root->rnode, slot);
301                 }
302
303                 /* Go a level down */
304                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
305                 node = slot;
306                 slot = node->slots[offset];
307                 shift -= RADIX_TREE_MAP_SHIFT;
308                 height--;
309         }
310
311         if (slot != NULL)
312                 return -EEXIST;
313
314         if (node) {
315                 node->count++;
316                 rcu_assign_pointer(node->slots[offset], item);
317                 BUG_ON(tag_get(node, 0, offset));
318                 BUG_ON(tag_get(node, 1, offset));
319         } else {
320                 rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item));
321                 BUG_ON(root_tag_get(root, 0));
322                 BUG_ON(root_tag_get(root, 1));
323         }
324
325         return 0;
326 }
327 EXPORT_SYMBOL(radix_tree_insert);
328
329 /**
330  *      radix_tree_lookup_slot    -    lookup a slot in a radix tree
331  *      @root:          radix tree root
332  *      @index:         index key
333  *
334  *      Returns:  the slot corresponding to the position @index in the
335  *      radix tree @root. This is useful for update-if-exists operations.
336  *
337  *      This function cannot be called under rcu_read_lock, it must be
338  *      excluded from writers, as must the returned slot for subsequent
339  *      use by radix_tree_deref_slot() and radix_tree_replace slot.
340  *      Caller must hold tree write locked across slot lookup and
341  *      replace.
342  */
343 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
344 {
345         unsigned int height, shift;
346         struct radix_tree_node *node, **slot;
347
348         node = root->rnode;
349         if (node == NULL)
350                 return NULL;
351
352         if (radix_tree_is_direct_ptr(node)) {
353                 if (index > 0)
354                         return NULL;
355                 return (void **)&root->rnode;
356         }
357
358         height = node->height;
359         if (index > radix_tree_maxindex(height))
360                 return NULL;
361
362         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
363
364         do {
365                 slot = (struct radix_tree_node **)
366                         (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
367                 node = *slot;
368                 if (node == NULL)
369                         return NULL;
370
371                 shift -= RADIX_TREE_MAP_SHIFT;
372                 height--;
373         } while (height > 0);
374
375         return (void **)slot;
376 }
377 EXPORT_SYMBOL(radix_tree_lookup_slot);
378
379 /**
380  *      radix_tree_lookup    -    perform lookup operation on a radix tree
381  *      @root:          radix tree root
382  *      @index:         index key
383  *
384  *      Lookup the item at the position @index in the radix tree @root.
385  *
386  *      This function can be called under rcu_read_lock, however the caller
387  *      must manage lifetimes of leaf nodes (eg. RCU may also be used to free
388  *      them safely). No RCU barriers are required to access or modify the
389  *      returned item, however.
390  */
391 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
392 {
393         unsigned int height, shift;
394         struct radix_tree_node *node, **slot;
395
396         node = rcu_dereference(root->rnode);
397         if (node == NULL)
398                 return NULL;
399
400         if (radix_tree_is_direct_ptr(node)) {
401                 if (index > 0)
402                         return NULL;
403                 return radix_tree_direct_to_ptr(node);
404         }
405
406         height = node->height;
407         if (index > radix_tree_maxindex(height))
408                 return NULL;
409
410         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
411
412         do {
413                 slot = (struct radix_tree_node **)
414                         (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
415                 node = rcu_dereference(*slot);
416                 if (node == NULL)
417                         return NULL;
418
419                 shift -= RADIX_TREE_MAP_SHIFT;
420                 height--;
421         } while (height > 0);
422
423         return node;
424 }
425 EXPORT_SYMBOL(radix_tree_lookup);
426
427 /**
428  *      radix_tree_tag_set - set a tag on a radix tree node
429  *      @root:          radix tree root
430  *      @index:         index key
431  *      @tag:           tag index
432  *
433  *      Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
434  *      corresponding to @index in the radix tree.  From
435  *      the root all the way down to the leaf node.
436  *
437  *      Returns the address of the tagged item.   Setting a tag on a not-present
438  *      item is a bug.
439  */
440 void *radix_tree_tag_set(struct radix_tree_root *root,
441                         unsigned long index, unsigned int tag)
442 {
443         unsigned int height, shift;
444         struct radix_tree_node *slot;
445
446         height = root->height;
447         BUG_ON(index > radix_tree_maxindex(height));
448
449         slot = root->rnode;
450         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
451
452         while (height > 0) {
453                 int offset;
454
455                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
456                 if (!tag_get(slot, tag, offset))
457                         tag_set(slot, tag, offset);
458                 slot = slot->slots[offset];
459                 BUG_ON(slot == NULL);
460                 shift -= RADIX_TREE_MAP_SHIFT;
461                 height--;
462         }
463
464         /* set the root's tag bit */
465         if (slot && !root_tag_get(root, tag))
466                 root_tag_set(root, tag);
467
468         return slot;
469 }
470 EXPORT_SYMBOL(radix_tree_tag_set);
471
472 /**
473  *      radix_tree_tag_clear - clear a tag on a radix tree node
474  *      @root:          radix tree root
475  *      @index:         index key
476  *      @tag:           tag index
477  *
478  *      Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
479  *      corresponding to @index in the radix tree.  If
480  *      this causes the leaf node to have no tags set then clear the tag in the
481  *      next-to-leaf node, etc.
482  *
483  *      Returns the address of the tagged item on success, else NULL.  ie:
484  *      has the same return value and semantics as radix_tree_lookup().
485  */
486 void *radix_tree_tag_clear(struct radix_tree_root *root,
487                         unsigned long index, unsigned int tag)
488 {
489         struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
490         struct radix_tree_node *slot = NULL;
491         unsigned int height, shift;
492
493         height = root->height;
494         if (index > radix_tree_maxindex(height))
495                 goto out;
496
497         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
498         pathp->node = NULL;
499         slot = root->rnode;
500
501         while (height > 0) {
502                 int offset;
503
504                 if (slot == NULL)
505                         goto out;
506
507                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
508                 pathp[1].offset = offset;
509                 pathp[1].node = slot;
510                 slot = slot->slots[offset];
511                 pathp++;
512                 shift -= RADIX_TREE_MAP_SHIFT;
513                 height--;
514         }
515
516         if (slot == NULL)
517                 goto out;
518
519         while (pathp->node) {
520                 if (!tag_get(pathp->node, tag, pathp->offset))
521                         goto out;
522                 tag_clear(pathp->node, tag, pathp->offset);
523                 if (any_tag_set(pathp->node, tag))
524                         goto out;
525                 pathp--;
526         }
527
528         /* clear the root's tag bit */
529         if (root_tag_get(root, tag))
530                 root_tag_clear(root, tag);
531
532 out:
533         return slot;
534 }
535 EXPORT_SYMBOL(radix_tree_tag_clear);
536
537 #ifndef __KERNEL__      /* Only the test harness uses this at present */
538 /**
539  * radix_tree_tag_get - get a tag on a radix tree node
540  * @root:               radix tree root
541  * @index:              index key
542  * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
543  *
544  * Return values:
545  *
546  *  0: tag not present or not set
547  *  1: tag set
548  */
549 int radix_tree_tag_get(struct radix_tree_root *root,
550                         unsigned long index, unsigned int tag)
551 {
552         unsigned int height, shift;
553         struct radix_tree_node *node;
554         int saw_unset_tag = 0;
555
556         /* check the root's tag bit */
557         if (!root_tag_get(root, tag))
558                 return 0;
559
560         node = rcu_dereference(root->rnode);
561         if (node == NULL)
562                 return 0;
563
564         if (radix_tree_is_direct_ptr(node))
565                 return (index == 0);
566
567         height = node->height;
568         if (index > radix_tree_maxindex(height))
569                 return 0;
570
571         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
572
573         for ( ; ; ) {
574                 int offset;
575
576                 if (node == NULL)
577                         return 0;
578
579                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
580
581                 /*
582                  * This is just a debug check.  Later, we can bale as soon as
583                  * we see an unset tag.
584                  */
585                 if (!tag_get(node, tag, offset))
586                         saw_unset_tag = 1;
587                 if (height == 1) {
588                         int ret = tag_get(node, tag, offset);
589
590                         BUG_ON(ret && saw_unset_tag);
591                         return !!ret;
592                 }
593                 node = rcu_dereference(node->slots[offset]);
594                 shift -= RADIX_TREE_MAP_SHIFT;
595                 height--;
596         }
597 }
598 EXPORT_SYMBOL(radix_tree_tag_get);
599 #endif
600
601 static unsigned int
602 __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
603         unsigned int max_items, unsigned long *next_index)
604 {
605         unsigned int nr_found = 0;
606         unsigned int shift, height;
607         unsigned long i;
608
609         height = slot->height;
610         if (height == 0)
611                 goto out;
612         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
613
614         for ( ; height > 1; height--) {
615                 i = (index >> shift) & RADIX_TREE_MAP_MASK;
616                 for (;;) {
617                         if (slot->slots[i] != NULL)
618                                 break;
619                         index &= ~((1UL << shift) - 1);
620                         index += 1UL << shift;
621                         if (index == 0)
622                                 goto out;       /* 32-bit wraparound */
623                         i++;
624                         if (i == RADIX_TREE_MAP_SIZE)
625                                 goto out;
626                 }
627
628                 shift -= RADIX_TREE_MAP_SHIFT;
629                 slot = rcu_dereference(slot->slots[i]);
630                 if (slot == NULL)
631                         goto out;
632         }
633
634         /* Bottom level: grab some items */
635         for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
636                 struct radix_tree_node *node;
637                 index++;
638                 node = slot->slots[i];
639                 if (node) {
640                         results[nr_found++] = rcu_dereference(node);
641                         if (nr_found == max_items)
642                                 goto out;
643                 }
644         }
645 out:
646         *next_index = index;
647         return nr_found;
648 }
649
650 /**
651  *      radix_tree_gang_lookup - perform multiple lookup on a radix tree
652  *      @root:          radix tree root
653  *      @results:       where the results of the lookup are placed
654  *      @first_index:   start the lookup from this key
655  *      @max_items:     place up to this many items at *results
656  *
657  *      Performs an index-ascending scan of the tree for present items.  Places
658  *      them at *@results and returns the number of items which were placed at
659  *      *@results.
660  *
661  *      The implementation is naive.
662  *
663  *      Like radix_tree_lookup, radix_tree_gang_lookup may be called under
664  *      rcu_read_lock. In this case, rather than the returned results being
665  *      an atomic snapshot of the tree at a single point in time, the semantics
666  *      of an RCU protected gang lookup are as though multiple radix_tree_lookups
667  *      have been issued in individual locks, and results stored in 'results'.
668  */
669 unsigned int
670 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
671                         unsigned long first_index, unsigned int max_items)
672 {
673         unsigned long max_index;
674         struct radix_tree_node *node;
675         unsigned long cur_index = first_index;
676         unsigned int ret;
677
678         node = rcu_dereference(root->rnode);
679         if (!node)
680                 return 0;
681
682         if (radix_tree_is_direct_ptr(node)) {
683                 if (first_index > 0)
684                         return 0;
685                 node = radix_tree_direct_to_ptr(node);
686                 results[0] = rcu_dereference(node);
687                 return 1;
688         }
689
690         max_index = radix_tree_maxindex(node->height);
691
692         ret = 0;
693         while (ret < max_items) {
694                 unsigned int nr_found;
695                 unsigned long next_index;       /* Index of next search */
696
697                 if (cur_index > max_index)
698                         break;
699                 nr_found = __lookup(node, results + ret, cur_index,
700                                         max_items - ret, &next_index);
701                 ret += nr_found;
702                 if (next_index == 0)
703                         break;
704                 cur_index = next_index;
705         }
706
707         return ret;
708 }
709 EXPORT_SYMBOL(radix_tree_gang_lookup);
710
711 /*
712  * FIXME: the two tag_get()s here should use find_next_bit() instead of
713  * open-coding the search.
714  */
715 static unsigned int
716 __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
717         unsigned int max_items, unsigned long *next_index, unsigned int tag)
718 {
719         unsigned int nr_found = 0;
720         unsigned int shift, height;
721
722         height = slot->height;
723         if (height == 0)
724                 goto out;
725         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
726
727         while (height > 0) {
728                 unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
729
730                 for (;;) {
731                         if (tag_get(slot, tag, i))
732                                 break;
733                         index &= ~((1UL << shift) - 1);
734                         index += 1UL << shift;
735                         if (index == 0)
736                                 goto out;       /* 32-bit wraparound */
737                         i++;
738                         if (i == RADIX_TREE_MAP_SIZE)
739                                 goto out;
740                 }
741                 height--;
742                 if (height == 0) {      /* Bottom level: grab some items */
743                         unsigned long j = index & RADIX_TREE_MAP_MASK;
744
745                         for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
746                                 struct radix_tree_node *node;
747                                 index++;
748                                 if (!tag_get(slot, tag, j))
749                                         continue;
750                                 node = slot->slots[j];
751                                 /*
752                                  * Even though the tag was found set, we need to
753                                  * recheck that we have a non-NULL node, because
754                                  * if this lookup is lockless, it may have been
755                                  * subsequently deleted.
756                                  *
757                                  * Similar care must be taken in any place that
758                                  * lookup ->slots[x] without a lock (ie. can't
759                                  * rely on its value remaining the same).
760                                  */
761                                 if (node) {
762                                         node = rcu_dereference(node);
763                                         results[nr_found++] = node;
764                                         if (nr_found == max_items)
765                                                 goto out;
766                                 }
767                         }
768                 }
769                 shift -= RADIX_TREE_MAP_SHIFT;
770                 slot = rcu_dereference(slot->slots[i]);
771                 if (slot == NULL)
772                         break;
773         }
774 out:
775         *next_index = index;
776         return nr_found;
777 }
778
779 /**
780  *      radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
781  *                                   based on a tag
782  *      @root:          radix tree root
783  *      @results:       where the results of the lookup are placed
784  *      @first_index:   start the lookup from this key
785  *      @max_items:     place up to this many items at *results
786  *      @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
787  *
788  *      Performs an index-ascending scan of the tree for present items which
789  *      have the tag indexed by @tag set.  Places the items at *@results and
790  *      returns the number of items which were placed at *@results.
791  */
792 unsigned int
793 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
794                 unsigned long first_index, unsigned int max_items,
795                 unsigned int tag)
796 {
797         struct radix_tree_node *node;
798         unsigned long max_index;
799         unsigned long cur_index = first_index;
800         unsigned int ret;
801
802         /* check the root's tag bit */
803         if (!root_tag_get(root, tag))
804                 return 0;
805
806         node = rcu_dereference(root->rnode);
807         if (!node)
808                 return 0;
809
810         if (radix_tree_is_direct_ptr(node)) {
811                 if (first_index > 0)
812                         return 0;
813                 node = radix_tree_direct_to_ptr(node);
814                 results[0] = rcu_dereference(node);
815                 return 1;
816         }
817
818         max_index = radix_tree_maxindex(node->height);
819
820         ret = 0;
821         while (ret < max_items) {
822                 unsigned int nr_found;
823                 unsigned long next_index;       /* Index of next search */
824
825                 if (cur_index > max_index)
826                         break;
827                 nr_found = __lookup_tag(node, results + ret, cur_index,
828                                         max_items - ret, &next_index, tag);
829                 ret += nr_found;
830                 if (next_index == 0)
831                         break;
832                 cur_index = next_index;
833         }
834
835         return ret;
836 }
837 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
838
839 /**
840  *      radix_tree_shrink    -    shrink height of a radix tree to minimal
841  *      @root           radix tree root
842  */
843 static inline void radix_tree_shrink(struct radix_tree_root *root)
844 {
845         /* try to shrink tree height */
846         while (root->height > 0 &&
847                         root->rnode->count == 1 &&
848                         root->rnode->slots[0]) {
849                 struct radix_tree_node *to_free = root->rnode;
850                 void *newptr;
851
852                 /*
853                  * We don't need rcu_assign_pointer(), since we are simply
854                  * moving the node from one part of the tree to another. If
855                  * it was safe to dereference the old pointer to it
856                  * (to_free->slots[0]), it will be safe to dereference the new
857                  * one (root->rnode).
858                  */
859                 newptr = to_free->slots[0];
860                 if (root->height == 1)
861                         newptr = radix_tree_ptr_to_direct(newptr);
862                 root->rnode = newptr;
863                 root->height--;
864                 /* must only free zeroed nodes into the slab */
865                 tag_clear(to_free, 0, 0);
866                 tag_clear(to_free, 1, 0);
867                 to_free->slots[0] = NULL;
868                 to_free->count = 0;
869                 radix_tree_node_free(to_free);
870         }
871 }
872
873 /**
874  *      radix_tree_delete    -    delete an item from a radix tree
875  *      @root:          radix tree root
876  *      @index:         index key
877  *
878  *      Remove the item at @index from the radix tree rooted at @root.
879  *
880  *      Returns the address of the deleted item, or NULL if it was not present.
881  */
882 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
883 {
884         struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
885         struct radix_tree_node *slot = NULL;
886         struct radix_tree_node *to_free;
887         unsigned int height, shift;
888         int tag;
889         int offset;
890
891         height = root->height;
892         if (index > radix_tree_maxindex(height))
893                 goto out;
894
895         slot = root->rnode;
896         if (height == 0 && root->rnode) {
897                 slot = radix_tree_direct_to_ptr(slot);
898                 root_tag_clear_all(root);
899                 root->rnode = NULL;
900                 goto out;
901         }
902
903         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
904         pathp->node = NULL;
905
906         do {
907                 if (slot == NULL)
908                         goto out;
909
910                 pathp++;
911                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
912                 pathp->offset = offset;
913                 pathp->node = slot;
914                 slot = slot->slots[offset];
915                 shift -= RADIX_TREE_MAP_SHIFT;
916                 height--;
917         } while (height > 0);
918
919         if (slot == NULL)
920                 goto out;
921
922         /*
923          * Clear all tags associated with the just-deleted item
924          */
925         for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
926                 if (tag_get(pathp->node, tag, pathp->offset))
927                         radix_tree_tag_clear(root, index, tag);
928         }
929
930         to_free = NULL;
931         /* Now free the nodes we do not need anymore */
932         while (pathp->node) {
933                 pathp->node->slots[pathp->offset] = NULL;
934                 pathp->node->count--;
935                 /*
936                  * Queue the node for deferred freeing after the
937                  * last reference to it disappears (set NULL, above).
938                  */
939                 if (to_free)
940                         radix_tree_node_free(to_free);
941
942                 if (pathp->node->count) {
943                         if (pathp->node == root->rnode)
944                                 radix_tree_shrink(root);
945                         goto out;
946                 }
947
948                 /* Node with zero slots in use so free it */
949                 to_free = pathp->node;
950                 pathp--;
951
952         }
953         root_tag_clear_all(root);
954         root->height = 0;
955         root->rnode = NULL;
956         if (to_free)
957                 radix_tree_node_free(to_free);
958
959 out:
960         return slot;
961 }
962 EXPORT_SYMBOL(radix_tree_delete);
963
964 /**
965  *      radix_tree_tagged - test whether any items in the tree are tagged
966  *      @root:          radix tree root
967  *      @tag:           tag to test
968  */
969 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
970 {
971         return root_tag_get(root, tag);
972 }
973 EXPORT_SYMBOL(radix_tree_tagged);
974
975 static void
976 radix_tree_node_ctor(void *node, struct kmem_cache *cachep, unsigned long flags)
977 {
978         memset(node, 0, sizeof(struct radix_tree_node));
979 }
980
981 static __init unsigned long __maxindex(unsigned int height)
982 {
983         unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
984         unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
985
986         if (tmp >= RADIX_TREE_INDEX_BITS)
987                 index = ~0UL;
988         return index;
989 }
990
991 static __init void radix_tree_init_maxindex(void)
992 {
993         unsigned int i;
994
995         for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
996                 height_to_maxindex[i] = __maxindex(i);
997 }
998
999 static int radix_tree_callback(struct notifier_block *nfb,
1000                             unsigned long action,
1001                             void *hcpu)
1002 {
1003        int cpu = (long)hcpu;
1004        struct radix_tree_preload *rtp;
1005
1006        /* Free per-cpu pool of perloaded nodes */
1007        if (action == CPU_DEAD) {
1008                rtp = &per_cpu(radix_tree_preloads, cpu);
1009                while (rtp->nr) {
1010                        kmem_cache_free(radix_tree_node_cachep,
1011                                        rtp->nodes[rtp->nr-1]);
1012                        rtp->nodes[rtp->nr-1] = NULL;
1013                        rtp->nr--;
1014                }
1015        }
1016        return NOTIFY_OK;
1017 }
1018
1019 void __init radix_tree_init(void)
1020 {
1021         radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1022                         sizeof(struct radix_tree_node), 0,
1023                         SLAB_PANIC, radix_tree_node_ctor, NULL);
1024         radix_tree_init_maxindex();
1025         hotcpu_notifier(radix_tree_callback, 0);
1026 }