2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 static struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Check if all specified nodes are online */
97 static int nodes_online(unsigned long *nodes)
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, unsigned long *nodes)
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
127 return nodes_online(nodes);
130 /* Copy a node mask from user space. */
131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
135 unsigned long nlongs;
136 unsigned long endmask;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
156 if (get_user(t, nmask + k))
158 if (k == nlongs - 1) {
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
178 /* Generate a custom zonelist for the BIND policy. */
179 static struct zonelist *bind_zonelist(unsigned long *nodes)
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
197 zl->zones[num++] = z;
203 zl->zones[num] = NULL;
207 /* Create a new policy */
208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
210 struct mempolicy *policy;
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
236 policy->policy = mode;
240 /* Ensure all existing pages follow the policy. */
242 verify_pages(struct mm_struct *mm,
243 unsigned long addr, unsigned long end, unsigned long *nodes)
251 pgd = pgd_offset(mm, addr);
252 if (pgd_none(*pgd)) {
253 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
259 pud = pud_offset(pgd, addr);
260 if (pud_none(*pud)) {
261 addr = (addr + PUD_SIZE) & PUD_MASK;
264 pmd = pmd_offset(pud, addr);
265 if (pmd_none(*pmd)) {
266 addr = (addr + PMD_SIZE) & PMD_MASK;
270 pte = pte_offset_map(pmd, addr);
271 if (pte_present(*pte))
275 unsigned nid = page_to_nid(p);
276 if (!test_bit(nid, nodes))
284 /* Step 1: check the range */
285 static struct vm_area_struct *
286 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
287 unsigned long *nodes, unsigned long flags)
290 struct vm_area_struct *first, *vma, *prev;
292 first = find_vma(mm, start);
294 return ERR_PTR(-EFAULT);
296 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
297 if (!vma->vm_next && vma->vm_end < end)
298 return ERR_PTR(-EFAULT);
299 if (prev && prev->vm_end < vma->vm_start)
300 return ERR_PTR(-EFAULT);
301 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
302 err = verify_pages(vma->vm_mm,
303 vma->vm_start, vma->vm_end, nodes);
305 first = ERR_PTR(err);
314 /* Apply policy to a single VMA */
315 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
318 struct mempolicy *old = vma->vm_policy;
320 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
321 vma->vm_start, vma->vm_end, vma->vm_pgoff,
322 vma->vm_ops, vma->vm_file,
323 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
325 if (vma->vm_ops && vma->vm_ops->set_policy)
326 err = vma->vm_ops->set_policy(vma, new);
329 vma->vm_policy = new;
335 /* Step 2: apply policy to a range and do splits. */
336 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
337 unsigned long end, struct mempolicy *new)
339 struct vm_area_struct *next;
343 for (; vma && vma->vm_start < end; vma = next) {
345 if (vma->vm_start < start)
346 err = split_vma(vma->vm_mm, vma, start, 1);
347 if (!err && vma->vm_end > end)
348 err = split_vma(vma->vm_mm, vma, end, 0);
350 err = policy_vma(vma, new);
357 /* Change policy for a memory range */
358 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
360 unsigned long __user *nmask, unsigned long maxnode,
363 struct vm_area_struct *vma;
364 struct mm_struct *mm = current->mm;
365 struct mempolicy *new;
367 DECLARE_BITMAP(nodes, MAX_NUMNODES);
370 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
372 if (start & ~PAGE_MASK)
374 if (mode == MPOL_DEFAULT)
375 flags &= ~MPOL_MF_STRICT;
376 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
383 err = get_nodes(nodes, nmask, maxnode, mode);
387 new = mpol_new(mode, nodes);
391 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
394 down_write(&mm->mmap_sem);
395 vma = check_range(mm, start, end, nodes, flags);
398 err = mbind_range(vma, start, end, new);
399 up_write(&mm->mmap_sem);
404 /* Set the process memory policy */
405 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
406 unsigned long maxnode)
409 struct mempolicy *new;
410 DECLARE_BITMAP(nodes, MAX_NUMNODES);
414 err = get_nodes(nodes, nmask, maxnode, mode);
417 new = mpol_new(mode, nodes);
420 mpol_free(current->mempolicy);
421 current->mempolicy = new;
422 if (new && new->policy == MPOL_INTERLEAVE)
423 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
427 /* Fill a zone bitmap for a policy */
428 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
432 bitmap_zero(nodes, MAX_NUMNODES);
435 for (i = 0; p->v.zonelist->zones[i]; i++)
436 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
440 case MPOL_INTERLEAVE:
441 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
444 /* or use current node instead of online map? */
445 if (p->v.preferred_node < 0)
446 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
448 __set_bit(p->v.preferred_node, nodes);
455 static int lookup_node(struct mm_struct *mm, unsigned long addr)
460 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
462 err = page_to_nid(p);
468 /* Copy a kernel node mask to user space */
469 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
470 void *nodes, unsigned nbytes)
472 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
475 if (copy > PAGE_SIZE)
477 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
481 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
484 /* Retrieve NUMA policy */
485 asmlinkage long sys_get_mempolicy(int __user *policy,
486 unsigned long __user *nmask,
487 unsigned long maxnode,
488 unsigned long addr, unsigned long flags)
491 struct mm_struct *mm = current->mm;
492 struct vm_area_struct *vma = NULL;
493 struct mempolicy *pol = current->mempolicy;
495 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
497 if (nmask != NULL && maxnode < MAX_NUMNODES)
499 if (flags & MPOL_F_ADDR) {
500 down_read(&mm->mmap_sem);
501 vma = find_vma_intersection(mm, addr, addr+1);
503 up_read(&mm->mmap_sem);
506 if (vma->vm_ops && vma->vm_ops->get_policy)
507 pol = vma->vm_ops->get_policy(vma, addr);
509 pol = vma->vm_policy;
514 pol = &default_policy;
516 if (flags & MPOL_F_NODE) {
517 if (flags & MPOL_F_ADDR) {
518 err = lookup_node(mm, addr);
522 } else if (pol == current->mempolicy &&
523 pol->policy == MPOL_INTERLEAVE) {
524 pval = current->il_next;
533 up_read(¤t->mm->mmap_sem);
537 if (policy && put_user(pval, policy))
542 DECLARE_BITMAP(nodes, MAX_NUMNODES);
543 get_zonemask(pol, nodes);
544 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
549 up_read(¤t->mm->mmap_sem);
555 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
556 compat_ulong_t __user *nmask,
557 compat_ulong_t maxnode,
558 compat_ulong_t addr, compat_ulong_t flags)
561 unsigned long __user *nm = NULL;
562 unsigned long nr_bits, alloc_size;
563 DECLARE_BITMAP(bm, MAX_NUMNODES);
565 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
566 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
569 nm = compat_alloc_user_space(alloc_size);
571 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
574 err = copy_from_user(bm, nm, alloc_size);
575 /* ensure entire bitmap is zeroed */
576 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
577 err |= compat_put_bitmap(nmask, bm, nr_bits);
583 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
584 compat_ulong_t maxnode)
587 unsigned long __user *nm = NULL;
588 unsigned long nr_bits, alloc_size;
589 DECLARE_BITMAP(bm, MAX_NUMNODES);
591 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
592 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
595 err = compat_get_bitmap(bm, nmask, nr_bits);
596 nm = compat_alloc_user_space(alloc_size);
597 err |= copy_to_user(nm, bm, alloc_size);
603 return sys_set_mempolicy(mode, nm, nr_bits+1);
606 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
607 compat_ulong_t mode, compat_ulong_t __user *nmask,
608 compat_ulong_t maxnode, compat_ulong_t flags)
611 unsigned long __user *nm = NULL;
612 unsigned long nr_bits, alloc_size;
613 DECLARE_BITMAP(bm, MAX_NUMNODES);
615 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
616 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
619 err = compat_get_bitmap(bm, nmask, nr_bits);
620 nm = compat_alloc_user_space(alloc_size);
621 err |= copy_to_user(nm, bm, alloc_size);
627 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
632 /* Return effective policy for a VMA */
633 static struct mempolicy *
634 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
636 struct mempolicy *pol = current->mempolicy;
639 if (vma->vm_ops && vma->vm_ops->get_policy)
640 pol = vma->vm_ops->get_policy(vma, addr);
641 else if (vma->vm_policy &&
642 vma->vm_policy->policy != MPOL_DEFAULT)
643 pol = vma->vm_policy;
646 pol = &default_policy;
650 /* Return a zonelist representing a mempolicy */
651 static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
655 switch (policy->policy) {
657 nd = policy->v.preferred_node;
662 /* Lower zones don't get a policy applied */
663 /* Careful: current->mems_allowed might have moved */
664 if (gfp >= policy_zone)
665 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
666 return policy->v.zonelist;
668 case MPOL_INTERLEAVE: /* should not happen */
676 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
679 /* Do dynamic interleaving for a process */
680 static unsigned interleave_nodes(struct mempolicy *policy)
683 struct task_struct *me = current;
686 BUG_ON(nid >= MAX_NUMNODES);
687 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
688 if (next >= MAX_NUMNODES)
689 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
694 /* Do static interleaving for a VMA with known offset. */
695 static unsigned offset_il_node(struct mempolicy *pol,
696 struct vm_area_struct *vma, unsigned long off)
698 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
699 unsigned target = (unsigned)off % nnodes;
705 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
707 } while (c <= target);
708 BUG_ON(nid >= MAX_NUMNODES);
709 BUG_ON(!test_bit(nid, pol->v.nodes));
713 /* Allocate a page in interleaved policy.
714 Own path because it needs to do special accounting. */
715 static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
720 BUG_ON(!node_online(nid));
721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722 page = __alloc_pages(gfp, order, zl);
723 if (page && page_zone(page) == zl->zones[0]) {
724 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
731 * alloc_page_vma - Allocate a page for a VMA.
734 * %GFP_USER user allocation.
735 * %GFP_KERNEL kernel allocations,
736 * %GFP_HIGHMEM highmem/user allocations,
737 * %GFP_FS allocation should not call back into a file system.
738 * %GFP_ATOMIC don't sleep.
740 * @vma: Pointer to VMA or NULL if not available.
741 * @addr: Virtual Address of the allocation. Must be inside the VMA.
743 * This function allocates a page from the kernel page pool and applies
744 * a NUMA policy associated with the VMA or the current process.
745 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
746 * mm_struct of the VMA to prevent it from going away. Should be used for
747 * all allocations for pages that will be mapped into
748 * user space. Returns NULL when no page can be allocated.
750 * Should be called with the mm_sem of the vma hold.
753 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
755 struct mempolicy *pol = get_vma_policy(vma, addr);
757 cpuset_update_current_mems_allowed();
759 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
763 BUG_ON(addr >= vma->vm_end);
764 BUG_ON(addr < vma->vm_start);
766 off += (addr - vma->vm_start) >> PAGE_SHIFT;
767 nid = offset_il_node(pol, vma, off);
769 /* fall back to process interleaving */
770 nid = interleave_nodes(pol);
772 return alloc_page_interleave(gfp, 0, nid);
774 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
778 * alloc_pages_current - Allocate pages.
781 * %GFP_USER user allocation,
782 * %GFP_KERNEL kernel allocation,
783 * %GFP_HIGHMEM highmem allocation,
784 * %GFP_FS don't call back into a file system.
785 * %GFP_ATOMIC don't sleep.
786 * @order: Power of two of allocation size in pages. 0 is a single page.
788 * Allocate a page from the kernel page pool. When not in
789 * interrupt context and apply the current process NUMA policy.
790 * Returns NULL when no page can be allocated.
792 * Don't call cpuset_update_current_mems_allowed() unless
793 * 1) it's ok to take cpuset_sem (can WAIT), and
794 * 2) allocating for current task (not interrupt).
796 struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
798 struct mempolicy *pol = current->mempolicy;
800 if ((gfp & __GFP_WAIT) && !in_interrupt())
801 cpuset_update_current_mems_allowed();
802 if (!pol || in_interrupt())
803 pol = &default_policy;
804 if (pol->policy == MPOL_INTERLEAVE)
805 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
806 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
808 EXPORT_SYMBOL(alloc_pages_current);
810 /* Slow path of a mempolicy copy */
811 struct mempolicy *__mpol_copy(struct mempolicy *old)
813 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
816 return ERR_PTR(-ENOMEM);
818 atomic_set(&new->refcnt, 1);
819 if (new->policy == MPOL_BIND) {
820 int sz = ksize(old->v.zonelist);
821 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
822 if (!new->v.zonelist) {
823 kmem_cache_free(policy_cache, new);
824 return ERR_PTR(-ENOMEM);
826 memcpy(new->v.zonelist, old->v.zonelist, sz);
831 /* Slow path of a mempolicy comparison */
832 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
836 if (a->policy != b->policy)
841 case MPOL_INTERLEAVE:
842 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
844 return a->v.preferred_node == b->v.preferred_node;
847 for (i = 0; a->v.zonelist->zones[i]; i++)
848 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
850 return b->v.zonelist->zones[i] == NULL;
858 /* Slow path of a mpol destructor. */
859 void __mpol_free(struct mempolicy *p)
861 if (!atomic_dec_and_test(&p->refcnt))
863 if (p->policy == MPOL_BIND)
864 kfree(p->v.zonelist);
865 p->policy = MPOL_DEFAULT;
866 kmem_cache_free(policy_cache, p);
870 * Hugetlb policy. Same as above, just works with node numbers instead of
874 /* Find first node suitable for an allocation */
875 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
877 struct mempolicy *pol = get_vma_policy(vma, addr);
879 switch (pol->policy) {
881 return numa_node_id();
883 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
884 case MPOL_INTERLEAVE:
885 return interleave_nodes(pol);
887 return pol->v.preferred_node >= 0 ?
888 pol->v.preferred_node : numa_node_id();
894 /* Find secondary valid nodes for an allocation */
895 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
897 struct mempolicy *pol = get_vma_policy(vma, addr);
899 switch (pol->policy) {
902 case MPOL_INTERLEAVE:
906 for (z = pol->v.zonelist->zones; *z; z++)
907 if ((*z)->zone_pgdat->node_id == nid)
918 * Shared memory backing store policy support.
920 * Remember policies even when nobody has shared memory mapped.
921 * The policies are kept in Red-Black tree linked from the inode.
922 * They are protected by the sp->lock spinlock, which should be held
923 * for any accesses to the tree.
926 /* lookup first element intersecting start-end */
927 /* Caller holds sp->lock */
928 static struct sp_node *
929 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
931 struct rb_node *n = sp->root.rb_node;
934 struct sp_node *p = rb_entry(n, struct sp_node, nd);
938 else if (end <= p->start)
946 struct sp_node *w = NULL;
947 struct rb_node *prev = rb_prev(n);
950 w = rb_entry(prev, struct sp_node, nd);
955 return rb_entry(n, struct sp_node, nd);
958 /* Insert a new shared policy into the list. */
959 /* Caller holds sp->lock */
960 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
962 struct rb_node **p = &sp->root.rb_node;
963 struct rb_node *parent = NULL;
968 nd = rb_entry(parent, struct sp_node, nd);
969 if (new->start < nd->start)
971 else if (new->end > nd->end)
976 rb_link_node(&new->nd, parent, p);
977 rb_insert_color(&new->nd, &sp->root);
978 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
979 new->policy ? new->policy->policy : 0);
982 /* Find shared policy intersecting idx */
984 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
986 struct mempolicy *pol = NULL;
989 if (!sp->root.rb_node)
991 spin_lock(&sp->lock);
992 sn = sp_lookup(sp, idx, idx+1);
994 mpol_get(sn->policy);
997 spin_unlock(&sp->lock);
1001 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1003 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1004 rb_erase(&n->nd, &sp->root);
1005 mpol_free(n->policy);
1006 kmem_cache_free(sn_cache, n);
1010 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1012 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1023 /* Replace a policy range. */
1024 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1025 unsigned long end, struct sp_node *new)
1027 struct sp_node *n, *new2 = NULL;
1030 spin_lock(&sp->lock);
1031 n = sp_lookup(sp, start, end);
1032 /* Take care of old policies in the same range. */
1033 while (n && n->start < end) {
1034 struct rb_node *next = rb_next(&n->nd);
1035 if (n->start >= start) {
1041 /* Old policy spanning whole new range. */
1044 spin_unlock(&sp->lock);
1045 new2 = sp_alloc(end, n->end, n->policy);
1051 sp_insert(sp, new2);
1059 n = rb_entry(next, struct sp_node, nd);
1063 spin_unlock(&sp->lock);
1065 mpol_free(new2->policy);
1066 kmem_cache_free(sn_cache, new2);
1071 int mpol_set_shared_policy(struct shared_policy *info,
1072 struct vm_area_struct *vma, struct mempolicy *npol)
1075 struct sp_node *new = NULL;
1076 unsigned long sz = vma_pages(vma);
1078 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1080 sz, npol? npol->policy : -1,
1081 npol ? npol->v.nodes[0] : -1);
1084 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1088 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1090 kmem_cache_free(sn_cache, new);
1094 /* Free a backing policy store on inode delete. */
1095 void mpol_free_shared_policy(struct shared_policy *p)
1098 struct rb_node *next;
1100 if (!p->root.rb_node)
1102 spin_lock(&p->lock);
1103 next = rb_first(&p->root);
1105 n = rb_entry(next, struct sp_node, nd);
1106 next = rb_next(&n->nd);
1107 mpol_free(n->policy);
1108 kmem_cache_free(sn_cache, n);
1110 spin_unlock(&p->lock);
1114 /* assumes fs == KERNEL_DS */
1115 void __init numa_policy_init(void)
1117 policy_cache = kmem_cache_create("numa_policy",
1118 sizeof(struct mempolicy),
1119 0, SLAB_PANIC, NULL, NULL);
1121 sn_cache = kmem_cache_create("shared_policy_node",
1122 sizeof(struct sp_node),
1123 0, SLAB_PANIC, NULL, NULL);
1125 /* Set interleaving policy for system init. This way not all
1126 the data structures allocated at system boot end up in node zero. */
1128 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1130 printk("numa_policy_init: interleaving failed\n");
1133 /* Reset policy of current process to default.
1134 * Assumes fs == KERNEL_DS */
1135 void numa_default_policy(void)
1137 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);