2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Check if all specified nodes are online */
97 static int nodes_online(unsigned long *nodes)
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, unsigned long *nodes)
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
127 return nodes_online(nodes);
130 /* Copy a node mask from user space. */
131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
135 unsigned long nlongs;
136 unsigned long endmask;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
156 if (get_user(t, nmask + k))
158 if (k == nlongs - 1) {
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
178 /* Generate a custom zonelist for the BIND policy. */
179 static struct zonelist *bind_zonelist(unsigned long *nodes)
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
197 zl->zones[num++] = z;
203 zl->zones[num] = NULL;
207 /* Create a new policy */
208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
210 struct mempolicy *policy;
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
236 policy->policy = mode;
240 /* Ensure all existing pages follow the policy. */
241 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
242 unsigned long addr, unsigned long end, unsigned long *nodes)
247 spin_lock(&mm->page_table_lock);
248 orig_pte = pte = pte_offset_map(pmd, addr);
253 if (!pte_present(*pte))
258 nid = pfn_to_nid(pfn);
259 if (!test_bit(nid, nodes))
261 } while (pte++, addr += PAGE_SIZE, addr != end);
263 spin_unlock(&mm->page_table_lock);
267 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes)
273 pmd = pmd_offset(pud, addr);
275 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd))
278 if (check_pte_range(mm, pmd, addr, next, nodes))
280 } while (pmd++, addr = next, addr != end);
284 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
285 unsigned long addr, unsigned long end, unsigned long *nodes)
290 pud = pud_offset(pgd, addr);
292 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud))
295 if (check_pmd_range(mm, pud, addr, next, nodes))
297 } while (pud++, addr = next, addr != end);
301 static inline int check_pgd_range(struct mm_struct *mm,
302 unsigned long addr, unsigned long end, unsigned long *nodes)
307 pgd = pgd_offset(mm, addr);
309 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd))
312 if (check_pud_range(mm, pgd, addr, next, nodes))
314 } while (pgd++, addr = next, addr != end);
318 /* Step 1: check the range */
319 static struct vm_area_struct *
320 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 unsigned long *nodes, unsigned long flags)
324 struct vm_area_struct *first, *vma, *prev;
326 first = find_vma(mm, start);
328 return ERR_PTR(-EFAULT);
330 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 if (!vma->vm_next && vma->vm_end < end)
332 return ERR_PTR(-EFAULT);
333 if (prev && prev->vm_end < vma->vm_start)
334 return ERR_PTR(-EFAULT);
335 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
336 unsigned long endvma = vma->vm_end;
339 if (vma->vm_start > start)
340 start = vma->vm_start;
341 err = check_pgd_range(vma->vm_mm,
342 start, endvma, nodes);
344 first = ERR_PTR(err);
353 /* Apply policy to a single VMA */
354 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
357 struct mempolicy *old = vma->vm_policy;
359 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
360 vma->vm_start, vma->vm_end, vma->vm_pgoff,
361 vma->vm_ops, vma->vm_file,
362 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
364 if (vma->vm_ops && vma->vm_ops->set_policy)
365 err = vma->vm_ops->set_policy(vma, new);
368 vma->vm_policy = new;
374 /* Step 2: apply policy to a range and do splits. */
375 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
376 unsigned long end, struct mempolicy *new)
378 struct vm_area_struct *next;
382 for (; vma && vma->vm_start < end; vma = next) {
384 if (vma->vm_start < start)
385 err = split_vma(vma->vm_mm, vma, start, 1);
386 if (!err && vma->vm_end > end)
387 err = split_vma(vma->vm_mm, vma, end, 0);
389 err = policy_vma(vma, new);
396 /* Change policy for a memory range */
397 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
399 unsigned long __user *nmask, unsigned long maxnode,
402 struct vm_area_struct *vma;
403 struct mm_struct *mm = current->mm;
404 struct mempolicy *new;
406 DECLARE_BITMAP(nodes, MAX_NUMNODES);
409 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
411 if (start & ~PAGE_MASK)
413 if (mode == MPOL_DEFAULT)
414 flags &= ~MPOL_MF_STRICT;
415 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
422 err = get_nodes(nodes, nmask, maxnode, mode);
426 new = mpol_new(mode, nodes);
430 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
433 down_write(&mm->mmap_sem);
434 vma = check_range(mm, start, end, nodes, flags);
437 err = mbind_range(vma, start, end, new);
438 up_write(&mm->mmap_sem);
443 /* Set the process memory policy */
444 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
445 unsigned long maxnode)
448 struct mempolicy *new;
449 DECLARE_BITMAP(nodes, MAX_NUMNODES);
451 if (mode < 0 || mode > MPOL_MAX)
453 err = get_nodes(nodes, nmask, maxnode, mode);
456 new = mpol_new(mode, nodes);
459 mpol_free(current->mempolicy);
460 current->mempolicy = new;
461 if (new && new->policy == MPOL_INTERLEAVE)
462 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
466 /* Fill a zone bitmap for a policy */
467 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
471 bitmap_zero(nodes, MAX_NUMNODES);
474 for (i = 0; p->v.zonelist->zones[i]; i++)
475 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
479 case MPOL_INTERLEAVE:
480 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
483 /* or use current node instead of online map? */
484 if (p->v.preferred_node < 0)
485 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
487 __set_bit(p->v.preferred_node, nodes);
494 static int lookup_node(struct mm_struct *mm, unsigned long addr)
499 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
501 err = page_to_nid(p);
507 /* Copy a kernel node mask to user space */
508 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
509 void *nodes, unsigned nbytes)
511 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
514 if (copy > PAGE_SIZE)
516 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
520 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
523 /* Retrieve NUMA policy */
524 asmlinkage long sys_get_mempolicy(int __user *policy,
525 unsigned long __user *nmask,
526 unsigned long maxnode,
527 unsigned long addr, unsigned long flags)
530 struct mm_struct *mm = current->mm;
531 struct vm_area_struct *vma = NULL;
532 struct mempolicy *pol = current->mempolicy;
534 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
536 if (nmask != NULL && maxnode < MAX_NUMNODES)
538 if (flags & MPOL_F_ADDR) {
539 down_read(&mm->mmap_sem);
540 vma = find_vma_intersection(mm, addr, addr+1);
542 up_read(&mm->mmap_sem);
545 if (vma->vm_ops && vma->vm_ops->get_policy)
546 pol = vma->vm_ops->get_policy(vma, addr);
548 pol = vma->vm_policy;
553 pol = &default_policy;
555 if (flags & MPOL_F_NODE) {
556 if (flags & MPOL_F_ADDR) {
557 err = lookup_node(mm, addr);
561 } else if (pol == current->mempolicy &&
562 pol->policy == MPOL_INTERLEAVE) {
563 pval = current->il_next;
572 up_read(¤t->mm->mmap_sem);
576 if (policy && put_user(pval, policy))
581 DECLARE_BITMAP(nodes, MAX_NUMNODES);
582 get_zonemask(pol, nodes);
583 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
588 up_read(¤t->mm->mmap_sem);
594 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
595 compat_ulong_t __user *nmask,
596 compat_ulong_t maxnode,
597 compat_ulong_t addr, compat_ulong_t flags)
600 unsigned long __user *nm = NULL;
601 unsigned long nr_bits, alloc_size;
602 DECLARE_BITMAP(bm, MAX_NUMNODES);
604 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
605 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
608 nm = compat_alloc_user_space(alloc_size);
610 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
613 err = copy_from_user(bm, nm, alloc_size);
614 /* ensure entire bitmap is zeroed */
615 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
616 err |= compat_put_bitmap(nmask, bm, nr_bits);
622 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
623 compat_ulong_t maxnode)
626 unsigned long __user *nm = NULL;
627 unsigned long nr_bits, alloc_size;
628 DECLARE_BITMAP(bm, MAX_NUMNODES);
630 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
631 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
634 err = compat_get_bitmap(bm, nmask, nr_bits);
635 nm = compat_alloc_user_space(alloc_size);
636 err |= copy_to_user(nm, bm, alloc_size);
642 return sys_set_mempolicy(mode, nm, nr_bits+1);
645 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
646 compat_ulong_t mode, compat_ulong_t __user *nmask,
647 compat_ulong_t maxnode, compat_ulong_t flags)
650 unsigned long __user *nm = NULL;
651 unsigned long nr_bits, alloc_size;
652 DECLARE_BITMAP(bm, MAX_NUMNODES);
654 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
655 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
658 err = compat_get_bitmap(bm, nmask, nr_bits);
659 nm = compat_alloc_user_space(alloc_size);
660 err |= copy_to_user(nm, bm, alloc_size);
666 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
671 /* Return effective policy for a VMA */
673 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
675 struct mempolicy *pol = task->mempolicy;
678 if (vma->vm_ops && vma->vm_ops->get_policy)
679 pol = vma->vm_ops->get_policy(vma, addr);
680 else if (vma->vm_policy &&
681 vma->vm_policy->policy != MPOL_DEFAULT)
682 pol = vma->vm_policy;
685 pol = &default_policy;
689 /* Return a zonelist representing a mempolicy */
690 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
694 switch (policy->policy) {
696 nd = policy->v.preferred_node;
701 /* Lower zones don't get a policy applied */
702 /* Careful: current->mems_allowed might have moved */
703 if (gfp_zone(gfp) >= policy_zone)
704 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
705 return policy->v.zonelist;
707 case MPOL_INTERLEAVE: /* should not happen */
715 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
718 /* Do dynamic interleaving for a process */
719 static unsigned interleave_nodes(struct mempolicy *policy)
722 struct task_struct *me = current;
725 BUG_ON(nid >= MAX_NUMNODES);
726 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
727 if (next >= MAX_NUMNODES)
728 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
733 /* Do static interleaving for a VMA with known offset. */
734 static unsigned offset_il_node(struct mempolicy *pol,
735 struct vm_area_struct *vma, unsigned long off)
737 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
738 unsigned target = (unsigned)off % nnodes;
744 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
746 } while (c <= target);
747 BUG_ON(nid >= MAX_NUMNODES);
748 BUG_ON(!test_bit(nid, pol->v.nodes));
752 /* Allocate a page in interleaved policy.
753 Own path because it needs to do special accounting. */
754 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
759 BUG_ON(!node_online(nid));
760 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
761 page = __alloc_pages(gfp, order, zl);
762 if (page && page_zone(page) == zl->zones[0]) {
763 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
770 * alloc_page_vma - Allocate a page for a VMA.
773 * %GFP_USER user allocation.
774 * %GFP_KERNEL kernel allocations,
775 * %GFP_HIGHMEM highmem/user allocations,
776 * %GFP_FS allocation should not call back into a file system.
777 * %GFP_ATOMIC don't sleep.
779 * @vma: Pointer to VMA or NULL if not available.
780 * @addr: Virtual Address of the allocation. Must be inside the VMA.
782 * This function allocates a page from the kernel page pool and applies
783 * a NUMA policy associated with the VMA or the current process.
784 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
785 * mm_struct of the VMA to prevent it from going away. Should be used for
786 * all allocations for pages that will be mapped into
787 * user space. Returns NULL when no page can be allocated.
789 * Should be called with the mm_sem of the vma hold.
792 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
794 struct mempolicy *pol = get_vma_policy(current, vma, addr);
796 cpuset_update_current_mems_allowed();
798 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
802 BUG_ON(addr >= vma->vm_end);
803 BUG_ON(addr < vma->vm_start);
805 off += (addr - vma->vm_start) >> PAGE_SHIFT;
806 nid = offset_il_node(pol, vma, off);
808 /* fall back to process interleaving */
809 nid = interleave_nodes(pol);
811 return alloc_page_interleave(gfp, 0, nid);
813 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
817 * alloc_pages_current - Allocate pages.
820 * %GFP_USER user allocation,
821 * %GFP_KERNEL kernel allocation,
822 * %GFP_HIGHMEM highmem allocation,
823 * %GFP_FS don't call back into a file system.
824 * %GFP_ATOMIC don't sleep.
825 * @order: Power of two of allocation size in pages. 0 is a single page.
827 * Allocate a page from the kernel page pool. When not in
828 * interrupt context and apply the current process NUMA policy.
829 * Returns NULL when no page can be allocated.
831 * Don't call cpuset_update_current_mems_allowed() unless
832 * 1) it's ok to take cpuset_sem (can WAIT), and
833 * 2) allocating for current task (not interrupt).
835 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
837 struct mempolicy *pol = current->mempolicy;
839 if ((gfp & __GFP_WAIT) && !in_interrupt())
840 cpuset_update_current_mems_allowed();
841 if (!pol || in_interrupt())
842 pol = &default_policy;
843 if (pol->policy == MPOL_INTERLEAVE)
844 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
845 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
847 EXPORT_SYMBOL(alloc_pages_current);
849 /* Slow path of a mempolicy copy */
850 struct mempolicy *__mpol_copy(struct mempolicy *old)
852 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
855 return ERR_PTR(-ENOMEM);
857 atomic_set(&new->refcnt, 1);
858 if (new->policy == MPOL_BIND) {
859 int sz = ksize(old->v.zonelist);
860 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
861 if (!new->v.zonelist) {
862 kmem_cache_free(policy_cache, new);
863 return ERR_PTR(-ENOMEM);
865 memcpy(new->v.zonelist, old->v.zonelist, sz);
870 /* Slow path of a mempolicy comparison */
871 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
875 if (a->policy != b->policy)
880 case MPOL_INTERLEAVE:
881 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
883 return a->v.preferred_node == b->v.preferred_node;
886 for (i = 0; a->v.zonelist->zones[i]; i++)
887 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
889 return b->v.zonelist->zones[i] == NULL;
897 /* Slow path of a mpol destructor. */
898 void __mpol_free(struct mempolicy *p)
900 if (!atomic_dec_and_test(&p->refcnt))
902 if (p->policy == MPOL_BIND)
903 kfree(p->v.zonelist);
904 p->policy = MPOL_DEFAULT;
905 kmem_cache_free(policy_cache, p);
909 * Hugetlb policy. Same as above, just works with node numbers instead of
913 /* Find first node suitable for an allocation */
914 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
916 struct mempolicy *pol = get_vma_policy(current, vma, addr);
918 switch (pol->policy) {
920 return numa_node_id();
922 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
923 case MPOL_INTERLEAVE:
924 return interleave_nodes(pol);
926 return pol->v.preferred_node >= 0 ?
927 pol->v.preferred_node : numa_node_id();
933 /* Find secondary valid nodes for an allocation */
934 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
936 struct mempolicy *pol = get_vma_policy(current, vma, addr);
938 switch (pol->policy) {
941 case MPOL_INTERLEAVE:
945 for (z = pol->v.zonelist->zones; *z; z++)
946 if ((*z)->zone_pgdat->node_id == nid)
957 * Shared memory backing store policy support.
959 * Remember policies even when nobody has shared memory mapped.
960 * The policies are kept in Red-Black tree linked from the inode.
961 * They are protected by the sp->lock spinlock, which should be held
962 * for any accesses to the tree.
965 /* lookup first element intersecting start-end */
966 /* Caller holds sp->lock */
967 static struct sp_node *
968 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
970 struct rb_node *n = sp->root.rb_node;
973 struct sp_node *p = rb_entry(n, struct sp_node, nd);
977 else if (end <= p->start)
985 struct sp_node *w = NULL;
986 struct rb_node *prev = rb_prev(n);
989 w = rb_entry(prev, struct sp_node, nd);
994 return rb_entry(n, struct sp_node, nd);
997 /* Insert a new shared policy into the list. */
998 /* Caller holds sp->lock */
999 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1001 struct rb_node **p = &sp->root.rb_node;
1002 struct rb_node *parent = NULL;
1007 nd = rb_entry(parent, struct sp_node, nd);
1008 if (new->start < nd->start)
1010 else if (new->end > nd->end)
1011 p = &(*p)->rb_right;
1015 rb_link_node(&new->nd, parent, p);
1016 rb_insert_color(&new->nd, &sp->root);
1017 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1018 new->policy ? new->policy->policy : 0);
1021 /* Find shared policy intersecting idx */
1023 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1025 struct mempolicy *pol = NULL;
1028 if (!sp->root.rb_node)
1030 spin_lock(&sp->lock);
1031 sn = sp_lookup(sp, idx, idx+1);
1033 mpol_get(sn->policy);
1036 spin_unlock(&sp->lock);
1040 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1042 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1043 rb_erase(&n->nd, &sp->root);
1044 mpol_free(n->policy);
1045 kmem_cache_free(sn_cache, n);
1049 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1051 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1062 /* Replace a policy range. */
1063 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1064 unsigned long end, struct sp_node *new)
1066 struct sp_node *n, *new2 = NULL;
1069 spin_lock(&sp->lock);
1070 n = sp_lookup(sp, start, end);
1071 /* Take care of old policies in the same range. */
1072 while (n && n->start < end) {
1073 struct rb_node *next = rb_next(&n->nd);
1074 if (n->start >= start) {
1080 /* Old policy spanning whole new range. */
1083 spin_unlock(&sp->lock);
1084 new2 = sp_alloc(end, n->end, n->policy);
1090 sp_insert(sp, new2);
1098 n = rb_entry(next, struct sp_node, nd);
1102 spin_unlock(&sp->lock);
1104 mpol_free(new2->policy);
1105 kmem_cache_free(sn_cache, new2);
1110 int mpol_set_shared_policy(struct shared_policy *info,
1111 struct vm_area_struct *vma, struct mempolicy *npol)
1114 struct sp_node *new = NULL;
1115 unsigned long sz = vma_pages(vma);
1117 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1119 sz, npol? npol->policy : -1,
1120 npol ? npol->v.nodes[0] : -1);
1123 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1127 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1129 kmem_cache_free(sn_cache, new);
1133 /* Free a backing policy store on inode delete. */
1134 void mpol_free_shared_policy(struct shared_policy *p)
1137 struct rb_node *next;
1139 if (!p->root.rb_node)
1141 spin_lock(&p->lock);
1142 next = rb_first(&p->root);
1144 n = rb_entry(next, struct sp_node, nd);
1145 next = rb_next(&n->nd);
1146 rb_erase(&n->nd, &p->root);
1147 mpol_free(n->policy);
1148 kmem_cache_free(sn_cache, n);
1150 spin_unlock(&p->lock);
1153 /* assumes fs == KERNEL_DS */
1154 void __init numa_policy_init(void)
1156 policy_cache = kmem_cache_create("numa_policy",
1157 sizeof(struct mempolicy),
1158 0, SLAB_PANIC, NULL, NULL);
1160 sn_cache = kmem_cache_create("shared_policy_node",
1161 sizeof(struct sp_node),
1162 0, SLAB_PANIC, NULL, NULL);
1164 /* Set interleaving policy for system init. This way not all
1165 the data structures allocated at system boot end up in node zero. */
1167 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1169 printk("numa_policy_init: interleaving failed\n");
1172 /* Reset policy of current process to default.
1173 * Assumes fs == KERNEL_DS */
1174 void numa_default_policy(void)
1176 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);