2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 static struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Check if all specified nodes are online */
97 static int nodes_online(unsigned long *nodes)
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, unsigned long *nodes)
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
127 return nodes_online(nodes);
130 /* Copy a node mask from user space. */
131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
135 unsigned long nlongs;
136 unsigned long endmask;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
156 if (get_user(t, nmask + k))
158 if (k == nlongs - 1) {
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
178 /* Generate a custom zonelist for the BIND policy. */
179 static struct zonelist *bind_zonelist(unsigned long *nodes)
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
197 zl->zones[num++] = z;
203 zl->zones[num] = NULL;
207 /* Create a new policy */
208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
210 struct mempolicy *policy;
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
236 policy->policy = mode;
240 /* Ensure all existing pages follow the policy. */
241 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
242 unsigned long addr, unsigned long end, unsigned long *nodes)
247 spin_lock(&mm->page_table_lock);
248 orig_pte = pte = pte_offset_map(pmd, addr);
253 if (!pte_present(*pte))
258 nid = pfn_to_nid(pfn);
259 if (!test_bit(nid, nodes))
261 } while (pte++, addr += PAGE_SIZE, addr != end);
263 spin_unlock(&mm->page_table_lock);
267 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes)
273 pmd = pmd_offset(pud, addr);
275 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd))
278 if (check_pte_range(mm, pmd, addr, next, nodes))
280 } while (pmd++, addr = next, addr != end);
284 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
285 unsigned long addr, unsigned long end, unsigned long *nodes)
290 pud = pud_offset(pgd, addr);
292 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud))
295 if (check_pmd_range(mm, pud, addr, next, nodes))
297 } while (pud++, addr = next, addr != end);
301 static inline int check_pgd_range(struct mm_struct *mm,
302 unsigned long addr, unsigned long end, unsigned long *nodes)
307 pgd = pgd_offset(mm, addr);
309 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd))
312 if (check_pud_range(mm, pgd, addr, next, nodes))
314 } while (pgd++, addr = next, addr != end);
318 /* Step 1: check the range */
319 static struct vm_area_struct *
320 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 unsigned long *nodes, unsigned long flags)
324 struct vm_area_struct *first, *vma, *prev;
326 first = find_vma(mm, start);
328 return ERR_PTR(-EFAULT);
330 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 if (!vma->vm_next && vma->vm_end < end)
332 return ERR_PTR(-EFAULT);
333 if (prev && prev->vm_end < vma->vm_start)
334 return ERR_PTR(-EFAULT);
335 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
336 err = check_pgd_range(vma->vm_mm,
337 vma->vm_start, vma->vm_end, nodes);
339 first = ERR_PTR(err);
348 /* Apply policy to a single VMA */
349 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
352 struct mempolicy *old = vma->vm_policy;
354 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
355 vma->vm_start, vma->vm_end, vma->vm_pgoff,
356 vma->vm_ops, vma->vm_file,
357 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
359 if (vma->vm_ops && vma->vm_ops->set_policy)
360 err = vma->vm_ops->set_policy(vma, new);
363 vma->vm_policy = new;
369 /* Step 2: apply policy to a range and do splits. */
370 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
371 unsigned long end, struct mempolicy *new)
373 struct vm_area_struct *next;
377 for (; vma && vma->vm_start < end; vma = next) {
379 if (vma->vm_start < start)
380 err = split_vma(vma->vm_mm, vma, start, 1);
381 if (!err && vma->vm_end > end)
382 err = split_vma(vma->vm_mm, vma, end, 0);
384 err = policy_vma(vma, new);
391 /* Change policy for a memory range */
392 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
394 unsigned long __user *nmask, unsigned long maxnode,
397 struct vm_area_struct *vma;
398 struct mm_struct *mm = current->mm;
399 struct mempolicy *new;
401 DECLARE_BITMAP(nodes, MAX_NUMNODES);
404 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
406 if (start & ~PAGE_MASK)
408 if (mode == MPOL_DEFAULT)
409 flags &= ~MPOL_MF_STRICT;
410 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
417 err = get_nodes(nodes, nmask, maxnode, mode);
421 new = mpol_new(mode, nodes);
425 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
428 down_write(&mm->mmap_sem);
429 vma = check_range(mm, start, end, nodes, flags);
432 err = mbind_range(vma, start, end, new);
433 up_write(&mm->mmap_sem);
438 /* Set the process memory policy */
439 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
440 unsigned long maxnode)
443 struct mempolicy *new;
444 DECLARE_BITMAP(nodes, MAX_NUMNODES);
448 err = get_nodes(nodes, nmask, maxnode, mode);
451 new = mpol_new(mode, nodes);
454 mpol_free(current->mempolicy);
455 current->mempolicy = new;
456 if (new && new->policy == MPOL_INTERLEAVE)
457 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
461 /* Fill a zone bitmap for a policy */
462 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
466 bitmap_zero(nodes, MAX_NUMNODES);
469 for (i = 0; p->v.zonelist->zones[i]; i++)
470 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
474 case MPOL_INTERLEAVE:
475 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
478 /* or use current node instead of online map? */
479 if (p->v.preferred_node < 0)
480 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
482 __set_bit(p->v.preferred_node, nodes);
489 static int lookup_node(struct mm_struct *mm, unsigned long addr)
494 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
496 err = page_to_nid(p);
502 /* Copy a kernel node mask to user space */
503 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
504 void *nodes, unsigned nbytes)
506 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
509 if (copy > PAGE_SIZE)
511 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
515 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
518 /* Retrieve NUMA policy */
519 asmlinkage long sys_get_mempolicy(int __user *policy,
520 unsigned long __user *nmask,
521 unsigned long maxnode,
522 unsigned long addr, unsigned long flags)
525 struct mm_struct *mm = current->mm;
526 struct vm_area_struct *vma = NULL;
527 struct mempolicy *pol = current->mempolicy;
529 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
531 if (nmask != NULL && maxnode < MAX_NUMNODES)
533 if (flags & MPOL_F_ADDR) {
534 down_read(&mm->mmap_sem);
535 vma = find_vma_intersection(mm, addr, addr+1);
537 up_read(&mm->mmap_sem);
540 if (vma->vm_ops && vma->vm_ops->get_policy)
541 pol = vma->vm_ops->get_policy(vma, addr);
543 pol = vma->vm_policy;
548 pol = &default_policy;
550 if (flags & MPOL_F_NODE) {
551 if (flags & MPOL_F_ADDR) {
552 err = lookup_node(mm, addr);
556 } else if (pol == current->mempolicy &&
557 pol->policy == MPOL_INTERLEAVE) {
558 pval = current->il_next;
567 up_read(¤t->mm->mmap_sem);
571 if (policy && put_user(pval, policy))
576 DECLARE_BITMAP(nodes, MAX_NUMNODES);
577 get_zonemask(pol, nodes);
578 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
583 up_read(¤t->mm->mmap_sem);
589 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
590 compat_ulong_t __user *nmask,
591 compat_ulong_t maxnode,
592 compat_ulong_t addr, compat_ulong_t flags)
595 unsigned long __user *nm = NULL;
596 unsigned long nr_bits, alloc_size;
597 DECLARE_BITMAP(bm, MAX_NUMNODES);
599 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
600 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
603 nm = compat_alloc_user_space(alloc_size);
605 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
608 err = copy_from_user(bm, nm, alloc_size);
609 /* ensure entire bitmap is zeroed */
610 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
611 err |= compat_put_bitmap(nmask, bm, nr_bits);
617 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
618 compat_ulong_t maxnode)
621 unsigned long __user *nm = NULL;
622 unsigned long nr_bits, alloc_size;
623 DECLARE_BITMAP(bm, MAX_NUMNODES);
625 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
626 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
629 err = compat_get_bitmap(bm, nmask, nr_bits);
630 nm = compat_alloc_user_space(alloc_size);
631 err |= copy_to_user(nm, bm, alloc_size);
637 return sys_set_mempolicy(mode, nm, nr_bits+1);
640 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
641 compat_ulong_t mode, compat_ulong_t __user *nmask,
642 compat_ulong_t maxnode, compat_ulong_t flags)
645 unsigned long __user *nm = NULL;
646 unsigned long nr_bits, alloc_size;
647 DECLARE_BITMAP(bm, MAX_NUMNODES);
649 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
650 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
653 err = compat_get_bitmap(bm, nmask, nr_bits);
654 nm = compat_alloc_user_space(alloc_size);
655 err |= copy_to_user(nm, bm, alloc_size);
661 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
666 /* Return effective policy for a VMA */
667 static struct mempolicy *
668 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
670 struct mempolicy *pol = current->mempolicy;
673 if (vma->vm_ops && vma->vm_ops->get_policy)
674 pol = vma->vm_ops->get_policy(vma, addr);
675 else if (vma->vm_policy &&
676 vma->vm_policy->policy != MPOL_DEFAULT)
677 pol = vma->vm_policy;
680 pol = &default_policy;
684 /* Return a zonelist representing a mempolicy */
685 static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
689 switch (policy->policy) {
691 nd = policy->v.preferred_node;
696 /* Lower zones don't get a policy applied */
697 /* Careful: current->mems_allowed might have moved */
698 if ((gfp & GFP_ZONEMASK) >= policy_zone)
699 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
700 return policy->v.zonelist;
702 case MPOL_INTERLEAVE: /* should not happen */
710 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
713 /* Do dynamic interleaving for a process */
714 static unsigned interleave_nodes(struct mempolicy *policy)
717 struct task_struct *me = current;
720 BUG_ON(nid >= MAX_NUMNODES);
721 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
722 if (next >= MAX_NUMNODES)
723 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
728 /* Do static interleaving for a VMA with known offset. */
729 static unsigned offset_il_node(struct mempolicy *pol,
730 struct vm_area_struct *vma, unsigned long off)
732 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
733 unsigned target = (unsigned)off % nnodes;
739 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
741 } while (c <= target);
742 BUG_ON(nid >= MAX_NUMNODES);
743 BUG_ON(!test_bit(nid, pol->v.nodes));
747 /* Allocate a page in interleaved policy.
748 Own path because it needs to do special accounting. */
749 static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
754 BUG_ON(!node_online(nid));
755 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
756 page = __alloc_pages(gfp, order, zl);
757 if (page && page_zone(page) == zl->zones[0]) {
758 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
765 * alloc_page_vma - Allocate a page for a VMA.
768 * %GFP_USER user allocation.
769 * %GFP_KERNEL kernel allocations,
770 * %GFP_HIGHMEM highmem/user allocations,
771 * %GFP_FS allocation should not call back into a file system.
772 * %GFP_ATOMIC don't sleep.
774 * @vma: Pointer to VMA or NULL if not available.
775 * @addr: Virtual Address of the allocation. Must be inside the VMA.
777 * This function allocates a page from the kernel page pool and applies
778 * a NUMA policy associated with the VMA or the current process.
779 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
780 * mm_struct of the VMA to prevent it from going away. Should be used for
781 * all allocations for pages that will be mapped into
782 * user space. Returns NULL when no page can be allocated.
784 * Should be called with the mm_sem of the vma hold.
787 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
789 struct mempolicy *pol = get_vma_policy(vma, addr);
791 cpuset_update_current_mems_allowed();
793 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
797 BUG_ON(addr >= vma->vm_end);
798 BUG_ON(addr < vma->vm_start);
800 off += (addr - vma->vm_start) >> PAGE_SHIFT;
801 nid = offset_il_node(pol, vma, off);
803 /* fall back to process interleaving */
804 nid = interleave_nodes(pol);
806 return alloc_page_interleave(gfp, 0, nid);
808 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
812 * alloc_pages_current - Allocate pages.
815 * %GFP_USER user allocation,
816 * %GFP_KERNEL kernel allocation,
817 * %GFP_HIGHMEM highmem allocation,
818 * %GFP_FS don't call back into a file system.
819 * %GFP_ATOMIC don't sleep.
820 * @order: Power of two of allocation size in pages. 0 is a single page.
822 * Allocate a page from the kernel page pool. When not in
823 * interrupt context and apply the current process NUMA policy.
824 * Returns NULL when no page can be allocated.
826 * Don't call cpuset_update_current_mems_allowed() unless
827 * 1) it's ok to take cpuset_sem (can WAIT), and
828 * 2) allocating for current task (not interrupt).
830 struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
832 struct mempolicy *pol = current->mempolicy;
834 if ((gfp & __GFP_WAIT) && !in_interrupt())
835 cpuset_update_current_mems_allowed();
836 if (!pol || in_interrupt())
837 pol = &default_policy;
838 if (pol->policy == MPOL_INTERLEAVE)
839 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
840 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
842 EXPORT_SYMBOL(alloc_pages_current);
844 /* Slow path of a mempolicy copy */
845 struct mempolicy *__mpol_copy(struct mempolicy *old)
847 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
850 return ERR_PTR(-ENOMEM);
852 atomic_set(&new->refcnt, 1);
853 if (new->policy == MPOL_BIND) {
854 int sz = ksize(old->v.zonelist);
855 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
856 if (!new->v.zonelist) {
857 kmem_cache_free(policy_cache, new);
858 return ERR_PTR(-ENOMEM);
860 memcpy(new->v.zonelist, old->v.zonelist, sz);
865 /* Slow path of a mempolicy comparison */
866 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
870 if (a->policy != b->policy)
875 case MPOL_INTERLEAVE:
876 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
878 return a->v.preferred_node == b->v.preferred_node;
881 for (i = 0; a->v.zonelist->zones[i]; i++)
882 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
884 return b->v.zonelist->zones[i] == NULL;
892 /* Slow path of a mpol destructor. */
893 void __mpol_free(struct mempolicy *p)
895 if (!atomic_dec_and_test(&p->refcnt))
897 if (p->policy == MPOL_BIND)
898 kfree(p->v.zonelist);
899 p->policy = MPOL_DEFAULT;
900 kmem_cache_free(policy_cache, p);
904 * Hugetlb policy. Same as above, just works with node numbers instead of
908 /* Find first node suitable for an allocation */
909 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
911 struct mempolicy *pol = get_vma_policy(vma, addr);
913 switch (pol->policy) {
915 return numa_node_id();
917 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
918 case MPOL_INTERLEAVE:
919 return interleave_nodes(pol);
921 return pol->v.preferred_node >= 0 ?
922 pol->v.preferred_node : numa_node_id();
928 /* Find secondary valid nodes for an allocation */
929 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
931 struct mempolicy *pol = get_vma_policy(vma, addr);
933 switch (pol->policy) {
936 case MPOL_INTERLEAVE:
940 for (z = pol->v.zonelist->zones; *z; z++)
941 if ((*z)->zone_pgdat->node_id == nid)
952 * Shared memory backing store policy support.
954 * Remember policies even when nobody has shared memory mapped.
955 * The policies are kept in Red-Black tree linked from the inode.
956 * They are protected by the sp->lock spinlock, which should be held
957 * for any accesses to the tree.
960 /* lookup first element intersecting start-end */
961 /* Caller holds sp->lock */
962 static struct sp_node *
963 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
965 struct rb_node *n = sp->root.rb_node;
968 struct sp_node *p = rb_entry(n, struct sp_node, nd);
972 else if (end <= p->start)
980 struct sp_node *w = NULL;
981 struct rb_node *prev = rb_prev(n);
984 w = rb_entry(prev, struct sp_node, nd);
989 return rb_entry(n, struct sp_node, nd);
992 /* Insert a new shared policy into the list. */
993 /* Caller holds sp->lock */
994 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
996 struct rb_node **p = &sp->root.rb_node;
997 struct rb_node *parent = NULL;
1002 nd = rb_entry(parent, struct sp_node, nd);
1003 if (new->start < nd->start)
1005 else if (new->end > nd->end)
1006 p = &(*p)->rb_right;
1010 rb_link_node(&new->nd, parent, p);
1011 rb_insert_color(&new->nd, &sp->root);
1012 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1013 new->policy ? new->policy->policy : 0);
1016 /* Find shared policy intersecting idx */
1018 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1020 struct mempolicy *pol = NULL;
1023 if (!sp->root.rb_node)
1025 spin_lock(&sp->lock);
1026 sn = sp_lookup(sp, idx, idx+1);
1028 mpol_get(sn->policy);
1031 spin_unlock(&sp->lock);
1035 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1037 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1038 rb_erase(&n->nd, &sp->root);
1039 mpol_free(n->policy);
1040 kmem_cache_free(sn_cache, n);
1044 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1046 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1057 /* Replace a policy range. */
1058 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1059 unsigned long end, struct sp_node *new)
1061 struct sp_node *n, *new2 = NULL;
1064 spin_lock(&sp->lock);
1065 n = sp_lookup(sp, start, end);
1066 /* Take care of old policies in the same range. */
1067 while (n && n->start < end) {
1068 struct rb_node *next = rb_next(&n->nd);
1069 if (n->start >= start) {
1075 /* Old policy spanning whole new range. */
1078 spin_unlock(&sp->lock);
1079 new2 = sp_alloc(end, n->end, n->policy);
1085 sp_insert(sp, new2);
1093 n = rb_entry(next, struct sp_node, nd);
1097 spin_unlock(&sp->lock);
1099 mpol_free(new2->policy);
1100 kmem_cache_free(sn_cache, new2);
1105 int mpol_set_shared_policy(struct shared_policy *info,
1106 struct vm_area_struct *vma, struct mempolicy *npol)
1109 struct sp_node *new = NULL;
1110 unsigned long sz = vma_pages(vma);
1112 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1114 sz, npol? npol->policy : -1,
1115 npol ? npol->v.nodes[0] : -1);
1118 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1122 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1124 kmem_cache_free(sn_cache, new);
1128 /* Free a backing policy store on inode delete. */
1129 void mpol_free_shared_policy(struct shared_policy *p)
1132 struct rb_node *next;
1134 if (!p->root.rb_node)
1136 spin_lock(&p->lock);
1137 next = rb_first(&p->root);
1139 n = rb_entry(next, struct sp_node, nd);
1140 next = rb_next(&n->nd);
1141 mpol_free(n->policy);
1142 kmem_cache_free(sn_cache, n);
1144 spin_unlock(&p->lock);
1148 /* assumes fs == KERNEL_DS */
1149 void __init numa_policy_init(void)
1151 policy_cache = kmem_cache_create("numa_policy",
1152 sizeof(struct mempolicy),
1153 0, SLAB_PANIC, NULL, NULL);
1155 sn_cache = kmem_cache_create("shared_policy_node",
1156 sizeof(struct sp_node),
1157 0, SLAB_PANIC, NULL, NULL);
1159 /* Set interleaving policy for system init. This way not all
1160 the data structures allocated at system boot end up in node zero. */
1162 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1164 printk("numa_policy_init: interleaving failed\n");
1167 /* Reset policy of current process to default.
1168 * Assumes fs == KERNEL_DS */
1169 void numa_default_policy(void)
1171 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);