2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Do sanity checking on a policy */
97 static int mpol_check_policy(int mode, nodemask_t *nodes)
99 int empty = nodes_empty(*nodes);
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
117 /* Copy a node mask from user space. */
118 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
119 unsigned long maxnode, int mode)
122 unsigned long nlongs;
123 unsigned long endmask;
127 if (maxnode == 0 || !nmask)
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
143 if (get_user(t, nmask + k))
145 if (k == nlongs - 1) {
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
157 nodes_addr(*nodes)[nlongs-1] &= endmask;
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
163 return mpol_check_policy(mode, nodes);
166 /* Generate a custom zonelist for the BIND policy. */
167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
177 for_each_node_mask(nd, *nodes) {
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
183 zl->zones[num++] = z;
188 zl->zones[num] = NULL;
192 /* Create a new policy */
193 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
195 struct mempolicy *policy;
197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
198 if (mode == MPOL_DEFAULT)
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
205 case MPOL_INTERLEAVE:
206 policy->v.nodes = *nodes;
209 policy->v.preferred_node = first_node(*nodes);
210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
221 policy->policy = mode;
225 /* Ensure all existing pages follow the policy. */
226 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
227 unsigned long addr, unsigned long end, nodemask_t *nodes)
232 spin_lock(&vma->vm_mm->page_table_lock);
233 orig_pte = pte = pte_offset_map(pmd, addr);
238 if (!pte_present(*pte))
241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
245 nid = pfn_to_nid(pfn);
246 if (!node_isset(nid, *nodes))
248 } while (pte++, addr += PAGE_SIZE, addr != end);
250 spin_unlock(&vma->vm_mm->page_table_lock);
254 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
255 unsigned long addr, unsigned long end, nodemask_t *nodes)
260 pmd = pmd_offset(pud, addr);
262 next = pmd_addr_end(addr, end);
263 if (pmd_none_or_clear_bad(pmd))
265 if (check_pte_range(vma, pmd, addr, next, nodes))
267 } while (pmd++, addr = next, addr != end);
271 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
272 unsigned long addr, unsigned long end, nodemask_t *nodes)
277 pud = pud_offset(pgd, addr);
279 next = pud_addr_end(addr, end);
280 if (pud_none_or_clear_bad(pud))
282 if (check_pmd_range(vma, pud, addr, next, nodes))
284 } while (pud++, addr = next, addr != end);
288 static inline int check_pgd_range(struct vm_area_struct *vma,
289 unsigned long addr, unsigned long end, nodemask_t *nodes)
294 pgd = pgd_offset(vma->vm_mm, addr);
296 next = pgd_addr_end(addr, end);
297 if (pgd_none_or_clear_bad(pgd))
299 if (check_pud_range(vma, pgd, addr, next, nodes))
301 } while (pgd++, addr = next, addr != end);
305 /* Step 1: check the range */
306 static struct vm_area_struct *
307 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
308 nodemask_t *nodes, unsigned long flags)
311 struct vm_area_struct *first, *vma, *prev;
313 first = find_vma(mm, start);
315 return ERR_PTR(-EFAULT);
316 if (first->vm_flags & VM_RESERVED)
317 return ERR_PTR(-EACCES);
319 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
320 if (!vma->vm_next && vma->vm_end < end)
321 return ERR_PTR(-EFAULT);
322 if (prev && prev->vm_end < vma->vm_start)
323 return ERR_PTR(-EFAULT);
324 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
325 unsigned long endvma = vma->vm_end;
328 if (vma->vm_start > start)
329 start = vma->vm_start;
330 err = check_pgd_range(vma, start, endvma, nodes);
332 first = ERR_PTR(err);
341 /* Apply policy to a single VMA */
342 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
345 struct mempolicy *old = vma->vm_policy;
347 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
348 vma->vm_start, vma->vm_end, vma->vm_pgoff,
349 vma->vm_ops, vma->vm_file,
350 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
352 if (vma->vm_ops && vma->vm_ops->set_policy)
353 err = vma->vm_ops->set_policy(vma, new);
356 vma->vm_policy = new;
362 /* Step 2: apply policy to a range and do splits. */
363 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
364 unsigned long end, struct mempolicy *new)
366 struct vm_area_struct *next;
370 for (; vma && vma->vm_start < end; vma = next) {
372 if (vma->vm_start < start)
373 err = split_vma(vma->vm_mm, vma, start, 1);
374 if (!err && vma->vm_end > end)
375 err = split_vma(vma->vm_mm, vma, end, 0);
377 err = policy_vma(vma, new);
384 /* Change policy for a memory range */
385 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
387 unsigned long __user *nmask, unsigned long maxnode,
390 struct vm_area_struct *vma;
391 struct mm_struct *mm = current->mm;
392 struct mempolicy *new;
397 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
399 if (start & ~PAGE_MASK)
401 if (mode == MPOL_DEFAULT)
402 flags &= ~MPOL_MF_STRICT;
403 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
410 err = get_nodes(&nodes, nmask, maxnode, mode);
414 new = mpol_new(mode, &nodes);
418 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
419 mode,nodes_addr(nodes)[0]);
421 down_write(&mm->mmap_sem);
422 vma = check_range(mm, start, end, &nodes, flags);
425 err = mbind_range(vma, start, end, new);
426 up_write(&mm->mmap_sem);
431 /* Set the process memory policy */
432 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
433 unsigned long maxnode)
436 struct mempolicy *new;
439 if (mode < 0 || mode > MPOL_MAX)
441 err = get_nodes(&nodes, nmask, maxnode, mode);
444 new = mpol_new(mode, &nodes);
447 mpol_free(current->mempolicy);
448 current->mempolicy = new;
449 if (new && new->policy == MPOL_INTERLEAVE)
450 current->il_next = first_node(new->v.nodes);
454 /* Fill a zone bitmap for a policy */
455 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
462 for (i = 0; p->v.zonelist->zones[i]; i++)
463 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
467 case MPOL_INTERLEAVE:
471 /* or use current node instead of online map? */
472 if (p->v.preferred_node < 0)
473 *nodes = node_online_map;
475 node_set(p->v.preferred_node, *nodes);
482 static int lookup_node(struct mm_struct *mm, unsigned long addr)
487 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
489 err = page_to_nid(p);
495 /* Copy a kernel node mask to user space */
496 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
499 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
500 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
503 if (copy > PAGE_SIZE)
505 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
509 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
512 /* Retrieve NUMA policy */
513 asmlinkage long sys_get_mempolicy(int __user *policy,
514 unsigned long __user *nmask,
515 unsigned long maxnode,
516 unsigned long addr, unsigned long flags)
519 struct mm_struct *mm = current->mm;
520 struct vm_area_struct *vma = NULL;
521 struct mempolicy *pol = current->mempolicy;
523 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
525 if (nmask != NULL && maxnode < MAX_NUMNODES)
527 if (flags & MPOL_F_ADDR) {
528 down_read(&mm->mmap_sem);
529 vma = find_vma_intersection(mm, addr, addr+1);
531 up_read(&mm->mmap_sem);
534 if (vma->vm_ops && vma->vm_ops->get_policy)
535 pol = vma->vm_ops->get_policy(vma, addr);
537 pol = vma->vm_policy;
542 pol = &default_policy;
544 if (flags & MPOL_F_NODE) {
545 if (flags & MPOL_F_ADDR) {
546 err = lookup_node(mm, addr);
550 } else if (pol == current->mempolicy &&
551 pol->policy == MPOL_INTERLEAVE) {
552 pval = current->il_next;
561 up_read(¤t->mm->mmap_sem);
565 if (policy && put_user(pval, policy))
571 get_zonemask(pol, &nodes);
572 err = copy_nodes_to_user(nmask, maxnode, &nodes);
577 up_read(¤t->mm->mmap_sem);
583 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
584 compat_ulong_t __user *nmask,
585 compat_ulong_t maxnode,
586 compat_ulong_t addr, compat_ulong_t flags)
589 unsigned long __user *nm = NULL;
590 unsigned long nr_bits, alloc_size;
591 DECLARE_BITMAP(bm, MAX_NUMNODES);
593 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
594 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
597 nm = compat_alloc_user_space(alloc_size);
599 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
602 err = copy_from_user(bm, nm, alloc_size);
603 /* ensure entire bitmap is zeroed */
604 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
605 err |= compat_put_bitmap(nmask, bm, nr_bits);
611 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
612 compat_ulong_t maxnode)
615 unsigned long __user *nm = NULL;
616 unsigned long nr_bits, alloc_size;
617 DECLARE_BITMAP(bm, MAX_NUMNODES);
619 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
620 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
623 err = compat_get_bitmap(bm, nmask, nr_bits);
624 nm = compat_alloc_user_space(alloc_size);
625 err |= copy_to_user(nm, bm, alloc_size);
631 return sys_set_mempolicy(mode, nm, nr_bits+1);
634 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
635 compat_ulong_t mode, compat_ulong_t __user *nmask,
636 compat_ulong_t maxnode, compat_ulong_t flags)
639 unsigned long __user *nm = NULL;
640 unsigned long nr_bits, alloc_size;
643 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
644 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
647 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
648 nm = compat_alloc_user_space(alloc_size);
649 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
655 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
660 /* Return effective policy for a VMA */
662 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
664 struct mempolicy *pol = task->mempolicy;
667 if (vma->vm_ops && vma->vm_ops->get_policy)
668 pol = vma->vm_ops->get_policy(vma, addr);
669 else if (vma->vm_policy &&
670 vma->vm_policy->policy != MPOL_DEFAULT)
671 pol = vma->vm_policy;
674 pol = &default_policy;
678 /* Return a zonelist representing a mempolicy */
679 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
683 switch (policy->policy) {
685 nd = policy->v.preferred_node;
690 /* Lower zones don't get a policy applied */
691 /* Careful: current->mems_allowed might have moved */
692 if (gfp_zone(gfp) >= policy_zone)
693 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
694 return policy->v.zonelist;
696 case MPOL_INTERLEAVE: /* should not happen */
704 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
707 /* Do dynamic interleaving for a process */
708 static unsigned interleave_nodes(struct mempolicy *policy)
711 struct task_struct *me = current;
714 next = next_node(nid, policy->v.nodes);
715 if (next >= MAX_NUMNODES)
716 next = first_node(policy->v.nodes);
721 /* Do static interleaving for a VMA with known offset. */
722 static unsigned offset_il_node(struct mempolicy *pol,
723 struct vm_area_struct *vma, unsigned long off)
725 unsigned nnodes = nodes_weight(pol->v.nodes);
726 unsigned target = (unsigned)off % nnodes;
732 nid = next_node(nid, pol->v.nodes);
734 } while (c <= target);
738 /* Allocate a page in interleaved policy.
739 Own path because it needs to do special accounting. */
740 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
746 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
747 page = __alloc_pages(gfp, order, zl);
748 if (page && page_zone(page) == zl->zones[0]) {
749 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
756 * alloc_page_vma - Allocate a page for a VMA.
759 * %GFP_USER user allocation.
760 * %GFP_KERNEL kernel allocations,
761 * %GFP_HIGHMEM highmem/user allocations,
762 * %GFP_FS allocation should not call back into a file system.
763 * %GFP_ATOMIC don't sleep.
765 * @vma: Pointer to VMA or NULL if not available.
766 * @addr: Virtual Address of the allocation. Must be inside the VMA.
768 * This function allocates a page from the kernel page pool and applies
769 * a NUMA policy associated with the VMA or the current process.
770 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
771 * mm_struct of the VMA to prevent it from going away. Should be used for
772 * all allocations for pages that will be mapped into
773 * user space. Returns NULL when no page can be allocated.
775 * Should be called with the mm_sem of the vma hold.
778 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
780 struct mempolicy *pol = get_vma_policy(current, vma, addr);
782 cpuset_update_current_mems_allowed();
784 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
789 off += (addr - vma->vm_start) >> PAGE_SHIFT;
790 nid = offset_il_node(pol, vma, off);
792 /* fall back to process interleaving */
793 nid = interleave_nodes(pol);
795 return alloc_page_interleave(gfp, 0, nid);
797 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
801 * alloc_pages_current - Allocate pages.
804 * %GFP_USER user allocation,
805 * %GFP_KERNEL kernel allocation,
806 * %GFP_HIGHMEM highmem allocation,
807 * %GFP_FS don't call back into a file system.
808 * %GFP_ATOMIC don't sleep.
809 * @order: Power of two of allocation size in pages. 0 is a single page.
811 * Allocate a page from the kernel page pool. When not in
812 * interrupt context and apply the current process NUMA policy.
813 * Returns NULL when no page can be allocated.
815 * Don't call cpuset_update_current_mems_allowed() unless
816 * 1) it's ok to take cpuset_sem (can WAIT), and
817 * 2) allocating for current task (not interrupt).
819 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
821 struct mempolicy *pol = current->mempolicy;
823 if ((gfp & __GFP_WAIT) && !in_interrupt())
824 cpuset_update_current_mems_allowed();
825 if (!pol || in_interrupt())
826 pol = &default_policy;
827 if (pol->policy == MPOL_INTERLEAVE)
828 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
829 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
831 EXPORT_SYMBOL(alloc_pages_current);
833 /* Slow path of a mempolicy copy */
834 struct mempolicy *__mpol_copy(struct mempolicy *old)
836 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
839 return ERR_PTR(-ENOMEM);
841 atomic_set(&new->refcnt, 1);
842 if (new->policy == MPOL_BIND) {
843 int sz = ksize(old->v.zonelist);
844 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
845 if (!new->v.zonelist) {
846 kmem_cache_free(policy_cache, new);
847 return ERR_PTR(-ENOMEM);
849 memcpy(new->v.zonelist, old->v.zonelist, sz);
854 /* Slow path of a mempolicy comparison */
855 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
859 if (a->policy != b->policy)
864 case MPOL_INTERLEAVE:
865 return nodes_equal(a->v.nodes, b->v.nodes);
867 return a->v.preferred_node == b->v.preferred_node;
870 for (i = 0; a->v.zonelist->zones[i]; i++)
871 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
873 return b->v.zonelist->zones[i] == NULL;
881 /* Slow path of a mpol destructor. */
882 void __mpol_free(struct mempolicy *p)
884 if (!atomic_dec_and_test(&p->refcnt))
886 if (p->policy == MPOL_BIND)
887 kfree(p->v.zonelist);
888 p->policy = MPOL_DEFAULT;
889 kmem_cache_free(policy_cache, p);
893 * Hugetlb policy. Same as above, just works with node numbers instead of
897 /* Find first node suitable for an allocation */
898 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
900 struct mempolicy *pol = get_vma_policy(current, vma, addr);
902 switch (pol->policy) {
904 return numa_node_id();
906 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
907 case MPOL_INTERLEAVE:
908 return interleave_nodes(pol);
910 return pol->v.preferred_node >= 0 ?
911 pol->v.preferred_node : numa_node_id();
917 /* Find secondary valid nodes for an allocation */
918 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
920 struct mempolicy *pol = get_vma_policy(current, vma, addr);
922 switch (pol->policy) {
925 case MPOL_INTERLEAVE:
929 for (z = pol->v.zonelist->zones; *z; z++)
930 if ((*z)->zone_pgdat->node_id == nid)
941 * Shared memory backing store policy support.
943 * Remember policies even when nobody has shared memory mapped.
944 * The policies are kept in Red-Black tree linked from the inode.
945 * They are protected by the sp->lock spinlock, which should be held
946 * for any accesses to the tree.
949 /* lookup first element intersecting start-end */
950 /* Caller holds sp->lock */
951 static struct sp_node *
952 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
954 struct rb_node *n = sp->root.rb_node;
957 struct sp_node *p = rb_entry(n, struct sp_node, nd);
961 else if (end <= p->start)
969 struct sp_node *w = NULL;
970 struct rb_node *prev = rb_prev(n);
973 w = rb_entry(prev, struct sp_node, nd);
978 return rb_entry(n, struct sp_node, nd);
981 /* Insert a new shared policy into the list. */
982 /* Caller holds sp->lock */
983 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
985 struct rb_node **p = &sp->root.rb_node;
986 struct rb_node *parent = NULL;
991 nd = rb_entry(parent, struct sp_node, nd);
992 if (new->start < nd->start)
994 else if (new->end > nd->end)
999 rb_link_node(&new->nd, parent, p);
1000 rb_insert_color(&new->nd, &sp->root);
1001 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1002 new->policy ? new->policy->policy : 0);
1005 /* Find shared policy intersecting idx */
1007 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1009 struct mempolicy *pol = NULL;
1012 if (!sp->root.rb_node)
1014 spin_lock(&sp->lock);
1015 sn = sp_lookup(sp, idx, idx+1);
1017 mpol_get(sn->policy);
1020 spin_unlock(&sp->lock);
1024 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1026 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1027 rb_erase(&n->nd, &sp->root);
1028 mpol_free(n->policy);
1029 kmem_cache_free(sn_cache, n);
1033 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1035 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1046 /* Replace a policy range. */
1047 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1048 unsigned long end, struct sp_node *new)
1050 struct sp_node *n, *new2 = NULL;
1053 spin_lock(&sp->lock);
1054 n = sp_lookup(sp, start, end);
1055 /* Take care of old policies in the same range. */
1056 while (n && n->start < end) {
1057 struct rb_node *next = rb_next(&n->nd);
1058 if (n->start >= start) {
1064 /* Old policy spanning whole new range. */
1067 spin_unlock(&sp->lock);
1068 new2 = sp_alloc(end, n->end, n->policy);
1074 sp_insert(sp, new2);
1082 n = rb_entry(next, struct sp_node, nd);
1086 spin_unlock(&sp->lock);
1088 mpol_free(new2->policy);
1089 kmem_cache_free(sn_cache, new2);
1094 int mpol_set_shared_policy(struct shared_policy *info,
1095 struct vm_area_struct *vma, struct mempolicy *npol)
1098 struct sp_node *new = NULL;
1099 unsigned long sz = vma_pages(vma);
1101 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1103 sz, npol? npol->policy : -1,
1104 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1107 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1111 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1113 kmem_cache_free(sn_cache, new);
1117 /* Free a backing policy store on inode delete. */
1118 void mpol_free_shared_policy(struct shared_policy *p)
1121 struct rb_node *next;
1123 if (!p->root.rb_node)
1125 spin_lock(&p->lock);
1126 next = rb_first(&p->root);
1128 n = rb_entry(next, struct sp_node, nd);
1129 next = rb_next(&n->nd);
1130 rb_erase(&n->nd, &p->root);
1131 mpol_free(n->policy);
1132 kmem_cache_free(sn_cache, n);
1134 spin_unlock(&p->lock);
1137 /* assumes fs == KERNEL_DS */
1138 void __init numa_policy_init(void)
1140 policy_cache = kmem_cache_create("numa_policy",
1141 sizeof(struct mempolicy),
1142 0, SLAB_PANIC, NULL, NULL);
1144 sn_cache = kmem_cache_create("shared_policy_node",
1145 sizeof(struct sp_node),
1146 0, SLAB_PANIC, NULL, NULL);
1148 /* Set interleaving policy for system init. This way not all
1149 the data structures allocated at system boot end up in node zero. */
1151 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1153 printk("numa_policy_init: interleaving failed\n");
1156 /* Reset policy of current process to default.
1157 * Assumes fs == KERNEL_DS */
1158 void numa_default_policy(void)
1160 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);