2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Do sanity checking on a policy */
97 static int mpol_check_policy(int mode, nodemask_t *nodes)
99 int empty = nodes_empty(*nodes);
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
117 /* Copy a node mask from user space. */
118 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
119 unsigned long maxnode, int mode)
122 unsigned long nlongs;
123 unsigned long endmask;
127 if (maxnode == 0 || !nmask)
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
143 if (get_user(t, nmask + k))
145 if (k == nlongs - 1) {
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
157 nodes_addr(*nodes)[nlongs-1] &= endmask;
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
163 return mpol_check_policy(mode, nodes);
166 /* Generate a custom zonelist for the BIND policy. */
167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
177 for_each_node_mask(nd, *nodes) {
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
183 zl->zones[num++] = z;
188 zl->zones[num] = NULL;
192 /* Create a new policy */
193 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
195 struct mempolicy *policy;
197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
198 if (mode == MPOL_DEFAULT)
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
205 case MPOL_INTERLEAVE:
206 policy->v.nodes = *nodes;
209 policy->v.preferred_node = first_node(*nodes);
210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
221 policy->policy = mode;
225 /* Ensure all existing pages follow the policy. */
226 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
227 unsigned long addr, unsigned long end, nodemask_t *nodes)
232 spin_lock(&mm->page_table_lock);
233 orig_pte = pte = pte_offset_map(pmd, addr);
238 if (!pte_present(*pte))
243 nid = pfn_to_nid(pfn);
244 if (!node_isset(nid, *nodes))
246 } while (pte++, addr += PAGE_SIZE, addr != end);
248 spin_unlock(&mm->page_table_lock);
252 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
253 unsigned long addr, unsigned long end, nodemask_t *nodes)
258 pmd = pmd_offset(pud, addr);
260 next = pmd_addr_end(addr, end);
261 if (pmd_none_or_clear_bad(pmd))
263 if (check_pte_range(mm, pmd, addr, next, nodes))
265 } while (pmd++, addr = next, addr != end);
269 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
270 unsigned long addr, unsigned long end, nodemask_t *nodes)
275 pud = pud_offset(pgd, addr);
277 next = pud_addr_end(addr, end);
278 if (pud_none_or_clear_bad(pud))
280 if (check_pmd_range(mm, pud, addr, next, nodes))
282 } while (pud++, addr = next, addr != end);
286 static inline int check_pgd_range(struct mm_struct *mm,
287 unsigned long addr, unsigned long end, nodemask_t *nodes)
292 pgd = pgd_offset(mm, addr);
294 next = pgd_addr_end(addr, end);
295 if (pgd_none_or_clear_bad(pgd))
297 if (check_pud_range(mm, pgd, addr, next, nodes))
299 } while (pgd++, addr = next, addr != end);
303 /* Step 1: check the range */
304 static struct vm_area_struct *
305 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
306 nodemask_t *nodes, unsigned long flags)
309 struct vm_area_struct *first, *vma, *prev;
311 first = find_vma(mm, start);
313 return ERR_PTR(-EFAULT);
315 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
316 if (!vma->vm_next && vma->vm_end < end)
317 return ERR_PTR(-EFAULT);
318 if (prev && prev->vm_end < vma->vm_start)
319 return ERR_PTR(-EFAULT);
320 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
321 unsigned long endvma = vma->vm_end;
324 if (vma->vm_start > start)
325 start = vma->vm_start;
326 err = check_pgd_range(vma->vm_mm,
327 start, endvma, nodes);
329 first = ERR_PTR(err);
338 /* Apply policy to a single VMA */
339 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
342 struct mempolicy *old = vma->vm_policy;
344 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
345 vma->vm_start, vma->vm_end, vma->vm_pgoff,
346 vma->vm_ops, vma->vm_file,
347 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
349 if (vma->vm_ops && vma->vm_ops->set_policy)
350 err = vma->vm_ops->set_policy(vma, new);
353 vma->vm_policy = new;
359 /* Step 2: apply policy to a range and do splits. */
360 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
361 unsigned long end, struct mempolicy *new)
363 struct vm_area_struct *next;
367 for (; vma && vma->vm_start < end; vma = next) {
369 if (vma->vm_start < start)
370 err = split_vma(vma->vm_mm, vma, start, 1);
371 if (!err && vma->vm_end > end)
372 err = split_vma(vma->vm_mm, vma, end, 0);
374 err = policy_vma(vma, new);
381 /* Change policy for a memory range */
382 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
384 unsigned long __user *nmask, unsigned long maxnode,
387 struct vm_area_struct *vma;
388 struct mm_struct *mm = current->mm;
389 struct mempolicy *new;
394 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
396 if (start & ~PAGE_MASK)
398 if (mode == MPOL_DEFAULT)
399 flags &= ~MPOL_MF_STRICT;
400 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
407 err = get_nodes(&nodes, nmask, maxnode, mode);
411 new = mpol_new(mode, &nodes);
415 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
416 mode,nodes_addr(nodes)[0]);
418 down_write(&mm->mmap_sem);
419 vma = check_range(mm, start, end, &nodes, flags);
422 err = mbind_range(vma, start, end, new);
423 up_write(&mm->mmap_sem);
428 /* Set the process memory policy */
429 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
430 unsigned long maxnode)
433 struct mempolicy *new;
436 if (mode < 0 || mode > MPOL_MAX)
438 err = get_nodes(&nodes, nmask, maxnode, mode);
441 new = mpol_new(mode, &nodes);
444 mpol_free(current->mempolicy);
445 current->mempolicy = new;
446 if (new && new->policy == MPOL_INTERLEAVE)
447 current->il_next = first_node(new->v.nodes);
451 /* Fill a zone bitmap for a policy */
452 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
459 for (i = 0; p->v.zonelist->zones[i]; i++)
460 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
464 case MPOL_INTERLEAVE:
468 /* or use current node instead of online map? */
469 if (p->v.preferred_node < 0)
470 *nodes = node_online_map;
472 node_set(p->v.preferred_node, *nodes);
479 static int lookup_node(struct mm_struct *mm, unsigned long addr)
484 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
486 err = page_to_nid(p);
492 /* Copy a kernel node mask to user space */
493 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
496 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
497 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
500 if (copy > PAGE_SIZE)
502 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
506 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
509 /* Retrieve NUMA policy */
510 asmlinkage long sys_get_mempolicy(int __user *policy,
511 unsigned long __user *nmask,
512 unsigned long maxnode,
513 unsigned long addr, unsigned long flags)
516 struct mm_struct *mm = current->mm;
517 struct vm_area_struct *vma = NULL;
518 struct mempolicy *pol = current->mempolicy;
520 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
522 if (nmask != NULL && maxnode < MAX_NUMNODES)
524 if (flags & MPOL_F_ADDR) {
525 down_read(&mm->mmap_sem);
526 vma = find_vma_intersection(mm, addr, addr+1);
528 up_read(&mm->mmap_sem);
531 if (vma->vm_ops && vma->vm_ops->get_policy)
532 pol = vma->vm_ops->get_policy(vma, addr);
534 pol = vma->vm_policy;
539 pol = &default_policy;
541 if (flags & MPOL_F_NODE) {
542 if (flags & MPOL_F_ADDR) {
543 err = lookup_node(mm, addr);
547 } else if (pol == current->mempolicy &&
548 pol->policy == MPOL_INTERLEAVE) {
549 pval = current->il_next;
558 up_read(¤t->mm->mmap_sem);
562 if (policy && put_user(pval, policy))
568 get_zonemask(pol, &nodes);
569 err = copy_nodes_to_user(nmask, maxnode, &nodes);
574 up_read(¤t->mm->mmap_sem);
580 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
581 compat_ulong_t __user *nmask,
582 compat_ulong_t maxnode,
583 compat_ulong_t addr, compat_ulong_t flags)
586 unsigned long __user *nm = NULL;
587 unsigned long nr_bits, alloc_size;
588 DECLARE_BITMAP(bm, MAX_NUMNODES);
590 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
591 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
594 nm = compat_alloc_user_space(alloc_size);
596 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
599 err = copy_from_user(bm, nm, alloc_size);
600 /* ensure entire bitmap is zeroed */
601 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
602 err |= compat_put_bitmap(nmask, bm, nr_bits);
608 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
609 compat_ulong_t maxnode)
612 unsigned long __user *nm = NULL;
613 unsigned long nr_bits, alloc_size;
614 DECLARE_BITMAP(bm, MAX_NUMNODES);
616 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
617 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
620 err = compat_get_bitmap(bm, nmask, nr_bits);
621 nm = compat_alloc_user_space(alloc_size);
622 err |= copy_to_user(nm, bm, alloc_size);
628 return sys_set_mempolicy(mode, nm, nr_bits+1);
631 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
632 compat_ulong_t mode, compat_ulong_t __user *nmask,
633 compat_ulong_t maxnode, compat_ulong_t flags)
636 unsigned long __user *nm = NULL;
637 unsigned long nr_bits, alloc_size;
640 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
641 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
644 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
645 nm = compat_alloc_user_space(alloc_size);
646 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
652 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
657 /* Return effective policy for a VMA */
659 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
661 struct mempolicy *pol = task->mempolicy;
664 if (vma->vm_ops && vma->vm_ops->get_policy)
665 pol = vma->vm_ops->get_policy(vma, addr);
666 else if (vma->vm_policy &&
667 vma->vm_policy->policy != MPOL_DEFAULT)
668 pol = vma->vm_policy;
671 pol = &default_policy;
675 /* Return a zonelist representing a mempolicy */
676 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
680 switch (policy->policy) {
682 nd = policy->v.preferred_node;
687 /* Lower zones don't get a policy applied */
688 /* Careful: current->mems_allowed might have moved */
689 if (gfp_zone(gfp) >= policy_zone)
690 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
691 return policy->v.zonelist;
693 case MPOL_INTERLEAVE: /* should not happen */
701 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
704 /* Do dynamic interleaving for a process */
705 static unsigned interleave_nodes(struct mempolicy *policy)
708 struct task_struct *me = current;
711 next = next_node(nid, policy->v.nodes);
712 if (next >= MAX_NUMNODES)
713 next = first_node(policy->v.nodes);
718 /* Do static interleaving for a VMA with known offset. */
719 static unsigned offset_il_node(struct mempolicy *pol,
720 struct vm_area_struct *vma, unsigned long off)
722 unsigned nnodes = nodes_weight(pol->v.nodes);
723 unsigned target = (unsigned)off % nnodes;
729 nid = next_node(nid, pol->v.nodes);
731 } while (c <= target);
735 /* Allocate a page in interleaved policy.
736 Own path because it needs to do special accounting. */
737 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
743 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
744 page = __alloc_pages(gfp, order, zl);
745 if (page && page_zone(page) == zl->zones[0]) {
746 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
753 * alloc_page_vma - Allocate a page for a VMA.
756 * %GFP_USER user allocation.
757 * %GFP_KERNEL kernel allocations,
758 * %GFP_HIGHMEM highmem/user allocations,
759 * %GFP_FS allocation should not call back into a file system.
760 * %GFP_ATOMIC don't sleep.
762 * @vma: Pointer to VMA or NULL if not available.
763 * @addr: Virtual Address of the allocation. Must be inside the VMA.
765 * This function allocates a page from the kernel page pool and applies
766 * a NUMA policy associated with the VMA or the current process.
767 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
768 * mm_struct of the VMA to prevent it from going away. Should be used for
769 * all allocations for pages that will be mapped into
770 * user space. Returns NULL when no page can be allocated.
772 * Should be called with the mm_sem of the vma hold.
775 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
777 struct mempolicy *pol = get_vma_policy(current, vma, addr);
779 cpuset_update_current_mems_allowed();
781 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
786 off += (addr - vma->vm_start) >> PAGE_SHIFT;
787 nid = offset_il_node(pol, vma, off);
789 /* fall back to process interleaving */
790 nid = interleave_nodes(pol);
792 return alloc_page_interleave(gfp, 0, nid);
794 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
798 * alloc_pages_current - Allocate pages.
801 * %GFP_USER user allocation,
802 * %GFP_KERNEL kernel allocation,
803 * %GFP_HIGHMEM highmem allocation,
804 * %GFP_FS don't call back into a file system.
805 * %GFP_ATOMIC don't sleep.
806 * @order: Power of two of allocation size in pages. 0 is a single page.
808 * Allocate a page from the kernel page pool. When not in
809 * interrupt context and apply the current process NUMA policy.
810 * Returns NULL when no page can be allocated.
812 * Don't call cpuset_update_current_mems_allowed() unless
813 * 1) it's ok to take cpuset_sem (can WAIT), and
814 * 2) allocating for current task (not interrupt).
816 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
818 struct mempolicy *pol = current->mempolicy;
820 if ((gfp & __GFP_WAIT) && !in_interrupt())
821 cpuset_update_current_mems_allowed();
822 if (!pol || in_interrupt())
823 pol = &default_policy;
824 if (pol->policy == MPOL_INTERLEAVE)
825 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
826 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
828 EXPORT_SYMBOL(alloc_pages_current);
830 /* Slow path of a mempolicy copy */
831 struct mempolicy *__mpol_copy(struct mempolicy *old)
833 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
836 return ERR_PTR(-ENOMEM);
838 atomic_set(&new->refcnt, 1);
839 if (new->policy == MPOL_BIND) {
840 int sz = ksize(old->v.zonelist);
841 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
842 if (!new->v.zonelist) {
843 kmem_cache_free(policy_cache, new);
844 return ERR_PTR(-ENOMEM);
846 memcpy(new->v.zonelist, old->v.zonelist, sz);
851 /* Slow path of a mempolicy comparison */
852 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
856 if (a->policy != b->policy)
861 case MPOL_INTERLEAVE:
862 return nodes_equal(a->v.nodes, b->v.nodes);
864 return a->v.preferred_node == b->v.preferred_node;
867 for (i = 0; a->v.zonelist->zones[i]; i++)
868 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
870 return b->v.zonelist->zones[i] == NULL;
878 /* Slow path of a mpol destructor. */
879 void __mpol_free(struct mempolicy *p)
881 if (!atomic_dec_and_test(&p->refcnt))
883 if (p->policy == MPOL_BIND)
884 kfree(p->v.zonelist);
885 p->policy = MPOL_DEFAULT;
886 kmem_cache_free(policy_cache, p);
890 * Hugetlb policy. Same as above, just works with node numbers instead of
894 /* Find first node suitable for an allocation */
895 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
897 struct mempolicy *pol = get_vma_policy(current, vma, addr);
899 switch (pol->policy) {
901 return numa_node_id();
903 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
904 case MPOL_INTERLEAVE:
905 return interleave_nodes(pol);
907 return pol->v.preferred_node >= 0 ?
908 pol->v.preferred_node : numa_node_id();
914 /* Find secondary valid nodes for an allocation */
915 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
917 struct mempolicy *pol = get_vma_policy(current, vma, addr);
919 switch (pol->policy) {
922 case MPOL_INTERLEAVE:
926 for (z = pol->v.zonelist->zones; *z; z++)
927 if ((*z)->zone_pgdat->node_id == nid)
938 * Shared memory backing store policy support.
940 * Remember policies even when nobody has shared memory mapped.
941 * The policies are kept in Red-Black tree linked from the inode.
942 * They are protected by the sp->lock spinlock, which should be held
943 * for any accesses to the tree.
946 /* lookup first element intersecting start-end */
947 /* Caller holds sp->lock */
948 static struct sp_node *
949 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
951 struct rb_node *n = sp->root.rb_node;
954 struct sp_node *p = rb_entry(n, struct sp_node, nd);
958 else if (end <= p->start)
966 struct sp_node *w = NULL;
967 struct rb_node *prev = rb_prev(n);
970 w = rb_entry(prev, struct sp_node, nd);
975 return rb_entry(n, struct sp_node, nd);
978 /* Insert a new shared policy into the list. */
979 /* Caller holds sp->lock */
980 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
982 struct rb_node **p = &sp->root.rb_node;
983 struct rb_node *parent = NULL;
988 nd = rb_entry(parent, struct sp_node, nd);
989 if (new->start < nd->start)
991 else if (new->end > nd->end)
996 rb_link_node(&new->nd, parent, p);
997 rb_insert_color(&new->nd, &sp->root);
998 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
999 new->policy ? new->policy->policy : 0);
1002 /* Find shared policy intersecting idx */
1004 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1006 struct mempolicy *pol = NULL;
1009 if (!sp->root.rb_node)
1011 spin_lock(&sp->lock);
1012 sn = sp_lookup(sp, idx, idx+1);
1014 mpol_get(sn->policy);
1017 spin_unlock(&sp->lock);
1021 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1023 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1024 rb_erase(&n->nd, &sp->root);
1025 mpol_free(n->policy);
1026 kmem_cache_free(sn_cache, n);
1030 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1032 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1043 /* Replace a policy range. */
1044 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1045 unsigned long end, struct sp_node *new)
1047 struct sp_node *n, *new2 = NULL;
1050 spin_lock(&sp->lock);
1051 n = sp_lookup(sp, start, end);
1052 /* Take care of old policies in the same range. */
1053 while (n && n->start < end) {
1054 struct rb_node *next = rb_next(&n->nd);
1055 if (n->start >= start) {
1061 /* Old policy spanning whole new range. */
1064 spin_unlock(&sp->lock);
1065 new2 = sp_alloc(end, n->end, n->policy);
1071 sp_insert(sp, new2);
1079 n = rb_entry(next, struct sp_node, nd);
1083 spin_unlock(&sp->lock);
1085 mpol_free(new2->policy);
1086 kmem_cache_free(sn_cache, new2);
1091 int mpol_set_shared_policy(struct shared_policy *info,
1092 struct vm_area_struct *vma, struct mempolicy *npol)
1095 struct sp_node *new = NULL;
1096 unsigned long sz = vma_pages(vma);
1098 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1100 sz, npol? npol->policy : -1,
1101 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1104 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1108 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1110 kmem_cache_free(sn_cache, new);
1114 /* Free a backing policy store on inode delete. */
1115 void mpol_free_shared_policy(struct shared_policy *p)
1118 struct rb_node *next;
1120 if (!p->root.rb_node)
1122 spin_lock(&p->lock);
1123 next = rb_first(&p->root);
1125 n = rb_entry(next, struct sp_node, nd);
1126 next = rb_next(&n->nd);
1127 rb_erase(&n->nd, &p->root);
1128 mpol_free(n->policy);
1129 kmem_cache_free(sn_cache, n);
1131 spin_unlock(&p->lock);
1134 /* assumes fs == KERNEL_DS */
1135 void __init numa_policy_init(void)
1137 policy_cache = kmem_cache_create("numa_policy",
1138 sizeof(struct mempolicy),
1139 0, SLAB_PANIC, NULL, NULL);
1141 sn_cache = kmem_cache_create("shared_policy_node",
1142 sizeof(struct sp_node),
1143 0, SLAB_PANIC, NULL, NULL);
1145 /* Set interleaving policy for system init. This way not all
1146 the data structures allocated at system boot end up in node zero. */
1148 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1150 printk("numa_policy_init: interleaving failed\n");
1153 /* Reset policy of current process to default.
1154 * Assumes fs == KERNEL_DS */
1155 void numa_default_policy(void)
1157 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);