git.oblomov.eu Git - linux-2.6/blob - mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 enum zone_type policy_zone = 0;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd;
 141         enum zone_type k;
 142
 143         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 144         max++;                  /* space for zlcache_ptr (see mmzone.h) */
 145         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 146         if (!zl)
 147                 return ERR_PTR(-ENOMEM);
 148         zl->zlcache_ptr = NULL;
 149         num = 0;
 150         /* First put in the highest zones from all nodes, then all the next
 151            lower zones etc. Avoid empty zones because the memory allocator
 152            doesn't like them. If you implement node hot removal you
 153            have to fix that. */
 154         k = policy_zone;
 155         while (1) {
 156                 for_each_node_mask(nd, *nodes) {
 157                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 158                         if (z->present_pages > 0)
 159                                 zl->zones[num++] = z;
 160                 }
 161                 if (k == 0)
 162                         break;
 163                 k--;
 164         }
 165         if (num == 0) {
 166                 kfree(zl);
 167                 return ERR_PTR(-EINVAL);
 168         }
 169         zl->zones[num] = NULL;
 170         return zl;
 171 }
 172
 173 /* Create a new policy */
 174 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 175 {
 176         struct mempolicy *policy;
 177
 178         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 179         if (mode == MPOL_DEFAULT)
 180                 return NULL;
 181         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 182         if (!policy)
 183                 return ERR_PTR(-ENOMEM);
 184         atomic_set(&policy->refcnt, 1);
 185         switch (mode) {
 186         case MPOL_INTERLEAVE:
 187                 policy->v.nodes = *nodes;
 188                 if (nodes_weight(*nodes) == 0) {
 189                         kmem_cache_free(policy_cache, policy);
 190                         return ERR_PTR(-EINVAL);
 191                 }
 192                 break;
 193         case MPOL_PREFERRED:
 194                 policy->v.preferred_node = first_node(*nodes);
 195                 if (policy->v.preferred_node >= MAX_NUMNODES)
 196                         policy->v.preferred_node = -1;
 197                 break;
 198         case MPOL_BIND:
 199                 policy->v.zonelist = bind_zonelist(nodes);
 200                 if (IS_ERR(policy->v.zonelist)) {
 201                         void *error_code = policy->v.zonelist;
 202                         kmem_cache_free(policy_cache, policy);
 203                         return error_code;
 204                 }
 205                 break;
 206         }
 207         policy->policy = mode;
 208         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 209         return policy;
 210 }
 211
 212 static void gather_stats(struct page *, void *, int pte_dirty);
 213 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 214                                 unsigned long flags);
 215
 216 /* Scan through pages checking if pages follow certain conditions. */
 217 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 218                 unsigned long addr, unsigned long end,
 219                 const nodemask_t *nodes, unsigned long flags,
 220                 void *private)
 221 {
 222         pte_t *orig_pte;
 223         pte_t *pte;
 224         spinlock_t *ptl;
 225
 226         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 227         do {
 228                 struct page *page;
 229                 int nid;
 230
 231                 if (!pte_present(*pte))
 232                         continue;
 233                 page = vm_normal_page(vma, addr, *pte);
 234                 if (!page)
 235                         continue;
 236                 /*
 237                  * The check for PageReserved here is important to avoid
 238                  * handling zero pages and other pages that may have been
 239                  * marked special by the system.
 240                  *
 241                  * If the PageReserved would not be checked here then f.e.
 242                  * the location of the zero page could have an influence
 243                  * on MPOL_MF_STRICT, zero pages would be counted for
 244                  * the per node stats, and there would be useless attempts
 245                  * to put zero pages on the migration list.
 246                  */
 247                 if (PageReserved(page))
 248                         continue;
 249                 nid = page_to_nid(page);
 250                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 251                         continue;
 252
 253                 if (flags & MPOL_MF_STATS)
 254                         gather_stats(page, private, pte_dirty(*pte));
 255                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 256                         migrate_page_add(page, private, flags);
 257                 else
 258                         break;
 259         } while (pte++, addr += PAGE_SIZE, addr != end);
 260         pte_unmap_unlock(orig_pte, ptl);
 261         return addr != end;
 262 }
 263
 264 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 265                 unsigned long addr, unsigned long end,
 266                 const nodemask_t *nodes, unsigned long flags,
 267                 void *private)
 268 {
 269         pmd_t *pmd;
 270         unsigned long next;
 271
 272         pmd = pmd_offset(pud, addr);
 273         do {
 274                 next = pmd_addr_end(addr, end);
 275                 if (pmd_none_or_clear_bad(pmd))
 276                         continue;
 277                 if (check_pte_range(vma, pmd, addr, next, nodes,
 278                                     flags, private))
 279                         return -EIO;
 280         } while (pmd++, addr = next, addr != end);
 281         return 0;
 282 }
 283
 284 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 285                 unsigned long addr, unsigned long end,
 286                 const nodemask_t *nodes, unsigned long flags,
 287                 void *private)
 288 {
 289         pud_t *pud;
 290         unsigned long next;
 291
 292         pud = pud_offset(pgd, addr);
 293         do {
 294                 next = pud_addr_end(addr, end);
 295                 if (pud_none_or_clear_bad(pud))
 296                         continue;
 297                 if (check_pmd_range(vma, pud, addr, next, nodes,
 298                                     flags, private))
 299                         return -EIO;
 300         } while (pud++, addr = next, addr != end);
 301         return 0;
 302 }
 303
 304 static inline int check_pgd_range(struct vm_area_struct *vma,
 305                 unsigned long addr, unsigned long end,
 306                 const nodemask_t *nodes, unsigned long flags,
 307                 void *private)
 308 {
 309         pgd_t *pgd;
 310         unsigned long next;
 311
 312         pgd = pgd_offset(vma->vm_mm, addr);
 313         do {
 314                 next = pgd_addr_end(addr, end);
 315                 if (pgd_none_or_clear_bad(pgd))
 316                         continue;
 317                 if (check_pud_range(vma, pgd, addr, next, nodes,
 318                                     flags, private))
 319                         return -EIO;
 320         } while (pgd++, addr = next, addr != end);
 321         return 0;
 322 }
 323
 324 /* Check if a vma is migratable */
 325 static inline int vma_migratable(struct vm_area_struct *vma)
 326 {
 327         if (vma->vm_flags & (
 328                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 329                 return 0;
 330         return 1;
 331 }
 332
 333 /*
 334  * Check if all pages in a range are on a set of nodes.
 335  * If pagelist != NULL then isolate pages from the LRU and
 336  * put them on the pagelist.
 337  */
 338 static struct vm_area_struct *
 339 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 340                 const nodemask_t *nodes, unsigned long flags, void *private)
 341 {
 342         int err;
 343         struct vm_area_struct *first, *vma, *prev;
 344
 345         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 346
 347                 err = migrate_prep();
 348                 if (err)
 349                         return ERR_PTR(err);
 350         }
 351
 352         first = find_vma(mm, start);
 353         if (!first)
 354                 return ERR_PTR(-EFAULT);
 355         prev = NULL;
 356         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 357                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 358                         if (!vma->vm_next && vma->vm_end < end)
 359                                 return ERR_PTR(-EFAULT);
 360                         if (prev && prev->vm_end < vma->vm_start)
 361                                 return ERR_PTR(-EFAULT);
 362                 }
 363                 if (!is_vm_hugetlb_page(vma) &&
 364                     ((flags & MPOL_MF_STRICT) ||
 365                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 366                                 vma_migratable(vma)))) {
 367                         unsigned long endvma = vma->vm_end;
 368
 369                         if (endvma > end)
 370                                 endvma = end;
 371                         if (vma->vm_start > start)
 372                                 start = vma->vm_start;
 373                         err = check_pgd_range(vma, start, endvma, nodes,
 374                                                 flags, private);
 375                         if (err) {
 376                                 first = ERR_PTR(err);
 377                                 break;
 378                         }
 379                 }
 380                 prev = vma;
 381         }
 382         return first;
 383 }
 384
 385 /* Apply policy to a single VMA */
 386 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 387 {
 388         int err = 0;
 389         struct mempolicy *old = vma->vm_policy;
 390
 391         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 392                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 393                  vma->vm_ops, vma->vm_file,
 394                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 395
 396         if (vma->vm_ops && vma->vm_ops->set_policy)
 397                 err = vma->vm_ops->set_policy(vma, new);
 398         if (!err) {
 399                 mpol_get(new);
 400                 vma->vm_policy = new;
 401                 mpol_free(old);
 402         }
 403         return err;
 404 }
 405
 406 /* Step 2: apply policy to a range and do splits. */
 407 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 408                        unsigned long end, struct mempolicy *new)
 409 {
 410         struct vm_area_struct *next;
 411         int err;
 412
 413         err = 0;
 414         for (; vma && vma->vm_start < end; vma = next) {
 415                 next = vma->vm_next;
 416                 if (vma->vm_start < start)
 417                         err = split_vma(vma->vm_mm, vma, start, 1);
 418                 if (!err && vma->vm_end > end)
 419                         err = split_vma(vma->vm_mm, vma, end, 0);
 420                 if (!err)
 421                         err = policy_vma(vma, new);
 422                 if (err)
 423                         break;
 424         }
 425         return err;
 426 }
 427
 428 static int contextualize_policy(int mode, nodemask_t *nodes)
 429 {
 430         if (!nodes)
 431                 return 0;
 432
 433         cpuset_update_task_memory_state();
 434         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 435                 return -EINVAL;
 436         return mpol_check_policy(mode, nodes);
 437 }
 438
 439
 440 /*
 441  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 442  * mempolicy.  Allows more rapid checking of this (combined perhaps
 443  * with other PF_* flag bits) on memory allocation hot code paths.
 444  *
 445  * If called from outside this file, the task 'p' should -only- be
 446  * a newly forked child not yet visible on the task list, because
 447  * manipulating the task flags of a visible task is not safe.
 448  *
 449  * The above limitation is why this routine has the funny name
 450  * mpol_fix_fork_child_flag().
 451  *
 452  * It is also safe to call this with a task pointer of current,
 453  * which the static wrapper mpol_set_task_struct_flag() does,
 454  * for use within this file.
 455  */
 456
 457 void mpol_fix_fork_child_flag(struct task_struct *p)
 458 {
 459         if (p->mempolicy)
 460                 p->flags |= PF_MEMPOLICY;
 461         else
 462                 p->flags &= ~PF_MEMPOLICY;
 463 }
 464
 465 static void mpol_set_task_struct_flag(void)
 466 {
 467         mpol_fix_fork_child_flag(current);
 468 }
 469
 470 /* Set the process memory policy */
 471 long do_set_mempolicy(int mode, nodemask_t *nodes)
 472 {
 473         struct mempolicy *new;
 474
 475         if (contextualize_policy(mode, nodes))
 476                 return -EINVAL;
 477         new = mpol_new(mode, nodes);
 478         if (IS_ERR(new))
 479                 return PTR_ERR(new);
 480         mpol_free(current->mempolicy);
 481         current->mempolicy = new;
 482         mpol_set_task_struct_flag();
 483         if (new && new->policy == MPOL_INTERLEAVE)
 484                 current->il_next = first_node(new->v.nodes);
 485         return 0;
 486 }
 487
 488 /* Fill a zone bitmap for a policy */
 489 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 490 {
 491         int i;
 492
 493         nodes_clear(*nodes);
 494         switch (p->policy) {
 495         case MPOL_BIND:
 496                 for (i = 0; p->v.zonelist->zones[i]; i++)
 497                         node_set(zone_to_nid(p->v.zonelist->zones[i]),
 498                                 *nodes);
 499                 break;
 500         case MPOL_DEFAULT:
 501                 break;
 502         case MPOL_INTERLEAVE:
 503                 *nodes = p->v.nodes;
 504                 break;
 505         case MPOL_PREFERRED:
 506                 /* or use current node instead of online map? */
 507                 if (p->v.preferred_node < 0)
 508                         *nodes = node_online_map;
 509                 else
 510                         node_set(p->v.preferred_node, *nodes);
 511                 break;
 512         default:
 513                 BUG();
 514         }
 515 }
 516
 517 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 518 {
 519         struct page *p;
 520         int err;
 521
 522         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 523         if (err >= 0) {
 524                 err = page_to_nid(p);
 525                 put_page(p);
 526         }
 527         return err;
 528 }
 529
 530 /* Retrieve NUMA policy */
 531 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 532                         unsigned long addr, unsigned long flags)
 533 {
 534         int err;
 535         struct mm_struct *mm = current->mm;
 536         struct vm_area_struct *vma = NULL;
 537         struct mempolicy *pol = current->mempolicy;
 538
 539         cpuset_update_task_memory_state();
 540         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 541                 return -EINVAL;
 542         if (flags & MPOL_F_ADDR) {
 543                 down_read(&mm->mmap_sem);
 544                 vma = find_vma_intersection(mm, addr, addr+1);
 545                 if (!vma) {
 546                         up_read(&mm->mmap_sem);
 547                         return -EFAULT;
 548                 }
 549                 if (vma->vm_ops && vma->vm_ops->get_policy)
 550                         pol = vma->vm_ops->get_policy(vma, addr);
 551                 else
 552                         pol = vma->vm_policy;
 553         } else if (addr)
 554                 return -EINVAL;
 555
 556         if (!pol)
 557                 pol = &default_policy;
 558
 559         if (flags & MPOL_F_NODE) {
 560                 if (flags & MPOL_F_ADDR) {
 561                         err = lookup_node(mm, addr);
 562                         if (err < 0)
 563                                 goto out;
 564                         *policy = err;
 565                 } else if (pol == current->mempolicy &&
 566                                 pol->policy == MPOL_INTERLEAVE) {
 567                         *policy = current->il_next;
 568                 } else {
 569                         err = -EINVAL;
 570                         goto out;
 571                 }
 572         } else
 573                 *policy = pol->policy;
 574
 575         if (vma) {
 576                 up_read(&current->mm->mmap_sem);
 577                 vma = NULL;
 578         }
 579
 580         err = 0;
 581         if (nmask)
 582                 get_zonemask(pol, nmask);
 583
 584  out:
 585         if (vma)
 586                 up_read(&current->mm->mmap_sem);
 587         return err;
 588 }
 589
 590 #ifdef CONFIG_MIGRATION
 591 /*
 592  * page migration
 593  */
 594 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 595                                 unsigned long flags)
 596 {
 597         /*
 598          * Avoid migrating a page that is shared with others.
 599          */
 600         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 601                 isolate_lru_page(page, pagelist);
 602 }
 603
 604 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 605 {
 606         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 607 }
 608
 609 /*
 610  * Migrate pages from one node to a target node.
 611  * Returns error or the number of pages not migrated.
 612  */
 613 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 614 {
 615         nodemask_t nmask;
 616         LIST_HEAD(pagelist);
 617         int err = 0;
 618
 619         nodes_clear(nmask);
 620         node_set(source, nmask);
 621
 622         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 623                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 624
 625         if (!list_empty(&pagelist))
 626                 err = migrate_pages(&pagelist, new_node_page, dest);
 627
 628         return err;
 629 }
 630
 631 /*
 632  * Move pages between the two nodesets so as to preserve the physical
 633  * layout as much as possible.
 634  *
 635  * Returns the number of page that could not be moved.
 636  */
 637 int do_migrate_pages(struct mm_struct *mm,
 638         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 639 {
 640         LIST_HEAD(pagelist);
 641         int busy = 0;
 642         int err = 0;
 643         nodemask_t tmp;
 644
 645         down_read(&mm->mmap_sem);
 646
 647         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 648         if (err)
 649                 goto out;
 650
 651 /*
 652  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 653  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 654  * bit in 'tmp', and return that <source, dest> pair for migration.
 655  * The pair of nodemasks 'to' and 'from' define the map.
 656  *
 657  * If no pair of bits is found that way, fallback to picking some
 658  * pair of 'source' and 'dest' bits that are not the same.  If the
 659  * 'source' and 'dest' bits are the same, this represents a node
 660  * that will be migrating to itself, so no pages need move.
 661  *
 662  * If no bits are left in 'tmp', or if all remaining bits left
 663  * in 'tmp' correspond to the same bit in 'to', return false
 664  * (nothing left to migrate).
 665  *
 666  * This lets us pick a pair of nodes to migrate between, such that
 667  * if possible the dest node is not already occupied by some other
 668  * source node, minimizing the risk of overloading the memory on a
 669  * node that would happen if we migrated incoming memory to a node
 670  * before migrating outgoing memory source that same node.
 671  *
 672  * A single scan of tmp is sufficient.  As we go, we remember the
 673  * most recent <s, d> pair that moved (s != d).  If we find a pair
 674  * that not only moved, but what's better, moved to an empty slot
 675  * (d is not set in tmp), then we break out then, with that pair.
 676  * Otherwise when we finish scannng from_tmp, we at least have the
 677  * most recent <s, d> pair that moved.  If we get all the way through
 678  * the scan of tmp without finding any node that moved, much less
 679  * moved to an empty node, then there is nothing left worth migrating.
 680  */
 681
 682         tmp = *from_nodes;
 683         while (!nodes_empty(tmp)) {
 684                 int s,d;
 685                 int source = -1;
 686                 int dest = 0;
 687
 688                 for_each_node_mask(s, tmp) {
 689                         d = node_remap(s, *from_nodes, *to_nodes);
 690                         if (s == d)
 691                                 continue;
 692
 693                         source = s;     /* Node moved. Memorize */
 694                         dest = d;
 695
 696                         /* dest not in remaining from nodes? */
 697                         if (!node_isset(dest, tmp))
 698                                 break;
 699                 }
 700                 if (source == -1)
 701                         break;
 702
 703                 node_clear(source, tmp);
 704                 err = migrate_to_node(mm, source, dest, flags);
 705                 if (err > 0)
 706                         busy += err;
 707                 if (err < 0)
 708                         break;
 709         }
 710 out:
 711         up_read(&mm->mmap_sem);
 712         if (err < 0)
 713                 return err;
 714         return busy;
 715
 716 }
 717
 718 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 719 {
 720         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 721
 722         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 723 }
 724 #else
 725
 726 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 727                                 unsigned long flags)
 728 {
 729 }
 730
 731 int do_migrate_pages(struct mm_struct *mm,
 732         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 733 {
 734         return -ENOSYS;
 735 }
 736
 737 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 738 {
 739         return NULL;
 740 }
 741 #endif
 742
 743 long do_mbind(unsigned long start, unsigned long len,
 744                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 745 {
 746         struct vm_area_struct *vma;
 747         struct mm_struct *mm = current->mm;
 748         struct mempolicy *new;
 749         unsigned long end;
 750         int err;
 751         LIST_HEAD(pagelist);
 752
 753         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 754                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 755             || mode > MPOL_MAX)
 756                 return -EINVAL;
 757         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 758                 return -EPERM;
 759
 760         if (start & ~PAGE_MASK)
 761                 return -EINVAL;
 762
 763         if (mode == MPOL_DEFAULT)
 764                 flags &= ~MPOL_MF_STRICT;
 765
 766         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 767         end = start + len;
 768
 769         if (end < start)
 770                 return -EINVAL;
 771         if (end == start)
 772                 return 0;
 773
 774         if (mpol_check_policy(mode, nmask))
 775                 return -EINVAL;
 776
 777         new = mpol_new(mode, nmask);
 778         if (IS_ERR(new))
 779                 return PTR_ERR(new);
 780
 781         /*
 782          * If we are using the default policy then operation
 783          * on discontinuous address spaces is okay after all
 784          */
 785         if (!new)
 786                 flags |= MPOL_MF_DISCONTIG_OK;
 787
 788         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 789                         mode,nodes_addr(nodes)[0]);
 790
 791         down_write(&mm->mmap_sem);
 792         vma = check_range(mm, start, end, nmask,
 793                           flags | MPOL_MF_INVERT, &pagelist);
 794
 795         err = PTR_ERR(vma);
 796         if (!IS_ERR(vma)) {
 797                 int nr_failed = 0;
 798
 799                 err = mbind_range(vma, start, end, new);
 800
 801                 if (!list_empty(&pagelist))
 802                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 803                                                 (unsigned long)vma);
 804
 805                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 806                         err = -EIO;
 807         }
 808
 809         up_write(&mm->mmap_sem);
 810         mpol_free(new);
 811         return err;
 812 }
 813
 814 /*
 815  * User space interface with variable sized bitmaps for nodelists.
 816  */
 817
 818 /* Copy a node mask from user space. */
 819 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 820                      unsigned long maxnode)
 821 {
 822         unsigned long k;
 823         unsigned long nlongs;
 824         unsigned long endmask;
 825
 826         --maxnode;
 827         nodes_clear(*nodes);
 828         if (maxnode == 0 || !nmask)
 829                 return 0;
 830         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 831                 return -EINVAL;
 832
 833         nlongs = BITS_TO_LONGS(maxnode);
 834         if ((maxnode % BITS_PER_LONG) == 0)
 835                 endmask = ~0UL;
 836         else
 837                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 838
 839         /* When the user specified more nodes than supported just check
 840            if the non supported part is all zero. */
 841         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 842                 if (nlongs > PAGE_SIZE/sizeof(long))
 843                         return -EINVAL;
 844                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 845                         unsigned long t;
 846                         if (get_user(t, nmask + k))
 847                                 return -EFAULT;
 848                         if (k == nlongs - 1) {
 849                                 if (t & endmask)
 850                                         return -EINVAL;
 851                         } else if (t)
 852                                 return -EINVAL;
 853                 }
 854                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 855                 endmask = ~0UL;
 856         }
 857
 858         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 859                 return -EFAULT;
 860         nodes_addr(*nodes)[nlongs-1] &= endmask;
 861         return 0;
 862 }
 863
 864 /* Copy a kernel node mask to user space */
 865 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 866                               nodemask_t *nodes)
 867 {
 868         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 869         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 870
 871         if (copy > nbytes) {
 872                 if (copy > PAGE_SIZE)
 873                         return -EINVAL;
 874                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 875                         return -EFAULT;
 876                 copy = nbytes;
 877         }
 878         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 879 }
 880
 881 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 882                         unsigned long mode,
 883                         unsigned long __user *nmask, unsigned long maxnode,
 884                         unsigned flags)
 885 {
 886         nodemask_t nodes;
 887         int err;
 888
 889         err = get_nodes(&nodes, nmask, maxnode);
 890         if (err)
 891                 return err;
 892 #ifdef CONFIG_CPUSETS
 893         /* Restrict the nodes to the allowed nodes in the cpuset */
 894         nodes_and(nodes, nodes, current->mems_allowed);
 895 #endif
 896         return do_mbind(start, len, mode, &nodes, flags);
 897 }
 898
 899 /* Set the process memory policy */
 900 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 901                 unsigned long maxnode)
 902 {
 903         int err;
 904         nodemask_t nodes;
 905
 906         if (mode < 0 || mode > MPOL_MAX)
 907                 return -EINVAL;
 908         err = get_nodes(&nodes, nmask, maxnode);
 909         if (err)
 910                 return err;
 911         return do_set_mempolicy(mode, &nodes);
 912 }
 913
 914 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 915                 const unsigned long __user *old_nodes,
 916                 const unsigned long __user *new_nodes)
 917 {
 918         struct mm_struct *mm;
 919         struct task_struct *task;
 920         nodemask_t old;
 921         nodemask_t new;
 922         nodemask_t task_nodes;
 923         int err;
 924
 925         err = get_nodes(&old, old_nodes, maxnode);
 926         if (err)
 927                 return err;
 928
 929         err = get_nodes(&new, new_nodes, maxnode);
 930         if (err)
 931                 return err;
 932
 933         /* Find the mm_struct */
 934         read_lock(&tasklist_lock);
 935         task = pid ? find_task_by_pid(pid) : current;
 936         if (!task) {
 937                 read_unlock(&tasklist_lock);
 938                 return -ESRCH;
 939         }
 940         mm = get_task_mm(task);
 941         read_unlock(&tasklist_lock);
 942
 943         if (!mm)
 944                 return -EINVAL;
 945
 946         /*
 947          * Check if this process has the right to modify the specified
 948          * process. The right exists if the process has administrative
 949          * capabilities, superuser privileges or the same
 950          * userid as the target process.
 951          */
 952         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 953             (current->uid != task->suid) && (current->uid != task->uid) &&
 954             !capable(CAP_SYS_NICE)) {
 955                 err = -EPERM;
 956                 goto out;
 957         }
 958
 959         task_nodes = cpuset_mems_allowed(task);
 960         /* Is the user allowed to access the target nodes? */
 961         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 962                 err = -EPERM;
 963                 goto out;
 964         }
 965
 966         err = security_task_movememory(task);
 967         if (err)
 968                 goto out;
 969
 970         err = do_migrate_pages(mm, &old, &new,
 971                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 972 out:
 973         mmput(mm);
 974         return err;
 975 }
 976
 977
 978 /* Retrieve NUMA policy */
 979 asmlinkage long sys_get_mempolicy(int __user *policy,
 980                                 unsigned long __user *nmask,
 981                                 unsigned long maxnode,
 982                                 unsigned long addr, unsigned long flags)
 983 {
 984         int err, pval;
 985         nodemask_t nodes;
 986
 987         if (nmask != NULL && maxnode < MAX_NUMNODES)
 988                 return -EINVAL;
 989
 990         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 991
 992         if (err)
 993                 return err;
 994
 995         if (policy && put_user(pval, policy))
 996                 return -EFAULT;
 997
 998         if (nmask)
 999                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1000
1001         return err;
1002 }
1003
1004 #ifdef CONFIG_COMPAT
1005
1006 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1007                                      compat_ulong_t __user *nmask,
1008                                      compat_ulong_t maxnode,
1009                                      compat_ulong_t addr, compat_ulong_t flags)
1010 {
1011         long err;
1012         unsigned long __user *nm = NULL;
1013         unsigned long nr_bits, alloc_size;
1014         DECLARE_BITMAP(bm, MAX_NUMNODES);
1015
1016         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1017         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1018
1019         if (nmask)
1020                 nm = compat_alloc_user_space(alloc_size);
1021
1022         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1023
1024         if (!err && nmask) {
1025                 err = copy_from_user(bm, nm, alloc_size);
1026                 /* ensure entire bitmap is zeroed */
1027                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1028                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1029         }
1030
1031         return err;
1032 }
1033
1034 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1035                                      compat_ulong_t maxnode)
1036 {
1037         long err = 0;
1038         unsigned long __user *nm = NULL;
1039         unsigned long nr_bits, alloc_size;
1040         DECLARE_BITMAP(bm, MAX_NUMNODES);
1041
1042         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1043         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1044
1045         if (nmask) {
1046                 err = compat_get_bitmap(bm, nmask, nr_bits);
1047                 nm = compat_alloc_user_space(alloc_size);
1048                 err |= copy_to_user(nm, bm, alloc_size);
1049         }
1050
1051         if (err)
1052                 return -EFAULT;
1053
1054         return sys_set_mempolicy(mode, nm, nr_bits+1);
1055 }
1056
1057 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1058                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1059                              compat_ulong_t maxnode, compat_ulong_t flags)
1060 {
1061         long err = 0;
1062         unsigned long __user *nm = NULL;
1063         unsigned long nr_bits, alloc_size;
1064         nodemask_t bm;
1065
1066         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1067         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1068
1069         if (nmask) {
1070                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1071                 nm = compat_alloc_user_space(alloc_size);
1072                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1073         }
1074
1075         if (err)
1076                 return -EFAULT;
1077
1078         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1079 }
1080
1081 #endif
1082
1083 /* Return effective policy for a VMA */
1084 static struct mempolicy * get_vma_policy(struct task_struct *task,
1085                 struct vm_area_struct *vma, unsigned long addr)
1086 {
1087         struct mempolicy *pol = task->mempolicy;
1088
1089         if (vma) {
1090                 if (vma->vm_ops && vma->vm_ops->get_policy)
1091                         pol = vma->vm_ops->get_policy(vma, addr);
1092                 else if (vma->vm_policy &&
1093                                 vma->vm_policy->policy != MPOL_DEFAULT)
1094                         pol = vma->vm_policy;
1095         }
1096         if (!pol)
1097                 pol = &default_policy;
1098         return pol;
1099 }
1100
1101 /* Return a zonelist representing a mempolicy */
1102 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1103 {
1104         int nd;
1105
1106         switch (policy->policy) {
1107         case MPOL_PREFERRED:
1108                 nd = policy->v.preferred_node;
1109                 if (nd < 0)
1110                         nd = numa_node_id();
1111                 break;
1112         case MPOL_BIND:
1113                 /* Lower zones don't get a policy applied */
1114                 /* Careful: current->mems_allowed might have moved */
1115                 if (gfp_zone(gfp) >= policy_zone)
1116                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1117                                 return policy->v.zonelist;
1118                 /*FALL THROUGH*/
1119         case MPOL_INTERLEAVE: /* should not happen */
1120         case MPOL_DEFAULT:
1121                 nd = numa_node_id();
1122                 break;
1123         default:
1124                 nd = 0;
1125                 BUG();
1126         }
1127         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1128 }
1129
1130 /* Do dynamic interleaving for a process */
1131 static unsigned interleave_nodes(struct mempolicy *policy)
1132 {
1133         unsigned nid, next;
1134         struct task_struct *me = current;
1135
1136         nid = me->il_next;
1137         next = next_node(nid, policy->v.nodes);
1138         if (next >= MAX_NUMNODES)
1139                 next = first_node(policy->v.nodes);
1140         me->il_next = next;
1141         return nid;
1142 }
1143
1144 /*
1145  * Depending on the memory policy provide a node from which to allocate the
1146  * next slab entry.
1147  */
1148 unsigned slab_node(struct mempolicy *policy)
1149 {
1150         int pol = policy ? policy->policy : MPOL_DEFAULT;
1151
1152         switch (pol) {
1153         case MPOL_INTERLEAVE:
1154                 return interleave_nodes(policy);
1155
1156         case MPOL_BIND:
1157                 /*
1158                  * Follow bind policy behavior and start allocation at the
1159                  * first node.
1160                  */
1161                 return zone_to_nid(policy->v.zonelist->zones[0]);
1162
1163         case MPOL_PREFERRED:
1164                 if (policy->v.preferred_node >= 0)
1165                         return policy->v.preferred_node;
1166                 /* Fall through */
1167
1168         default:
1169                 return numa_node_id();
1170         }
1171 }
1172
1173 /* Do static interleaving for a VMA with known offset. */
1174 static unsigned offset_il_node(struct mempolicy *pol,
1175                 struct vm_area_struct *vma, unsigned long off)
1176 {
1177         unsigned nnodes = nodes_weight(pol->v.nodes);
1178         unsigned target = (unsigned)off % nnodes;
1179         int c;
1180         int nid = -1;
1181
1182         c = 0;
1183         do {
1184                 nid = next_node(nid, pol->v.nodes);
1185                 c++;
1186         } while (c <= target);
1187         return nid;
1188 }
1189
1190 /* Determine a node number for interleave */
1191 static inline unsigned interleave_nid(struct mempolicy *pol,
1192                  struct vm_area_struct *vma, unsigned long addr, int shift)
1193 {
1194         if (vma) {
1195                 unsigned long off;
1196
1197                 /*
1198                  * for small pages, there is no difference between
1199                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1200                  * for huge pages, since vm_pgoff is in units of small
1201                  * pages, we need to shift off the always 0 bits to get
1202                  * a useful offset.
1203                  */
1204                 BUG_ON(shift < PAGE_SHIFT);
1205                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1206                 off += (addr - vma->vm_start) >> shift;
1207                 return offset_il_node(pol, vma, off);
1208         } else
1209                 return interleave_nodes(pol);
1210 }
1211
1212 #ifdef CONFIG_HUGETLBFS
1213 /* Return a zonelist suitable for a huge page allocation. */
1214 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1215 {
1216         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1217
1218         if (pol->policy == MPOL_INTERLEAVE) {
1219                 unsigned nid;
1220
1221                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1222                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1223         }
1224         return zonelist_policy(GFP_HIGHUSER, pol);
1225 }
1226 #endif
1227
1228 /* Allocate a page in interleaved policy.
1229    Own path because it needs to do special accounting. */
1230 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1231                                         unsigned nid)
1232 {
1233         struct zonelist *zl;
1234         struct page *page;
1235
1236         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1237         page = __alloc_pages(gfp, order, zl);
1238         if (page && page_zone(page) == zl->zones[0])
1239                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1240         return page;
1241 }
1242
1243 /**
1244  *      alloc_page_vma  - Allocate a page for a VMA.
1245  *
1246  *      @gfp:
1247  *      %GFP_USER    user allocation.
1248  *      %GFP_KERNEL  kernel allocations,
1249  *      %GFP_HIGHMEM highmem/user allocations,
1250  *      %GFP_FS      allocation should not call back into a file system.
1251  *      %GFP_ATOMIC  don't sleep.
1252  *
1253  *      @vma:  Pointer to VMA or NULL if not available.
1254  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1255  *
1256  *      This function allocates a page from the kernel page pool and applies
1257  *      a NUMA policy associated with the VMA or the current process.
1258  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1259  *      mm_struct of the VMA to prevent it from going away. Should be used for
1260  *      all allocations for pages that will be mapped into
1261  *      user space. Returns NULL when no page can be allocated.
1262  *
1263  *      Should be called with the mm_sem of the vma hold.
1264  */
1265 struct page *
1266 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1267 {
1268         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1269
1270         cpuset_update_task_memory_state();
1271
1272         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1273                 unsigned nid;
1274
1275                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1276                 return alloc_page_interleave(gfp, 0, nid);
1277         }
1278         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1279 }
1280
1281 /**
1282  *      alloc_pages_current - Allocate pages.
1283  *
1284  *      @gfp:
1285  *              %GFP_USER   user allocation,
1286  *              %GFP_KERNEL kernel allocation,
1287  *              %GFP_HIGHMEM highmem allocation,
1288  *              %GFP_FS     don't call back into a file system.
1289  *              %GFP_ATOMIC don't sleep.
1290  *      @order: Power of two of allocation size in pages. 0 is a single page.
1291  *
1292  *      Allocate a page from the kernel page pool.  When not in
1293  *      interrupt context and apply the current process NUMA policy.
1294  *      Returns NULL when no page can be allocated.
1295  *
1296  *      Don't call cpuset_update_task_memory_state() unless
1297  *      1) it's ok to take cpuset_sem (can WAIT), and
1298  *      2) allocating for current task (not interrupt).
1299  */
1300 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1301 {
1302         struct mempolicy *pol = current->mempolicy;
1303
1304         if ((gfp & __GFP_WAIT) && !in_interrupt())
1305                 cpuset_update_task_memory_state();
1306         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1307                 pol = &default_policy;
1308         if (pol->policy == MPOL_INTERLEAVE)
1309                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1310         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1311 }
1312 EXPORT_SYMBOL(alloc_pages_current);
1313
1314 /*
1315  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1316  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1317  * with the mems_allowed returned by cpuset_mems_allowed().  This
1318  * keeps mempolicies cpuset relative after its cpuset moves.  See
1319  * further kernel/cpuset.c update_nodemask().
1320  */
1321 void *cpuset_being_rebound;
1322
1323 /* Slow path of a mempolicy copy */
1324 struct mempolicy *__mpol_copy(struct mempolicy *old)
1325 {
1326         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1327
1328         if (!new)
1329                 return ERR_PTR(-ENOMEM);
1330         if (current_cpuset_is_being_rebound()) {
1331                 nodemask_t mems = cpuset_mems_allowed(current);
1332                 mpol_rebind_policy(old, &mems);
1333         }
1334         *new = *old;
1335         atomic_set(&new->refcnt, 1);
1336         if (new->policy == MPOL_BIND) {
1337                 int sz = ksize(old->v.zonelist);
1338                 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1339                 if (!new->v.zonelist) {
1340                         kmem_cache_free(policy_cache, new);
1341                         return ERR_PTR(-ENOMEM);
1342                 }
1343         }
1344         return new;
1345 }
1346
1347 /* Slow path of a mempolicy comparison */
1348 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1349 {
1350         if (!a || !b)
1351                 return 0;
1352         if (a->policy != b->policy)
1353                 return 0;
1354         switch (a->policy) {
1355         case MPOL_DEFAULT:
1356                 return 1;
1357         case MPOL_INTERLEAVE:
1358                 return nodes_equal(a->v.nodes, b->v.nodes);
1359         case MPOL_PREFERRED:
1360                 return a->v.preferred_node == b->v.preferred_node;
1361         case MPOL_BIND: {
1362                 int i;
1363                 for (i = 0; a->v.zonelist->zones[i]; i++)
1364                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1365                                 return 0;
1366                 return b->v.zonelist->zones[i] == NULL;
1367         }
1368         default:
1369                 BUG();
1370                 return 0;
1371         }
1372 }
1373
1374 /* Slow path of a mpol destructor. */
1375 void __mpol_free(struct mempolicy *p)
1376 {
1377         if (!atomic_dec_and_test(&p->refcnt))
1378                 return;
1379         if (p->policy == MPOL_BIND)
1380                 kfree(p->v.zonelist);
1381         p->policy = MPOL_DEFAULT;
1382         kmem_cache_free(policy_cache, p);
1383 }
1384
1385 /*
1386  * Shared memory backing store policy support.
1387  *
1388  * Remember policies even when nobody has shared memory mapped.
1389  * The policies are kept in Red-Black tree linked from the inode.
1390  * They are protected by the sp->lock spinlock, which should be held
1391  * for any accesses to the tree.
1392  */
1393
1394 /* lookup first element intersecting start-end */
1395 /* Caller holds sp->lock */
1396 static struct sp_node *
1397 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1398 {
1399         struct rb_node *n = sp->root.rb_node;
1400
1401         while (n) {
1402                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1403
1404                 if (start >= p->end)
1405                         n = n->rb_right;
1406                 else if (end <= p->start)
1407                         n = n->rb_left;
1408                 else
1409                         break;
1410         }
1411         if (!n)
1412                 return NULL;
1413         for (;;) {
1414                 struct sp_node *w = NULL;
1415                 struct rb_node *prev = rb_prev(n);
1416                 if (!prev)
1417                         break;
1418                 w = rb_entry(prev, struct sp_node, nd);
1419                 if (w->end <= start)
1420                         break;
1421                 n = prev;
1422         }
1423         return rb_entry(n, struct sp_node, nd);
1424 }
1425
1426 /* Insert a new shared policy into the list. */
1427 /* Caller holds sp->lock */
1428 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1429 {
1430         struct rb_node **p = &sp->root.rb_node;
1431         struct rb_node *parent = NULL;
1432         struct sp_node *nd;
1433
1434         while (*p) {
1435                 parent = *p;
1436                 nd = rb_entry(parent, struct sp_node, nd);
1437                 if (new->start < nd->start)
1438                         p = &(*p)->rb_left;
1439                 else if (new->end > nd->end)
1440                         p = &(*p)->rb_right;
1441                 else
1442                         BUG();
1443         }
1444         rb_link_node(&new->nd, parent, p);
1445         rb_insert_color(&new->nd, &sp->root);
1446         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1447                  new->policy ? new->policy->policy : 0);
1448 }
1449
1450 /* Find shared policy intersecting idx */
1451 struct mempolicy *
1452 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1453 {
1454         struct mempolicy *pol = NULL;
1455         struct sp_node *sn;
1456
1457         if (!sp->root.rb_node)
1458                 return NULL;
1459         spin_lock(&sp->lock);
1460         sn = sp_lookup(sp, idx, idx+1);
1461         if (sn) {
1462                 mpol_get(sn->policy);
1463                 pol = sn->policy;
1464         }
1465         spin_unlock(&sp->lock);
1466         return pol;
1467 }
1468
1469 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1470 {
1471         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1472         rb_erase(&n->nd, &sp->root);
1473         mpol_free(n->policy);
1474         kmem_cache_free(sn_cache, n);
1475 }
1476
1477 struct sp_node *
1478 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1479 {
1480         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1481
1482         if (!n)
1483                 return NULL;
1484         n->start = start;
1485         n->end = end;
1486         mpol_get(pol);
1487         n->policy = pol;
1488         return n;
1489 }
1490
1491 /* Replace a policy range. */
1492 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1493                                  unsigned long end, struct sp_node *new)
1494 {
1495         struct sp_node *n, *new2 = NULL;
1496
1497 restart:
1498         spin_lock(&sp->lock);
1499         n = sp_lookup(sp, start, end);
1500         /* Take care of old policies in the same range. */
1501         while (n && n->start < end) {
1502                 struct rb_node *next = rb_next(&n->nd);
1503                 if (n->start >= start) {
1504                         if (n->end <= end)
1505                                 sp_delete(sp, n);
1506                         else
1507                                 n->start = end;
1508                 } else {
1509                         /* Old policy spanning whole new range. */
1510                         if (n->end > end) {
1511                                 if (!new2) {
1512                                         spin_unlock(&sp->lock);
1513                                         new2 = sp_alloc(end, n->end, n->policy);
1514                                         if (!new2)
1515                                                 return -ENOMEM;
1516                                         goto restart;
1517                                 }
1518                                 n->end = start;
1519                                 sp_insert(sp, new2);
1520                                 new2 = NULL;
1521                                 break;
1522                         } else
1523                                 n->end = start;
1524                 }
1525                 if (!next)
1526                         break;
1527                 n = rb_entry(next, struct sp_node, nd);
1528         }
1529         if (new)
1530                 sp_insert(sp, new);
1531         spin_unlock(&sp->lock);
1532         if (new2) {
1533                 mpol_free(new2->policy);
1534                 kmem_cache_free(sn_cache, new2);
1535         }
1536         return 0;
1537 }
1538
1539 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1540                                 nodemask_t *policy_nodes)
1541 {
1542         info->root = RB_ROOT;
1543         spin_lock_init(&info->lock);
1544
1545         if (policy != MPOL_DEFAULT) {
1546                 struct mempolicy *newpol;
1547
1548                 /* Falls back to MPOL_DEFAULT on any error */
1549                 newpol = mpol_new(policy, policy_nodes);
1550                 if (!IS_ERR(newpol)) {
1551                         /* Create pseudo-vma that contains just the policy */
1552                         struct vm_area_struct pvma;
1553
1554                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1555                         /* Policy covers entire file */
1556                         pvma.vm_end = TASK_SIZE;
1557                         mpol_set_shared_policy(info, &pvma, newpol);
1558                         mpol_free(newpol);
1559                 }
1560         }
1561 }
1562
1563 int mpol_set_shared_policy(struct shared_policy *info,
1564                         struct vm_area_struct *vma, struct mempolicy *npol)
1565 {
1566         int err;
1567         struct sp_node *new = NULL;
1568         unsigned long sz = vma_pages(vma);
1569
1570         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1571                  vma->vm_pgoff,
1572                  sz, npol? npol->policy : -1,
1573                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1574
1575         if (npol) {
1576                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1577                 if (!new)
1578                         return -ENOMEM;
1579         }
1580         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1581         if (err && new)
1582                 kmem_cache_free(sn_cache, new);
1583         return err;
1584 }
1585
1586 /* Free a backing policy store on inode delete. */
1587 void mpol_free_shared_policy(struct shared_policy *p)
1588 {
1589         struct sp_node *n;
1590         struct rb_node *next;
1591
1592         if (!p->root.rb_node)
1593                 return;
1594         spin_lock(&p->lock);
1595         next = rb_first(&p->root);
1596         while (next) {
1597                 n = rb_entry(next, struct sp_node, nd);
1598                 next = rb_next(&n->nd);
1599                 rb_erase(&n->nd, &p->root);
1600                 mpol_free(n->policy);
1601                 kmem_cache_free(sn_cache, n);
1602         }
1603         spin_unlock(&p->lock);
1604 }
1605
1606 /* assumes fs == KERNEL_DS */
1607 void __init numa_policy_init(void)
1608 {
1609         policy_cache = kmem_cache_create("numa_policy",
1610                                          sizeof(struct mempolicy),
1611                                          0, SLAB_PANIC, NULL, NULL);
1612
1613         sn_cache = kmem_cache_create("shared_policy_node",
1614                                      sizeof(struct sp_node),
1615                                      0, SLAB_PANIC, NULL, NULL);
1616
1617         /* Set interleaving policy for system init. This way not all
1618            the data structures allocated at system boot end up in node zero. */
1619
1620         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1621                 printk("numa_policy_init: interleaving failed\n");
1622 }
1623
1624 /* Reset policy of current process to default */
1625 void numa_default_policy(void)
1626 {
1627         do_set_mempolicy(MPOL_DEFAULT, NULL);
1628 }
1629
1630 /* Migrate a policy to a different set of nodes */
1631 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1632 {
1633         nodemask_t *mpolmask;
1634         nodemask_t tmp;
1635
1636         if (!pol)
1637                 return;
1638         mpolmask = &pol->cpuset_mems_allowed;
1639         if (nodes_equal(*mpolmask, *newmask))
1640                 return;
1641
1642         switch (pol->policy) {
1643         case MPOL_DEFAULT:
1644                 break;
1645         case MPOL_INTERLEAVE:
1646                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1647                 pol->v.nodes = tmp;
1648                 *mpolmask = *newmask;
1649                 current->il_next = node_remap(current->il_next,
1650                                                 *mpolmask, *newmask);
1651                 break;
1652         case MPOL_PREFERRED:
1653                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1654                                                 *mpolmask, *newmask);
1655                 *mpolmask = *newmask;
1656                 break;
1657         case MPOL_BIND: {
1658                 nodemask_t nodes;
1659                 struct zone **z;
1660                 struct zonelist *zonelist;
1661
1662                 nodes_clear(nodes);
1663                 for (z = pol->v.zonelist->zones; *z; z++)
1664                         node_set(zone_to_nid(*z), nodes);
1665                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1666                 nodes = tmp;
1667
1668                 zonelist = bind_zonelist(&nodes);
1669
1670                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1671                  * If that old zonelist has no remaining mems_allowed nodes,
1672                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1673                  */
1674
1675                 if (!IS_ERR(zonelist)) {
1676                         /* Good - got mem - substitute new zonelist */
1677                         kfree(pol->v.zonelist);
1678                         pol->v.zonelist = zonelist;
1679                 }
1680                 *mpolmask = *newmask;
1681                 break;
1682         }
1683         default:
1684                 BUG();
1685                 break;
1686         }
1687 }
1688
1689 /*
1690  * Wrapper for mpol_rebind_policy() that just requires task
1691  * pointer, and updates task mempolicy.
1692  */
1693
1694 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1695 {
1696         mpol_rebind_policy(tsk->mempolicy, new);
1697 }
1698
1699 /*
1700  * Rebind each vma in mm to new nodemask.
1701  *
1702  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1703  */
1704
1705 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1706 {
1707         struct vm_area_struct *vma;
1708
1709         down_write(&mm->mmap_sem);
1710         for (vma = mm->mmap; vma; vma = vma->vm_next)
1711                 mpol_rebind_policy(vma->vm_policy, new);
1712         up_write(&mm->mmap_sem);
1713 }
1714
1715 /*
1716  * Display pages allocated per node and memory policy via /proc.
1717  */
1718
1719 static const char * const policy_types[] =
1720         { "default", "prefer", "bind", "interleave" };
1721
1722 /*
1723  * Convert a mempolicy into a string.
1724  * Returns the number of characters in buffer (if positive)
1725  * or an error (negative)
1726  */
1727 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1728 {
1729         char *p = buffer;
1730         int l;
1731         nodemask_t nodes;
1732         int mode = pol ? pol->policy : MPOL_DEFAULT;
1733
1734         switch (mode) {
1735         case MPOL_DEFAULT:
1736                 nodes_clear(nodes);
1737                 break;
1738
1739         case MPOL_PREFERRED:
1740                 nodes_clear(nodes);
1741                 node_set(pol->v.preferred_node, nodes);
1742                 break;
1743
1744         case MPOL_BIND:
1745                 get_zonemask(pol, &nodes);
1746                 break;
1747
1748         case MPOL_INTERLEAVE:
1749                 nodes = pol->v.nodes;
1750                 break;
1751
1752         default:
1753                 BUG();
1754                 return -EFAULT;
1755         }
1756
1757         l = strlen(policy_types[mode]);
1758         if (buffer + maxlen < p + l + 1)
1759                 return -ENOSPC;
1760
1761         strcpy(p, policy_types[mode]);
1762         p += l;
1763
1764         if (!nodes_empty(nodes)) {
1765                 if (buffer + maxlen < p + 2)
1766                         return -ENOSPC;
1767                 *p++ = '=';
1768                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1769         }
1770         return p - buffer;
1771 }
1772
1773 struct numa_maps {
1774         unsigned long pages;
1775         unsigned long anon;
1776         unsigned long active;
1777         unsigned long writeback;
1778         unsigned long mapcount_max;
1779         unsigned long dirty;
1780         unsigned long swapcache;
1781         unsigned long node[MAX_NUMNODES];
1782 };
1783
1784 static void gather_stats(struct page *page, void *private, int pte_dirty)
1785 {
1786         struct numa_maps *md = private;
1787         int count = page_mapcount(page);
1788
1789         md->pages++;
1790         if (pte_dirty || PageDirty(page))
1791                 md->dirty++;
1792
1793         if (PageSwapCache(page))
1794                 md->swapcache++;
1795
1796         if (PageActive(page))
1797                 md->active++;
1798
1799         if (PageWriteback(page))
1800                 md->writeback++;
1801
1802         if (PageAnon(page))
1803                 md->anon++;
1804
1805         if (count > md->mapcount_max)
1806                 md->mapcount_max = count;
1807
1808         md->node[page_to_nid(page)]++;
1809 }
1810
1811 #ifdef CONFIG_HUGETLB_PAGE
1812 static void check_huge_range(struct vm_area_struct *vma,
1813                 unsigned long start, unsigned long end,
1814                 struct numa_maps *md)
1815 {
1816         unsigned long addr;
1817         struct page *page;
1818
1819         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1820                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1821                 pte_t pte;
1822
1823                 if (!ptep)
1824                         continue;
1825
1826                 pte = *ptep;
1827                 if (pte_none(pte))
1828                         continue;
1829
1830                 page = pte_page(pte);
1831                 if (!page)
1832                         continue;
1833
1834                 gather_stats(page, md, pte_dirty(*ptep));
1835         }
1836 }
1837 #else
1838 static inline void check_huge_range(struct vm_area_struct *vma,
1839                 unsigned long start, unsigned long end,
1840                 struct numa_maps *md)
1841 {
1842 }
1843 #endif
1844
1845 int show_numa_map(struct seq_file *m, void *v)
1846 {
1847         struct proc_maps_private *priv = m->private;
1848         struct vm_area_struct *vma = v;
1849         struct numa_maps *md;
1850         struct file *file = vma->vm_file;
1851         struct mm_struct *mm = vma->vm_mm;
1852         int n;
1853         char buffer[50];
1854
1855         if (!mm)
1856                 return 0;
1857
1858         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1859         if (!md)
1860                 return 0;
1861
1862         mpol_to_str(buffer, sizeof(buffer),
1863                             get_vma_policy(priv->task, vma, vma->vm_start));
1864
1865         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1866
1867         if (file) {
1868                 seq_printf(m, " file=");
1869                 seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1870         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1871                 seq_printf(m, " heap");
1872         } else if (vma->vm_start <= mm->start_stack &&
1873                         vma->vm_end >= mm->start_stack) {
1874                 seq_printf(m, " stack");
1875         }
1876
1877         if (is_vm_hugetlb_page(vma)) {
1878                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1879                 seq_printf(m, " huge");
1880         } else {
1881                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1882                                 &node_online_map, MPOL_MF_STATS, md);
1883         }
1884
1885         if (!md->pages)
1886                 goto out;
1887
1888         if (md->anon)
1889                 seq_printf(m," anon=%lu",md->anon);
1890
1891         if (md->dirty)
1892                 seq_printf(m," dirty=%lu",md->dirty);
1893
1894         if (md->pages != md->anon && md->pages != md->dirty)
1895                 seq_printf(m, " mapped=%lu", md->pages);
1896
1897         if (md->mapcount_max > 1)
1898                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1899
1900         if (md->swapcache)
1901                 seq_printf(m," swapcache=%lu", md->swapcache);
1902
1903         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1904                 seq_printf(m," active=%lu", md->active);
1905
1906         if (md->writeback)
1907                 seq_printf(m," writeback=%lu", md->writeback);
1908
1909         for_each_online_node(n)
1910                 if (md->node[n])
1911                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1912 out:
1913         seq_putc(m, '\n');
1914         kfree(md);
1915
1916         if (m->count < m->size)
1917                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1918         return 0;
1919 }
1920