git.oblomov.eu Git - linux-2.6/blob - mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 enum zone_type policy_zone = 0;
 107
 108 /*
 109  * run-time system-wide default policy => local allocation
 110  */
 111 struct mempolicy default_policy = {
 112         .refcnt = ATOMIC_INIT(1), /* never free it */
 113         .mode = MPOL_PREFERRED,
 114         .flags = MPOL_F_LOCAL,
 115 };
 116
 117 static const struct mempolicy_operations {
 118         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 119         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 120 } mpol_ops[MPOL_MAX];
 121
 122 /* Check that the nodemask contains at least one populated zone */
 123 static int is_valid_nodemask(const nodemask_t *nodemask)
 124 {
 125         int nd, k;
 126
 127         /* Check that there is something useful in this mask */
 128         k = policy_zone;
 129
 130         for_each_node_mask(nd, *nodemask) {
 131                 struct zone *z;
 132
 133                 for (k = 0; k <= policy_zone; k++) {
 134                         z = &NODE_DATA(nd)->node_zones[k];
 135                         if (z->present_pages > 0)
 136                                 return 1;
 137                 }
 138         }
 139
 140         return 0;
 141 }
 142
 143 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 144 {
 145         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 146 }
 147
 148 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 149                                    const nodemask_t *rel)
 150 {
 151         nodemask_t tmp;
 152         nodes_fold(tmp, *orig, nodes_weight(*rel));
 153         nodes_onto(*ret, tmp, *rel);
 154 }
 155
 156 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 157 {
 158         if (nodes_empty(*nodes))
 159                 return -EINVAL;
 160         pol->v.nodes = *nodes;
 161         return 0;
 162 }
 163
 164 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 165 {
 166         if (!nodes)
 167                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 168         else if (nodes_empty(*nodes))
 169                 return -EINVAL;                 /*  no allowed nodes */
 170         else
 171                 pol->v.preferred_node = first_node(*nodes);
 172         return 0;
 173 }
 174
 175 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 176 {
 177         if (!is_valid_nodemask(nodes))
 178                 return -EINVAL;
 179         pol->v.nodes = *nodes;
 180         return 0;
 181 }
 182
 183 /* Create a new policy */
 184 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 185                                   nodemask_t *nodes)
 186 {
 187         struct mempolicy *policy;
 188         nodemask_t cpuset_context_nmask;
 189         int ret;
 190
 191         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 192                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 193
 194         if (mode == MPOL_DEFAULT) {
 195                 if (nodes && !nodes_empty(*nodes))
 196                         return ERR_PTR(-EINVAL);
 197                 return NULL;    /* simply delete any existing policy */
 198         }
 199         VM_BUG_ON(!nodes);
 200
 201         /*
 202          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 203          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 204          * All other modes require a valid pointer to a non-empty nodemask.
 205          */
 206         if (mode == MPOL_PREFERRED) {
 207                 if (nodes_empty(*nodes)) {
 208                         if (((flags & MPOL_F_STATIC_NODES) ||
 209                              (flags & MPOL_F_RELATIVE_NODES)))
 210                                 return ERR_PTR(-EINVAL);
 211                         nodes = NULL;   /* flag local alloc */
 212                 }
 213         } else if (nodes_empty(*nodes))
 214                 return ERR_PTR(-EINVAL);
 215         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 216         if (!policy)
 217                 return ERR_PTR(-ENOMEM);
 218         atomic_set(&policy->refcnt, 1);
 219         policy->mode = mode;
 220         policy->flags = flags;
 221
 222         if (nodes) {
 223                 /*
 224                  * cpuset related setup doesn't apply to local allocation
 225                  */
 226                 cpuset_update_task_memory_state();
 227                 if (flags & MPOL_F_RELATIVE_NODES)
 228                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 229                                                &cpuset_current_mems_allowed);
 230                 else
 231                         nodes_and(cpuset_context_nmask, *nodes,
 232                                   cpuset_current_mems_allowed);
 233                 if (mpol_store_user_nodemask(policy))
 234                         policy->w.user_nodemask = *nodes;
 235                 else
 236                         policy->w.cpuset_mems_allowed =
 237                                                 cpuset_mems_allowed(current);
 238         }
 239
 240         ret = mpol_ops[mode].create(policy,
 241                                 nodes ? &cpuset_context_nmask : NULL);
 242         if (ret < 0) {
 243                 kmem_cache_free(policy_cache, policy);
 244                 return ERR_PTR(ret);
 245         }
 246         return policy;
 247 }
 248
 249 /* Slow path of a mpol destructor. */
 250 void __mpol_put(struct mempolicy *p)
 251 {
 252         if (!atomic_dec_and_test(&p->refcnt))
 253                 return;
 254         kmem_cache_free(policy_cache, p);
 255 }
 256
 257 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 258 {
 259 }
 260
 261 static void mpol_rebind_nodemask(struct mempolicy *pol,
 262                                  const nodemask_t *nodes)
 263 {
 264         nodemask_t tmp;
 265
 266         if (pol->flags & MPOL_F_STATIC_NODES)
 267                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 268         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 269                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 270         else {
 271                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 272                             *nodes);
 273                 pol->w.cpuset_mems_allowed = *nodes;
 274         }
 275
 276         pol->v.nodes = tmp;
 277         if (!node_isset(current->il_next, tmp)) {
 278                 current->il_next = next_node(current->il_next, tmp);
 279                 if (current->il_next >= MAX_NUMNODES)
 280                         current->il_next = first_node(tmp);
 281                 if (current->il_next >= MAX_NUMNODES)
 282                         current->il_next = numa_node_id();
 283         }
 284 }
 285
 286 static void mpol_rebind_preferred(struct mempolicy *pol,
 287                                   const nodemask_t *nodes)
 288 {
 289         nodemask_t tmp;
 290
 291         if (pol->flags & MPOL_F_STATIC_NODES) {
 292                 int node = first_node(pol->w.user_nodemask);
 293
 294                 if (node_isset(node, *nodes)) {
 295                         pol->v.preferred_node = node;
 296                         pol->flags &= ~MPOL_F_LOCAL;
 297                 } else
 298                         pol->flags |= MPOL_F_LOCAL;
 299         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 300                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 301                 pol->v.preferred_node = first_node(tmp);
 302         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 303                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 304                                                    pol->w.cpuset_mems_allowed,
 305                                                    *nodes);
 306                 pol->w.cpuset_mems_allowed = *nodes;
 307         }
 308 }
 309
 310 /* Migrate a policy to a different set of nodes */
 311 static void mpol_rebind_policy(struct mempolicy *pol,
 312                                const nodemask_t *newmask)
 313 {
 314         if (!pol)
 315                 return;
 316         if (!mpol_store_user_nodemask(pol) &&
 317             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 318                 return;
 319         mpol_ops[pol->mode].rebind(pol, newmask);
 320 }
 321
 322 /*
 323  * Wrapper for mpol_rebind_policy() that just requires task
 324  * pointer, and updates task mempolicy.
 325  */
 326
 327 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 328 {
 329         mpol_rebind_policy(tsk->mempolicy, new);
 330 }
 331
 332 /*
 333  * Rebind each vma in mm to new nodemask.
 334  *
 335  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 336  */
 337
 338 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 339 {
 340         struct vm_area_struct *vma;
 341
 342         down_write(&mm->mmap_sem);
 343         for (vma = mm->mmap; vma; vma = vma->vm_next)
 344                 mpol_rebind_policy(vma->vm_policy, new);
 345         up_write(&mm->mmap_sem);
 346 }
 347
 348 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 349         [MPOL_DEFAULT] = {
 350                 .rebind = mpol_rebind_default,
 351         },
 352         [MPOL_INTERLEAVE] = {
 353                 .create = mpol_new_interleave,
 354                 .rebind = mpol_rebind_nodemask,
 355         },
 356         [MPOL_PREFERRED] = {
 357                 .create = mpol_new_preferred,
 358                 .rebind = mpol_rebind_preferred,
 359         },
 360         [MPOL_BIND] = {
 361                 .create = mpol_new_bind,
 362                 .rebind = mpol_rebind_nodemask,
 363         },
 364 };
 365
 366 static void gather_stats(struct page *, void *, int pte_dirty);
 367 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 368                                 unsigned long flags);
 369
 370 /* Scan through pages checking if pages follow certain conditions. */
 371 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 372                 unsigned long addr, unsigned long end,
 373                 const nodemask_t *nodes, unsigned long flags,
 374                 void *private)
 375 {
 376         pte_t *orig_pte;
 377         pte_t *pte;
 378         spinlock_t *ptl;
 379
 380         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 381         do {
 382                 struct page *page;
 383                 int nid;
 384
 385                 if (!pte_present(*pte))
 386                         continue;
 387                 page = vm_normal_page(vma, addr, *pte);
 388                 if (!page)
 389                         continue;
 390                 /*
 391                  * The check for PageReserved here is important to avoid
 392                  * handling zero pages and other pages that may have been
 393                  * marked special by the system.
 394                  *
 395                  * If the PageReserved would not be checked here then f.e.
 396                  * the location of the zero page could have an influence
 397                  * on MPOL_MF_STRICT, zero pages would be counted for
 398                  * the per node stats, and there would be useless attempts
 399                  * to put zero pages on the migration list.
 400                  */
 401                 if (PageReserved(page))
 402                         continue;
 403                 nid = page_to_nid(page);
 404                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 405                         continue;
 406
 407                 if (flags & MPOL_MF_STATS)
 408                         gather_stats(page, private, pte_dirty(*pte));
 409                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 410                         migrate_page_add(page, private, flags);
 411                 else
 412                         break;
 413         } while (pte++, addr += PAGE_SIZE, addr != end);
 414         pte_unmap_unlock(orig_pte, ptl);
 415         return addr != end;
 416 }
 417
 418 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 419                 unsigned long addr, unsigned long end,
 420                 const nodemask_t *nodes, unsigned long flags,
 421                 void *private)
 422 {
 423         pmd_t *pmd;
 424         unsigned long next;
 425
 426         pmd = pmd_offset(pud, addr);
 427         do {
 428                 next = pmd_addr_end(addr, end);
 429                 if (pmd_none_or_clear_bad(pmd))
 430                         continue;
 431                 if (check_pte_range(vma, pmd, addr, next, nodes,
 432                                     flags, private))
 433                         return -EIO;
 434         } while (pmd++, addr = next, addr != end);
 435         return 0;
 436 }
 437
 438 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 439                 unsigned long addr, unsigned long end,
 440                 const nodemask_t *nodes, unsigned long flags,
 441                 void *private)
 442 {
 443         pud_t *pud;
 444         unsigned long next;
 445
 446         pud = pud_offset(pgd, addr);
 447         do {
 448                 next = pud_addr_end(addr, end);
 449                 if (pud_none_or_clear_bad(pud))
 450                         continue;
 451                 if (check_pmd_range(vma, pud, addr, next, nodes,
 452                                     flags, private))
 453                         return -EIO;
 454         } while (pud++, addr = next, addr != end);
 455         return 0;
 456 }
 457
 458 static inline int check_pgd_range(struct vm_area_struct *vma,
 459                 unsigned long addr, unsigned long end,
 460                 const nodemask_t *nodes, unsigned long flags,
 461                 void *private)
 462 {
 463         pgd_t *pgd;
 464         unsigned long next;
 465
 466         pgd = pgd_offset(vma->vm_mm, addr);
 467         do {
 468                 next = pgd_addr_end(addr, end);
 469                 if (pgd_none_or_clear_bad(pgd))
 470                         continue;
 471                 if (check_pud_range(vma, pgd, addr, next, nodes,
 472                                     flags, private))
 473                         return -EIO;
 474         } while (pgd++, addr = next, addr != end);
 475         return 0;
 476 }
 477
 478 /*
 479  * Check if all pages in a range are on a set of nodes.
 480  * If pagelist != NULL then isolate pages from the LRU and
 481  * put them on the pagelist.
 482  */
 483 static struct vm_area_struct *
 484 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 485                 const nodemask_t *nodes, unsigned long flags, void *private)
 486 {
 487         int err;
 488         struct vm_area_struct *first, *vma, *prev;
 489
 490         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 491
 492                 err = migrate_prep();
 493                 if (err)
 494                         return ERR_PTR(err);
 495         }
 496
 497         first = find_vma(mm, start);
 498         if (!first)
 499                 return ERR_PTR(-EFAULT);
 500         prev = NULL;
 501         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 502                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 503                         if (!vma->vm_next && vma->vm_end < end)
 504                                 return ERR_PTR(-EFAULT);
 505                         if (prev && prev->vm_end < vma->vm_start)
 506                                 return ERR_PTR(-EFAULT);
 507                 }
 508                 if (!is_vm_hugetlb_page(vma) &&
 509                     ((flags & MPOL_MF_STRICT) ||
 510                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 511                                 vma_migratable(vma)))) {
 512                         unsigned long endvma = vma->vm_end;
 513
 514                         if (endvma > end)
 515                                 endvma = end;
 516                         if (vma->vm_start > start)
 517                                 start = vma->vm_start;
 518                         err = check_pgd_range(vma, start, endvma, nodes,
 519                                                 flags, private);
 520                         if (err) {
 521                                 first = ERR_PTR(err);
 522                                 break;
 523                         }
 524                 }
 525                 prev = vma;
 526         }
 527         return first;
 528 }
 529
 530 /* Apply policy to a single VMA */
 531 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 532 {
 533         int err = 0;
 534         struct mempolicy *old = vma->vm_policy;
 535
 536         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 537                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 538                  vma->vm_ops, vma->vm_file,
 539                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 540
 541         if (vma->vm_ops && vma->vm_ops->set_policy)
 542                 err = vma->vm_ops->set_policy(vma, new);
 543         if (!err) {
 544                 mpol_get(new);
 545                 vma->vm_policy = new;
 546                 mpol_put(old);
 547         }
 548         return err;
 549 }
 550
 551 /* Step 2: apply policy to a range and do splits. */
 552 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 553                        unsigned long end, struct mempolicy *new)
 554 {
 555         struct vm_area_struct *next;
 556         int err;
 557
 558         err = 0;
 559         for (; vma && vma->vm_start < end; vma = next) {
 560                 next = vma->vm_next;
 561                 if (vma->vm_start < start)
 562                         err = split_vma(vma->vm_mm, vma, start, 1);
 563                 if (!err && vma->vm_end > end)
 564                         err = split_vma(vma->vm_mm, vma, end, 0);
 565                 if (!err)
 566                         err = policy_vma(vma, new);
 567                 if (err)
 568                         break;
 569         }
 570         return err;
 571 }
 572
 573 /*
 574  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 575  * mempolicy.  Allows more rapid checking of this (combined perhaps
 576  * with other PF_* flag bits) on memory allocation hot code paths.
 577  *
 578  * If called from outside this file, the task 'p' should -only- be
 579  * a newly forked child not yet visible on the task list, because
 580  * manipulating the task flags of a visible task is not safe.
 581  *
 582  * The above limitation is why this routine has the funny name
 583  * mpol_fix_fork_child_flag().
 584  *
 585  * It is also safe to call this with a task pointer of current,
 586  * which the static wrapper mpol_set_task_struct_flag() does,
 587  * for use within this file.
 588  */
 589
 590 void mpol_fix_fork_child_flag(struct task_struct *p)
 591 {
 592         if (p->mempolicy)
 593                 p->flags |= PF_MEMPOLICY;
 594         else
 595                 p->flags &= ~PF_MEMPOLICY;
 596 }
 597
 598 static void mpol_set_task_struct_flag(void)
 599 {
 600         mpol_fix_fork_child_flag(current);
 601 }
 602
 603 /* Set the process memory policy */
 604 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 605                              nodemask_t *nodes)
 606 {
 607         struct mempolicy *new;
 608         struct mm_struct *mm = current->mm;
 609
 610         new = mpol_new(mode, flags, nodes);
 611         if (IS_ERR(new))
 612                 return PTR_ERR(new);
 613
 614         /*
 615          * prevent changing our mempolicy while show_numa_maps()
 616          * is using it.
 617          * Note:  do_set_mempolicy() can be called at init time
 618          * with no 'mm'.
 619          */
 620         if (mm)
 621                 down_write(&mm->mmap_sem);
 622         mpol_put(current->mempolicy);
 623         current->mempolicy = new;
 624         mpol_set_task_struct_flag();
 625         if (new && new->mode == MPOL_INTERLEAVE &&
 626             nodes_weight(new->v.nodes))
 627                 current->il_next = first_node(new->v.nodes);
 628         if (mm)
 629                 up_write(&mm->mmap_sem);
 630
 631         return 0;
 632 }
 633
 634 /*
 635  * Return nodemask for policy for get_mempolicy() query
 636  */
 637 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 638 {
 639         nodes_clear(*nodes);
 640         if (p == &default_policy)
 641                 return;
 642
 643         switch (p->mode) {
 644         case MPOL_BIND:
 645                 /* Fall through */
 646         case MPOL_INTERLEAVE:
 647                 *nodes = p->v.nodes;
 648                 break;
 649         case MPOL_PREFERRED:
 650                 if (!(p->flags & MPOL_F_LOCAL))
 651                         node_set(p->v.preferred_node, *nodes);
 652                 /* else return empty node mask for local allocation */
 653                 break;
 654         default:
 655                 BUG();
 656         }
 657 }
 658
 659 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 660 {
 661         struct page *p;
 662         int err;
 663
 664         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 665         if (err >= 0) {
 666                 err = page_to_nid(p);
 667                 put_page(p);
 668         }
 669         return err;
 670 }
 671
 672 /* Retrieve NUMA policy */
 673 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 674                              unsigned long addr, unsigned long flags)
 675 {
 676         int err;
 677         struct mm_struct *mm = current->mm;
 678         struct vm_area_struct *vma = NULL;
 679         struct mempolicy *pol = current->mempolicy;
 680
 681         cpuset_update_task_memory_state();
 682         if (flags &
 683                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 684                 return -EINVAL;
 685
 686         if (flags & MPOL_F_MEMS_ALLOWED) {
 687                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 688                         return -EINVAL;
 689                 *policy = 0;    /* just so it's initialized */
 690                 *nmask  = cpuset_current_mems_allowed;
 691                 return 0;
 692         }
 693
 694         if (flags & MPOL_F_ADDR) {
 695                 /*
 696                  * Do NOT fall back to task policy if the
 697                  * vma/shared policy at addr is NULL.  We
 698                  * want to return MPOL_DEFAULT in this case.
 699                  */
 700                 down_read(&mm->mmap_sem);
 701                 vma = find_vma_intersection(mm, addr, addr+1);
 702                 if (!vma) {
 703                         up_read(&mm->mmap_sem);
 704                         return -EFAULT;
 705                 }
 706                 if (vma->vm_ops && vma->vm_ops->get_policy)
 707                         pol = vma->vm_ops->get_policy(vma, addr);
 708                 else
 709                         pol = vma->vm_policy;
 710         } else if (addr)
 711                 return -EINVAL;
 712
 713         if (!pol)
 714                 pol = &default_policy;  /* indicates default behavior */
 715
 716         if (flags & MPOL_F_NODE) {
 717                 if (flags & MPOL_F_ADDR) {
 718                         err = lookup_node(mm, addr);
 719                         if (err < 0)
 720                                 goto out;
 721                         *policy = err;
 722                 } else if (pol == current->mempolicy &&
 723                                 pol->mode == MPOL_INTERLEAVE) {
 724                         *policy = current->il_next;
 725                 } else {
 726                         err = -EINVAL;
 727                         goto out;
 728                 }
 729         } else {
 730                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 731                                                 pol->mode;
 732                 /*
 733                  * Internal mempolicy flags must be masked off before exposing
 734                  * the policy to userspace.
 735                  */
 736                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 737         }
 738
 739         if (vma) {
 740                 up_read(&current->mm->mmap_sem);
 741                 vma = NULL;
 742         }
 743
 744         err = 0;
 745         if (nmask)
 746                 get_policy_nodemask(pol, nmask);
 747
 748  out:
 749         mpol_cond_put(pol);
 750         if (vma)
 751                 up_read(&current->mm->mmap_sem);
 752         return err;
 753 }
 754
 755 #ifdef CONFIG_MIGRATION
 756 /*
 757  * page migration
 758  */
 759 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 760                                 unsigned long flags)
 761 {
 762         /*
 763          * Avoid migrating a page that is shared with others.
 764          */
 765         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 766                 isolate_lru_page(page, pagelist);
 767 }
 768
 769 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 770 {
 771         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 772 }
 773
 774 /*
 775  * Migrate pages from one node to a target node.
 776  * Returns error or the number of pages not migrated.
 777  */
 778 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 779                            int flags)
 780 {
 781         nodemask_t nmask;
 782         LIST_HEAD(pagelist);
 783         int err = 0;
 784
 785         nodes_clear(nmask);
 786         node_set(source, nmask);
 787
 788         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 789                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 790
 791         if (!list_empty(&pagelist))
 792                 err = migrate_pages(&pagelist, new_node_page, dest);
 793
 794         return err;
 795 }
 796
 797 /*
 798  * Move pages between the two nodesets so as to preserve the physical
 799  * layout as much as possible.
 800  *
 801  * Returns the number of page that could not be moved.
 802  */
 803 int do_migrate_pages(struct mm_struct *mm,
 804         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 805 {
 806         int busy = 0;
 807         int err = 0;
 808         nodemask_t tmp;
 809
 810         down_read(&mm->mmap_sem);
 811
 812         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 813         if (err)
 814                 goto out;
 815
 816 /*
 817  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 818  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 819  * bit in 'tmp', and return that <source, dest> pair for migration.
 820  * The pair of nodemasks 'to' and 'from' define the map.
 821  *
 822  * If no pair of bits is found that way, fallback to picking some
 823  * pair of 'source' and 'dest' bits that are not the same.  If the
 824  * 'source' and 'dest' bits are the same, this represents a node
 825  * that will be migrating to itself, so no pages need move.
 826  *
 827  * If no bits are left in 'tmp', or if all remaining bits left
 828  * in 'tmp' correspond to the same bit in 'to', return false
 829  * (nothing left to migrate).
 830  *
 831  * This lets us pick a pair of nodes to migrate between, such that
 832  * if possible the dest node is not already occupied by some other
 833  * source node, minimizing the risk of overloading the memory on a
 834  * node that would happen if we migrated incoming memory to a node
 835  * before migrating outgoing memory source that same node.
 836  *
 837  * A single scan of tmp is sufficient.  As we go, we remember the
 838  * most recent <s, d> pair that moved (s != d).  If we find a pair
 839  * that not only moved, but what's better, moved to an empty slot
 840  * (d is not set in tmp), then we break out then, with that pair.
 841  * Otherwise when we finish scannng from_tmp, we at least have the
 842  * most recent <s, d> pair that moved.  If we get all the way through
 843  * the scan of tmp without finding any node that moved, much less
 844  * moved to an empty node, then there is nothing left worth migrating.
 845  */
 846
 847         tmp = *from_nodes;
 848         while (!nodes_empty(tmp)) {
 849                 int s,d;
 850                 int source = -1;
 851                 int dest = 0;
 852
 853                 for_each_node_mask(s, tmp) {
 854                         d = node_remap(s, *from_nodes, *to_nodes);
 855                         if (s == d)
 856                                 continue;
 857
 858                         source = s;     /* Node moved. Memorize */
 859                         dest = d;
 860
 861                         /* dest not in remaining from nodes? */
 862                         if (!node_isset(dest, tmp))
 863                                 break;
 864                 }
 865                 if (source == -1)
 866                         break;
 867
 868                 node_clear(source, tmp);
 869                 err = migrate_to_node(mm, source, dest, flags);
 870                 if (err > 0)
 871                         busy += err;
 872                 if (err < 0)
 873                         break;
 874         }
 875 out:
 876         up_read(&mm->mmap_sem);
 877         if (err < 0)
 878                 return err;
 879         return busy;
 880
 881 }
 882
 883 /*
 884  * Allocate a new page for page migration based on vma policy.
 885  * Start assuming that page is mapped by vma pointed to by @private.
 886  * Search forward from there, if not.  N.B., this assumes that the
 887  * list of pages handed to migrate_pages()--which is how we get here--
 888  * is in virtual address order.
 889  */
 890 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 891 {
 892         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 893         unsigned long uninitialized_var(address);
 894
 895         while (vma) {
 896                 address = page_address_in_vma(page, vma);
 897                 if (address != -EFAULT)
 898                         break;
 899                 vma = vma->vm_next;
 900         }
 901
 902         /*
 903          * if !vma, alloc_page_vma() will use task or system default policy
 904          */
 905         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 906 }
 907 #else
 908
 909 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 910                                 unsigned long flags)
 911 {
 912 }
 913
 914 int do_migrate_pages(struct mm_struct *mm,
 915         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 916 {
 917         return -ENOSYS;
 918 }
 919
 920 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 921 {
 922         return NULL;
 923 }
 924 #endif
 925
 926 static long do_mbind(unsigned long start, unsigned long len,
 927                      unsigned short mode, unsigned short mode_flags,
 928                      nodemask_t *nmask, unsigned long flags)
 929 {
 930         struct vm_area_struct *vma;
 931         struct mm_struct *mm = current->mm;
 932         struct mempolicy *new;
 933         unsigned long end;
 934         int err;
 935         LIST_HEAD(pagelist);
 936
 937         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 938                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 939                 return -EINVAL;
 940         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 941                 return -EPERM;
 942
 943         if (start & ~PAGE_MASK)
 944                 return -EINVAL;
 945
 946         if (mode == MPOL_DEFAULT)
 947                 flags &= ~MPOL_MF_STRICT;
 948
 949         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 950         end = start + len;
 951
 952         if (end < start)
 953                 return -EINVAL;
 954         if (end == start)
 955                 return 0;
 956
 957         new = mpol_new(mode, mode_flags, nmask);
 958         if (IS_ERR(new))
 959                 return PTR_ERR(new);
 960
 961         /*
 962          * If we are using the default policy then operation
 963          * on discontinuous address spaces is okay after all
 964          */
 965         if (!new)
 966                 flags |= MPOL_MF_DISCONTIG_OK;
 967
 968         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 969                  start, start + len, mode, mode_flags,
 970                  nmask ? nodes_addr(*nmask)[0] : -1);
 971
 972         down_write(&mm->mmap_sem);
 973         vma = check_range(mm, start, end, nmask,
 974                           flags | MPOL_MF_INVERT, &pagelist);
 975
 976         err = PTR_ERR(vma);
 977         if (!IS_ERR(vma)) {
 978                 int nr_failed = 0;
 979
 980                 err = mbind_range(vma, start, end, new);
 981
 982                 if (!list_empty(&pagelist))
 983                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 984                                                 (unsigned long)vma);
 985
 986                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 987                         err = -EIO;
 988         }
 989
 990         up_write(&mm->mmap_sem);
 991         mpol_put(new);
 992         return err;
 993 }
 994
 995 /*
 996  * User space interface with variable sized bitmaps for nodelists.
 997  */
 998
 999 /* Copy a node mask from user space. */
1000 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1001                      unsigned long maxnode)
1002 {
1003         unsigned long k;
1004         unsigned long nlongs;
1005         unsigned long endmask;
1006
1007         --maxnode;
1008         nodes_clear(*nodes);
1009         if (maxnode == 0 || !nmask)
1010                 return 0;
1011         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1012                 return -EINVAL;
1013
1014         nlongs = BITS_TO_LONGS(maxnode);
1015         if ((maxnode % BITS_PER_LONG) == 0)
1016                 endmask = ~0UL;
1017         else
1018                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1019
1020         /* When the user specified more nodes than supported just check
1021            if the non supported part is all zero. */
1022         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1023                 if (nlongs > PAGE_SIZE/sizeof(long))
1024                         return -EINVAL;
1025                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1026                         unsigned long t;
1027                         if (get_user(t, nmask + k))
1028                                 return -EFAULT;
1029                         if (k == nlongs - 1) {
1030                                 if (t & endmask)
1031                                         return -EINVAL;
1032                         } else if (t)
1033                                 return -EINVAL;
1034                 }
1035                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1036                 endmask = ~0UL;
1037         }
1038
1039         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1040                 return -EFAULT;
1041         nodes_addr(*nodes)[nlongs-1] &= endmask;
1042         return 0;
1043 }
1044
1045 /* Copy a kernel node mask to user space */
1046 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1047                               nodemask_t *nodes)
1048 {
1049         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1050         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1051
1052         if (copy > nbytes) {
1053                 if (copy > PAGE_SIZE)
1054                         return -EINVAL;
1055                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1056                         return -EFAULT;
1057                 copy = nbytes;
1058         }
1059         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1060 }
1061
1062 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1063                         unsigned long mode,
1064                         unsigned long __user *nmask, unsigned long maxnode,
1065                         unsigned flags)
1066 {
1067         nodemask_t nodes;
1068         int err;
1069         unsigned short mode_flags;
1070
1071         mode_flags = mode & MPOL_MODE_FLAGS;
1072         mode &= ~MPOL_MODE_FLAGS;
1073         if (mode >= MPOL_MAX)
1074                 return -EINVAL;
1075         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1076             (mode_flags & MPOL_F_RELATIVE_NODES))
1077                 return -EINVAL;
1078         err = get_nodes(&nodes, nmask, maxnode);
1079         if (err)
1080                 return err;
1081         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1082 }
1083
1084 /* Set the process memory policy */
1085 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1086                 unsigned long maxnode)
1087 {
1088         int err;
1089         nodemask_t nodes;
1090         unsigned short flags;
1091
1092         flags = mode & MPOL_MODE_FLAGS;
1093         mode &= ~MPOL_MODE_FLAGS;
1094         if ((unsigned int)mode >= MPOL_MAX)
1095                 return -EINVAL;
1096         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1097                 return -EINVAL;
1098         err = get_nodes(&nodes, nmask, maxnode);
1099         if (err)
1100                 return err;
1101         return do_set_mempolicy(mode, flags, &nodes);
1102 }
1103
1104 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1105                 const unsigned long __user *old_nodes,
1106                 const unsigned long __user *new_nodes)
1107 {
1108         struct mm_struct *mm;
1109         struct task_struct *task;
1110         nodemask_t old;
1111         nodemask_t new;
1112         nodemask_t task_nodes;
1113         int err;
1114
1115         err = get_nodes(&old, old_nodes, maxnode);
1116         if (err)
1117                 return err;
1118
1119         err = get_nodes(&new, new_nodes, maxnode);
1120         if (err)
1121                 return err;
1122
1123         /* Find the mm_struct */
1124         read_lock(&tasklist_lock);
1125         task = pid ? find_task_by_vpid(pid) : current;
1126         if (!task) {
1127                 read_unlock(&tasklist_lock);
1128                 return -ESRCH;
1129         }
1130         mm = get_task_mm(task);
1131         read_unlock(&tasklist_lock);
1132
1133         if (!mm)
1134                 return -EINVAL;
1135
1136         /*
1137          * Check if this process has the right to modify the specified
1138          * process. The right exists if the process has administrative
1139          * capabilities, superuser privileges or the same
1140          * userid as the target process.
1141          */
1142         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1143             (current->uid != task->suid) && (current->uid != task->uid) &&
1144             !capable(CAP_SYS_NICE)) {
1145                 err = -EPERM;
1146                 goto out;
1147         }
1148
1149         task_nodes = cpuset_mems_allowed(task);
1150         /* Is the user allowed to access the target nodes? */
1151         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1152                 err = -EPERM;
1153                 goto out;
1154         }
1155
1156         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1157                 err = -EINVAL;
1158                 goto out;
1159         }
1160
1161         err = security_task_movememory(task);
1162         if (err)
1163                 goto out;
1164
1165         err = do_migrate_pages(mm, &old, &new,
1166                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1167 out:
1168         mmput(mm);
1169         return err;
1170 }
1171
1172
1173 /* Retrieve NUMA policy */
1174 asmlinkage long sys_get_mempolicy(int __user *policy,
1175                                 unsigned long __user *nmask,
1176                                 unsigned long maxnode,
1177                                 unsigned long addr, unsigned long flags)
1178 {
1179         int err;
1180         int uninitialized_var(pval);
1181         nodemask_t nodes;
1182
1183         if (nmask != NULL && maxnode < MAX_NUMNODES)
1184                 return -EINVAL;
1185
1186         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1187
1188         if (err)
1189                 return err;
1190
1191         if (policy && put_user(pval, policy))
1192                 return -EFAULT;
1193
1194         if (nmask)
1195                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1196
1197         return err;
1198 }
1199
1200 #ifdef CONFIG_COMPAT
1201
1202 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1203                                      compat_ulong_t __user *nmask,
1204                                      compat_ulong_t maxnode,
1205                                      compat_ulong_t addr, compat_ulong_t flags)
1206 {
1207         long err;
1208         unsigned long __user *nm = NULL;
1209         unsigned long nr_bits, alloc_size;
1210         DECLARE_BITMAP(bm, MAX_NUMNODES);
1211
1212         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1213         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1214
1215         if (nmask)
1216                 nm = compat_alloc_user_space(alloc_size);
1217
1218         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1219
1220         if (!err && nmask) {
1221                 err = copy_from_user(bm, nm, alloc_size);
1222                 /* ensure entire bitmap is zeroed */
1223                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1224                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1225         }
1226
1227         return err;
1228 }
1229
1230 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1231                                      compat_ulong_t maxnode)
1232 {
1233         long err = 0;
1234         unsigned long __user *nm = NULL;
1235         unsigned long nr_bits, alloc_size;
1236         DECLARE_BITMAP(bm, MAX_NUMNODES);
1237
1238         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1239         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1240
1241         if (nmask) {
1242                 err = compat_get_bitmap(bm, nmask, nr_bits);
1243                 nm = compat_alloc_user_space(alloc_size);
1244                 err |= copy_to_user(nm, bm, alloc_size);
1245         }
1246
1247         if (err)
1248                 return -EFAULT;
1249
1250         return sys_set_mempolicy(mode, nm, nr_bits+1);
1251 }
1252
1253 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1254                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1255                              compat_ulong_t maxnode, compat_ulong_t flags)
1256 {
1257         long err = 0;
1258         unsigned long __user *nm = NULL;
1259         unsigned long nr_bits, alloc_size;
1260         nodemask_t bm;
1261
1262         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1263         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1264
1265         if (nmask) {
1266                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1267                 nm = compat_alloc_user_space(alloc_size);
1268                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1269         }
1270
1271         if (err)
1272                 return -EFAULT;
1273
1274         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1275 }
1276
1277 #endif
1278
1279 /*
1280  * get_vma_policy(@task, @vma, @addr)
1281  * @task - task for fallback if vma policy == default
1282  * @vma   - virtual memory area whose policy is sought
1283  * @addr  - address in @vma for shared policy lookup
1284  *
1285  * Returns effective policy for a VMA at specified address.
1286  * Falls back to @task or system default policy, as necessary.
1287  * Current or other task's task mempolicy and non-shared vma policies
1288  * are protected by the task's mmap_sem, which must be held for read by
1289  * the caller.
1290  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1291  * count--added by the get_policy() vm_op, as appropriate--to protect against
1292  * freeing by another task.  It is the caller's responsibility to free the
1293  * extra reference for shared policies.
1294  */
1295 static struct mempolicy *get_vma_policy(struct task_struct *task,
1296                 struct vm_area_struct *vma, unsigned long addr)
1297 {
1298         struct mempolicy *pol = task->mempolicy;
1299
1300         if (vma) {
1301                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1302                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1303                                                                         addr);
1304                         if (vpol)
1305                                 pol = vpol;
1306                 } else if (vma->vm_policy)
1307                         pol = vma->vm_policy;
1308         }
1309         if (!pol)
1310                 pol = &default_policy;
1311         return pol;
1312 }
1313
1314 /*
1315  * Return a nodemask representing a mempolicy for filtering nodes for
1316  * page allocation
1317  */
1318 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1319 {
1320         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1321         if (unlikely(policy->mode == MPOL_BIND) &&
1322                         gfp_zone(gfp) >= policy_zone &&
1323                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1324                 return &policy->v.nodes;
1325
1326         return NULL;
1327 }
1328
1329 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1330 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1331 {
1332         int nd = numa_node_id();
1333
1334         switch (policy->mode) {
1335         case MPOL_PREFERRED:
1336                 if (!(policy->flags & MPOL_F_LOCAL))
1337                         nd = policy->v.preferred_node;
1338                 break;
1339         case MPOL_BIND:
1340                 /*
1341                  * Normally, MPOL_BIND allocations are node-local within the
1342                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1343                  * current node is part of the mask, we use the zonelist for
1344                  * the first node in the mask instead.
1345                  */
1346                 if (unlikely(gfp & __GFP_THISNODE) &&
1347                                 unlikely(!node_isset(nd, policy->v.nodes)))
1348                         nd = first_node(policy->v.nodes);
1349                 break;
1350         case MPOL_INTERLEAVE: /* should not happen */
1351                 break;
1352         default:
1353                 BUG();
1354         }
1355         return node_zonelist(nd, gfp);
1356 }
1357
1358 /* Do dynamic interleaving for a process */
1359 static unsigned interleave_nodes(struct mempolicy *policy)
1360 {
1361         unsigned nid, next;
1362         struct task_struct *me = current;
1363
1364         nid = me->il_next;
1365         next = next_node(nid, policy->v.nodes);
1366         if (next >= MAX_NUMNODES)
1367                 next = first_node(policy->v.nodes);
1368         if (next < MAX_NUMNODES)
1369                 me->il_next = next;
1370         return nid;
1371 }
1372
1373 /*
1374  * Depending on the memory policy provide a node from which to allocate the
1375  * next slab entry.
1376  * @policy must be protected by freeing by the caller.  If @policy is
1377  * the current task's mempolicy, this protection is implicit, as only the
1378  * task can change it's policy.  The system default policy requires no
1379  * such protection.
1380  */
1381 unsigned slab_node(struct mempolicy *policy)
1382 {
1383         if (!policy || policy->flags & MPOL_F_LOCAL)
1384                 return numa_node_id();
1385
1386         switch (policy->mode) {
1387         case MPOL_PREFERRED:
1388                 /*
1389                  * handled MPOL_F_LOCAL above
1390                  */
1391                 return policy->v.preferred_node;
1392
1393         case MPOL_INTERLEAVE:
1394                 return interleave_nodes(policy);
1395
1396         case MPOL_BIND: {
1397                 /*
1398                  * Follow bind policy behavior and start allocation at the
1399                  * first node.
1400                  */
1401                 struct zonelist *zonelist;
1402                 struct zone *zone;
1403                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1404                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1405                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1406                                                         &policy->v.nodes,
1407                                                         &zone);
1408                 return zone->node;
1409         }
1410
1411         default:
1412                 BUG();
1413         }
1414 }
1415
1416 /* Do static interleaving for a VMA with known offset. */
1417 static unsigned offset_il_node(struct mempolicy *pol,
1418                 struct vm_area_struct *vma, unsigned long off)
1419 {
1420         unsigned nnodes = nodes_weight(pol->v.nodes);
1421         unsigned target;
1422         int c;
1423         int nid = -1;
1424
1425         if (!nnodes)
1426                 return numa_node_id();
1427         target = (unsigned int)off % nnodes;
1428         c = 0;
1429         do {
1430                 nid = next_node(nid, pol->v.nodes);
1431                 c++;
1432         } while (c <= target);
1433         return nid;
1434 }
1435
1436 /* Determine a node number for interleave */
1437 static inline unsigned interleave_nid(struct mempolicy *pol,
1438                  struct vm_area_struct *vma, unsigned long addr, int shift)
1439 {
1440         if (vma) {
1441                 unsigned long off;
1442
1443                 /*
1444                  * for small pages, there is no difference between
1445                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1446                  * for huge pages, since vm_pgoff is in units of small
1447                  * pages, we need to shift off the always 0 bits to get
1448                  * a useful offset.
1449                  */
1450                 BUG_ON(shift < PAGE_SHIFT);
1451                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1452                 off += (addr - vma->vm_start) >> shift;
1453                 return offset_il_node(pol, vma, off);
1454         } else
1455                 return interleave_nodes(pol);
1456 }
1457
1458 #ifdef CONFIG_HUGETLBFS
1459 /*
1460  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1461  * @vma = virtual memory area whose policy is sought
1462  * @addr = address in @vma for shared policy lookup and interleave policy
1463  * @gfp_flags = for requested zone
1464  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1465  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1466  *
1467  * Returns a zonelist suitable for a huge page allocation and a pointer
1468  * to the struct mempolicy for conditional unref after allocation.
1469  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1470  * @nodemask for filtering the zonelist.
1471  */
1472 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1473                                 gfp_t gfp_flags, struct mempolicy **mpol,
1474                                 nodemask_t **nodemask)
1475 {
1476         struct zonelist *zl;
1477
1478         *mpol = get_vma_policy(current, vma, addr);
1479         *nodemask = NULL;       /* assume !MPOL_BIND */
1480
1481         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1482                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1483                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1484         } else {
1485                 zl = policy_zonelist(gfp_flags, *mpol);
1486                 if ((*mpol)->mode == MPOL_BIND)
1487                         *nodemask = &(*mpol)->v.nodes;
1488         }
1489         return zl;
1490 }
1491 #endif
1492
1493 /* Allocate a page in interleaved policy.
1494    Own path because it needs to do special accounting. */
1495 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1496                                         unsigned nid)
1497 {
1498         struct zonelist *zl;
1499         struct page *page;
1500
1501         zl = node_zonelist(nid, gfp);
1502         page = __alloc_pages(gfp, order, zl);
1503         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1504                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1505         return page;
1506 }
1507
1508 /**
1509  *      alloc_page_vma  - Allocate a page for a VMA.
1510  *
1511  *      @gfp:
1512  *      %GFP_USER    user allocation.
1513  *      %GFP_KERNEL  kernel allocations,
1514  *      %GFP_HIGHMEM highmem/user allocations,
1515  *      %GFP_FS      allocation should not call back into a file system.
1516  *      %GFP_ATOMIC  don't sleep.
1517  *
1518  *      @vma:  Pointer to VMA or NULL if not available.
1519  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1520  *
1521  *      This function allocates a page from the kernel page pool and applies
1522  *      a NUMA policy associated with the VMA or the current process.
1523  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1524  *      mm_struct of the VMA to prevent it from going away. Should be used for
1525  *      all allocations for pages that will be mapped into
1526  *      user space. Returns NULL when no page can be allocated.
1527  *
1528  *      Should be called with the mm_sem of the vma hold.
1529  */
1530 struct page *
1531 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1532 {
1533         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1534         struct zonelist *zl;
1535
1536         cpuset_update_task_memory_state();
1537
1538         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1539                 unsigned nid;
1540
1541                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1542                 mpol_cond_put(pol);
1543                 return alloc_page_interleave(gfp, 0, nid);
1544         }
1545         zl = policy_zonelist(gfp, pol);
1546         if (unlikely(mpol_needs_cond_ref(pol))) {
1547                 /*
1548                  * slow path: ref counted shared policy
1549                  */
1550                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1551                                                 zl, policy_nodemask(gfp, pol));
1552                 __mpol_put(pol);
1553                 return page;
1554         }
1555         /*
1556          * fast path:  default or task policy
1557          */
1558         return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1559 }
1560
1561 /**
1562  *      alloc_pages_current - Allocate pages.
1563  *
1564  *      @gfp:
1565  *              %GFP_USER   user allocation,
1566  *              %GFP_KERNEL kernel allocation,
1567  *              %GFP_HIGHMEM highmem allocation,
1568  *              %GFP_FS     don't call back into a file system.
1569  *              %GFP_ATOMIC don't sleep.
1570  *      @order: Power of two of allocation size in pages. 0 is a single page.
1571  *
1572  *      Allocate a page from the kernel page pool.  When not in
1573  *      interrupt context and apply the current process NUMA policy.
1574  *      Returns NULL when no page can be allocated.
1575  *
1576  *      Don't call cpuset_update_task_memory_state() unless
1577  *      1) it's ok to take cpuset_sem (can WAIT), and
1578  *      2) allocating for current task (not interrupt).
1579  */
1580 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1581 {
1582         struct mempolicy *pol = current->mempolicy;
1583
1584         if ((gfp & __GFP_WAIT) && !in_interrupt())
1585                 cpuset_update_task_memory_state();
1586         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1587                 pol = &default_policy;
1588
1589         /*
1590          * No reference counting needed for current->mempolicy
1591          * nor system default_policy
1592          */
1593         if (pol->mode == MPOL_INTERLEAVE)
1594                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1595         return __alloc_pages_nodemask(gfp, order,
1596                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1597 }
1598 EXPORT_SYMBOL(alloc_pages_current);
1599
1600 /*
1601  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1602  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1603  * with the mems_allowed returned by cpuset_mems_allowed().  This
1604  * keeps mempolicies cpuset relative after its cpuset moves.  See
1605  * further kernel/cpuset.c update_nodemask().
1606  */
1607
1608 /* Slow path of a mempolicy duplicate */
1609 struct mempolicy *__mpol_dup(struct mempolicy *old)
1610 {
1611         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1612
1613         if (!new)
1614                 return ERR_PTR(-ENOMEM);
1615         if (current_cpuset_is_being_rebound()) {
1616                 nodemask_t mems = cpuset_mems_allowed(current);
1617                 mpol_rebind_policy(old, &mems);
1618         }
1619         *new = *old;
1620         atomic_set(&new->refcnt, 1);
1621         return new;
1622 }
1623
1624 /*
1625  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1626  * eliminate the * MPOL_F_* flags that require conditional ref and
1627  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1628  * after return.  Use the returned value.
1629  *
1630  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1631  * policy lookup, even if the policy needs/has extra ref on lookup.
1632  * shmem_readahead needs this.
1633  */
1634 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1635                                                 struct mempolicy *frompol)
1636 {
1637         if (!mpol_needs_cond_ref(frompol))
1638                 return frompol;
1639
1640         *tompol = *frompol;
1641         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1642         __mpol_put(frompol);
1643         return tompol;
1644 }
1645
1646 static int mpol_match_intent(const struct mempolicy *a,
1647                              const struct mempolicy *b)
1648 {
1649         if (a->flags != b->flags)
1650                 return 0;
1651         if (!mpol_store_user_nodemask(a))
1652                 return 1;
1653         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1654 }
1655
1656 /* Slow path of a mempolicy comparison */
1657 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1658 {
1659         if (!a || !b)
1660                 return 0;
1661         if (a->mode != b->mode)
1662                 return 0;
1663         if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1664                 return 0;
1665         switch (a->mode) {
1666         case MPOL_BIND:
1667                 /* Fall through */
1668         case MPOL_INTERLEAVE:
1669                 return nodes_equal(a->v.nodes, b->v.nodes);
1670         case MPOL_PREFERRED:
1671                 return a->v.preferred_node == b->v.preferred_node &&
1672                         a->flags == b->flags;
1673         default:
1674                 BUG();
1675                 return 0;
1676         }
1677 }
1678
1679 /*
1680  * Shared memory backing store policy support.
1681  *
1682  * Remember policies even when nobody has shared memory mapped.
1683  * The policies are kept in Red-Black tree linked from the inode.
1684  * They are protected by the sp->lock spinlock, which should be held
1685  * for any accesses to the tree.
1686  */
1687
1688 /* lookup first element intersecting start-end */
1689 /* Caller holds sp->lock */
1690 static struct sp_node *
1691 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1692 {
1693         struct rb_node *n = sp->root.rb_node;
1694
1695         while (n) {
1696                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1697
1698                 if (start >= p->end)
1699                         n = n->rb_right;
1700                 else if (end <= p->start)
1701                         n = n->rb_left;
1702                 else
1703                         break;
1704         }
1705         if (!n)
1706                 return NULL;
1707         for (;;) {
1708                 struct sp_node *w = NULL;
1709                 struct rb_node *prev = rb_prev(n);
1710                 if (!prev)
1711                         break;
1712                 w = rb_entry(prev, struct sp_node, nd);
1713                 if (w->end <= start)
1714                         break;
1715                 n = prev;
1716         }
1717         return rb_entry(n, struct sp_node, nd);
1718 }
1719
1720 /* Insert a new shared policy into the list. */
1721 /* Caller holds sp->lock */
1722 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1723 {
1724         struct rb_node **p = &sp->root.rb_node;
1725         struct rb_node *parent = NULL;
1726         struct sp_node *nd;
1727
1728         while (*p) {
1729                 parent = *p;
1730                 nd = rb_entry(parent, struct sp_node, nd);
1731                 if (new->start < nd->start)
1732                         p = &(*p)->rb_left;
1733                 else if (new->end > nd->end)
1734                         p = &(*p)->rb_right;
1735                 else
1736                         BUG();
1737         }
1738         rb_link_node(&new->nd, parent, p);
1739         rb_insert_color(&new->nd, &sp->root);
1740         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1741                  new->policy ? new->policy->mode : 0);
1742 }
1743
1744 /* Find shared policy intersecting idx */
1745 struct mempolicy *
1746 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1747 {
1748         struct mempolicy *pol = NULL;
1749         struct sp_node *sn;
1750
1751         if (!sp->root.rb_node)
1752                 return NULL;
1753         spin_lock(&sp->lock);
1754         sn = sp_lookup(sp, idx, idx+1);
1755         if (sn) {
1756                 mpol_get(sn->policy);
1757                 pol = sn->policy;
1758         }
1759         spin_unlock(&sp->lock);
1760         return pol;
1761 }
1762
1763 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1764 {
1765         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1766         rb_erase(&n->nd, &sp->root);
1767         mpol_put(n->policy);
1768         kmem_cache_free(sn_cache, n);
1769 }
1770
1771 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1772                                 struct mempolicy *pol)
1773 {
1774         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1775
1776         if (!n)
1777                 return NULL;
1778         n->start = start;
1779         n->end = end;
1780         mpol_get(pol);
1781         pol->flags |= MPOL_F_SHARED;    /* for unref */
1782         n->policy = pol;
1783         return n;
1784 }
1785
1786 /* Replace a policy range. */
1787 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1788                                  unsigned long end, struct sp_node *new)
1789 {
1790         struct sp_node *n, *new2 = NULL;
1791
1792 restart:
1793         spin_lock(&sp->lock);
1794         n = sp_lookup(sp, start, end);
1795         /* Take care of old policies in the same range. */
1796         while (n && n->start < end) {
1797                 struct rb_node *next = rb_next(&n->nd);
1798                 if (n->start >= start) {
1799                         if (n->end <= end)
1800                                 sp_delete(sp, n);
1801                         else
1802                                 n->start = end;
1803                 } else {
1804                         /* Old policy spanning whole new range. */
1805                         if (n->end > end) {
1806                                 if (!new2) {
1807                                         spin_unlock(&sp->lock);
1808                                         new2 = sp_alloc(end, n->end, n->policy);
1809                                         if (!new2)
1810                                                 return -ENOMEM;
1811                                         goto restart;
1812                                 }
1813                                 n->end = start;
1814                                 sp_insert(sp, new2);
1815                                 new2 = NULL;
1816                                 break;
1817                         } else
1818                                 n->end = start;
1819                 }
1820                 if (!next)
1821                         break;
1822                 n = rb_entry(next, struct sp_node, nd);
1823         }
1824         if (new)
1825                 sp_insert(sp, new);
1826         spin_unlock(&sp->lock);
1827         if (new2) {
1828                 mpol_put(new2->policy);
1829                 kmem_cache_free(sn_cache, new2);
1830         }
1831         return 0;
1832 }
1833
1834 /**
1835  * mpol_shared_policy_init - initialize shared policy for inode
1836  * @sp: pointer to inode shared policy
1837  * @mpol:  struct mempolicy to install
1838  *
1839  * Install non-NULL @mpol in inode's shared policy rb-tree.
1840  * On entry, the current task has a reference on a non-NULL @mpol.
1841  * This must be released on exit.
1842  */
1843 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1844 {
1845         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1846         spin_lock_init(&sp->lock);
1847
1848         if (mpol) {
1849                 struct vm_area_struct pvma;
1850                 struct mempolicy *new;
1851
1852                 /* contextualize the tmpfs mount point mempolicy */
1853                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1854                 mpol_put(mpol); /* drop our ref on sb mpol */
1855                 if (IS_ERR(new))
1856                         return;         /* no valid nodemask intersection */
1857
1858                 /* Create pseudo-vma that contains just the policy */
1859                 memset(&pvma, 0, sizeof(struct vm_area_struct));
1860                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1861                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1862                 mpol_put(new);                  /* drop initial ref */
1863         }
1864 }
1865
1866 int mpol_set_shared_policy(struct shared_policy *info,
1867                         struct vm_area_struct *vma, struct mempolicy *npol)
1868 {
1869         int err;
1870         struct sp_node *new = NULL;
1871         unsigned long sz = vma_pages(vma);
1872
1873         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1874                  vma->vm_pgoff,
1875                  sz, npol ? npol->mode : -1,
1876                  npol ? npol->flags : -1,
1877                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1878
1879         if (npol) {
1880                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1881                 if (!new)
1882                         return -ENOMEM;
1883         }
1884         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1885         if (err && new)
1886                 kmem_cache_free(sn_cache, new);
1887         return err;
1888 }
1889
1890 /* Free a backing policy store on inode delete. */
1891 void mpol_free_shared_policy(struct shared_policy *p)
1892 {
1893         struct sp_node *n;
1894         struct rb_node *next;
1895
1896         if (!p->root.rb_node)
1897                 return;
1898         spin_lock(&p->lock);
1899         next = rb_first(&p->root);
1900         while (next) {
1901                 n = rb_entry(next, struct sp_node, nd);
1902                 next = rb_next(&n->nd);
1903                 rb_erase(&n->nd, &p->root);
1904                 mpol_put(n->policy);
1905                 kmem_cache_free(sn_cache, n);
1906         }
1907         spin_unlock(&p->lock);
1908 }
1909
1910 /* assumes fs == KERNEL_DS */
1911 void __init numa_policy_init(void)
1912 {
1913         nodemask_t interleave_nodes;
1914         unsigned long largest = 0;
1915         int nid, prefer = 0;
1916
1917         policy_cache = kmem_cache_create("numa_policy",
1918                                          sizeof(struct mempolicy),
1919                                          0, SLAB_PANIC, NULL);
1920
1921         sn_cache = kmem_cache_create("shared_policy_node",
1922                                      sizeof(struct sp_node),
1923                                      0, SLAB_PANIC, NULL);
1924
1925         /*
1926          * Set interleaving policy for system init. Interleaving is only
1927          * enabled across suitably sized nodes (default is >= 16MB), or
1928          * fall back to the largest node if they're all smaller.
1929          */
1930         nodes_clear(interleave_nodes);
1931         for_each_node_state(nid, N_HIGH_MEMORY) {
1932                 unsigned long total_pages = node_present_pages(nid);
1933
1934                 /* Preserve the largest node */
1935                 if (largest < total_pages) {
1936                         largest = total_pages;
1937                         prefer = nid;
1938                 }
1939
1940                 /* Interleave this node? */
1941                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1942                         node_set(nid, interleave_nodes);
1943         }
1944
1945         /* All too small, use the largest */
1946         if (unlikely(nodes_empty(interleave_nodes)))
1947                 node_set(prefer, interleave_nodes);
1948
1949         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1950                 printk("numa_policy_init: interleaving failed\n");
1951 }
1952
1953 /* Reset policy of current process to default */
1954 void numa_default_policy(void)
1955 {
1956         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1957 }
1958
1959 /*
1960  * Parse and format mempolicy from/to strings
1961  */
1962
1963 /*
1964  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1965  * Used only for mpol_parse_str() and mpol_to_str()
1966  */
1967 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1968 static const char * const policy_types[] =
1969         { "default", "prefer", "bind", "interleave", "local" };
1970
1971
1972 #ifdef CONFIG_TMPFS
1973 /**
1974  * mpol_parse_str - parse string to mempolicy
1975  * @str:  string containing mempolicy to parse
1976  * @mpol:  pointer to struct mempolicy pointer, returned on success.
1977  * @no_context:  flag whether to "contextualize" the mempolicy
1978  *
1979  * Format of input:
1980  *      <mode>[=<flags>][:<nodelist>]
1981  *
1982  * if @no_context is true, save the input nodemask in w.user_nodemask in
1983  * the returned mempolicy.  This will be used to "clone" the mempolicy in
1984  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1985  * mount option.  Note that if 'static' or 'relative' mode flags were
1986  * specified, the input nodemask will already have been saved.  Saving
1987  * it again is redundant, but safe.
1988  *
1989  * On success, returns 0, else 1
1990  */
1991 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1992 {
1993         struct mempolicy *new = NULL;
1994         unsigned short uninitialized_var(mode);
1995         unsigned short uninitialized_var(mode_flags);
1996         nodemask_t nodes;
1997         char *nodelist = strchr(str, ':');
1998         char *flags = strchr(str, '=');
1999         int i;
2000         int err = 1;
2001
2002         if (nodelist) {
2003                 /* NUL-terminate mode or flags string */
2004                 *nodelist++ = '\0';
2005                 if (nodelist_parse(nodelist, nodes))
2006                         goto out;
2007                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2008                         goto out;
2009         } else
2010                 nodes_clear(nodes);
2011
2012         if (flags)
2013                 *flags++ = '\0';        /* terminate mode string */
2014
2015         for (i = 0; i <= MPOL_LOCAL; i++) {
2016                 if (!strcmp(str, policy_types[i])) {
2017                         mode = i;
2018                         break;
2019                 }
2020         }
2021         if (i > MPOL_LOCAL)
2022                 goto out;
2023
2024         switch (mode) {
2025         case MPOL_PREFERRED:
2026                 /*
2027                  * Insist on a nodelist of one node only
2028                  */
2029                 if (nodelist) {
2030                         char *rest = nodelist;
2031                         while (isdigit(*rest))
2032                                 rest++;
2033                         if (!*rest)
2034                                 err = 0;
2035                 }
2036                 break;
2037         case MPOL_INTERLEAVE:
2038                 /*
2039                  * Default to online nodes with memory if no nodelist
2040                  */
2041                 if (!nodelist)
2042                         nodes = node_states[N_HIGH_MEMORY];
2043                 err = 0;
2044                 break;
2045         case MPOL_LOCAL:
2046                 /*
2047                  * Don't allow a nodelist;  mpol_new() checks flags
2048                  */
2049                 if (nodelist)
2050                         goto out;
2051                 mode = MPOL_PREFERRED;
2052                 break;
2053
2054         /*
2055          * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2056          * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2057          */
2058         }
2059
2060         mode_flags = 0;
2061         if (flags) {
2062                 /*
2063                  * Currently, we only support two mutually exclusive
2064                  * mode flags.
2065                  */
2066                 if (!strcmp(flags, "static"))
2067                         mode_flags |= MPOL_F_STATIC_NODES;
2068                 else if (!strcmp(flags, "relative"))
2069                         mode_flags |= MPOL_F_RELATIVE_NODES;
2070                 else
2071                         err = 1;
2072         }
2073
2074         new = mpol_new(mode, mode_flags, &nodes);
2075         if (IS_ERR(new))
2076                 err = 1;
2077         else if (no_context)
2078                 new->w.user_nodemask = nodes;   /* save for contextualization */
2079
2080 out:
2081         /* Restore string for error message */
2082         if (nodelist)
2083                 *--nodelist = ':';
2084         if (flags)
2085                 *--flags = '=';
2086         if (!err)
2087                 *mpol = new;
2088         return err;
2089 }
2090 #endif /* CONFIG_TMPFS */
2091
2092 /**
2093  * mpol_to_str - format a mempolicy structure for printing
2094  * @buffer:  to contain formatted mempolicy string
2095  * @maxlen:  length of @buffer
2096  * @pol:  pointer to mempolicy to be formatted
2097  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2098  *
2099  * Convert a mempolicy into a string.
2100  * Returns the number of characters in buffer (if positive)
2101  * or an error (negative)
2102  */
2103 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2104 {
2105         char *p = buffer;
2106         int l;
2107         nodemask_t nodes;
2108         unsigned short mode;
2109         unsigned short flags = pol ? pol->flags : 0;
2110
2111         /*
2112          * Sanity check:  room for longest mode, flag and some nodes
2113          */
2114         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2115
2116         if (!pol || pol == &default_policy)
2117                 mode = MPOL_DEFAULT;
2118         else
2119                 mode = pol->mode;
2120
2121         switch (mode) {
2122         case MPOL_DEFAULT:
2123                 nodes_clear(nodes);
2124                 break;
2125
2126         case MPOL_PREFERRED:
2127                 nodes_clear(nodes);
2128                 if (flags & MPOL_F_LOCAL)
2129                         mode = MPOL_LOCAL;      /* pseudo-policy */
2130                 else
2131                         node_set(pol->v.preferred_node, nodes);
2132                 break;
2133
2134         case MPOL_BIND:
2135                 /* Fall through */
2136         case MPOL_INTERLEAVE:
2137                 if (no_context)
2138                         nodes = pol->w.user_nodemask;
2139                 else
2140                         nodes = pol->v.nodes;
2141                 break;
2142
2143         default:
2144                 BUG();
2145         }
2146
2147         l = strlen(policy_types[mode]);
2148         if (buffer + maxlen < p + l + 1)
2149                 return -ENOSPC;
2150
2151         strcpy(p, policy_types[mode]);
2152         p += l;
2153
2154         if (flags & MPOL_MODE_FLAGS) {
2155                 if (buffer + maxlen < p + 2)
2156                         return -ENOSPC;
2157                 *p++ = '=';
2158
2159                 /*
2160                  * Currently, the only defined flags are mutually exclusive
2161                  */
2162                 if (flags & MPOL_F_STATIC_NODES)
2163                         p += snprintf(p, buffer + maxlen - p, "static");
2164                 else if (flags & MPOL_F_RELATIVE_NODES)
2165                         p += snprintf(p, buffer + maxlen - p, "relative");
2166         }
2167
2168         if (!nodes_empty(nodes)) {
2169                 if (buffer + maxlen < p + 2)
2170                         return -ENOSPC;
2171                 *p++ = ':';
2172                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2173         }
2174         return p - buffer;
2175 }
2176
2177 struct numa_maps {
2178         unsigned long pages;
2179         unsigned long anon;
2180         unsigned long active;
2181         unsigned long writeback;
2182         unsigned long mapcount_max;
2183         unsigned long dirty;
2184         unsigned long swapcache;
2185         unsigned long node[MAX_NUMNODES];
2186 };
2187
2188 static void gather_stats(struct page *page, void *private, int pte_dirty)
2189 {
2190         struct numa_maps *md = private;
2191         int count = page_mapcount(page);
2192
2193         md->pages++;
2194         if (pte_dirty || PageDirty(page))
2195                 md->dirty++;
2196
2197         if (PageSwapCache(page))
2198                 md->swapcache++;
2199
2200         if (PageActive(page))
2201                 md->active++;
2202
2203         if (PageWriteback(page))
2204                 md->writeback++;
2205
2206         if (PageAnon(page))
2207                 md->anon++;
2208
2209         if (count > md->mapcount_max)
2210                 md->mapcount_max = count;
2211
2212         md->node[page_to_nid(page)]++;
2213 }
2214
2215 #ifdef CONFIG_HUGETLB_PAGE
2216 static void check_huge_range(struct vm_area_struct *vma,
2217                 unsigned long start, unsigned long end,
2218                 struct numa_maps *md)
2219 {
2220         unsigned long addr;
2221         struct page *page;
2222         struct hstate *h = hstate_vma(vma);
2223         unsigned long sz = huge_page_size(h);
2224
2225         for (addr = start; addr < end; addr += sz) {
2226                 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2227                                                 addr & huge_page_mask(h));
2228                 pte_t pte;
2229
2230                 if (!ptep)
2231                         continue;
2232
2233                 pte = *ptep;
2234                 if (pte_none(pte))
2235                         continue;
2236
2237                 page = pte_page(pte);
2238                 if (!page)
2239                         continue;
2240
2241                 gather_stats(page, md, pte_dirty(*ptep));
2242         }
2243 }
2244 #else
2245 static inline void check_huge_range(struct vm_area_struct *vma,
2246                 unsigned long start, unsigned long end,
2247                 struct numa_maps *md)
2248 {
2249 }
2250 #endif
2251
2252 /*
2253  * Display pages allocated per node and memory policy via /proc.
2254  */
2255 int show_numa_map(struct seq_file *m, void *v)
2256 {
2257         struct proc_maps_private *priv = m->private;
2258         struct vm_area_struct *vma = v;
2259         struct numa_maps *md;
2260         struct file *file = vma->vm_file;
2261         struct mm_struct *mm = vma->vm_mm;
2262         struct mempolicy *pol;
2263         int n;
2264         char buffer[50];
2265
2266         if (!mm)
2267                 return 0;
2268
2269         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2270         if (!md)
2271                 return 0;
2272
2273         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2274         mpol_to_str(buffer, sizeof(buffer), pol, 0);
2275         mpol_cond_put(pol);
2276
2277         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2278
2279         if (file) {
2280                 seq_printf(m, " file=");
2281                 seq_path(m, &file->f_path, "\n\t= ");
2282         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2283                 seq_printf(m, " heap");
2284         } else if (vma->vm_start <= mm->start_stack &&
2285                         vma->vm_end >= mm->start_stack) {
2286                 seq_printf(m, " stack");
2287         }
2288
2289         if (is_vm_hugetlb_page(vma)) {
2290                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2291                 seq_printf(m, " huge");
2292         } else {
2293                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2294                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2295         }
2296
2297         if (!md->pages)
2298                 goto out;
2299
2300         if (md->anon)
2301                 seq_printf(m," anon=%lu",md->anon);
2302
2303         if (md->dirty)
2304                 seq_printf(m," dirty=%lu",md->dirty);
2305
2306         if (md->pages != md->anon && md->pages != md->dirty)
2307                 seq_printf(m, " mapped=%lu", md->pages);
2308
2309         if (md->mapcount_max > 1)
2310                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2311
2312         if (md->swapcache)
2313                 seq_printf(m," swapcache=%lu", md->swapcache);
2314
2315         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2316                 seq_printf(m," active=%lu", md->active);
2317
2318         if (md->writeback)
2319                 seq_printf(m," writeback=%lu", md->writeback);
2320
2321         for_each_node_state(n, N_HIGH_MEMORY)
2322                 if (md->node[n])
2323                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2324 out:
2325         seq_putc(m, '\n');
2326         kfree(md);
2327
2328         if (m->count < m->size)
2329                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2330         return 0;
2331 }