git.oblomov.eu Git - linux-2.6/blob - mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter <clameter@sgi.com>
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapops.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/buffer_head.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/rmap.h>
  24 #include <linux/topology.h>
  25 #include <linux/cpu.h>
  26 #include <linux/cpuset.h>
  27
  28 #include "internal.h"
  29
  30 /* The maximum number of pages to take off the LRU for migration */
  31 #define MIGRATE_CHUNK_SIZE 256
  32
  33 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  34
  35 /*
  36  * Isolate one page from the LRU lists. If successful put it onto
  37  * the indicated list with elevated page count.
  38  *
  39  * Result:
  40  *  -EBUSY: page not on LRU list
  41  *  0: page removed from LRU list and added to the specified list.
  42  */
  43 int isolate_lru_page(struct page *page, struct list_head *pagelist)
  44 {
  45         int ret = -EBUSY;
  46
  47         if (PageLRU(page)) {
  48                 struct zone *zone = page_zone(page);
  49
  50                 spin_lock_irq(&zone->lru_lock);
  51                 if (PageLRU(page)) {
  52                         ret = 0;
  53                         get_page(page);
  54                         ClearPageLRU(page);
  55                         if (PageActive(page))
  56                                 del_page_from_active_list(zone, page);
  57                         else
  58                                 del_page_from_inactive_list(zone, page);
  59                         list_add_tail(&page->lru, pagelist);
  60                 }
  61                 spin_unlock_irq(&zone->lru_lock);
  62         }
  63         return ret;
  64 }
  65
  66 /*
  67  * migrate_prep() needs to be called after we have compiled the list of pages
  68  * to be migrated using isolate_lru_page() but before we begin a series of calls
  69  * to migrate_pages().
  70  */
  71 int migrate_prep(void)
  72 {
  73         /*
  74          * Clear the LRU lists so pages can be isolated.
  75          * Note that pages may be moved off the LRU after we have
  76          * drained them. Those pages will fail to migrate like other
  77          * pages that may be busy.
  78          */
  79         lru_add_drain_all();
  80
  81         return 0;
  82 }
  83
  84 static inline void move_to_lru(struct page *page)
  85 {
  86         list_del(&page->lru);
  87         if (PageActive(page)) {
  88                 /*
  89                  * lru_cache_add_active checks that
  90                  * the PG_active bit is off.
  91                  */
  92                 ClearPageActive(page);
  93                 lru_cache_add_active(page);
  94         } else {
  95                 lru_cache_add(page);
  96         }
  97         put_page(page);
  98 }
  99
 100 /*
 101  * Add isolated pages on the list back to the LRU.
 102  *
 103  * returns the number of pages put back.
 104  */
 105 int putback_lru_pages(struct list_head *l)
 106 {
 107         struct page *page;
 108         struct page *page2;
 109         int count = 0;
 110
 111         list_for_each_entry_safe(page, page2, l, lru) {
 112                 move_to_lru(page);
 113                 count++;
 114         }
 115         return count;
 116 }
 117
 118 static inline int is_swap_pte(pte_t pte)
 119 {
 120         return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
 121 }
 122
 123 /*
 124  * Restore a potential migration pte to a working pte entry
 125  */
 126 static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
 127                 struct page *old, struct page *new)
 128 {
 129         struct mm_struct *mm = vma->vm_mm;
 130         swp_entry_t entry;
 131         pgd_t *pgd;
 132         pud_t *pud;
 133         pmd_t *pmd;
 134         pte_t *ptep, pte;
 135         spinlock_t *ptl;
 136
 137         pgd = pgd_offset(mm, addr);
 138         if (!pgd_present(*pgd))
 139                 return;
 140
 141         pud = pud_offset(pgd, addr);
 142         if (!pud_present(*pud))
 143                 return;
 144
 145         pmd = pmd_offset(pud, addr);
 146         if (!pmd_present(*pmd))
 147                 return;
 148
 149         ptep = pte_offset_map(pmd, addr);
 150
 151         if (!is_swap_pte(*ptep)) {
 152                 pte_unmap(ptep);
 153                 return;
 154         }
 155
 156         ptl = pte_lockptr(mm, pmd);
 157         spin_lock(ptl);
 158         pte = *ptep;
 159         if (!is_swap_pte(pte))
 160                 goto out;
 161
 162         entry = pte_to_swp_entry(pte);
 163
 164         if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 165                 goto out;
 166
 167         get_page(new);
 168         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 169         if (is_write_migration_entry(entry))
 170                 pte = pte_mkwrite(pte);
 171         set_pte_at(mm, addr, ptep, pte);
 172         page_add_anon_rmap(new, vma, addr);
 173 out:
 174         pte_unmap_unlock(ptep, ptl);
 175 }
 176
 177 /*
 178  * Get rid of all migration entries and replace them by
 179  * references to the indicated page.
 180  *
 181  * Must hold mmap_sem lock on at least one of the vmas containing
 182  * the page so that the anon_vma cannot vanish.
 183  */
 184 static void remove_migration_ptes(struct page *old, struct page *new)
 185 {
 186         struct anon_vma *anon_vma;
 187         struct vm_area_struct *vma;
 188         unsigned long mapping;
 189
 190         mapping = (unsigned long)new->mapping;
 191
 192         if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 193                 return;
 194
 195         /*
 196          * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
 197          */
 198         anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
 199         spin_lock(&anon_vma->lock);
 200
 201         list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 202                 remove_migration_pte(vma, page_address_in_vma(new, vma),
 203                                         old, new);
 204
 205         spin_unlock(&anon_vma->lock);
 206 }
 207
 208 /*
 209  * Something used the pte of a page under migration. We need to
 210  * get to the page and wait until migration is finished.
 211  * When we return from this function the fault will be retried.
 212  *
 213  * This function is called from do_swap_page().
 214  */
 215 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 216                                 unsigned long address)
 217 {
 218         pte_t *ptep, pte;
 219         spinlock_t *ptl;
 220         swp_entry_t entry;
 221         struct page *page;
 222
 223         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 224         pte = *ptep;
 225         if (!is_swap_pte(pte))
 226                 goto out;
 227
 228         entry = pte_to_swp_entry(pte);
 229         if (!is_migration_entry(entry))
 230                 goto out;
 231
 232         page = migration_entry_to_page(entry);
 233
 234         get_page(page);
 235         pte_unmap_unlock(ptep, ptl);
 236         wait_on_page_locked(page);
 237         put_page(page);
 238         return;
 239 out:
 240         pte_unmap_unlock(ptep, ptl);
 241 }
 242
 243 /*
 244  * Replace the page in the mapping.
 245  *
 246  * The number of remaining references must be:
 247  * 1 for anonymous pages without a mapping
 248  * 2 for pages with a mapping
 249  * 3 for pages with a mapping and PagePrivate set.
 250  */
 251 static int migrate_page_move_mapping(struct address_space *mapping,
 252                 struct page *newpage, struct page *page)
 253 {
 254         struct page **radix_pointer;
 255
 256         if (!mapping) {
 257                 /* Anonymous page */
 258                 if (page_count(page) != 1)
 259                         return -EAGAIN;
 260                 return 0;
 261         }
 262
 263         write_lock_irq(&mapping->tree_lock);
 264
 265         radix_pointer = (struct page **)radix_tree_lookup_slot(
 266                                                 &mapping->page_tree,
 267                                                 page_index(page));
 268
 269         if (page_count(page) != 2 + !!PagePrivate(page) ||
 270                         *radix_pointer != page) {
 271                 write_unlock_irq(&mapping->tree_lock);
 272                 return -EAGAIN;
 273         }
 274
 275         /*
 276          * Now we know that no one else is looking at the page.
 277          */
 278         get_page(newpage);
 279 #ifdef CONFIG_SWAP
 280         if (PageSwapCache(page)) {
 281                 SetPageSwapCache(newpage);
 282                 set_page_private(newpage, page_private(page));
 283         }
 284 #endif
 285
 286         *radix_pointer = newpage;
 287         __put_page(page);
 288         write_unlock_irq(&mapping->tree_lock);
 289
 290         return 0;
 291 }
 292
 293 /*
 294  * Copy the page to its new location
 295  */
 296 static void migrate_page_copy(struct page *newpage, struct page *page)
 297 {
 298         copy_highpage(newpage, page);
 299
 300         if (PageError(page))
 301                 SetPageError(newpage);
 302         if (PageReferenced(page))
 303                 SetPageReferenced(newpage);
 304         if (PageUptodate(page))
 305                 SetPageUptodate(newpage);
 306         if (PageActive(page))
 307                 SetPageActive(newpage);
 308         if (PageChecked(page))
 309                 SetPageChecked(newpage);
 310         if (PageMappedToDisk(page))
 311                 SetPageMappedToDisk(newpage);
 312
 313         if (PageDirty(page)) {
 314                 clear_page_dirty_for_io(page);
 315                 set_page_dirty(newpage);
 316         }
 317
 318 #ifdef CONFIG_SWAP
 319         ClearPageSwapCache(page);
 320 #endif
 321         ClearPageActive(page);
 322         ClearPagePrivate(page);
 323         set_page_private(page, 0);
 324         page->mapping = NULL;
 325
 326         /*
 327          * If any waiters have accumulated on the new page then
 328          * wake them up.
 329          */
 330         if (PageWriteback(newpage))
 331                 end_page_writeback(newpage);
 332 }
 333
 334 /************************************************************
 335  *                    Migration functions
 336  ***********************************************************/
 337
 338 /* Always fail migration. Used for mappings that are not movable */
 339 int fail_migrate_page(struct address_space *mapping,
 340                         struct page *newpage, struct page *page)
 341 {
 342         return -EIO;
 343 }
 344 EXPORT_SYMBOL(fail_migrate_page);
 345
 346 /*
 347  * Common logic to directly migrate a single page suitable for
 348  * pages that do not use PagePrivate.
 349  *
 350  * Pages are locked upon entry and exit.
 351  */
 352 int migrate_page(struct address_space *mapping,
 353                 struct page *newpage, struct page *page)
 354 {
 355         int rc;
 356
 357         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 358
 359         rc = migrate_page_move_mapping(mapping, newpage, page);
 360
 361         if (rc)
 362                 return rc;
 363
 364         migrate_page_copy(newpage, page);
 365         return 0;
 366 }
 367 EXPORT_SYMBOL(migrate_page);
 368
 369 /*
 370  * Migration function for pages with buffers. This function can only be used
 371  * if the underlying filesystem guarantees that no other references to "page"
 372  * exist.
 373  */
 374 int buffer_migrate_page(struct address_space *mapping,
 375                 struct page *newpage, struct page *page)
 376 {
 377         struct buffer_head *bh, *head;
 378         int rc;
 379
 380         if (!page_has_buffers(page))
 381                 return migrate_page(mapping, newpage, page);
 382
 383         head = page_buffers(page);
 384
 385         rc = migrate_page_move_mapping(mapping, newpage, page);
 386
 387         if (rc)
 388                 return rc;
 389
 390         bh = head;
 391         do {
 392                 get_bh(bh);
 393                 lock_buffer(bh);
 394                 bh = bh->b_this_page;
 395
 396         } while (bh != head);
 397
 398         ClearPagePrivate(page);
 399         set_page_private(newpage, page_private(page));
 400         set_page_private(page, 0);
 401         put_page(page);
 402         get_page(newpage);
 403
 404         bh = head;
 405         do {
 406                 set_bh_page(bh, newpage, bh_offset(bh));
 407                 bh = bh->b_this_page;
 408
 409         } while (bh != head);
 410
 411         SetPagePrivate(newpage);
 412
 413         migrate_page_copy(newpage, page);
 414
 415         bh = head;
 416         do {
 417                 unlock_buffer(bh);
 418                 put_bh(bh);
 419                 bh = bh->b_this_page;
 420
 421         } while (bh != head);
 422
 423         return 0;
 424 }
 425 EXPORT_SYMBOL(buffer_migrate_page);
 426
 427 static int fallback_migrate_page(struct address_space *mapping,
 428         struct page *newpage, struct page *page)
 429 {
 430         /*
 431          * Default handling if a filesystem does not provide
 432          * a migration function. We can only migrate clean
 433          * pages so try to write out any dirty pages first.
 434          */
 435         if (PageDirty(page)) {
 436                 switch (pageout(page, mapping)) {
 437                 case PAGE_KEEP:
 438                 case PAGE_ACTIVATE:
 439                         return -EAGAIN;
 440
 441                 case PAGE_SUCCESS:
 442                         /* Relock since we lost the lock */
 443                         lock_page(page);
 444                         /* Must retry since page state may have changed */
 445                         return -EAGAIN;
 446
 447                 case PAGE_CLEAN:
 448                         ; /* try to migrate the page below */
 449                 }
 450         }
 451
 452         /*
 453          * Buffers may be managed in a filesystem specific way.
 454          * We must have no buffers or drop them.
 455          */
 456         if (page_has_buffers(page) &&
 457             !try_to_release_page(page, GFP_KERNEL))
 458                 return -EAGAIN;
 459
 460         return migrate_page(mapping, newpage, page);
 461 }
 462
 463 /*
 464  * migrate_pages
 465  *
 466  * Two lists are passed to this function. The first list
 467  * contains the pages isolated from the LRU to be migrated.
 468  * The second list contains new pages that the pages isolated
 469  * can be moved to.
 470  *
 471  * The function returns after 10 attempts or if no pages
 472  * are movable anymore because to has become empty
 473  * or no retryable pages exist anymore.
 474  *
 475  * Return: Number of pages not migrated when "to" ran empty.
 476  */
 477 int migrate_pages(struct list_head *from, struct list_head *to,
 478                   struct list_head *moved, struct list_head *failed)
 479 {
 480         int retry;
 481         int nr_failed = 0;
 482         int pass = 0;
 483         struct page *page;
 484         struct page *page2;
 485         int swapwrite = current->flags & PF_SWAPWRITE;
 486         int rc;
 487
 488         if (!swapwrite)
 489                 current->flags |= PF_SWAPWRITE;
 490
 491 redo:
 492         retry = 0;
 493
 494         list_for_each_entry_safe(page, page2, from, lru) {
 495                 struct page *newpage = NULL;
 496                 struct address_space *mapping;
 497
 498                 cond_resched();
 499
 500                 rc = 0;
 501                 if (page_count(page) == 1)
 502                         /* page was freed from under us. So we are done. */
 503                         goto next;
 504
 505                 if (to && list_empty(to))
 506                         break;
 507
 508                 /*
 509                  * Skip locked pages during the first two passes to give the
 510                  * functions holding the lock time to release the page. Later we
 511                  * use lock_page() to have a higher chance of acquiring the
 512                  * lock.
 513                  */
 514                 rc = -EAGAIN;
 515                 if (pass > 2)
 516                         lock_page(page);
 517                 else
 518                         if (TestSetPageLocked(page))
 519                                 goto next;
 520
 521                 /*
 522                  * Only wait on writeback if we have already done a pass where
 523                  * we we may have triggered writeouts for lots of pages.
 524                  */
 525                 if (pass > 0)
 526                         wait_on_page_writeback(page);
 527                 else
 528                         if (PageWriteback(page))
 529                                 goto unlock_page;
 530
 531                 /*
 532                  * Establish migration ptes or remove ptes
 533                  */
 534                 rc = -EPERM;
 535                 if (try_to_unmap(page, 1) == SWAP_FAIL)
 536                         /* A vma has VM_LOCKED set -> permanent failure */
 537                         goto unlock_page;
 538
 539                 rc = -EAGAIN;
 540                 if (page_mapped(page))
 541                         goto unlock_page;
 542
 543                 newpage = lru_to_page(to);
 544                 lock_page(newpage);
 545                 /* Prepare mapping for the new page.*/
 546                 newpage->index = page->index;
 547                 newpage->mapping = page->mapping;
 548
 549                 /*
 550                  * Pages are properly locked and writeback is complete.
 551                  * Try to migrate the page.
 552                  */
 553                 mapping = page_mapping(page);
 554                 if (!mapping)
 555                         rc = migrate_page(mapping, newpage, page);
 556
 557                 else if (mapping->a_ops->migratepage)
 558                         /*
 559                          * Most pages have a mapping and most filesystems
 560                          * should provide a migration function. Anonymous
 561                          * pages are part of swap space which also has its
 562                          * own migration function. This is the most common
 563                          * path for page migration.
 564                          */
 565                         rc = mapping->a_ops->migratepage(mapping,
 566                                                         newpage, page);
 567                 else
 568                         rc = fallback_migrate_page(mapping, newpage, page);
 569
 570                 if (!rc)
 571                         remove_migration_ptes(page, newpage);
 572
 573                 unlock_page(newpage);
 574
 575 unlock_page:
 576                 if (rc)
 577                         remove_migration_ptes(page, page);
 578
 579                 unlock_page(page);
 580
 581 next:
 582                 if (rc) {
 583                         if (newpage)
 584                                 newpage->mapping = NULL;
 585
 586                         if (rc == -EAGAIN)
 587                                 retry++;
 588                         else {
 589                                 /* Permanent failure */
 590                                 list_move(&page->lru, failed);
 591                                 nr_failed++;
 592                         }
 593                 } else {
 594                         if (newpage) {
 595                                 /* Successful migration. Return page to LRU */
 596                                 move_to_lru(newpage);
 597                         }
 598                         list_move(&page->lru, moved);
 599                 }
 600         }
 601         if (retry && pass++ < 10)
 602                 goto redo;
 603
 604         if (!swapwrite)
 605                 current->flags &= ~PF_SWAPWRITE;
 606
 607         return nr_failed + retry;
 608 }
 609
 610 /*
 611  * Migrate the list 'pagelist' of pages to a certain destination.
 612  *
 613  * Specify destination with either non-NULL vma or dest_node >= 0
 614  * Return the number of pages not migrated or error code
 615  */
 616 int migrate_pages_to(struct list_head *pagelist,
 617                         struct vm_area_struct *vma, int dest)
 618 {
 619         LIST_HEAD(newlist);
 620         LIST_HEAD(moved);
 621         LIST_HEAD(failed);
 622         int err = 0;
 623         unsigned long offset = 0;
 624         int nr_pages;
 625         struct page *page;
 626         struct list_head *p;
 627
 628 redo:
 629         nr_pages = 0;
 630         list_for_each(p, pagelist) {
 631                 if (vma) {
 632                         /*
 633                          * The address passed to alloc_page_vma is used to
 634                          * generate the proper interleave behavior. We fake
 635                          * the address here by an increasing offset in order
 636                          * to get the proper distribution of pages.
 637                          *
 638                          * No decision has been made as to which page
 639                          * a certain old page is moved to so we cannot
 640                          * specify the correct address.
 641                          */
 642                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 643                                         offset + vma->vm_start);
 644                         offset += PAGE_SIZE;
 645                 }
 646                 else
 647                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 648
 649                 if (!page) {
 650                         err = -ENOMEM;
 651                         goto out;
 652                 }
 653                 list_add_tail(&page->lru, &newlist);
 654                 nr_pages++;
 655                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 656                         break;
 657         }
 658         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 659
 660         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 661
 662         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 663                 goto redo;
 664 out:
 665         /* Return leftover allocated pages */
 666         while (!list_empty(&newlist)) {
 667                 page = list_entry(newlist.next, struct page, lru);
 668                 list_del(&page->lru);
 669                 __free_page(page);
 670         }
 671         list_splice(&failed, pagelist);
 672         if (err < 0)
 673                 return err;
 674
 675         /* Calculate number of leftover pages */
 676         nr_pages = 0;
 677         list_for_each(p, pagelist)
 678                 nr_pages++;
 679         return nr_pages;
 680 }