1 /******************************************************************************
 
   4  * Xen balloon driver - enables returning/claiming memory to/from Xen.
 
   6  * Copyright (c) 2003, B Dragovic
 
   7  * Copyright (c) 2003-2004, M Williamson, K Fraser
 
   8  * Copyright (c) 2005 Dan M. Smith, IBM Corporation
 
  10  * This program is free software; you can redistribute it and/or
 
  11  * modify it under the terms of the GNU General Public License version 2
 
  12  * as published by the Free Software Foundation; or, when distributed
 
  13  * separately from the Linux kernel or incorporated into other
 
  14  * software packages, subject to the following license:
 
  16  * Permission is hereby granted, free of charge, to any person obtaining a copy
 
  17  * of this source file (the "Software"), to deal in the Software without
 
  18  * restriction, including without limitation the rights to use, copy, modify,
 
  19  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
 
  20  * and to permit persons to whom the Software is furnished to do so, subject to
 
  21  * the following conditions:
 
  23  * The above copyright notice and this permission notice shall be included in
 
  24  * all copies or substantial portions of the Software.
 
  26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 
  27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 
  28  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
  29  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 
  30  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 
  31  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 
  35 #include <linux/kernel.h>
 
  36 #include <linux/module.h>
 
  37 #include <linux/sched.h>
 
  38 #include <linux/errno.h>
 
  40 #include <linux/bootmem.h>
 
  41 #include <linux/pagemap.h>
 
  42 #include <linux/highmem.h>
 
  43 #include <linux/mutex.h>
 
  44 #include <linux/highmem.h>
 
  45 #include <linux/list.h>
 
  46 #include <linux/sysdev.h>
 
  48 #include <asm/xen/hypervisor.h>
 
  50 #include <asm/pgalloc.h>
 
  51 #include <asm/pgtable.h>
 
  52 #include <asm/uaccess.h>
 
  55 #include <xen/interface/memory.h>
 
  56 #include <xen/balloon.h>
 
  57 #include <xen/xenbus.h>
 
  58 #include <xen/features.h>
 
  61 #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
 
  63 #define BALLOON_CLASS_NAME "xen_memory"
 
  65 struct balloon_stats {
 
  66         /* We aim for 'current allocation' == 'target allocation'. */
 
  67         unsigned long current_pages;
 
  68         unsigned long target_pages;
 
  69         /* We may hit the hard limit in Xen. If we do then we remember it. */
 
  70         unsigned long hard_limit;
 
  72          * Drivers may alter the memory reservation independently, but they
 
  73          * must inform the balloon driver so we avoid hitting the hard limit.
 
  75         unsigned long driver_pages;
 
  76         /* Number of pages in high- and low-memory balloons. */
 
  77         unsigned long balloon_low;
 
  78         unsigned long balloon_high;
 
  81 static DEFINE_MUTEX(balloon_mutex);
 
  83 static struct sys_device balloon_sysdev;
 
  85 static int register_balloon(struct sys_device *sysdev);
 
  88  * Protects atomic reservation decrease/increase against concurrent increases.
 
  89  * Also protects non-atomic updates of current_pages and driver_pages, and
 
  92 static DEFINE_SPINLOCK(balloon_lock);
 
  94 static struct balloon_stats balloon_stats;
 
  96 /* We increase/decrease in batches which fit in a page */
 
  97 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
 
  99 /* VM /proc information for memory */
 
 100 extern unsigned long totalram_pages;
 
 102 #ifdef CONFIG_HIGHMEM
 
 103 extern unsigned long totalhigh_pages;
 
 104 #define inc_totalhigh_pages() (totalhigh_pages++)
 
 105 #define dec_totalhigh_pages() (totalhigh_pages--)
 
 107 #define inc_totalhigh_pages() do {} while(0)
 
 108 #define dec_totalhigh_pages() do {} while(0)
 
 111 /* List of ballooned pages, threaded through the mem_map array. */
 
 112 static LIST_HEAD(ballooned_pages);
 
 114 /* Main work function, always executed in process context. */
 
 115 static void balloon_process(struct work_struct *work);
 
 116 static DECLARE_WORK(balloon_worker, balloon_process);
 
 117 static struct timer_list balloon_timer;
 
 119 /* When ballooning out (allocating memory to return to Xen) we don't really
 
 120    want the kernel to try too hard since that can trigger the oom killer. */
 
 121 #define GFP_BALLOON \
 
 122         (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
 
 124 static void scrub_page(struct page *page)
 
 126 #ifdef CONFIG_XEN_SCRUB_PAGES
 
 127         if (PageHighMem(page)) {
 
 128                 void *v = kmap(page);
 
 132                 void *v = page_address(page);
 
 138 /* balloon_append: add the given page to the balloon. */
 
 139 static void balloon_append(struct page *page)
 
 141         /* Lowmem is re-populated first, so highmem pages go at list tail. */
 
 142         if (PageHighMem(page)) {
 
 143                 list_add_tail(&page->lru, &ballooned_pages);
 
 144                 balloon_stats.balloon_high++;
 
 145                 dec_totalhigh_pages();
 
 147                 list_add(&page->lru, &ballooned_pages);
 
 148                 balloon_stats.balloon_low++;
 
 152 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
 
 153 static struct page *balloon_retrieve(void)
 
 157         if (list_empty(&ballooned_pages))
 
 160         page = list_entry(ballooned_pages.next, struct page, lru);
 
 161         list_del(&page->lru);
 
 163         if (PageHighMem(page)) {
 
 164                 balloon_stats.balloon_high--;
 
 165                 inc_totalhigh_pages();
 
 168                 balloon_stats.balloon_low--;
 
 173 static struct page *balloon_first_page(void)
 
 175         if (list_empty(&ballooned_pages))
 
 177         return list_entry(ballooned_pages.next, struct page, lru);
 
 180 static struct page *balloon_next_page(struct page *page)
 
 182         struct list_head *next = page->lru.next;
 
 183         if (next == &ballooned_pages)
 
 185         return list_entry(next, struct page, lru);
 
 188 static void balloon_alarm(unsigned long unused)
 
 190         schedule_work(&balloon_worker);
 
 193 static unsigned long current_target(void)
 
 195         unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit);
 
 198                      balloon_stats.current_pages +
 
 199                      balloon_stats.balloon_low +
 
 200                      balloon_stats.balloon_high);
 
 205 static int increase_reservation(unsigned long nr_pages)
 
 207         unsigned long  pfn, i, flags;
 
 210         struct xen_memory_reservation reservation = {
 
 216         if (nr_pages > ARRAY_SIZE(frame_list))
 
 217                 nr_pages = ARRAY_SIZE(frame_list);
 
 219         spin_lock_irqsave(&balloon_lock, flags);
 
 221         page = balloon_first_page();
 
 222         for (i = 0; i < nr_pages; i++) {
 
 223                 BUG_ON(page == NULL);
 
 224                 frame_list[i] = page_to_pfn(page);;
 
 225                 page = balloon_next_page(page);
 
 228         set_xen_guest_handle(reservation.extent_start, frame_list);
 
 229         reservation.nr_extents   = nr_pages;
 
 230         rc = HYPERVISOR_memory_op(
 
 231                 XENMEM_populate_physmap, &reservation);
 
 236                         /* We hit the Xen hard limit: reprobe. */
 
 237                         reservation.nr_extents = rc;
 
 238                         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 
 243                         balloon_stats.hard_limit = (balloon_stats.current_pages + rc -
 
 244                                                     balloon_stats.driver_pages);
 
 248         for (i = 0; i < nr_pages; i++) {
 
 249                 page = balloon_retrieve();
 
 250                 BUG_ON(page == NULL);
 
 252                 pfn = page_to_pfn(page);
 
 253                 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
 
 254                        phys_to_machine_mapping_valid(pfn));
 
 256                 set_phys_to_machine(pfn, frame_list[i]);
 
 258                 /* Link back into the page tables if not highmem. */
 
 259                 if (pfn < max_low_pfn) {
 
 261                         ret = HYPERVISOR_update_va_mapping(
 
 262                                 (unsigned long)__va(pfn << PAGE_SHIFT),
 
 263                                 mfn_pte(frame_list[i], PAGE_KERNEL),
 
 268                 /* Relinquish the page back to the allocator. */
 
 269                 ClearPageReserved(page);
 
 270                 init_page_count(page);
 
 274         balloon_stats.current_pages += nr_pages;
 
 275         totalram_pages = balloon_stats.current_pages;
 
 278         spin_unlock_irqrestore(&balloon_lock, flags);
 
 283 static int decrease_reservation(unsigned long nr_pages)
 
 285         unsigned long  pfn, i, flags;
 
 289         struct xen_memory_reservation reservation = {
 
 295         if (nr_pages > ARRAY_SIZE(frame_list))
 
 296                 nr_pages = ARRAY_SIZE(frame_list);
 
 298         for (i = 0; i < nr_pages; i++) {
 
 299                 if ((page = alloc_page(GFP_BALLOON)) == NULL) {
 
 305                 pfn = page_to_pfn(page);
 
 306                 frame_list[i] = pfn_to_mfn(pfn);
 
 311         /* Ensure that ballooned highmem pages don't have kmaps. */
 
 315         spin_lock_irqsave(&balloon_lock, flags);
 
 317         /* No more mappings: invalidate P2M and add to balloon. */
 
 318         for (i = 0; i < nr_pages; i++) {
 
 319                 pfn = mfn_to_pfn(frame_list[i]);
 
 320                 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 
 321                 balloon_append(pfn_to_page(pfn));
 
 324         set_xen_guest_handle(reservation.extent_start, frame_list);
 
 325         reservation.nr_extents   = nr_pages;
 
 326         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
 
 327         BUG_ON(ret != nr_pages);
 
 329         balloon_stats.current_pages -= nr_pages;
 
 330         totalram_pages = balloon_stats.current_pages;
 
 332         spin_unlock_irqrestore(&balloon_lock, flags);
 
 338  * We avoid multiple worker processes conflicting via the balloon mutex.
 
 339  * We may of course race updates of the target counts (which are protected
 
 340  * by the balloon lock), or with changes to the Xen hard limit, but we will
 
 341  * recover from these in time.
 
 343 static void balloon_process(struct work_struct *work)
 
 348         mutex_lock(&balloon_mutex);
 
 351                 credit = current_target() - balloon_stats.current_pages;
 
 353                         need_sleep = (increase_reservation(credit) != 0);
 
 355                         need_sleep = (decrease_reservation(-credit) != 0);
 
 357 #ifndef CONFIG_PREEMPT
 
 361         } while ((credit != 0) && !need_sleep);
 
 363         /* Schedule more work if there is some still to be done. */
 
 364         if (current_target() != balloon_stats.current_pages)
 
 365                 mod_timer(&balloon_timer, jiffies + HZ);
 
 367         mutex_unlock(&balloon_mutex);
 
 370 /* Resets the Xen limit, sets new target, and kicks off processing. */
 
 371 static void balloon_set_new_target(unsigned long target)
 
 373         /* No need for lock. Not read-modify-write updates. */
 
 374         balloon_stats.hard_limit   = ~0UL;
 
 375         balloon_stats.target_pages = target;
 
 376         schedule_work(&balloon_worker);
 
 379 static struct xenbus_watch target_watch =
 
 381         .node = "memory/target"
 
 384 /* React to a change in the target key */
 
 385 static void watch_target(struct xenbus_watch *watch,
 
 386                          const char **vec, unsigned int len)
 
 388         unsigned long long new_target;
 
 391         err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
 
 393                 /* This is ok (for domain0 at least) - so just return */
 
 397         /* The given memory/target value is in KiB, so it needs converting to
 
 398          * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
 
 400         balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
 
 403 static int balloon_init_watcher(struct notifier_block *notifier,
 
 409         err = register_xenbus_watch(&target_watch);
 
 411                 printk(KERN_ERR "Failed to set balloon watcher\n");
 
 416 static struct notifier_block xenstore_notifier;
 
 418 static int __init balloon_init(void)
 
 423         if (!is_running_on_xen())
 
 426         pr_info("xen_balloon: Initialising balloon driver.\n");
 
 428         balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
 
 429         totalram_pages   = balloon_stats.current_pages;
 
 430         balloon_stats.target_pages  = balloon_stats.current_pages;
 
 431         balloon_stats.balloon_low   = 0;
 
 432         balloon_stats.balloon_high  = 0;
 
 433         balloon_stats.driver_pages  = 0UL;
 
 434         balloon_stats.hard_limit    = ~0UL;
 
 436         init_timer(&balloon_timer);
 
 437         balloon_timer.data = 0;
 
 438         balloon_timer.function = balloon_alarm;
 
 440         register_balloon(&balloon_sysdev);
 
 442         /* Initialise the balloon with excess memory space. */
 
 443         for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
 
 444                 page = pfn_to_page(pfn);
 
 445                 if (!PageReserved(page))
 
 446                         balloon_append(page);
 
 449         target_watch.callback = watch_target;
 
 450         xenstore_notifier.notifier_call = balloon_init_watcher;
 
 452         register_xenstore_notifier(&xenstore_notifier);
 
 457 subsys_initcall(balloon_init);
 
 459 static void balloon_exit(void)
 
 461     /* XXX - release balloon here */
 
 465 module_exit(balloon_exit);
 
 467 static void balloon_update_driver_allowance(long delta)
 
 471         spin_lock_irqsave(&balloon_lock, flags);
 
 472         balloon_stats.driver_pages += delta;
 
 473         spin_unlock_irqrestore(&balloon_lock, flags);
 
 476 static int dealloc_pte_fn(
 
 477         pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
 
 479         unsigned long mfn = pte_mfn(*pte);
 
 481         struct xen_memory_reservation reservation = {
 
 486         set_xen_guest_handle(reservation.extent_start, &mfn);
 
 487         set_pte_at(&init_mm, addr, pte, __pte_ma(0ull));
 
 488         set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
 
 489         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
 
 494 static struct page **alloc_empty_pages_and_pagevec(int nr_pages)
 
 496         unsigned long vaddr, flags;
 
 497         struct page *page, **pagevec;
 
 500         pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
 
 504         for (i = 0; i < nr_pages; i++) {
 
 505                 page = pagevec[i] = alloc_page(GFP_KERNEL);
 
 509                 vaddr = (unsigned long)page_address(page);
 
 513                 spin_lock_irqsave(&balloon_lock, flags);
 
 515                 if (xen_feature(XENFEAT_auto_translated_physmap)) {
 
 516                         unsigned long gmfn = page_to_pfn(page);
 
 517                         struct xen_memory_reservation reservation = {
 
 522                         set_xen_guest_handle(reservation.extent_start, &gmfn);
 
 523                         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 
 526                                 ret = 0; /* success */
 
 528                         ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
 
 529                                                   dealloc_pte_fn, NULL);
 
 533                         spin_unlock_irqrestore(&balloon_lock, flags);
 
 538                 totalram_pages = --balloon_stats.current_pages;
 
 540                 spin_unlock_irqrestore(&balloon_lock, flags);
 
 544         schedule_work(&balloon_worker);
 
 549         spin_lock_irqsave(&balloon_lock, flags);
 
 551                 balloon_append(pagevec[i]);
 
 552         spin_unlock_irqrestore(&balloon_lock, flags);
 
 558 static void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
 
 566         spin_lock_irqsave(&balloon_lock, flags);
 
 567         for (i = 0; i < nr_pages; i++) {
 
 568                 BUG_ON(page_count(pagevec[i]) != 1);
 
 569                 balloon_append(pagevec[i]);
 
 571         spin_unlock_irqrestore(&balloon_lock, flags);
 
 575         schedule_work(&balloon_worker);
 
 578 static void balloon_release_driver_page(struct page *page)
 
 582         spin_lock_irqsave(&balloon_lock, flags);
 
 583         balloon_append(page);
 
 584         balloon_stats.driver_pages--;
 
 585         spin_unlock_irqrestore(&balloon_lock, flags);
 
 587         schedule_work(&balloon_worker);
 
 591 #define BALLOON_SHOW(name, format, args...)                     \
 
 592         static ssize_t show_##name(struct sys_device *dev,      \
 
 595                 return sprintf(buf, format, ##args);            \
 
 597         static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
 
 599 BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
 
 600 BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
 
 601 BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
 
 602 BALLOON_SHOW(hard_limit_kb,
 
 603              (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n",
 
 604              (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0);
 
 605 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
 
 607 static ssize_t show_target_kb(struct sys_device *dev, char *buf)
 
 609         return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
 
 612 static ssize_t store_target_kb(struct sys_device *dev,
 
 613                                struct sysdev_attribute *attr,
 
 617         char memstring[64], *endchar;
 
 618         unsigned long long target_bytes;
 
 620         if (!capable(CAP_SYS_ADMIN))
 
 624                 return -EBADMSG; /* runt */
 
 625         if (count > sizeof(memstring))
 
 626                 return -EFBIG;   /* too long */
 
 627         strcpy(memstring, buf);
 
 629         target_bytes = memparse(memstring, &endchar);
 
 630         balloon_set_new_target(target_bytes >> PAGE_SHIFT);
 
 635 static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
 
 636                    show_target_kb, store_target_kb);
 
 638 static struct sysdev_attribute *balloon_attrs[] = {
 
 642 static struct attribute *balloon_info_attrs[] = {
 
 643         &attr_current_kb.attr,
 
 646         &attr_hard_limit_kb.attr,
 
 647         &attr_driver_kb.attr,
 
 651 static struct attribute_group balloon_info_group = {
 
 653         .attrs = balloon_info_attrs,
 
 656 static struct sysdev_class balloon_sysdev_class = {
 
 657         .name = BALLOON_CLASS_NAME,
 
 660 static int register_balloon(struct sys_device *sysdev)
 
 664         error = sysdev_class_register(&balloon_sysdev_class);
 
 669         sysdev->cls = &balloon_sysdev_class;
 
 671         error = sysdev_register(sysdev);
 
 673                 sysdev_class_unregister(&balloon_sysdev_class);
 
 677         for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
 
 678                 error = sysdev_create_file(sysdev, balloon_attrs[i]);
 
 683         error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
 
 691                 sysdev_remove_file(sysdev, balloon_attrs[i]);
 
 692         sysdev_unregister(sysdev);
 
 693         sysdev_class_unregister(&balloon_sysdev_class);
 
 697 static void unregister_balloon(struct sys_device *sysdev)
 
 701         sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
 
 702         for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
 
 703                 sysdev_remove_file(sysdev, balloon_attrs[i]);
 
 704         sysdev_unregister(sysdev);
 
 705         sysdev_class_unregister(&balloon_sysdev_class);
 
 708 static void balloon_sysfs_exit(void)
 
 710         unregister_balloon(&balloon_sysdev);
 
 713 MODULE_LICENSE("GPL");