[PATCH] KVM: cpu hotplug support
[linux-2.6] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <asm/processor.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
26 #include <asm/msr.h>
27 #include <linux/mm.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <asm/uaccess.h>
31 #include <linux/reboot.h>
32 #include <asm/io.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
36 #include <asm/desc.h>
37 #include <linux/cpu.h>
38
39 #include "x86_emulate.h"
40 #include "segment_descriptor.h"
41
42 MODULE_AUTHOR("Qumranet");
43 MODULE_LICENSE("GPL");
44
45 static DEFINE_SPINLOCK(kvm_lock);
46 static LIST_HEAD(vm_list);
47
48 struct kvm_arch_ops *kvm_arch_ops;
49 struct kvm_stat kvm_stat;
50 EXPORT_SYMBOL_GPL(kvm_stat);
51
52 static struct kvm_stats_debugfs_item {
53         const char *name;
54         u32 *data;
55         struct dentry *dentry;
56 } debugfs_entries[] = {
57         { "pf_fixed", &kvm_stat.pf_fixed },
58         { "pf_guest", &kvm_stat.pf_guest },
59         { "tlb_flush", &kvm_stat.tlb_flush },
60         { "invlpg", &kvm_stat.invlpg },
61         { "exits", &kvm_stat.exits },
62         { "io_exits", &kvm_stat.io_exits },
63         { "mmio_exits", &kvm_stat.mmio_exits },
64         { "signal_exits", &kvm_stat.signal_exits },
65         { "irq_window", &kvm_stat.irq_window_exits },
66         { "halt_exits", &kvm_stat.halt_exits },
67         { "request_irq", &kvm_stat.request_irq_exits },
68         { "irq_exits", &kvm_stat.irq_exits },
69         { NULL, NULL }
70 };
71
72 static struct dentry *debugfs_dir;
73
74 #define MAX_IO_MSRS 256
75
76 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
77 #define LMSW_GUEST_MASK 0x0eULL
78 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
79 #define CR8_RESEVED_BITS (~0x0fULL)
80 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
81
82 #ifdef CONFIG_X86_64
83 // LDT or TSS descriptor in the GDT. 16 bytes.
84 struct segment_descriptor_64 {
85         struct segment_descriptor s;
86         u32 base_higher;
87         u32 pad_zero;
88 };
89
90 #endif
91
92 unsigned long segment_base(u16 selector)
93 {
94         struct descriptor_table gdt;
95         struct segment_descriptor *d;
96         unsigned long table_base;
97         typedef unsigned long ul;
98         unsigned long v;
99
100         if (selector == 0)
101                 return 0;
102
103         asm ("sgdt %0" : "=m"(gdt));
104         table_base = gdt.base;
105
106         if (selector & 4) {           /* from ldt */
107                 u16 ldt_selector;
108
109                 asm ("sldt %0" : "=g"(ldt_selector));
110                 table_base = segment_base(ldt_selector);
111         }
112         d = (struct segment_descriptor *)(table_base + (selector & ~7));
113         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
114 #ifdef CONFIG_X86_64
115         if (d->system == 0
116             && (d->type == 2 || d->type == 9 || d->type == 11))
117                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
118 #endif
119         return v;
120 }
121 EXPORT_SYMBOL_GPL(segment_base);
122
123 static inline int valid_vcpu(int n)
124 {
125         return likely(n >= 0 && n < KVM_MAX_VCPUS);
126 }
127
128 int kvm_read_guest(struct kvm_vcpu *vcpu,
129                              gva_t addr,
130                              unsigned long size,
131                              void *dest)
132 {
133         unsigned char *host_buf = dest;
134         unsigned long req_size = size;
135
136         while (size) {
137                 hpa_t paddr;
138                 unsigned now;
139                 unsigned offset;
140                 hva_t guest_buf;
141
142                 paddr = gva_to_hpa(vcpu, addr);
143
144                 if (is_error_hpa(paddr))
145                         break;
146
147                 guest_buf = (hva_t)kmap_atomic(
148                                         pfn_to_page(paddr >> PAGE_SHIFT),
149                                         KM_USER0);
150                 offset = addr & ~PAGE_MASK;
151                 guest_buf |= offset;
152                 now = min(size, PAGE_SIZE - offset);
153                 memcpy(host_buf, (void*)guest_buf, now);
154                 host_buf += now;
155                 addr += now;
156                 size -= now;
157                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
158         }
159         return req_size - size;
160 }
161 EXPORT_SYMBOL_GPL(kvm_read_guest);
162
163 int kvm_write_guest(struct kvm_vcpu *vcpu,
164                              gva_t addr,
165                              unsigned long size,
166                              void *data)
167 {
168         unsigned char *host_buf = data;
169         unsigned long req_size = size;
170
171         while (size) {
172                 hpa_t paddr;
173                 unsigned now;
174                 unsigned offset;
175                 hva_t guest_buf;
176
177                 paddr = gva_to_hpa(vcpu, addr);
178
179                 if (is_error_hpa(paddr))
180                         break;
181
182                 guest_buf = (hva_t)kmap_atomic(
183                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
184                 offset = addr & ~PAGE_MASK;
185                 guest_buf |= offset;
186                 now = min(size, PAGE_SIZE - offset);
187                 memcpy((void*)guest_buf, host_buf, now);
188                 host_buf += now;
189                 addr += now;
190                 size -= now;
191                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
192         }
193         return req_size - size;
194 }
195 EXPORT_SYMBOL_GPL(kvm_write_guest);
196
197 static int vcpu_slot(struct kvm_vcpu *vcpu)
198 {
199         return vcpu - vcpu->kvm->vcpus;
200 }
201
202 /*
203  * Switches to specified vcpu, until a matching vcpu_put()
204  */
205 static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot)
206 {
207         struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot];
208
209         mutex_lock(&vcpu->mutex);
210         if (unlikely(!vcpu->vmcs)) {
211                 mutex_unlock(&vcpu->mutex);
212                 return NULL;
213         }
214         return kvm_arch_ops->vcpu_load(vcpu);
215 }
216
217 static void vcpu_put(struct kvm_vcpu *vcpu)
218 {
219         kvm_arch_ops->vcpu_put(vcpu);
220         mutex_unlock(&vcpu->mutex);
221 }
222
223 static int kvm_dev_open(struct inode *inode, struct file *filp)
224 {
225         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
226         int i;
227
228         if (!kvm)
229                 return -ENOMEM;
230
231         spin_lock_init(&kvm->lock);
232         INIT_LIST_HEAD(&kvm->active_mmu_pages);
233         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
234                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
235
236                 mutex_init(&vcpu->mutex);
237                 vcpu->cpu = -1;
238                 vcpu->kvm = kvm;
239                 vcpu->mmu.root_hpa = INVALID_PAGE;
240                 INIT_LIST_HEAD(&vcpu->free_pages);
241                 spin_lock(&kvm_lock);
242                 list_add(&kvm->vm_list, &vm_list);
243                 spin_unlock(&kvm_lock);
244         }
245         filp->private_data = kvm;
246         return 0;
247 }
248
249 /*
250  * Free any memory in @free but not in @dont.
251  */
252 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
253                                   struct kvm_memory_slot *dont)
254 {
255         int i;
256
257         if (!dont || free->phys_mem != dont->phys_mem)
258                 if (free->phys_mem) {
259                         for (i = 0; i < free->npages; ++i)
260                                 if (free->phys_mem[i])
261                                         __free_page(free->phys_mem[i]);
262                         vfree(free->phys_mem);
263                 }
264
265         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
266                 vfree(free->dirty_bitmap);
267
268         free->phys_mem = NULL;
269         free->npages = 0;
270         free->dirty_bitmap = NULL;
271 }
272
273 static void kvm_free_physmem(struct kvm *kvm)
274 {
275         int i;
276
277         for (i = 0; i < kvm->nmemslots; ++i)
278                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
279 }
280
281 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
282 {
283         if (!vcpu_load(vcpu->kvm, vcpu_slot(vcpu)))
284                 return;
285
286         kvm_mmu_destroy(vcpu);
287         vcpu_put(vcpu);
288         kvm_arch_ops->vcpu_free(vcpu);
289 }
290
291 static void kvm_free_vcpus(struct kvm *kvm)
292 {
293         unsigned int i;
294
295         for (i = 0; i < KVM_MAX_VCPUS; ++i)
296                 kvm_free_vcpu(&kvm->vcpus[i]);
297 }
298
299 static int kvm_dev_release(struct inode *inode, struct file *filp)
300 {
301         struct kvm *kvm = filp->private_data;
302
303         spin_lock(&kvm_lock);
304         list_del(&kvm->vm_list);
305         spin_unlock(&kvm_lock);
306         kvm_free_vcpus(kvm);
307         kvm_free_physmem(kvm);
308         kfree(kvm);
309         return 0;
310 }
311
312 static void inject_gp(struct kvm_vcpu *vcpu)
313 {
314         kvm_arch_ops->inject_gp(vcpu, 0);
315 }
316
317 /*
318  * Load the pae pdptrs.  Return true is they are all valid.
319  */
320 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
321 {
322         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
323         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
324         int i;
325         u64 pdpte;
326         u64 *pdpt;
327         int ret;
328         struct kvm_memory_slot *memslot;
329
330         spin_lock(&vcpu->kvm->lock);
331         memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn);
332         /* FIXME: !memslot - emulate? 0xff? */
333         pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
334
335         ret = 1;
336         for (i = 0; i < 4; ++i) {
337                 pdpte = pdpt[offset + i];
338                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
339                         ret = 0;
340                         goto out;
341                 }
342         }
343
344         for (i = 0; i < 4; ++i)
345                 vcpu->pdptrs[i] = pdpt[offset + i];
346
347 out:
348         kunmap_atomic(pdpt, KM_USER0);
349         spin_unlock(&vcpu->kvm->lock);
350
351         return ret;
352 }
353
354 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
355 {
356         if (cr0 & CR0_RESEVED_BITS) {
357                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
358                        cr0, vcpu->cr0);
359                 inject_gp(vcpu);
360                 return;
361         }
362
363         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
364                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
365                 inject_gp(vcpu);
366                 return;
367         }
368
369         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
370                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
371                        "and a clear PE flag\n");
372                 inject_gp(vcpu);
373                 return;
374         }
375
376         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
377 #ifdef CONFIG_X86_64
378                 if ((vcpu->shadow_efer & EFER_LME)) {
379                         int cs_db, cs_l;
380
381                         if (!is_pae(vcpu)) {
382                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
383                                        "in long mode while PAE is disabled\n");
384                                 inject_gp(vcpu);
385                                 return;
386                         }
387                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
388                         if (cs_l) {
389                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
390                                        "in long mode while CS.L == 1\n");
391                                 inject_gp(vcpu);
392                                 return;
393
394                         }
395                 } else
396 #endif
397                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
398                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
399                                "reserved bits\n");
400                         inject_gp(vcpu);
401                         return;
402                 }
403
404         }
405
406         kvm_arch_ops->set_cr0(vcpu, cr0);
407         vcpu->cr0 = cr0;
408
409         spin_lock(&vcpu->kvm->lock);
410         kvm_mmu_reset_context(vcpu);
411         spin_unlock(&vcpu->kvm->lock);
412         return;
413 }
414 EXPORT_SYMBOL_GPL(set_cr0);
415
416 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
417 {
418         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
419         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
420 }
421 EXPORT_SYMBOL_GPL(lmsw);
422
423 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
424 {
425         if (cr4 & CR4_RESEVED_BITS) {
426                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
427                 inject_gp(vcpu);
428                 return;
429         }
430
431         if (is_long_mode(vcpu)) {
432                 if (!(cr4 & CR4_PAE_MASK)) {
433                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
434                                "in long mode\n");
435                         inject_gp(vcpu);
436                         return;
437                 }
438         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
439                    && !load_pdptrs(vcpu, vcpu->cr3)) {
440                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
441                 inject_gp(vcpu);
442         }
443
444         if (cr4 & CR4_VMXE_MASK) {
445                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
446                 inject_gp(vcpu);
447                 return;
448         }
449         kvm_arch_ops->set_cr4(vcpu, cr4);
450         spin_lock(&vcpu->kvm->lock);
451         kvm_mmu_reset_context(vcpu);
452         spin_unlock(&vcpu->kvm->lock);
453 }
454 EXPORT_SYMBOL_GPL(set_cr4);
455
456 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
457 {
458         if (is_long_mode(vcpu)) {
459                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
460                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
461                         inject_gp(vcpu);
462                         return;
463                 }
464         } else {
465                 if (cr3 & CR3_RESEVED_BITS) {
466                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
467                         inject_gp(vcpu);
468                         return;
469                 }
470                 if (is_paging(vcpu) && is_pae(vcpu) &&
471                     !load_pdptrs(vcpu, cr3)) {
472                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
473                                "reserved bits\n");
474                         inject_gp(vcpu);
475                         return;
476                 }
477         }
478
479         vcpu->cr3 = cr3;
480         spin_lock(&vcpu->kvm->lock);
481         /*
482          * Does the new cr3 value map to physical memory? (Note, we
483          * catch an invalid cr3 even in real-mode, because it would
484          * cause trouble later on when we turn on paging anyway.)
485          *
486          * A real CPU would silently accept an invalid cr3 and would
487          * attempt to use it - with largely undefined (and often hard
488          * to debug) behavior on the guest side.
489          */
490         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
491                 inject_gp(vcpu);
492         else
493                 vcpu->mmu.new_cr3(vcpu);
494         spin_unlock(&vcpu->kvm->lock);
495 }
496 EXPORT_SYMBOL_GPL(set_cr3);
497
498 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
499 {
500         if ( cr8 & CR8_RESEVED_BITS) {
501                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
502                 inject_gp(vcpu);
503                 return;
504         }
505         vcpu->cr8 = cr8;
506 }
507 EXPORT_SYMBOL_GPL(set_cr8);
508
509 void fx_init(struct kvm_vcpu *vcpu)
510 {
511         struct __attribute__ ((__packed__)) fx_image_s {
512                 u16 control; //fcw
513                 u16 status; //fsw
514                 u16 tag; // ftw
515                 u16 opcode; //fop
516                 u64 ip; // fpu ip
517                 u64 operand;// fpu dp
518                 u32 mxcsr;
519                 u32 mxcsr_mask;
520
521         } *fx_image;
522
523         fx_save(vcpu->host_fx_image);
524         fpu_init();
525         fx_save(vcpu->guest_fx_image);
526         fx_restore(vcpu->host_fx_image);
527
528         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
529         fx_image->mxcsr = 0x1f80;
530         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
531                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
532 }
533 EXPORT_SYMBOL_GPL(fx_init);
534
535 /*
536  * Creates some virtual cpus.  Good luck creating more than one.
537  */
538 static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
539 {
540         int r;
541         struct kvm_vcpu *vcpu;
542
543         r = -EINVAL;
544         if (!valid_vcpu(n))
545                 goto out;
546
547         vcpu = &kvm->vcpus[n];
548
549         mutex_lock(&vcpu->mutex);
550
551         if (vcpu->vmcs) {
552                 mutex_unlock(&vcpu->mutex);
553                 return -EEXIST;
554         }
555
556         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
557                                            FX_IMAGE_ALIGN);
558         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
559
560         r = kvm_arch_ops->vcpu_create(vcpu);
561         if (r < 0)
562                 goto out_free_vcpus;
563
564         r = kvm_mmu_create(vcpu);
565         if (r < 0)
566                 goto out_free_vcpus;
567
568         kvm_arch_ops->vcpu_load(vcpu);
569         r = kvm_mmu_setup(vcpu);
570         if (r >= 0)
571                 r = kvm_arch_ops->vcpu_setup(vcpu);
572         vcpu_put(vcpu);
573
574         if (r < 0)
575                 goto out_free_vcpus;
576
577         return 0;
578
579 out_free_vcpus:
580         kvm_free_vcpu(vcpu);
581         mutex_unlock(&vcpu->mutex);
582 out:
583         return r;
584 }
585
586 /*
587  * Allocate some memory and give it an address in the guest physical address
588  * space.
589  *
590  * Discontiguous memory is allowed, mostly for framebuffers.
591  */
592 static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm,
593                                            struct kvm_memory_region *mem)
594 {
595         int r;
596         gfn_t base_gfn;
597         unsigned long npages;
598         unsigned long i;
599         struct kvm_memory_slot *memslot;
600         struct kvm_memory_slot old, new;
601         int memory_config_version;
602
603         r = -EINVAL;
604         /* General sanity checks */
605         if (mem->memory_size & (PAGE_SIZE - 1))
606                 goto out;
607         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
608                 goto out;
609         if (mem->slot >= KVM_MEMORY_SLOTS)
610                 goto out;
611         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
612                 goto out;
613
614         memslot = &kvm->memslots[mem->slot];
615         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
616         npages = mem->memory_size >> PAGE_SHIFT;
617
618         if (!npages)
619                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
620
621 raced:
622         spin_lock(&kvm->lock);
623
624         memory_config_version = kvm->memory_config_version;
625         new = old = *memslot;
626
627         new.base_gfn = base_gfn;
628         new.npages = npages;
629         new.flags = mem->flags;
630
631         /* Disallow changing a memory slot's size. */
632         r = -EINVAL;
633         if (npages && old.npages && npages != old.npages)
634                 goto out_unlock;
635
636         /* Check for overlaps */
637         r = -EEXIST;
638         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
639                 struct kvm_memory_slot *s = &kvm->memslots[i];
640
641                 if (s == memslot)
642                         continue;
643                 if (!((base_gfn + npages <= s->base_gfn) ||
644                       (base_gfn >= s->base_gfn + s->npages)))
645                         goto out_unlock;
646         }
647         /*
648          * Do memory allocations outside lock.  memory_config_version will
649          * detect any races.
650          */
651         spin_unlock(&kvm->lock);
652
653         /* Deallocate if slot is being removed */
654         if (!npages)
655                 new.phys_mem = NULL;
656
657         /* Free page dirty bitmap if unneeded */
658         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
659                 new.dirty_bitmap = NULL;
660
661         r = -ENOMEM;
662
663         /* Allocate if a slot is being created */
664         if (npages && !new.phys_mem) {
665                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
666
667                 if (!new.phys_mem)
668                         goto out_free;
669
670                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
671                 for (i = 0; i < npages; ++i) {
672                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
673                                                      | __GFP_ZERO);
674                         if (!new.phys_mem[i])
675                                 goto out_free;
676                         new.phys_mem[i]->private = 0;
677                 }
678         }
679
680         /* Allocate page dirty bitmap if needed */
681         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
682                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
683
684                 new.dirty_bitmap = vmalloc(dirty_bytes);
685                 if (!new.dirty_bitmap)
686                         goto out_free;
687                 memset(new.dirty_bitmap, 0, dirty_bytes);
688         }
689
690         spin_lock(&kvm->lock);
691
692         if (memory_config_version != kvm->memory_config_version) {
693                 spin_unlock(&kvm->lock);
694                 kvm_free_physmem_slot(&new, &old);
695                 goto raced;
696         }
697
698         r = -EAGAIN;
699         if (kvm->busy)
700                 goto out_unlock;
701
702         if (mem->slot >= kvm->nmemslots)
703                 kvm->nmemslots = mem->slot + 1;
704
705         *memslot = new;
706         ++kvm->memory_config_version;
707
708         spin_unlock(&kvm->lock);
709
710         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
711                 struct kvm_vcpu *vcpu;
712
713                 vcpu = vcpu_load(kvm, i);
714                 if (!vcpu)
715                         continue;
716                 kvm_mmu_reset_context(vcpu);
717                 vcpu_put(vcpu);
718         }
719
720         kvm_free_physmem_slot(&old, &new);
721         return 0;
722
723 out_unlock:
724         spin_unlock(&kvm->lock);
725 out_free:
726         kvm_free_physmem_slot(&new, &old);
727 out:
728         return r;
729 }
730
731 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
732 {
733         spin_lock(&vcpu->kvm->lock);
734         kvm_mmu_slot_remove_write_access(vcpu, slot);
735         spin_unlock(&vcpu->kvm->lock);
736 }
737
738 /*
739  * Get (and clear) the dirty memory log for a memory slot.
740  */
741 static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
742                                        struct kvm_dirty_log *log)
743 {
744         struct kvm_memory_slot *memslot;
745         int r, i;
746         int n;
747         int cleared;
748         unsigned long any = 0;
749
750         spin_lock(&kvm->lock);
751
752         /*
753          * Prevent changes to guest memory configuration even while the lock
754          * is not taken.
755          */
756         ++kvm->busy;
757         spin_unlock(&kvm->lock);
758         r = -EINVAL;
759         if (log->slot >= KVM_MEMORY_SLOTS)
760                 goto out;
761
762         memslot = &kvm->memslots[log->slot];
763         r = -ENOENT;
764         if (!memslot->dirty_bitmap)
765                 goto out;
766
767         n = ALIGN(memslot->npages, 8) / 8;
768
769         for (i = 0; !any && i < n; ++i)
770                 any = memslot->dirty_bitmap[i];
771
772         r = -EFAULT;
773         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
774                 goto out;
775
776
777         if (any) {
778                 cleared = 0;
779                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
780                         struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
781
782                         if (!vcpu)
783                                 continue;
784                         if (!cleared) {
785                                 do_remove_write_access(vcpu, log->slot);
786                                 memset(memslot->dirty_bitmap, 0, n);
787                                 cleared = 1;
788                         }
789                         kvm_arch_ops->tlb_flush(vcpu);
790                         vcpu_put(vcpu);
791                 }
792         }
793
794         r = 0;
795
796 out:
797         spin_lock(&kvm->lock);
798         --kvm->busy;
799         spin_unlock(&kvm->lock);
800         return r;
801 }
802
803 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
804 {
805         int i;
806
807         for (i = 0; i < kvm->nmemslots; ++i) {
808                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
809
810                 if (gfn >= memslot->base_gfn
811                     && gfn < memslot->base_gfn + memslot->npages)
812                         return memslot;
813         }
814         return NULL;
815 }
816 EXPORT_SYMBOL_GPL(gfn_to_memslot);
817
818 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
819 {
820         int i;
821         struct kvm_memory_slot *memslot = NULL;
822         unsigned long rel_gfn;
823
824         for (i = 0; i < kvm->nmemslots; ++i) {
825                 memslot = &kvm->memslots[i];
826
827                 if (gfn >= memslot->base_gfn
828                     && gfn < memslot->base_gfn + memslot->npages) {
829
830                         if (!memslot || !memslot->dirty_bitmap)
831                                 return;
832
833                         rel_gfn = gfn - memslot->base_gfn;
834
835                         /* avoid RMW */
836                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
837                                 set_bit(rel_gfn, memslot->dirty_bitmap);
838                         return;
839                 }
840         }
841 }
842
843 static int emulator_read_std(unsigned long addr,
844                              unsigned long *val,
845                              unsigned int bytes,
846                              struct x86_emulate_ctxt *ctxt)
847 {
848         struct kvm_vcpu *vcpu = ctxt->vcpu;
849         void *data = val;
850
851         while (bytes) {
852                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
853                 unsigned offset = addr & (PAGE_SIZE-1);
854                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
855                 unsigned long pfn;
856                 struct kvm_memory_slot *memslot;
857                 void *page;
858
859                 if (gpa == UNMAPPED_GVA)
860                         return X86EMUL_PROPAGATE_FAULT;
861                 pfn = gpa >> PAGE_SHIFT;
862                 memslot = gfn_to_memslot(vcpu->kvm, pfn);
863                 if (!memslot)
864                         return X86EMUL_UNHANDLEABLE;
865                 page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0);
866
867                 memcpy(data, page + offset, tocopy);
868
869                 kunmap_atomic(page, KM_USER0);
870
871                 bytes -= tocopy;
872                 data += tocopy;
873                 addr += tocopy;
874         }
875
876         return X86EMUL_CONTINUE;
877 }
878
879 static int emulator_write_std(unsigned long addr,
880                               unsigned long val,
881                               unsigned int bytes,
882                               struct x86_emulate_ctxt *ctxt)
883 {
884         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
885                addr, bytes);
886         return X86EMUL_UNHANDLEABLE;
887 }
888
889 static int emulator_read_emulated(unsigned long addr,
890                                   unsigned long *val,
891                                   unsigned int bytes,
892                                   struct x86_emulate_ctxt *ctxt)
893 {
894         struct kvm_vcpu *vcpu = ctxt->vcpu;
895
896         if (vcpu->mmio_read_completed) {
897                 memcpy(val, vcpu->mmio_data, bytes);
898                 vcpu->mmio_read_completed = 0;
899                 return X86EMUL_CONTINUE;
900         } else if (emulator_read_std(addr, val, bytes, ctxt)
901                    == X86EMUL_CONTINUE)
902                 return X86EMUL_CONTINUE;
903         else {
904                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
905                 if (gpa == UNMAPPED_GVA)
906                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
907                 vcpu->mmio_needed = 1;
908                 vcpu->mmio_phys_addr = gpa;
909                 vcpu->mmio_size = bytes;
910                 vcpu->mmio_is_write = 0;
911
912                 return X86EMUL_UNHANDLEABLE;
913         }
914 }
915
916 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
917                                unsigned long val, int bytes)
918 {
919         struct kvm_memory_slot *m;
920         struct page *page;
921         void *virt;
922
923         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
924                 return 0;
925         m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
926         if (!m)
927                 return 0;
928         page = gfn_to_page(m, gpa >> PAGE_SHIFT);
929         kvm_mmu_pre_write(vcpu, gpa, bytes);
930         virt = kmap_atomic(page, KM_USER0);
931         memcpy(virt + offset_in_page(gpa), &val, bytes);
932         kunmap_atomic(virt, KM_USER0);
933         kvm_mmu_post_write(vcpu, gpa, bytes);
934         return 1;
935 }
936
937 static int emulator_write_emulated(unsigned long addr,
938                                    unsigned long val,
939                                    unsigned int bytes,
940                                    struct x86_emulate_ctxt *ctxt)
941 {
942         struct kvm_vcpu *vcpu = ctxt->vcpu;
943         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
944
945         if (gpa == UNMAPPED_GVA)
946                 return X86EMUL_PROPAGATE_FAULT;
947
948         if (emulator_write_phys(vcpu, gpa, val, bytes))
949                 return X86EMUL_CONTINUE;
950
951         vcpu->mmio_needed = 1;
952         vcpu->mmio_phys_addr = gpa;
953         vcpu->mmio_size = bytes;
954         vcpu->mmio_is_write = 1;
955         memcpy(vcpu->mmio_data, &val, bytes);
956
957         return X86EMUL_CONTINUE;
958 }
959
960 static int emulator_cmpxchg_emulated(unsigned long addr,
961                                      unsigned long old,
962                                      unsigned long new,
963                                      unsigned int bytes,
964                                      struct x86_emulate_ctxt *ctxt)
965 {
966         static int reported;
967
968         if (!reported) {
969                 reported = 1;
970                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
971         }
972         return emulator_write_emulated(addr, new, bytes, ctxt);
973 }
974
975 #ifdef CONFIG_X86_32
976
977 static int emulator_cmpxchg8b_emulated(unsigned long addr,
978                                        unsigned long old_lo,
979                                        unsigned long old_hi,
980                                        unsigned long new_lo,
981                                        unsigned long new_hi,
982                                        struct x86_emulate_ctxt *ctxt)
983 {
984         static int reported;
985         int r;
986
987         if (!reported) {
988                 reported = 1;
989                 printk(KERN_WARNING "kvm: emulating exchange8b as write\n");
990         }
991         r = emulator_write_emulated(addr, new_lo, 4, ctxt);
992         if (r != X86EMUL_CONTINUE)
993                 return r;
994         return emulator_write_emulated(addr+4, new_hi, 4, ctxt);
995 }
996
997 #endif
998
999 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1000 {
1001         return kvm_arch_ops->get_segment_base(vcpu, seg);
1002 }
1003
1004 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1005 {
1006         return X86EMUL_CONTINUE;
1007 }
1008
1009 int emulate_clts(struct kvm_vcpu *vcpu)
1010 {
1011         unsigned long cr0;
1012
1013         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1014         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1015         kvm_arch_ops->set_cr0(vcpu, cr0);
1016         return X86EMUL_CONTINUE;
1017 }
1018
1019 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1020 {
1021         struct kvm_vcpu *vcpu = ctxt->vcpu;
1022
1023         switch (dr) {
1024         case 0 ... 3:
1025                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1026                 return X86EMUL_CONTINUE;
1027         default:
1028                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1029                        __FUNCTION__, dr);
1030                 return X86EMUL_UNHANDLEABLE;
1031         }
1032 }
1033
1034 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1035 {
1036         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1037         int exception;
1038
1039         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1040         if (exception) {
1041                 /* FIXME: better handling */
1042                 return X86EMUL_UNHANDLEABLE;
1043         }
1044         return X86EMUL_CONTINUE;
1045 }
1046
1047 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1048 {
1049         static int reported;
1050         u8 opcodes[4];
1051         unsigned long rip = ctxt->vcpu->rip;
1052         unsigned long rip_linear;
1053
1054         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1055
1056         if (reported)
1057                 return;
1058
1059         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1060
1061         printk(KERN_ERR "emulation failed but !mmio_needed?"
1062                " rip %lx %02x %02x %02x %02x\n",
1063                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1064         reported = 1;
1065 }
1066
1067 struct x86_emulate_ops emulate_ops = {
1068         .read_std            = emulator_read_std,
1069         .write_std           = emulator_write_std,
1070         .read_emulated       = emulator_read_emulated,
1071         .write_emulated      = emulator_write_emulated,
1072         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1073 #ifdef CONFIG_X86_32
1074         .cmpxchg8b_emulated  = emulator_cmpxchg8b_emulated,
1075 #endif
1076 };
1077
1078 int emulate_instruction(struct kvm_vcpu *vcpu,
1079                         struct kvm_run *run,
1080                         unsigned long cr2,
1081                         u16 error_code)
1082 {
1083         struct x86_emulate_ctxt emulate_ctxt;
1084         int r;
1085         int cs_db, cs_l;
1086
1087         kvm_arch_ops->cache_regs(vcpu);
1088
1089         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1090
1091         emulate_ctxt.vcpu = vcpu;
1092         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1093         emulate_ctxt.cr2 = cr2;
1094         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1095                 ? X86EMUL_MODE_REAL : cs_l
1096                 ? X86EMUL_MODE_PROT64 : cs_db
1097                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1098
1099         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1100                 emulate_ctxt.cs_base = 0;
1101                 emulate_ctxt.ds_base = 0;
1102                 emulate_ctxt.es_base = 0;
1103                 emulate_ctxt.ss_base = 0;
1104         } else {
1105                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1106                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1107                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1108                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1109         }
1110
1111         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1112         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1113
1114         vcpu->mmio_is_write = 0;
1115         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1116
1117         if ((r || vcpu->mmio_is_write) && run) {
1118                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1119                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1120                 run->mmio.len = vcpu->mmio_size;
1121                 run->mmio.is_write = vcpu->mmio_is_write;
1122         }
1123
1124         if (r) {
1125                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1126                         return EMULATE_DONE;
1127                 if (!vcpu->mmio_needed) {
1128                         report_emulation_failure(&emulate_ctxt);
1129                         return EMULATE_FAIL;
1130                 }
1131                 return EMULATE_DO_MMIO;
1132         }
1133
1134         kvm_arch_ops->decache_regs(vcpu);
1135         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1136
1137         if (vcpu->mmio_is_write)
1138                 return EMULATE_DO_MMIO;
1139
1140         return EMULATE_DONE;
1141 }
1142 EXPORT_SYMBOL_GPL(emulate_instruction);
1143
1144 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1145 {
1146         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1147 }
1148
1149 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1150 {
1151         struct descriptor_table dt = { limit, base };
1152
1153         kvm_arch_ops->set_gdt(vcpu, &dt);
1154 }
1155
1156 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1157 {
1158         struct descriptor_table dt = { limit, base };
1159
1160         kvm_arch_ops->set_idt(vcpu, &dt);
1161 }
1162
1163 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1164                    unsigned long *rflags)
1165 {
1166         lmsw(vcpu, msw);
1167         *rflags = kvm_arch_ops->get_rflags(vcpu);
1168 }
1169
1170 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1171 {
1172         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1173         switch (cr) {
1174         case 0:
1175                 return vcpu->cr0;
1176         case 2:
1177                 return vcpu->cr2;
1178         case 3:
1179                 return vcpu->cr3;
1180         case 4:
1181                 return vcpu->cr4;
1182         default:
1183                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1184                 return 0;
1185         }
1186 }
1187
1188 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1189                      unsigned long *rflags)
1190 {
1191         switch (cr) {
1192         case 0:
1193                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1194                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1195                 break;
1196         case 2:
1197                 vcpu->cr2 = val;
1198                 break;
1199         case 3:
1200                 set_cr3(vcpu, val);
1201                 break;
1202         case 4:
1203                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1204                 break;
1205         default:
1206                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1207         }
1208 }
1209
1210 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1211 {
1212         u64 data;
1213
1214         switch (msr) {
1215         case 0xc0010010: /* SYSCFG */
1216         case 0xc0010015: /* HWCR */
1217         case MSR_IA32_PLATFORM_ID:
1218         case MSR_IA32_P5_MC_ADDR:
1219         case MSR_IA32_P5_MC_TYPE:
1220         case MSR_IA32_MC0_CTL:
1221         case MSR_IA32_MCG_STATUS:
1222         case MSR_IA32_MCG_CAP:
1223         case MSR_IA32_MC0_MISC:
1224         case MSR_IA32_MC0_MISC+4:
1225         case MSR_IA32_MC0_MISC+8:
1226         case MSR_IA32_MC0_MISC+12:
1227         case MSR_IA32_MC0_MISC+16:
1228         case MSR_IA32_UCODE_REV:
1229         case MSR_IA32_PERF_STATUS:
1230                 /* MTRR registers */
1231         case 0xfe:
1232         case 0x200 ... 0x2ff:
1233                 data = 0;
1234                 break;
1235         case 0xcd: /* fsb frequency */
1236                 data = 3;
1237                 break;
1238         case MSR_IA32_APICBASE:
1239                 data = vcpu->apic_base;
1240                 break;
1241         case MSR_IA32_MISC_ENABLE:
1242                 data = vcpu->ia32_misc_enable_msr;
1243                 break;
1244 #ifdef CONFIG_X86_64
1245         case MSR_EFER:
1246                 data = vcpu->shadow_efer;
1247                 break;
1248 #endif
1249         default:
1250                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1251                 return 1;
1252         }
1253         *pdata = data;
1254         return 0;
1255 }
1256 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1257
1258 /*
1259  * Reads an msr value (of 'msr_index') into 'pdata'.
1260  * Returns 0 on success, non-0 otherwise.
1261  * Assumes vcpu_load() was already called.
1262  */
1263 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1264 {
1265         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1266 }
1267
1268 #ifdef CONFIG_X86_64
1269
1270 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1271 {
1272         if (efer & EFER_RESERVED_BITS) {
1273                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1274                        efer);
1275                 inject_gp(vcpu);
1276                 return;
1277         }
1278
1279         if (is_paging(vcpu)
1280             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1281                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1282                 inject_gp(vcpu);
1283                 return;
1284         }
1285
1286         kvm_arch_ops->set_efer(vcpu, efer);
1287
1288         efer &= ~EFER_LMA;
1289         efer |= vcpu->shadow_efer & EFER_LMA;
1290
1291         vcpu->shadow_efer = efer;
1292 }
1293
1294 #endif
1295
1296 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1297 {
1298         switch (msr) {
1299 #ifdef CONFIG_X86_64
1300         case MSR_EFER:
1301                 set_efer(vcpu, data);
1302                 break;
1303 #endif
1304         case MSR_IA32_MC0_STATUS:
1305                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1306                        __FUNCTION__, data);
1307                 break;
1308         case MSR_IA32_UCODE_REV:
1309         case MSR_IA32_UCODE_WRITE:
1310         case 0x200 ... 0x2ff: /* MTRRs */
1311                 break;
1312         case MSR_IA32_APICBASE:
1313                 vcpu->apic_base = data;
1314                 break;
1315         case MSR_IA32_MISC_ENABLE:
1316                 vcpu->ia32_misc_enable_msr = data;
1317                 break;
1318         default:
1319                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1320                 return 1;
1321         }
1322         return 0;
1323 }
1324 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1325
1326 /*
1327  * Writes msr value into into the appropriate "register".
1328  * Returns 0 on success, non-0 otherwise.
1329  * Assumes vcpu_load() was already called.
1330  */
1331 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1332 {
1333         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1334 }
1335
1336 void kvm_resched(struct kvm_vcpu *vcpu)
1337 {
1338         vcpu_put(vcpu);
1339         cond_resched();
1340         /* Cannot fail -  no vcpu unplug yet. */
1341         vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
1342 }
1343 EXPORT_SYMBOL_GPL(kvm_resched);
1344
1345 void load_msrs(struct vmx_msr_entry *e, int n)
1346 {
1347         int i;
1348
1349         for (i = 0; i < n; ++i)
1350                 wrmsrl(e[i].index, e[i].data);
1351 }
1352 EXPORT_SYMBOL_GPL(load_msrs);
1353
1354 void save_msrs(struct vmx_msr_entry *e, int n)
1355 {
1356         int i;
1357
1358         for (i = 0; i < n; ++i)
1359                 rdmsrl(e[i].index, e[i].data);
1360 }
1361 EXPORT_SYMBOL_GPL(save_msrs);
1362
1363 static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run)
1364 {
1365         struct kvm_vcpu *vcpu;
1366         int r;
1367
1368         if (!valid_vcpu(kvm_run->vcpu))
1369                 return -EINVAL;
1370
1371         vcpu = vcpu_load(kvm, kvm_run->vcpu);
1372         if (!vcpu)
1373                 return -ENOENT;
1374
1375         /* re-sync apic's tpr */
1376         vcpu->cr8 = kvm_run->cr8;
1377
1378         if (kvm_run->emulated) {
1379                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1380                 kvm_run->emulated = 0;
1381         }
1382
1383         if (kvm_run->mmio_completed) {
1384                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1385                 vcpu->mmio_read_completed = 1;
1386         }
1387
1388         vcpu->mmio_needed = 0;
1389
1390         r = kvm_arch_ops->run(vcpu, kvm_run);
1391
1392         vcpu_put(vcpu);
1393         return r;
1394 }
1395
1396 static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs)
1397 {
1398         struct kvm_vcpu *vcpu;
1399
1400         if (!valid_vcpu(regs->vcpu))
1401                 return -EINVAL;
1402
1403         vcpu = vcpu_load(kvm, regs->vcpu);
1404         if (!vcpu)
1405                 return -ENOENT;
1406
1407         kvm_arch_ops->cache_regs(vcpu);
1408
1409         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1410         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1411         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1412         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1413         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1414         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1415         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1416         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1417 #ifdef CONFIG_X86_64
1418         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1419         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1420         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1421         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1422         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1423         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1424         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1425         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1426 #endif
1427
1428         regs->rip = vcpu->rip;
1429         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1430
1431         /*
1432          * Don't leak debug flags in case they were set for guest debugging
1433          */
1434         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1435                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1436
1437         vcpu_put(vcpu);
1438
1439         return 0;
1440 }
1441
1442 static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs)
1443 {
1444         struct kvm_vcpu *vcpu;
1445
1446         if (!valid_vcpu(regs->vcpu))
1447                 return -EINVAL;
1448
1449         vcpu = vcpu_load(kvm, regs->vcpu);
1450         if (!vcpu)
1451                 return -ENOENT;
1452
1453         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1454         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1455         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1456         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1457         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1458         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1459         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1460         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1461 #ifdef CONFIG_X86_64
1462         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1463         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1464         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1465         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1466         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1467         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1468         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1469         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1470 #endif
1471
1472         vcpu->rip = regs->rip;
1473         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1474
1475         kvm_arch_ops->decache_regs(vcpu);
1476
1477         vcpu_put(vcpu);
1478
1479         return 0;
1480 }
1481
1482 static void get_segment(struct kvm_vcpu *vcpu,
1483                         struct kvm_segment *var, int seg)
1484 {
1485         return kvm_arch_ops->get_segment(vcpu, var, seg);
1486 }
1487
1488 static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1489 {
1490         struct kvm_vcpu *vcpu;
1491         struct descriptor_table dt;
1492
1493         if (!valid_vcpu(sregs->vcpu))
1494                 return -EINVAL;
1495         vcpu = vcpu_load(kvm, sregs->vcpu);
1496         if (!vcpu)
1497                 return -ENOENT;
1498
1499         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1500         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1501         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1502         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1503         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1504         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1505
1506         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1507         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1508
1509         kvm_arch_ops->get_idt(vcpu, &dt);
1510         sregs->idt.limit = dt.limit;
1511         sregs->idt.base = dt.base;
1512         kvm_arch_ops->get_gdt(vcpu, &dt);
1513         sregs->gdt.limit = dt.limit;
1514         sregs->gdt.base = dt.base;
1515
1516         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1517         sregs->cr0 = vcpu->cr0;
1518         sregs->cr2 = vcpu->cr2;
1519         sregs->cr3 = vcpu->cr3;
1520         sregs->cr4 = vcpu->cr4;
1521         sregs->cr8 = vcpu->cr8;
1522         sregs->efer = vcpu->shadow_efer;
1523         sregs->apic_base = vcpu->apic_base;
1524
1525         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1526                sizeof sregs->interrupt_bitmap);
1527
1528         vcpu_put(vcpu);
1529
1530         return 0;
1531 }
1532
1533 static void set_segment(struct kvm_vcpu *vcpu,
1534                         struct kvm_segment *var, int seg)
1535 {
1536         return kvm_arch_ops->set_segment(vcpu, var, seg);
1537 }
1538
1539 static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1540 {
1541         struct kvm_vcpu *vcpu;
1542         int mmu_reset_needed = 0;
1543         int i;
1544         struct descriptor_table dt;
1545
1546         if (!valid_vcpu(sregs->vcpu))
1547                 return -EINVAL;
1548         vcpu = vcpu_load(kvm, sregs->vcpu);
1549         if (!vcpu)
1550                 return -ENOENT;
1551
1552         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1553         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1554         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1555         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1556         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1557         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1558
1559         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1560         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1561
1562         dt.limit = sregs->idt.limit;
1563         dt.base = sregs->idt.base;
1564         kvm_arch_ops->set_idt(vcpu, &dt);
1565         dt.limit = sregs->gdt.limit;
1566         dt.base = sregs->gdt.base;
1567         kvm_arch_ops->set_gdt(vcpu, &dt);
1568
1569         vcpu->cr2 = sregs->cr2;
1570         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1571         vcpu->cr3 = sregs->cr3;
1572
1573         vcpu->cr8 = sregs->cr8;
1574
1575         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1576 #ifdef CONFIG_X86_64
1577         kvm_arch_ops->set_efer(vcpu, sregs->efer);
1578 #endif
1579         vcpu->apic_base = sregs->apic_base;
1580
1581         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1582
1583         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1584         kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
1585
1586         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1587         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
1588         if (!is_long_mode(vcpu) && is_pae(vcpu))
1589                 load_pdptrs(vcpu, vcpu->cr3);
1590
1591         if (mmu_reset_needed)
1592                 kvm_mmu_reset_context(vcpu);
1593
1594         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1595                sizeof vcpu->irq_pending);
1596         vcpu->irq_summary = 0;
1597         for (i = 0; i < NR_IRQ_WORDS; ++i)
1598                 if (vcpu->irq_pending[i])
1599                         __set_bit(i, &vcpu->irq_summary);
1600
1601         vcpu_put(vcpu);
1602
1603         return 0;
1604 }
1605
1606 /*
1607  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1608  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1609  *
1610  * This list is modified at module load time to reflect the
1611  * capabilities of the host cpu.
1612  */
1613 static u32 msrs_to_save[] = {
1614         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1615         MSR_K6_STAR,
1616 #ifdef CONFIG_X86_64
1617         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1618 #endif
1619         MSR_IA32_TIME_STAMP_COUNTER,
1620 };
1621
1622 static unsigned num_msrs_to_save;
1623
1624 static u32 emulated_msrs[] = {
1625         MSR_IA32_MISC_ENABLE,
1626 };
1627
1628 static __init void kvm_init_msr_list(void)
1629 {
1630         u32 dummy[2];
1631         unsigned i, j;
1632
1633         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1634                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1635                         continue;
1636                 if (j < i)
1637                         msrs_to_save[j] = msrs_to_save[i];
1638                 j++;
1639         }
1640         num_msrs_to_save = j;
1641 }
1642
1643 /*
1644  * Adapt set_msr() to msr_io()'s calling convention
1645  */
1646 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1647 {
1648         return set_msr(vcpu, index, *data);
1649 }
1650
1651 /*
1652  * Read or write a bunch of msrs. All parameters are kernel addresses.
1653  *
1654  * @return number of msrs set successfully.
1655  */
1656 static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs,
1657                     struct kvm_msr_entry *entries,
1658                     int (*do_msr)(struct kvm_vcpu *vcpu,
1659                                   unsigned index, u64 *data))
1660 {
1661         struct kvm_vcpu *vcpu;
1662         int i;
1663
1664         if (!valid_vcpu(msrs->vcpu))
1665                 return -EINVAL;
1666
1667         vcpu = vcpu_load(kvm, msrs->vcpu);
1668         if (!vcpu)
1669                 return -ENOENT;
1670
1671         for (i = 0; i < msrs->nmsrs; ++i)
1672                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1673                         break;
1674
1675         vcpu_put(vcpu);
1676
1677         return i;
1678 }
1679
1680 /*
1681  * Read or write a bunch of msrs. Parameters are user addresses.
1682  *
1683  * @return number of msrs set successfully.
1684  */
1685 static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs,
1686                   int (*do_msr)(struct kvm_vcpu *vcpu,
1687                                 unsigned index, u64 *data),
1688                   int writeback)
1689 {
1690         struct kvm_msrs msrs;
1691         struct kvm_msr_entry *entries;
1692         int r, n;
1693         unsigned size;
1694
1695         r = -EFAULT;
1696         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1697                 goto out;
1698
1699         r = -E2BIG;
1700         if (msrs.nmsrs >= MAX_IO_MSRS)
1701                 goto out;
1702
1703         r = -ENOMEM;
1704         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1705         entries = vmalloc(size);
1706         if (!entries)
1707                 goto out;
1708
1709         r = -EFAULT;
1710         if (copy_from_user(entries, user_msrs->entries, size))
1711                 goto out_free;
1712
1713         r = n = __msr_io(kvm, &msrs, entries, do_msr);
1714         if (r < 0)
1715                 goto out_free;
1716
1717         r = -EFAULT;
1718         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1719                 goto out_free;
1720
1721         r = n;
1722
1723 out_free:
1724         vfree(entries);
1725 out:
1726         return r;
1727 }
1728
1729 /*
1730  * Translate a guest virtual address to a guest physical address.
1731  */
1732 static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr)
1733 {
1734         unsigned long vaddr = tr->linear_address;
1735         struct kvm_vcpu *vcpu;
1736         gpa_t gpa;
1737
1738         vcpu = vcpu_load(kvm, tr->vcpu);
1739         if (!vcpu)
1740                 return -ENOENT;
1741         spin_lock(&kvm->lock);
1742         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1743         tr->physical_address = gpa;
1744         tr->valid = gpa != UNMAPPED_GVA;
1745         tr->writeable = 1;
1746         tr->usermode = 0;
1747         spin_unlock(&kvm->lock);
1748         vcpu_put(vcpu);
1749
1750         return 0;
1751 }
1752
1753 static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq)
1754 {
1755         struct kvm_vcpu *vcpu;
1756
1757         if (!valid_vcpu(irq->vcpu))
1758                 return -EINVAL;
1759         if (irq->irq < 0 || irq->irq >= 256)
1760                 return -EINVAL;
1761         vcpu = vcpu_load(kvm, irq->vcpu);
1762         if (!vcpu)
1763                 return -ENOENT;
1764
1765         set_bit(irq->irq, vcpu->irq_pending);
1766         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1767
1768         vcpu_put(vcpu);
1769
1770         return 0;
1771 }
1772
1773 static int kvm_dev_ioctl_debug_guest(struct kvm *kvm,
1774                                      struct kvm_debug_guest *dbg)
1775 {
1776         struct kvm_vcpu *vcpu;
1777         int r;
1778
1779         if (!valid_vcpu(dbg->vcpu))
1780                 return -EINVAL;
1781         vcpu = vcpu_load(kvm, dbg->vcpu);
1782         if (!vcpu)
1783                 return -ENOENT;
1784
1785         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
1786
1787         vcpu_put(vcpu);
1788
1789         return r;
1790 }
1791
1792 static long kvm_dev_ioctl(struct file *filp,
1793                           unsigned int ioctl, unsigned long arg)
1794 {
1795         struct kvm *kvm = filp->private_data;
1796         void __user *argp = (void __user *)arg;
1797         int r = -EINVAL;
1798
1799         switch (ioctl) {
1800         case KVM_GET_API_VERSION:
1801                 r = KVM_API_VERSION;
1802                 break;
1803         case KVM_CREATE_VCPU: {
1804                 r = kvm_dev_ioctl_create_vcpu(kvm, arg);
1805                 if (r)
1806                         goto out;
1807                 break;
1808         }
1809         case KVM_RUN: {
1810                 struct kvm_run kvm_run;
1811
1812                 r = -EFAULT;
1813                 if (copy_from_user(&kvm_run, argp, sizeof kvm_run))
1814                         goto out;
1815                 r = kvm_dev_ioctl_run(kvm, &kvm_run);
1816                 if (r < 0 &&  r != -EINTR)
1817                         goto out;
1818                 if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) {
1819                         r = -EFAULT;
1820                         goto out;
1821                 }
1822                 break;
1823         }
1824         case KVM_GET_REGS: {
1825                 struct kvm_regs kvm_regs;
1826
1827                 r = -EFAULT;
1828                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1829                         goto out;
1830                 r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs);
1831                 if (r)
1832                         goto out;
1833                 r = -EFAULT;
1834                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
1835                         goto out;
1836                 r = 0;
1837                 break;
1838         }
1839         case KVM_SET_REGS: {
1840                 struct kvm_regs kvm_regs;
1841
1842                 r = -EFAULT;
1843                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1844                         goto out;
1845                 r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs);
1846                 if (r)
1847                         goto out;
1848                 r = 0;
1849                 break;
1850         }
1851         case KVM_GET_SREGS: {
1852                 struct kvm_sregs kvm_sregs;
1853
1854                 r = -EFAULT;
1855                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1856                         goto out;
1857                 r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs);
1858                 if (r)
1859                         goto out;
1860                 r = -EFAULT;
1861                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
1862                         goto out;
1863                 r = 0;
1864                 break;
1865         }
1866         case KVM_SET_SREGS: {
1867                 struct kvm_sregs kvm_sregs;
1868
1869                 r = -EFAULT;
1870                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1871                         goto out;
1872                 r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs);
1873                 if (r)
1874                         goto out;
1875                 r = 0;
1876                 break;
1877         }
1878         case KVM_TRANSLATE: {
1879                 struct kvm_translation tr;
1880
1881                 r = -EFAULT;
1882                 if (copy_from_user(&tr, argp, sizeof tr))
1883                         goto out;
1884                 r = kvm_dev_ioctl_translate(kvm, &tr);
1885                 if (r)
1886                         goto out;
1887                 r = -EFAULT;
1888                 if (copy_to_user(argp, &tr, sizeof tr))
1889                         goto out;
1890                 r = 0;
1891                 break;
1892         }
1893         case KVM_INTERRUPT: {
1894                 struct kvm_interrupt irq;
1895
1896                 r = -EFAULT;
1897                 if (copy_from_user(&irq, argp, sizeof irq))
1898                         goto out;
1899                 r = kvm_dev_ioctl_interrupt(kvm, &irq);
1900                 if (r)
1901                         goto out;
1902                 r = 0;
1903                 break;
1904         }
1905         case KVM_DEBUG_GUEST: {
1906                 struct kvm_debug_guest dbg;
1907
1908                 r = -EFAULT;
1909                 if (copy_from_user(&dbg, argp, sizeof dbg))
1910                         goto out;
1911                 r = kvm_dev_ioctl_debug_guest(kvm, &dbg);
1912                 if (r)
1913                         goto out;
1914                 r = 0;
1915                 break;
1916         }
1917         case KVM_SET_MEMORY_REGION: {
1918                 struct kvm_memory_region kvm_mem;
1919
1920                 r = -EFAULT;
1921                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1922                         goto out;
1923                 r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem);
1924                 if (r)
1925                         goto out;
1926                 break;
1927         }
1928         case KVM_GET_DIRTY_LOG: {
1929                 struct kvm_dirty_log log;
1930
1931                 r = -EFAULT;
1932                 if (copy_from_user(&log, argp, sizeof log))
1933                         goto out;
1934                 r = kvm_dev_ioctl_get_dirty_log(kvm, &log);
1935                 if (r)
1936                         goto out;
1937                 break;
1938         }
1939         case KVM_GET_MSRS:
1940                 r = msr_io(kvm, argp, get_msr, 1);
1941                 break;
1942         case KVM_SET_MSRS:
1943                 r = msr_io(kvm, argp, do_set_msr, 0);
1944                 break;
1945         case KVM_GET_MSR_INDEX_LIST: {
1946                 struct kvm_msr_list __user *user_msr_list = argp;
1947                 struct kvm_msr_list msr_list;
1948                 unsigned n;
1949
1950                 r = -EFAULT;
1951                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1952                         goto out;
1953                 n = msr_list.nmsrs;
1954                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1955                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1956                         goto out;
1957                 r = -E2BIG;
1958                 if (n < num_msrs_to_save)
1959                         goto out;
1960                 r = -EFAULT;
1961                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1962                                  num_msrs_to_save * sizeof(u32)))
1963                         goto out;
1964                 if (copy_to_user(user_msr_list->indices
1965                                  + num_msrs_to_save * sizeof(u32),
1966                                  &emulated_msrs,
1967                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1968                         goto out;
1969                 r = 0;
1970                 break;
1971         }
1972         default:
1973                 ;
1974         }
1975 out:
1976         return r;
1977 }
1978
1979 static struct page *kvm_dev_nopage(struct vm_area_struct *vma,
1980                                    unsigned long address,
1981                                    int *type)
1982 {
1983         struct kvm *kvm = vma->vm_file->private_data;
1984         unsigned long pgoff;
1985         struct kvm_memory_slot *slot;
1986         struct page *page;
1987
1988         *type = VM_FAULT_MINOR;
1989         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1990         slot = gfn_to_memslot(kvm, pgoff);
1991         if (!slot)
1992                 return NOPAGE_SIGBUS;
1993         page = gfn_to_page(slot, pgoff);
1994         if (!page)
1995                 return NOPAGE_SIGBUS;
1996         get_page(page);
1997         return page;
1998 }
1999
2000 static struct vm_operations_struct kvm_dev_vm_ops = {
2001         .nopage = kvm_dev_nopage,
2002 };
2003
2004 static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma)
2005 {
2006         vma->vm_ops = &kvm_dev_vm_ops;
2007         return 0;
2008 }
2009
2010 static struct file_operations kvm_chardev_ops = {
2011         .open           = kvm_dev_open,
2012         .release        = kvm_dev_release,
2013         .unlocked_ioctl = kvm_dev_ioctl,
2014         .compat_ioctl   = kvm_dev_ioctl,
2015         .mmap           = kvm_dev_mmap,
2016 };
2017
2018 static struct miscdevice kvm_dev = {
2019         MISC_DYNAMIC_MINOR,
2020         "kvm",
2021         &kvm_chardev_ops,
2022 };
2023
2024 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2025                        void *v)
2026 {
2027         if (val == SYS_RESTART) {
2028                 /*
2029                  * Some (well, at least mine) BIOSes hang on reboot if
2030                  * in vmx root mode.
2031                  */
2032                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2033                 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2034         }
2035         return NOTIFY_OK;
2036 }
2037
2038 static struct notifier_block kvm_reboot_notifier = {
2039         .notifier_call = kvm_reboot,
2040         .priority = 0,
2041 };
2042
2043 /*
2044  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2045  * cached on it.
2046  */
2047 static void decache_vcpus_on_cpu(int cpu)
2048 {
2049         struct kvm *vm;
2050         struct kvm_vcpu *vcpu;
2051         int i;
2052
2053         spin_lock(&kvm_lock);
2054         list_for_each_entry(vm, &vm_list, vm_list)
2055                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2056                         vcpu = &vm->vcpus[i];
2057                         /*
2058                          * If the vcpu is locked, then it is running on some
2059                          * other cpu and therefore it is not cached on the
2060                          * cpu in question.
2061                          *
2062                          * If it's not locked, check the last cpu it executed
2063                          * on.
2064                          */
2065                         if (mutex_trylock(&vcpu->mutex)) {
2066                                 if (vcpu->cpu == cpu) {
2067                                         kvm_arch_ops->vcpu_decache(vcpu);
2068                                         vcpu->cpu = -1;
2069                                 }
2070                                 mutex_unlock(&vcpu->mutex);
2071                         }
2072                 }
2073         spin_unlock(&kvm_lock);
2074 }
2075
2076 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2077                            void *v)
2078 {
2079         int cpu = (long)v;
2080
2081         switch (val) {
2082         case CPU_DEAD:
2083         case CPU_UP_CANCELED:
2084                 decache_vcpus_on_cpu(cpu);
2085                 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
2086                                          NULL, 0, 1);
2087                 break;
2088         case CPU_UP_PREPARE:
2089                 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
2090                                          NULL, 0, 1);
2091                 break;
2092         }
2093         return NOTIFY_OK;
2094 }
2095
2096 static struct notifier_block kvm_cpu_notifier = {
2097         .notifier_call = kvm_cpu_hotplug,
2098         .priority = 20, /* must be > scheduler priority */
2099 };
2100
2101 static __init void kvm_init_debug(void)
2102 {
2103         struct kvm_stats_debugfs_item *p;
2104
2105         debugfs_dir = debugfs_create_dir("kvm", NULL);
2106         for (p = debugfs_entries; p->name; ++p)
2107                 p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir,
2108                                                p->data);
2109 }
2110
2111 static void kvm_exit_debug(void)
2112 {
2113         struct kvm_stats_debugfs_item *p;
2114
2115         for (p = debugfs_entries; p->name; ++p)
2116                 debugfs_remove(p->dentry);
2117         debugfs_remove(debugfs_dir);
2118 }
2119
2120 hpa_t bad_page_address;
2121
2122 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
2123 {
2124         int r;
2125
2126         if (kvm_arch_ops) {
2127                 printk(KERN_ERR "kvm: already loaded the other module\n");
2128                 return -EEXIST;
2129         }
2130
2131         if (!ops->cpu_has_kvm_support()) {
2132                 printk(KERN_ERR "kvm: no hardware support\n");
2133                 return -EOPNOTSUPP;
2134         }
2135         if (ops->disabled_by_bios()) {
2136                 printk(KERN_ERR "kvm: disabled by bios\n");
2137                 return -EOPNOTSUPP;
2138         }
2139
2140         kvm_arch_ops = ops;
2141
2142         r = kvm_arch_ops->hardware_setup();
2143         if (r < 0)
2144             return r;
2145
2146         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
2147         r = register_cpu_notifier(&kvm_cpu_notifier);
2148         if (r)
2149                 goto out_free_1;
2150         register_reboot_notifier(&kvm_reboot_notifier);
2151
2152         kvm_chardev_ops.owner = module;
2153
2154         r = misc_register(&kvm_dev);
2155         if (r) {
2156                 printk (KERN_ERR "kvm: misc device register failed\n");
2157                 goto out_free;
2158         }
2159
2160         return r;
2161
2162 out_free:
2163         unregister_reboot_notifier(&kvm_reboot_notifier);
2164         unregister_cpu_notifier(&kvm_cpu_notifier);
2165 out_free_1:
2166         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2167         kvm_arch_ops->hardware_unsetup();
2168         return r;
2169 }
2170
2171 void kvm_exit_arch(void)
2172 {
2173         misc_deregister(&kvm_dev);
2174
2175         unregister_reboot_notifier(&kvm_reboot_notifier);
2176         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2177         kvm_arch_ops->hardware_unsetup();
2178         kvm_arch_ops = NULL;
2179 }
2180
2181 static __init int kvm_init(void)
2182 {
2183         static struct page *bad_page;
2184         int r = 0;
2185
2186         kvm_init_debug();
2187
2188         kvm_init_msr_list();
2189
2190         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
2191                 r = -ENOMEM;
2192                 goto out;
2193         }
2194
2195         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
2196         memset(__va(bad_page_address), 0, PAGE_SIZE);
2197
2198         return r;
2199
2200 out:
2201         kvm_exit_debug();
2202         return r;
2203 }
2204
2205 static __exit void kvm_exit(void)
2206 {
2207         kvm_exit_debug();
2208         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
2209 }
2210
2211 module_init(kvm_init)
2212 module_exit(kvm_exit)
2213
2214 EXPORT_SYMBOL_GPL(kvm_init_arch);
2215 EXPORT_SYMBOL_GPL(kvm_exit_arch);