KVM: Dynamically allocate vcpus
[linux-2.6] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "x86_emulate.h"
20 #include "segment_descriptor.h"
21
22 #include <linux/kvm.h>
23 #include <linux/module.h>
24 #include <linux/errno.h>
25 #include <linux/percpu.h>
26 #include <linux/gfp.h>
27 #include <linux/mm.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <linux/reboot.h>
31 #include <linux/debugfs.h>
32 #include <linux/highmem.h>
33 #include <linux/file.h>
34 #include <linux/sysdev.h>
35 #include <linux/cpu.h>
36 #include <linux/sched.h>
37 #include <linux/cpumask.h>
38 #include <linux/smp.h>
39 #include <linux/anon_inodes.h>
40
41 #include <asm/processor.h>
42 #include <asm/msr.h>
43 #include <asm/io.h>
44 #include <asm/uaccess.h>
45 #include <asm/desc.h>
46
47 MODULE_AUTHOR("Qumranet");
48 MODULE_LICENSE("GPL");
49
50 static DEFINE_SPINLOCK(kvm_lock);
51 static LIST_HEAD(vm_list);
52
53 static cpumask_t cpus_hardware_enabled;
54
55 struct kvm_arch_ops *kvm_arch_ops;
56
57 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
58
59 static struct kvm_stats_debugfs_item {
60         const char *name;
61         int offset;
62         struct dentry *dentry;
63 } debugfs_entries[] = {
64         { "pf_fixed", STAT_OFFSET(pf_fixed) },
65         { "pf_guest", STAT_OFFSET(pf_guest) },
66         { "tlb_flush", STAT_OFFSET(tlb_flush) },
67         { "invlpg", STAT_OFFSET(invlpg) },
68         { "exits", STAT_OFFSET(exits) },
69         { "io_exits", STAT_OFFSET(io_exits) },
70         { "mmio_exits", STAT_OFFSET(mmio_exits) },
71         { "signal_exits", STAT_OFFSET(signal_exits) },
72         { "irq_window", STAT_OFFSET(irq_window_exits) },
73         { "halt_exits", STAT_OFFSET(halt_exits) },
74         { "request_irq", STAT_OFFSET(request_irq_exits) },
75         { "irq_exits", STAT_OFFSET(irq_exits) },
76         { "light_exits", STAT_OFFSET(light_exits) },
77         { "efer_reload", STAT_OFFSET(efer_reload) },
78         { NULL }
79 };
80
81 static struct dentry *debugfs_dir;
82
83 #define MAX_IO_MSRS 256
84
85 #define CR0_RESERVED_BITS                                               \
86         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
87                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
88                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
89 #define CR4_RESERVED_BITS                                               \
90         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
91                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
92                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
93                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
94
95 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
96 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
97
98 #ifdef CONFIG_X86_64
99 // LDT or TSS descriptor in the GDT. 16 bytes.
100 struct segment_descriptor_64 {
101         struct segment_descriptor s;
102         u32 base_higher;
103         u32 pad_zero;
104 };
105
106 #endif
107
108 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
109                            unsigned long arg);
110
111 unsigned long segment_base(u16 selector)
112 {
113         struct descriptor_table gdt;
114         struct segment_descriptor *d;
115         unsigned long table_base;
116         typedef unsigned long ul;
117         unsigned long v;
118
119         if (selector == 0)
120                 return 0;
121
122         asm ("sgdt %0" : "=m"(gdt));
123         table_base = gdt.base;
124
125         if (selector & 4) {           /* from ldt */
126                 u16 ldt_selector;
127
128                 asm ("sldt %0" : "=g"(ldt_selector));
129                 table_base = segment_base(ldt_selector);
130         }
131         d = (struct segment_descriptor *)(table_base + (selector & ~7));
132         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
133 #ifdef CONFIG_X86_64
134         if (d->system == 0
135             && (d->type == 2 || d->type == 9 || d->type == 11))
136                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
137 #endif
138         return v;
139 }
140 EXPORT_SYMBOL_GPL(segment_base);
141
142 static inline int valid_vcpu(int n)
143 {
144         return likely(n >= 0 && n < KVM_MAX_VCPUS);
145 }
146
147 int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
148                    void *dest)
149 {
150         unsigned char *host_buf = dest;
151         unsigned long req_size = size;
152
153         while (size) {
154                 hpa_t paddr;
155                 unsigned now;
156                 unsigned offset;
157                 hva_t guest_buf;
158
159                 paddr = gva_to_hpa(vcpu, addr);
160
161                 if (is_error_hpa(paddr))
162                         break;
163
164                 guest_buf = (hva_t)kmap_atomic(
165                                         pfn_to_page(paddr >> PAGE_SHIFT),
166                                         KM_USER0);
167                 offset = addr & ~PAGE_MASK;
168                 guest_buf |= offset;
169                 now = min(size, PAGE_SIZE - offset);
170                 memcpy(host_buf, (void*)guest_buf, now);
171                 host_buf += now;
172                 addr += now;
173                 size -= now;
174                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
175         }
176         return req_size - size;
177 }
178 EXPORT_SYMBOL_GPL(kvm_read_guest);
179
180 int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
181                     void *data)
182 {
183         unsigned char *host_buf = data;
184         unsigned long req_size = size;
185
186         while (size) {
187                 hpa_t paddr;
188                 unsigned now;
189                 unsigned offset;
190                 hva_t guest_buf;
191                 gfn_t gfn;
192
193                 paddr = gva_to_hpa(vcpu, addr);
194
195                 if (is_error_hpa(paddr))
196                         break;
197
198                 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
199                 mark_page_dirty(vcpu->kvm, gfn);
200                 guest_buf = (hva_t)kmap_atomic(
201                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
202                 offset = addr & ~PAGE_MASK;
203                 guest_buf |= offset;
204                 now = min(size, PAGE_SIZE - offset);
205                 memcpy((void*)guest_buf, host_buf, now);
206                 host_buf += now;
207                 addr += now;
208                 size -= now;
209                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
210         }
211         return req_size - size;
212 }
213 EXPORT_SYMBOL_GPL(kvm_write_guest);
214
215 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
216 {
217         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
218                 return;
219
220         vcpu->guest_fpu_loaded = 1;
221         fx_save(vcpu->host_fx_image);
222         fx_restore(vcpu->guest_fx_image);
223 }
224 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
225
226 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
227 {
228         if (!vcpu->guest_fpu_loaded)
229                 return;
230
231         vcpu->guest_fpu_loaded = 0;
232         fx_save(vcpu->guest_fx_image);
233         fx_restore(vcpu->host_fx_image);
234 }
235 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
236
237 /*
238  * Switches to specified vcpu, until a matching vcpu_put()
239  */
240 static void vcpu_load(struct kvm_vcpu *vcpu)
241 {
242         mutex_lock(&vcpu->mutex);
243         kvm_arch_ops->vcpu_load(vcpu);
244 }
245
246 static void vcpu_put(struct kvm_vcpu *vcpu)
247 {
248         kvm_arch_ops->vcpu_put(vcpu);
249         mutex_unlock(&vcpu->mutex);
250 }
251
252 static void ack_flush(void *_completed)
253 {
254         atomic_t *completed = _completed;
255
256         atomic_inc(completed);
257 }
258
259 void kvm_flush_remote_tlbs(struct kvm *kvm)
260 {
261         int i, cpu, needed;
262         cpumask_t cpus;
263         struct kvm_vcpu *vcpu;
264         atomic_t completed;
265
266         atomic_set(&completed, 0);
267         cpus_clear(cpus);
268         needed = 0;
269         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
270                 vcpu = kvm->vcpus[i];
271                 if (!vcpu)
272                         continue;
273                 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
274                         continue;
275                 cpu = vcpu->cpu;
276                 if (cpu != -1 && cpu != raw_smp_processor_id())
277                         if (!cpu_isset(cpu, cpus)) {
278                                 cpu_set(cpu, cpus);
279                                 ++needed;
280                         }
281         }
282
283         /*
284          * We really want smp_call_function_mask() here.  But that's not
285          * available, so ipi all cpus in parallel and wait for them
286          * to complete.
287          */
288         for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
289                 smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
290         while (atomic_read(&completed) != needed) {
291                 cpu_relax();
292                 barrier();
293         }
294 }
295
296 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
297 {
298         struct page *page;
299         int r;
300
301         mutex_init(&vcpu->mutex);
302         vcpu->cpu = -1;
303         vcpu->mmu.root_hpa = INVALID_PAGE;
304         vcpu->kvm = kvm;
305         vcpu->vcpu_id = id;
306
307         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
308         if (!page) {
309                 r = -ENOMEM;
310                 goto fail;
311         }
312         vcpu->run = page_address(page);
313
314         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
315         if (!page) {
316                 r = -ENOMEM;
317                 goto fail_free_run;
318         }
319         vcpu->pio_data = page_address(page);
320
321         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
322                                            FX_IMAGE_ALIGN);
323         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
324
325         r = kvm_mmu_create(vcpu);
326         if (r < 0)
327                 goto fail_free_pio_data;
328
329         return 0;
330
331 fail_free_pio_data:
332         free_page((unsigned long)vcpu->pio_data);
333 fail_free_run:
334         free_page((unsigned long)vcpu->run);
335 fail:
336         return -ENOMEM;
337 }
338 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
339
340 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
341 {
342         kvm_mmu_destroy(vcpu);
343         free_page((unsigned long)vcpu->pio_data);
344         free_page((unsigned long)vcpu->run);
345 }
346 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
347
348 static struct kvm *kvm_create_vm(void)
349 {
350         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
351
352         if (!kvm)
353                 return ERR_PTR(-ENOMEM);
354
355         kvm_io_bus_init(&kvm->pio_bus);
356         spin_lock_init(&kvm->lock);
357         INIT_LIST_HEAD(&kvm->active_mmu_pages);
358         kvm_io_bus_init(&kvm->mmio_bus);
359         spin_lock(&kvm_lock);
360         list_add(&kvm->vm_list, &vm_list);
361         spin_unlock(&kvm_lock);
362         return kvm;
363 }
364
365 static int kvm_dev_open(struct inode *inode, struct file *filp)
366 {
367         return 0;
368 }
369
370 /*
371  * Free any memory in @free but not in @dont.
372  */
373 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
374                                   struct kvm_memory_slot *dont)
375 {
376         int i;
377
378         if (!dont || free->phys_mem != dont->phys_mem)
379                 if (free->phys_mem) {
380                         for (i = 0; i < free->npages; ++i)
381                                 if (free->phys_mem[i])
382                                         __free_page(free->phys_mem[i]);
383                         vfree(free->phys_mem);
384                 }
385
386         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
387                 vfree(free->dirty_bitmap);
388
389         free->phys_mem = NULL;
390         free->npages = 0;
391         free->dirty_bitmap = NULL;
392 }
393
394 static void kvm_free_physmem(struct kvm *kvm)
395 {
396         int i;
397
398         for (i = 0; i < kvm->nmemslots; ++i)
399                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
400 }
401
402 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
403 {
404         int i;
405
406         for (i = 0; i < 2; ++i)
407                 if (vcpu->pio.guest_pages[i]) {
408                         __free_page(vcpu->pio.guest_pages[i]);
409                         vcpu->pio.guest_pages[i] = NULL;
410                 }
411 }
412
413 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
414 {
415         vcpu_load(vcpu);
416         kvm_mmu_unload(vcpu);
417         vcpu_put(vcpu);
418 }
419
420 static void kvm_free_vcpus(struct kvm *kvm)
421 {
422         unsigned int i;
423
424         /*
425          * Unpin any mmu pages first.
426          */
427         for (i = 0; i < KVM_MAX_VCPUS; ++i)
428                 if (kvm->vcpus[i])
429                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
430         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
431                 if (kvm->vcpus[i]) {
432                         kvm_arch_ops->vcpu_free(kvm->vcpus[i]);
433                         kvm->vcpus[i] = NULL;
434                 }
435         }
436
437 }
438
439 static int kvm_dev_release(struct inode *inode, struct file *filp)
440 {
441         return 0;
442 }
443
444 static void kvm_destroy_vm(struct kvm *kvm)
445 {
446         spin_lock(&kvm_lock);
447         list_del(&kvm->vm_list);
448         spin_unlock(&kvm_lock);
449         kvm_io_bus_destroy(&kvm->pio_bus);
450         kvm_io_bus_destroy(&kvm->mmio_bus);
451         kvm_free_vcpus(kvm);
452         kvm_free_physmem(kvm);
453         kfree(kvm);
454 }
455
456 static int kvm_vm_release(struct inode *inode, struct file *filp)
457 {
458         struct kvm *kvm = filp->private_data;
459
460         kvm_destroy_vm(kvm);
461         return 0;
462 }
463
464 static void inject_gp(struct kvm_vcpu *vcpu)
465 {
466         kvm_arch_ops->inject_gp(vcpu, 0);
467 }
468
469 /*
470  * Load the pae pdptrs.  Return true is they are all valid.
471  */
472 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
473 {
474         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
475         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
476         int i;
477         u64 *pdpt;
478         int ret;
479         struct page *page;
480         u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
481
482         spin_lock(&vcpu->kvm->lock);
483         page = gfn_to_page(vcpu->kvm, pdpt_gfn);
484         if (!page) {
485                 ret = 0;
486                 goto out;
487         }
488
489         pdpt = kmap_atomic(page, KM_USER0);
490         memcpy(pdpte, pdpt+offset, sizeof(pdpte));
491         kunmap_atomic(pdpt, KM_USER0);
492
493         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
494                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
495                         ret = 0;
496                         goto out;
497                 }
498         }
499         ret = 1;
500
501         memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
502 out:
503         spin_unlock(&vcpu->kvm->lock);
504
505         return ret;
506 }
507
508 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
509 {
510         if (cr0 & CR0_RESERVED_BITS) {
511                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
512                        cr0, vcpu->cr0);
513                 inject_gp(vcpu);
514                 return;
515         }
516
517         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
518                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
519                 inject_gp(vcpu);
520                 return;
521         }
522
523         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
524                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
525                        "and a clear PE flag\n");
526                 inject_gp(vcpu);
527                 return;
528         }
529
530         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
531 #ifdef CONFIG_X86_64
532                 if ((vcpu->shadow_efer & EFER_LME)) {
533                         int cs_db, cs_l;
534
535                         if (!is_pae(vcpu)) {
536                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
537                                        "in long mode while PAE is disabled\n");
538                                 inject_gp(vcpu);
539                                 return;
540                         }
541                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
542                         if (cs_l) {
543                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
544                                        "in long mode while CS.L == 1\n");
545                                 inject_gp(vcpu);
546                                 return;
547
548                         }
549                 } else
550 #endif
551                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
552                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
553                                "reserved bits\n");
554                         inject_gp(vcpu);
555                         return;
556                 }
557
558         }
559
560         kvm_arch_ops->set_cr0(vcpu, cr0);
561         vcpu->cr0 = cr0;
562
563         spin_lock(&vcpu->kvm->lock);
564         kvm_mmu_reset_context(vcpu);
565         spin_unlock(&vcpu->kvm->lock);
566         return;
567 }
568 EXPORT_SYMBOL_GPL(set_cr0);
569
570 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
571 {
572         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
573 }
574 EXPORT_SYMBOL_GPL(lmsw);
575
576 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
577 {
578         if (cr4 & CR4_RESERVED_BITS) {
579                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
580                 inject_gp(vcpu);
581                 return;
582         }
583
584         if (is_long_mode(vcpu)) {
585                 if (!(cr4 & X86_CR4_PAE)) {
586                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
587                                "in long mode\n");
588                         inject_gp(vcpu);
589                         return;
590                 }
591         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
592                    && !load_pdptrs(vcpu, vcpu->cr3)) {
593                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
594                 inject_gp(vcpu);
595                 return;
596         }
597
598         if (cr4 & X86_CR4_VMXE) {
599                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
600                 inject_gp(vcpu);
601                 return;
602         }
603         kvm_arch_ops->set_cr4(vcpu, cr4);
604         spin_lock(&vcpu->kvm->lock);
605         kvm_mmu_reset_context(vcpu);
606         spin_unlock(&vcpu->kvm->lock);
607 }
608 EXPORT_SYMBOL_GPL(set_cr4);
609
610 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
611 {
612         if (is_long_mode(vcpu)) {
613                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
614                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
615                         inject_gp(vcpu);
616                         return;
617                 }
618         } else {
619                 if (is_pae(vcpu)) {
620                         if (cr3 & CR3_PAE_RESERVED_BITS) {
621                                 printk(KERN_DEBUG
622                                        "set_cr3: #GP, reserved bits\n");
623                                 inject_gp(vcpu);
624                                 return;
625                         }
626                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
627                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
628                                        "reserved bits\n");
629                                 inject_gp(vcpu);
630                                 return;
631                         }
632                 } else {
633                         if (cr3 & CR3_NONPAE_RESERVED_BITS) {
634                                 printk(KERN_DEBUG
635                                        "set_cr3: #GP, reserved bits\n");
636                                 inject_gp(vcpu);
637                                 return;
638                         }
639                 }
640         }
641
642         vcpu->cr3 = cr3;
643         spin_lock(&vcpu->kvm->lock);
644         /*
645          * Does the new cr3 value map to physical memory? (Note, we
646          * catch an invalid cr3 even in real-mode, because it would
647          * cause trouble later on when we turn on paging anyway.)
648          *
649          * A real CPU would silently accept an invalid cr3 and would
650          * attempt to use it - with largely undefined (and often hard
651          * to debug) behavior on the guest side.
652          */
653         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
654                 inject_gp(vcpu);
655         else
656                 vcpu->mmu.new_cr3(vcpu);
657         spin_unlock(&vcpu->kvm->lock);
658 }
659 EXPORT_SYMBOL_GPL(set_cr3);
660
661 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
662 {
663         if (cr8 & CR8_RESERVED_BITS) {
664                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
665                 inject_gp(vcpu);
666                 return;
667         }
668         vcpu->cr8 = cr8;
669 }
670 EXPORT_SYMBOL_GPL(set_cr8);
671
672 void fx_init(struct kvm_vcpu *vcpu)
673 {
674         struct __attribute__ ((__packed__)) fx_image_s {
675                 u16 control; //fcw
676                 u16 status; //fsw
677                 u16 tag; // ftw
678                 u16 opcode; //fop
679                 u64 ip; // fpu ip
680                 u64 operand;// fpu dp
681                 u32 mxcsr;
682                 u32 mxcsr_mask;
683
684         } *fx_image;
685
686         fx_save(vcpu->host_fx_image);
687         fpu_init();
688         fx_save(vcpu->guest_fx_image);
689         fx_restore(vcpu->host_fx_image);
690
691         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
692         fx_image->mxcsr = 0x1f80;
693         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
694                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
695 }
696 EXPORT_SYMBOL_GPL(fx_init);
697
698 /*
699  * Allocate some memory and give it an address in the guest physical address
700  * space.
701  *
702  * Discontiguous memory is allowed, mostly for framebuffers.
703  */
704 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
705                                           struct kvm_memory_region *mem)
706 {
707         int r;
708         gfn_t base_gfn;
709         unsigned long npages;
710         unsigned long i;
711         struct kvm_memory_slot *memslot;
712         struct kvm_memory_slot old, new;
713         int memory_config_version;
714
715         r = -EINVAL;
716         /* General sanity checks */
717         if (mem->memory_size & (PAGE_SIZE - 1))
718                 goto out;
719         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
720                 goto out;
721         if (mem->slot >= KVM_MEMORY_SLOTS)
722                 goto out;
723         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
724                 goto out;
725
726         memslot = &kvm->memslots[mem->slot];
727         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
728         npages = mem->memory_size >> PAGE_SHIFT;
729
730         if (!npages)
731                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
732
733 raced:
734         spin_lock(&kvm->lock);
735
736         memory_config_version = kvm->memory_config_version;
737         new = old = *memslot;
738
739         new.base_gfn = base_gfn;
740         new.npages = npages;
741         new.flags = mem->flags;
742
743         /* Disallow changing a memory slot's size. */
744         r = -EINVAL;
745         if (npages && old.npages && npages != old.npages)
746                 goto out_unlock;
747
748         /* Check for overlaps */
749         r = -EEXIST;
750         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
751                 struct kvm_memory_slot *s = &kvm->memslots[i];
752
753                 if (s == memslot)
754                         continue;
755                 if (!((base_gfn + npages <= s->base_gfn) ||
756                       (base_gfn >= s->base_gfn + s->npages)))
757                         goto out_unlock;
758         }
759         /*
760          * Do memory allocations outside lock.  memory_config_version will
761          * detect any races.
762          */
763         spin_unlock(&kvm->lock);
764
765         /* Deallocate if slot is being removed */
766         if (!npages)
767                 new.phys_mem = NULL;
768
769         /* Free page dirty bitmap if unneeded */
770         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
771                 new.dirty_bitmap = NULL;
772
773         r = -ENOMEM;
774
775         /* Allocate if a slot is being created */
776         if (npages && !new.phys_mem) {
777                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
778
779                 if (!new.phys_mem)
780                         goto out_free;
781
782                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
783                 for (i = 0; i < npages; ++i) {
784                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
785                                                      | __GFP_ZERO);
786                         if (!new.phys_mem[i])
787                                 goto out_free;
788                         set_page_private(new.phys_mem[i],0);
789                 }
790         }
791
792         /* Allocate page dirty bitmap if needed */
793         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
794                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
795
796                 new.dirty_bitmap = vmalloc(dirty_bytes);
797                 if (!new.dirty_bitmap)
798                         goto out_free;
799                 memset(new.dirty_bitmap, 0, dirty_bytes);
800         }
801
802         spin_lock(&kvm->lock);
803
804         if (memory_config_version != kvm->memory_config_version) {
805                 spin_unlock(&kvm->lock);
806                 kvm_free_physmem_slot(&new, &old);
807                 goto raced;
808         }
809
810         r = -EAGAIN;
811         if (kvm->busy)
812                 goto out_unlock;
813
814         if (mem->slot >= kvm->nmemslots)
815                 kvm->nmemslots = mem->slot + 1;
816
817         *memslot = new;
818         ++kvm->memory_config_version;
819
820         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
821         kvm_flush_remote_tlbs(kvm);
822
823         spin_unlock(&kvm->lock);
824
825         kvm_free_physmem_slot(&old, &new);
826         return 0;
827
828 out_unlock:
829         spin_unlock(&kvm->lock);
830 out_free:
831         kvm_free_physmem_slot(&new, &old);
832 out:
833         return r;
834 }
835
836 /*
837  * Get (and clear) the dirty memory log for a memory slot.
838  */
839 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
840                                       struct kvm_dirty_log *log)
841 {
842         struct kvm_memory_slot *memslot;
843         int r, i;
844         int n;
845         unsigned long any = 0;
846
847         spin_lock(&kvm->lock);
848
849         /*
850          * Prevent changes to guest memory configuration even while the lock
851          * is not taken.
852          */
853         ++kvm->busy;
854         spin_unlock(&kvm->lock);
855         r = -EINVAL;
856         if (log->slot >= KVM_MEMORY_SLOTS)
857                 goto out;
858
859         memslot = &kvm->memslots[log->slot];
860         r = -ENOENT;
861         if (!memslot->dirty_bitmap)
862                 goto out;
863
864         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
865
866         for (i = 0; !any && i < n/sizeof(long); ++i)
867                 any = memslot->dirty_bitmap[i];
868
869         r = -EFAULT;
870         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
871                 goto out;
872
873         spin_lock(&kvm->lock);
874         kvm_mmu_slot_remove_write_access(kvm, log->slot);
875         kvm_flush_remote_tlbs(kvm);
876         memset(memslot->dirty_bitmap, 0, n);
877         spin_unlock(&kvm->lock);
878
879         r = 0;
880
881 out:
882         spin_lock(&kvm->lock);
883         --kvm->busy;
884         spin_unlock(&kvm->lock);
885         return r;
886 }
887
888 /*
889  * Set a new alias region.  Aliases map a portion of physical memory into
890  * another portion.  This is useful for memory windows, for example the PC
891  * VGA region.
892  */
893 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
894                                          struct kvm_memory_alias *alias)
895 {
896         int r, n;
897         struct kvm_mem_alias *p;
898
899         r = -EINVAL;
900         /* General sanity checks */
901         if (alias->memory_size & (PAGE_SIZE - 1))
902                 goto out;
903         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
904                 goto out;
905         if (alias->slot >= KVM_ALIAS_SLOTS)
906                 goto out;
907         if (alias->guest_phys_addr + alias->memory_size
908             < alias->guest_phys_addr)
909                 goto out;
910         if (alias->target_phys_addr + alias->memory_size
911             < alias->target_phys_addr)
912                 goto out;
913
914         spin_lock(&kvm->lock);
915
916         p = &kvm->aliases[alias->slot];
917         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
918         p->npages = alias->memory_size >> PAGE_SHIFT;
919         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
920
921         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
922                 if (kvm->aliases[n - 1].npages)
923                         break;
924         kvm->naliases = n;
925
926         kvm_mmu_zap_all(kvm);
927
928         spin_unlock(&kvm->lock);
929
930         return 0;
931
932 out:
933         return r;
934 }
935
936 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
937 {
938         int i;
939         struct kvm_mem_alias *alias;
940
941         for (i = 0; i < kvm->naliases; ++i) {
942                 alias = &kvm->aliases[i];
943                 if (gfn >= alias->base_gfn
944                     && gfn < alias->base_gfn + alias->npages)
945                         return alias->target_gfn + gfn - alias->base_gfn;
946         }
947         return gfn;
948 }
949
950 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
951 {
952         int i;
953
954         for (i = 0; i < kvm->nmemslots; ++i) {
955                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
956
957                 if (gfn >= memslot->base_gfn
958                     && gfn < memslot->base_gfn + memslot->npages)
959                         return memslot;
960         }
961         return NULL;
962 }
963
964 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
965 {
966         gfn = unalias_gfn(kvm, gfn);
967         return __gfn_to_memslot(kvm, gfn);
968 }
969
970 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
971 {
972         struct kvm_memory_slot *slot;
973
974         gfn = unalias_gfn(kvm, gfn);
975         slot = __gfn_to_memslot(kvm, gfn);
976         if (!slot)
977                 return NULL;
978         return slot->phys_mem[gfn - slot->base_gfn];
979 }
980 EXPORT_SYMBOL_GPL(gfn_to_page);
981
982 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
983 {
984         int i;
985         struct kvm_memory_slot *memslot;
986         unsigned long rel_gfn;
987
988         for (i = 0; i < kvm->nmemslots; ++i) {
989                 memslot = &kvm->memslots[i];
990
991                 if (gfn >= memslot->base_gfn
992                     && gfn < memslot->base_gfn + memslot->npages) {
993
994                         if (!memslot->dirty_bitmap)
995                                 return;
996
997                         rel_gfn = gfn - memslot->base_gfn;
998
999                         /* avoid RMW */
1000                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1001                                 set_bit(rel_gfn, memslot->dirty_bitmap);
1002                         return;
1003                 }
1004         }
1005 }
1006
1007 static int emulator_read_std(unsigned long addr,
1008                              void *val,
1009                              unsigned int bytes,
1010                              struct x86_emulate_ctxt *ctxt)
1011 {
1012         struct kvm_vcpu *vcpu = ctxt->vcpu;
1013         void *data = val;
1014
1015         while (bytes) {
1016                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1017                 unsigned offset = addr & (PAGE_SIZE-1);
1018                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1019                 unsigned long pfn;
1020                 struct page *page;
1021                 void *page_virt;
1022
1023                 if (gpa == UNMAPPED_GVA)
1024                         return X86EMUL_PROPAGATE_FAULT;
1025                 pfn = gpa >> PAGE_SHIFT;
1026                 page = gfn_to_page(vcpu->kvm, pfn);
1027                 if (!page)
1028                         return X86EMUL_UNHANDLEABLE;
1029                 page_virt = kmap_atomic(page, KM_USER0);
1030
1031                 memcpy(data, page_virt + offset, tocopy);
1032
1033                 kunmap_atomic(page_virt, KM_USER0);
1034
1035                 bytes -= tocopy;
1036                 data += tocopy;
1037                 addr += tocopy;
1038         }
1039
1040         return X86EMUL_CONTINUE;
1041 }
1042
1043 static int emulator_write_std(unsigned long addr,
1044                               const void *val,
1045                               unsigned int bytes,
1046                               struct x86_emulate_ctxt *ctxt)
1047 {
1048         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
1049                addr, bytes);
1050         return X86EMUL_UNHANDLEABLE;
1051 }
1052
1053 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1054                                                 gpa_t addr)
1055 {
1056         /*
1057          * Note that its important to have this wrapper function because
1058          * in the very near future we will be checking for MMIOs against
1059          * the LAPIC as well as the general MMIO bus
1060          */
1061         return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1062 }
1063
1064 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1065                                                gpa_t addr)
1066 {
1067         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1068 }
1069
1070 static int emulator_read_emulated(unsigned long addr,
1071                                   void *val,
1072                                   unsigned int bytes,
1073                                   struct x86_emulate_ctxt *ctxt)
1074 {
1075         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1076         struct kvm_io_device *mmio_dev;
1077         gpa_t                 gpa;
1078
1079         if (vcpu->mmio_read_completed) {
1080                 memcpy(val, vcpu->mmio_data, bytes);
1081                 vcpu->mmio_read_completed = 0;
1082                 return X86EMUL_CONTINUE;
1083         } else if (emulator_read_std(addr, val, bytes, ctxt)
1084                    == X86EMUL_CONTINUE)
1085                 return X86EMUL_CONTINUE;
1086
1087         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1088         if (gpa == UNMAPPED_GVA)
1089                 return X86EMUL_PROPAGATE_FAULT;
1090
1091         /*
1092          * Is this MMIO handled locally?
1093          */
1094         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1095         if (mmio_dev) {
1096                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1097                 return X86EMUL_CONTINUE;
1098         }
1099
1100         vcpu->mmio_needed = 1;
1101         vcpu->mmio_phys_addr = gpa;
1102         vcpu->mmio_size = bytes;
1103         vcpu->mmio_is_write = 0;
1104
1105         return X86EMUL_UNHANDLEABLE;
1106 }
1107
1108 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1109                                const void *val, int bytes)
1110 {
1111         struct page *page;
1112         void *virt;
1113
1114         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1115                 return 0;
1116         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1117         if (!page)
1118                 return 0;
1119         mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1120         virt = kmap_atomic(page, KM_USER0);
1121         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1122         memcpy(virt + offset_in_page(gpa), val, bytes);
1123         kunmap_atomic(virt, KM_USER0);
1124         return 1;
1125 }
1126
1127 static int emulator_write_emulated_onepage(unsigned long addr,
1128                                            const void *val,
1129                                            unsigned int bytes,
1130                                            struct x86_emulate_ctxt *ctxt)
1131 {
1132         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1133         struct kvm_io_device *mmio_dev;
1134         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1135
1136         if (gpa == UNMAPPED_GVA) {
1137                 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1138                 return X86EMUL_PROPAGATE_FAULT;
1139         }
1140
1141         if (emulator_write_phys(vcpu, gpa, val, bytes))
1142                 return X86EMUL_CONTINUE;
1143
1144         /*
1145          * Is this MMIO handled locally?
1146          */
1147         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1148         if (mmio_dev) {
1149                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1150                 return X86EMUL_CONTINUE;
1151         }
1152
1153         vcpu->mmio_needed = 1;
1154         vcpu->mmio_phys_addr = gpa;
1155         vcpu->mmio_size = bytes;
1156         vcpu->mmio_is_write = 1;
1157         memcpy(vcpu->mmio_data, val, bytes);
1158
1159         return X86EMUL_CONTINUE;
1160 }
1161
1162 static int emulator_write_emulated(unsigned long addr,
1163                                    const void *val,
1164                                    unsigned int bytes,
1165                                    struct x86_emulate_ctxt *ctxt)
1166 {
1167         /* Crossing a page boundary? */
1168         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1169                 int rc, now;
1170
1171                 now = -addr & ~PAGE_MASK;
1172                 rc = emulator_write_emulated_onepage(addr, val, now, ctxt);
1173                 if (rc != X86EMUL_CONTINUE)
1174                         return rc;
1175                 addr += now;
1176                 val += now;
1177                 bytes -= now;
1178         }
1179         return emulator_write_emulated_onepage(addr, val, bytes, ctxt);
1180 }
1181
1182 static int emulator_cmpxchg_emulated(unsigned long addr,
1183                                      const void *old,
1184                                      const void *new,
1185                                      unsigned int bytes,
1186                                      struct x86_emulate_ctxt *ctxt)
1187 {
1188         static int reported;
1189
1190         if (!reported) {
1191                 reported = 1;
1192                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1193         }
1194         return emulator_write_emulated(addr, new, bytes, ctxt);
1195 }
1196
1197 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1198 {
1199         return kvm_arch_ops->get_segment_base(vcpu, seg);
1200 }
1201
1202 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1203 {
1204         return X86EMUL_CONTINUE;
1205 }
1206
1207 int emulate_clts(struct kvm_vcpu *vcpu)
1208 {
1209         unsigned long cr0;
1210
1211         cr0 = vcpu->cr0 & ~X86_CR0_TS;
1212         kvm_arch_ops->set_cr0(vcpu, cr0);
1213         return X86EMUL_CONTINUE;
1214 }
1215
1216 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1217 {
1218         struct kvm_vcpu *vcpu = ctxt->vcpu;
1219
1220         switch (dr) {
1221         case 0 ... 3:
1222                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1223                 return X86EMUL_CONTINUE;
1224         default:
1225                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1226                        __FUNCTION__, dr);
1227                 return X86EMUL_UNHANDLEABLE;
1228         }
1229 }
1230
1231 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1232 {
1233         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1234         int exception;
1235
1236         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1237         if (exception) {
1238                 /* FIXME: better handling */
1239                 return X86EMUL_UNHANDLEABLE;
1240         }
1241         return X86EMUL_CONTINUE;
1242 }
1243
1244 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1245 {
1246         static int reported;
1247         u8 opcodes[4];
1248         unsigned long rip = ctxt->vcpu->rip;
1249         unsigned long rip_linear;
1250
1251         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1252
1253         if (reported)
1254                 return;
1255
1256         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1257
1258         printk(KERN_ERR "emulation failed but !mmio_needed?"
1259                " rip %lx %02x %02x %02x %02x\n",
1260                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1261         reported = 1;
1262 }
1263
1264 struct x86_emulate_ops emulate_ops = {
1265         .read_std            = emulator_read_std,
1266         .write_std           = emulator_write_std,
1267         .read_emulated       = emulator_read_emulated,
1268         .write_emulated      = emulator_write_emulated,
1269         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1270 };
1271
1272 int emulate_instruction(struct kvm_vcpu *vcpu,
1273                         struct kvm_run *run,
1274                         unsigned long cr2,
1275                         u16 error_code)
1276 {
1277         struct x86_emulate_ctxt emulate_ctxt;
1278         int r;
1279         int cs_db, cs_l;
1280
1281         vcpu->mmio_fault_cr2 = cr2;
1282         kvm_arch_ops->cache_regs(vcpu);
1283
1284         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1285
1286         emulate_ctxt.vcpu = vcpu;
1287         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1288         emulate_ctxt.cr2 = cr2;
1289         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1290                 ? X86EMUL_MODE_REAL : cs_l
1291                 ? X86EMUL_MODE_PROT64 : cs_db
1292                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1293
1294         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1295                 emulate_ctxt.cs_base = 0;
1296                 emulate_ctxt.ds_base = 0;
1297                 emulate_ctxt.es_base = 0;
1298                 emulate_ctxt.ss_base = 0;
1299         } else {
1300                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1301                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1302                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1303                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1304         }
1305
1306         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1307         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1308
1309         vcpu->mmio_is_write = 0;
1310         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1311
1312         if ((r || vcpu->mmio_is_write) && run) {
1313                 run->exit_reason = KVM_EXIT_MMIO;
1314                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1315                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1316                 run->mmio.len = vcpu->mmio_size;
1317                 run->mmio.is_write = vcpu->mmio_is_write;
1318         }
1319
1320         if (r) {
1321                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1322                         return EMULATE_DONE;
1323                 if (!vcpu->mmio_needed) {
1324                         report_emulation_failure(&emulate_ctxt);
1325                         return EMULATE_FAIL;
1326                 }
1327                 return EMULATE_DO_MMIO;
1328         }
1329
1330         kvm_arch_ops->decache_regs(vcpu);
1331         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1332
1333         if (vcpu->mmio_is_write) {
1334                 vcpu->mmio_needed = 0;
1335                 return EMULATE_DO_MMIO;
1336         }
1337
1338         return EMULATE_DONE;
1339 }
1340 EXPORT_SYMBOL_GPL(emulate_instruction);
1341
1342 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1343 {
1344         if (vcpu->irq_summary)
1345                 return 1;
1346
1347         vcpu->run->exit_reason = KVM_EXIT_HLT;
1348         ++vcpu->stat.halt_exits;
1349         return 0;
1350 }
1351 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1352
1353 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1354 {
1355         unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1356
1357         kvm_arch_ops->cache_regs(vcpu);
1358         ret = -KVM_EINVAL;
1359 #ifdef CONFIG_X86_64
1360         if (is_long_mode(vcpu)) {
1361                 nr = vcpu->regs[VCPU_REGS_RAX];
1362                 a0 = vcpu->regs[VCPU_REGS_RDI];
1363                 a1 = vcpu->regs[VCPU_REGS_RSI];
1364                 a2 = vcpu->regs[VCPU_REGS_RDX];
1365                 a3 = vcpu->regs[VCPU_REGS_RCX];
1366                 a4 = vcpu->regs[VCPU_REGS_R8];
1367                 a5 = vcpu->regs[VCPU_REGS_R9];
1368         } else
1369 #endif
1370         {
1371                 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1372                 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1373                 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1374                 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1375                 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1376                 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1377                 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1378         }
1379         switch (nr) {
1380         default:
1381                 run->hypercall.args[0] = a0;
1382                 run->hypercall.args[1] = a1;
1383                 run->hypercall.args[2] = a2;
1384                 run->hypercall.args[3] = a3;
1385                 run->hypercall.args[4] = a4;
1386                 run->hypercall.args[5] = a5;
1387                 run->hypercall.ret = ret;
1388                 run->hypercall.longmode = is_long_mode(vcpu);
1389                 kvm_arch_ops->decache_regs(vcpu);
1390                 return 0;
1391         }
1392         vcpu->regs[VCPU_REGS_RAX] = ret;
1393         kvm_arch_ops->decache_regs(vcpu);
1394         return 1;
1395 }
1396 EXPORT_SYMBOL_GPL(kvm_hypercall);
1397
1398 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1399 {
1400         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1401 }
1402
1403 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1404 {
1405         struct descriptor_table dt = { limit, base };
1406
1407         kvm_arch_ops->set_gdt(vcpu, &dt);
1408 }
1409
1410 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1411 {
1412         struct descriptor_table dt = { limit, base };
1413
1414         kvm_arch_ops->set_idt(vcpu, &dt);
1415 }
1416
1417 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1418                    unsigned long *rflags)
1419 {
1420         lmsw(vcpu, msw);
1421         *rflags = kvm_arch_ops->get_rflags(vcpu);
1422 }
1423
1424 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1425 {
1426         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1427         switch (cr) {
1428         case 0:
1429                 return vcpu->cr0;
1430         case 2:
1431                 return vcpu->cr2;
1432         case 3:
1433                 return vcpu->cr3;
1434         case 4:
1435                 return vcpu->cr4;
1436         default:
1437                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1438                 return 0;
1439         }
1440 }
1441
1442 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1443                      unsigned long *rflags)
1444 {
1445         switch (cr) {
1446         case 0:
1447                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1448                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1449                 break;
1450         case 2:
1451                 vcpu->cr2 = val;
1452                 break;
1453         case 3:
1454                 set_cr3(vcpu, val);
1455                 break;
1456         case 4:
1457                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1458                 break;
1459         default:
1460                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1461         }
1462 }
1463
1464 /*
1465  * Register the para guest with the host:
1466  */
1467 static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1468 {
1469         struct kvm_vcpu_para_state *para_state;
1470         hpa_t para_state_hpa, hypercall_hpa;
1471         struct page *para_state_page;
1472         unsigned char *hypercall;
1473         gpa_t hypercall_gpa;
1474
1475         printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1476         printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1477
1478         /*
1479          * Needs to be page aligned:
1480          */
1481         if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1482                 goto err_gp;
1483
1484         para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1485         printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1486         if (is_error_hpa(para_state_hpa))
1487                 goto err_gp;
1488
1489         mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1490         para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1491         para_state = kmap(para_state_page);
1492
1493         printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1494         printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1495
1496         para_state->host_version = KVM_PARA_API_VERSION;
1497         /*
1498          * We cannot support guests that try to register themselves
1499          * with a newer API version than the host supports:
1500          */
1501         if (para_state->guest_version > KVM_PARA_API_VERSION) {
1502                 para_state->ret = -KVM_EINVAL;
1503                 goto err_kunmap_skip;
1504         }
1505
1506         hypercall_gpa = para_state->hypercall_gpa;
1507         hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1508         printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1509         if (is_error_hpa(hypercall_hpa)) {
1510                 para_state->ret = -KVM_EINVAL;
1511                 goto err_kunmap_skip;
1512         }
1513
1514         printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1515         vcpu->para_state_page = para_state_page;
1516         vcpu->para_state_gpa = para_state_gpa;
1517         vcpu->hypercall_gpa = hypercall_gpa;
1518
1519         mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1520         hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1521                                 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1522         kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1523         kunmap_atomic(hypercall, KM_USER1);
1524
1525         para_state->ret = 0;
1526 err_kunmap_skip:
1527         kunmap(para_state_page);
1528         return 0;
1529 err_gp:
1530         return 1;
1531 }
1532
1533 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1534 {
1535         u64 data;
1536
1537         switch (msr) {
1538         case 0xc0010010: /* SYSCFG */
1539         case 0xc0010015: /* HWCR */
1540         case MSR_IA32_PLATFORM_ID:
1541         case MSR_IA32_P5_MC_ADDR:
1542         case MSR_IA32_P5_MC_TYPE:
1543         case MSR_IA32_MC0_CTL:
1544         case MSR_IA32_MCG_STATUS:
1545         case MSR_IA32_MCG_CAP:
1546         case MSR_IA32_MC0_MISC:
1547         case MSR_IA32_MC0_MISC+4:
1548         case MSR_IA32_MC0_MISC+8:
1549         case MSR_IA32_MC0_MISC+12:
1550         case MSR_IA32_MC0_MISC+16:
1551         case MSR_IA32_UCODE_REV:
1552         case MSR_IA32_PERF_STATUS:
1553         case MSR_IA32_EBL_CR_POWERON:
1554                 /* MTRR registers */
1555         case 0xfe:
1556         case 0x200 ... 0x2ff:
1557                 data = 0;
1558                 break;
1559         case 0xcd: /* fsb frequency */
1560                 data = 3;
1561                 break;
1562         case MSR_IA32_APICBASE:
1563                 data = vcpu->apic_base;
1564                 break;
1565         case MSR_IA32_MISC_ENABLE:
1566                 data = vcpu->ia32_misc_enable_msr;
1567                 break;
1568 #ifdef CONFIG_X86_64
1569         case MSR_EFER:
1570                 data = vcpu->shadow_efer;
1571                 break;
1572 #endif
1573         default:
1574                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1575                 return 1;
1576         }
1577         *pdata = data;
1578         return 0;
1579 }
1580 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1581
1582 /*
1583  * Reads an msr value (of 'msr_index') into 'pdata'.
1584  * Returns 0 on success, non-0 otherwise.
1585  * Assumes vcpu_load() was already called.
1586  */
1587 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1588 {
1589         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1590 }
1591
1592 #ifdef CONFIG_X86_64
1593
1594 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1595 {
1596         if (efer & EFER_RESERVED_BITS) {
1597                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1598                        efer);
1599                 inject_gp(vcpu);
1600                 return;
1601         }
1602
1603         if (is_paging(vcpu)
1604             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1605                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1606                 inject_gp(vcpu);
1607                 return;
1608         }
1609
1610         kvm_arch_ops->set_efer(vcpu, efer);
1611
1612         efer &= ~EFER_LMA;
1613         efer |= vcpu->shadow_efer & EFER_LMA;
1614
1615         vcpu->shadow_efer = efer;
1616 }
1617
1618 #endif
1619
1620 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1621 {
1622         switch (msr) {
1623 #ifdef CONFIG_X86_64
1624         case MSR_EFER:
1625                 set_efer(vcpu, data);
1626                 break;
1627 #endif
1628         case MSR_IA32_MC0_STATUS:
1629                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1630                        __FUNCTION__, data);
1631                 break;
1632         case MSR_IA32_MCG_STATUS:
1633                 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1634                         __FUNCTION__, data);
1635                 break;
1636         case MSR_IA32_UCODE_REV:
1637         case MSR_IA32_UCODE_WRITE:
1638         case 0x200 ... 0x2ff: /* MTRRs */
1639                 break;
1640         case MSR_IA32_APICBASE:
1641                 vcpu->apic_base = data;
1642                 break;
1643         case MSR_IA32_MISC_ENABLE:
1644                 vcpu->ia32_misc_enable_msr = data;
1645                 break;
1646         /*
1647          * This is the 'probe whether the host is KVM' logic:
1648          */
1649         case MSR_KVM_API_MAGIC:
1650                 return vcpu_register_para(vcpu, data);
1651
1652         default:
1653                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1654                 return 1;
1655         }
1656         return 0;
1657 }
1658 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1659
1660 /*
1661  * Writes msr value into into the appropriate "register".
1662  * Returns 0 on success, non-0 otherwise.
1663  * Assumes vcpu_load() was already called.
1664  */
1665 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1666 {
1667         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1668 }
1669
1670 void kvm_resched(struct kvm_vcpu *vcpu)
1671 {
1672         if (!need_resched())
1673                 return;
1674         vcpu_put(vcpu);
1675         cond_resched();
1676         vcpu_load(vcpu);
1677 }
1678 EXPORT_SYMBOL_GPL(kvm_resched);
1679
1680 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1681 {
1682         int i;
1683         u32 function;
1684         struct kvm_cpuid_entry *e, *best;
1685
1686         kvm_arch_ops->cache_regs(vcpu);
1687         function = vcpu->regs[VCPU_REGS_RAX];
1688         vcpu->regs[VCPU_REGS_RAX] = 0;
1689         vcpu->regs[VCPU_REGS_RBX] = 0;
1690         vcpu->regs[VCPU_REGS_RCX] = 0;
1691         vcpu->regs[VCPU_REGS_RDX] = 0;
1692         best = NULL;
1693         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1694                 e = &vcpu->cpuid_entries[i];
1695                 if (e->function == function) {
1696                         best = e;
1697                         break;
1698                 }
1699                 /*
1700                  * Both basic or both extended?
1701                  */
1702                 if (((e->function ^ function) & 0x80000000) == 0)
1703                         if (!best || e->function > best->function)
1704                                 best = e;
1705         }
1706         if (best) {
1707                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1708                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1709                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1710                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1711         }
1712         kvm_arch_ops->decache_regs(vcpu);
1713         kvm_arch_ops->skip_emulated_instruction(vcpu);
1714 }
1715 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1716
1717 static int pio_copy_data(struct kvm_vcpu *vcpu)
1718 {
1719         void *p = vcpu->pio_data;
1720         void *q;
1721         unsigned bytes;
1722         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1723
1724         kvm_arch_ops->vcpu_put(vcpu);
1725         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1726                  PAGE_KERNEL);
1727         if (!q) {
1728                 kvm_arch_ops->vcpu_load(vcpu);
1729                 free_pio_guest_pages(vcpu);
1730                 return -ENOMEM;
1731         }
1732         q += vcpu->pio.guest_page_offset;
1733         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1734         if (vcpu->pio.in)
1735                 memcpy(q, p, bytes);
1736         else
1737                 memcpy(p, q, bytes);
1738         q -= vcpu->pio.guest_page_offset;
1739         vunmap(q);
1740         kvm_arch_ops->vcpu_load(vcpu);
1741         free_pio_guest_pages(vcpu);
1742         return 0;
1743 }
1744
1745 static int complete_pio(struct kvm_vcpu *vcpu)
1746 {
1747         struct kvm_pio_request *io = &vcpu->pio;
1748         long delta;
1749         int r;
1750
1751         kvm_arch_ops->cache_regs(vcpu);
1752
1753         if (!io->string) {
1754                 if (io->in)
1755                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1756                                io->size);
1757         } else {
1758                 if (io->in) {
1759                         r = pio_copy_data(vcpu);
1760                         if (r) {
1761                                 kvm_arch_ops->cache_regs(vcpu);
1762                                 return r;
1763                         }
1764                 }
1765
1766                 delta = 1;
1767                 if (io->rep) {
1768                         delta *= io->cur_count;
1769                         /*
1770                          * The size of the register should really depend on
1771                          * current address size.
1772                          */
1773                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1774                 }
1775                 if (io->down)
1776                         delta = -delta;
1777                 delta *= io->size;
1778                 if (io->in)
1779                         vcpu->regs[VCPU_REGS_RDI] += delta;
1780                 else
1781                         vcpu->regs[VCPU_REGS_RSI] += delta;
1782         }
1783
1784         kvm_arch_ops->decache_regs(vcpu);
1785
1786         io->count -= io->cur_count;
1787         io->cur_count = 0;
1788
1789         if (!io->count)
1790                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1791         return 0;
1792 }
1793
1794 static void kernel_pio(struct kvm_io_device *pio_dev,
1795                        struct kvm_vcpu *vcpu,
1796                        void *pd)
1797 {
1798         /* TODO: String I/O for in kernel device */
1799
1800         if (vcpu->pio.in)
1801                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1802                                   vcpu->pio.size,
1803                                   pd);
1804         else
1805                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1806                                    vcpu->pio.size,
1807                                    pd);
1808 }
1809
1810 static void pio_string_write(struct kvm_io_device *pio_dev,
1811                              struct kvm_vcpu *vcpu)
1812 {
1813         struct kvm_pio_request *io = &vcpu->pio;
1814         void *pd = vcpu->pio_data;
1815         int i;
1816
1817         for (i = 0; i < io->cur_count; i++) {
1818                 kvm_iodevice_write(pio_dev, io->port,
1819                                    io->size,
1820                                    pd);
1821                 pd += io->size;
1822         }
1823 }
1824
1825 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1826                   int size, unsigned long count, int string, int down,
1827                   gva_t address, int rep, unsigned port)
1828 {
1829         unsigned now, in_page;
1830         int i, ret = 0;
1831         int nr_pages = 1;
1832         struct page *page;
1833         struct kvm_io_device *pio_dev;
1834
1835         vcpu->run->exit_reason = KVM_EXIT_IO;
1836         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1837         vcpu->run->io.size = size;
1838         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1839         vcpu->run->io.count = count;
1840         vcpu->run->io.port = port;
1841         vcpu->pio.count = count;
1842         vcpu->pio.cur_count = count;
1843         vcpu->pio.size = size;
1844         vcpu->pio.in = in;
1845         vcpu->pio.port = port;
1846         vcpu->pio.string = string;
1847         vcpu->pio.down = down;
1848         vcpu->pio.guest_page_offset = offset_in_page(address);
1849         vcpu->pio.rep = rep;
1850
1851         pio_dev = vcpu_find_pio_dev(vcpu, port);
1852         if (!string) {
1853                 kvm_arch_ops->cache_regs(vcpu);
1854                 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1855                 kvm_arch_ops->decache_regs(vcpu);
1856                 if (pio_dev) {
1857                         kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1858                         complete_pio(vcpu);
1859                         return 1;
1860                 }
1861                 return 0;
1862         }
1863
1864         if (!count) {
1865                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1866                 return 1;
1867         }
1868
1869         now = min(count, PAGE_SIZE / size);
1870
1871         if (!down)
1872                 in_page = PAGE_SIZE - offset_in_page(address);
1873         else
1874                 in_page = offset_in_page(address) + size;
1875         now = min(count, (unsigned long)in_page / size);
1876         if (!now) {
1877                 /*
1878                  * String I/O straddles page boundary.  Pin two guest pages
1879                  * so that we satisfy atomicity constraints.  Do just one
1880                  * transaction to avoid complexity.
1881                  */
1882                 nr_pages = 2;
1883                 now = 1;
1884         }
1885         if (down) {
1886                 /*
1887                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1888                  */
1889                 printk(KERN_ERR "kvm: guest string pio down\n");
1890                 inject_gp(vcpu);
1891                 return 1;
1892         }
1893         vcpu->run->io.count = now;
1894         vcpu->pio.cur_count = now;
1895
1896         for (i = 0; i < nr_pages; ++i) {
1897                 spin_lock(&vcpu->kvm->lock);
1898                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1899                 if (page)
1900                         get_page(page);
1901                 vcpu->pio.guest_pages[i] = page;
1902                 spin_unlock(&vcpu->kvm->lock);
1903                 if (!page) {
1904                         inject_gp(vcpu);
1905                         free_pio_guest_pages(vcpu);
1906                         return 1;
1907                 }
1908         }
1909
1910         if (!vcpu->pio.in) {
1911                 /* string PIO write */
1912                 ret = pio_copy_data(vcpu);
1913                 if (ret >= 0 && pio_dev) {
1914                         pio_string_write(pio_dev, vcpu);
1915                         complete_pio(vcpu);
1916                         if (vcpu->pio.count == 0)
1917                                 ret = 1;
1918                 }
1919         } else if (pio_dev)
1920                 printk(KERN_ERR "no string pio read support yet, "
1921                        "port %x size %d count %ld\n",
1922                         port, size, count);
1923
1924         return ret;
1925 }
1926 EXPORT_SYMBOL_GPL(kvm_setup_pio);
1927
1928 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1929 {
1930         int r;
1931         sigset_t sigsaved;
1932
1933         vcpu_load(vcpu);
1934
1935         if (vcpu->sigset_active)
1936                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1937
1938         /* re-sync apic's tpr */
1939         vcpu->cr8 = kvm_run->cr8;
1940
1941         if (vcpu->pio.cur_count) {
1942                 r = complete_pio(vcpu);
1943                 if (r)
1944                         goto out;
1945         }
1946
1947         if (vcpu->mmio_needed) {
1948                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1949                 vcpu->mmio_read_completed = 1;
1950                 vcpu->mmio_needed = 0;
1951                 r = emulate_instruction(vcpu, kvm_run,
1952                                         vcpu->mmio_fault_cr2, 0);
1953                 if (r == EMULATE_DO_MMIO) {
1954                         /*
1955                          * Read-modify-write.  Back to userspace.
1956                          */
1957                         r = 0;
1958                         goto out;
1959                 }
1960         }
1961
1962         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1963                 kvm_arch_ops->cache_regs(vcpu);
1964                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1965                 kvm_arch_ops->decache_regs(vcpu);
1966         }
1967
1968         r = kvm_arch_ops->run(vcpu, kvm_run);
1969
1970 out:
1971         if (vcpu->sigset_active)
1972                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1973
1974         vcpu_put(vcpu);
1975         return r;
1976 }
1977
1978 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1979                                    struct kvm_regs *regs)
1980 {
1981         vcpu_load(vcpu);
1982
1983         kvm_arch_ops->cache_regs(vcpu);
1984
1985         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1986         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1987         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1988         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1989         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1990         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1991         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1992         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1993 #ifdef CONFIG_X86_64
1994         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1995         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1996         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1997         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1998         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1999         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2000         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2001         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2002 #endif
2003
2004         regs->rip = vcpu->rip;
2005         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
2006
2007         /*
2008          * Don't leak debug flags in case they were set for guest debugging
2009          */
2010         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2011                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2012
2013         vcpu_put(vcpu);
2014
2015         return 0;
2016 }
2017
2018 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2019                                    struct kvm_regs *regs)
2020 {
2021         vcpu_load(vcpu);
2022
2023         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2024         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2025         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2026         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2027         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2028         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2029         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2030         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2031 #ifdef CONFIG_X86_64
2032         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2033         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2034         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2035         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2036         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2037         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2038         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2039         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2040 #endif
2041
2042         vcpu->rip = regs->rip;
2043         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
2044
2045         kvm_arch_ops->decache_regs(vcpu);
2046
2047         vcpu_put(vcpu);
2048
2049         return 0;
2050 }
2051
2052 static void get_segment(struct kvm_vcpu *vcpu,
2053                         struct kvm_segment *var, int seg)
2054 {
2055         return kvm_arch_ops->get_segment(vcpu, var, seg);
2056 }
2057
2058 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2059                                     struct kvm_sregs *sregs)
2060 {
2061         struct descriptor_table dt;
2062
2063         vcpu_load(vcpu);
2064
2065         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2066         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2067         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2068         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2069         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2070         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2071
2072         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2073         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2074
2075         kvm_arch_ops->get_idt(vcpu, &dt);
2076         sregs->idt.limit = dt.limit;
2077         sregs->idt.base = dt.base;
2078         kvm_arch_ops->get_gdt(vcpu, &dt);
2079         sregs->gdt.limit = dt.limit;
2080         sregs->gdt.base = dt.base;
2081
2082         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2083         sregs->cr0 = vcpu->cr0;
2084         sregs->cr2 = vcpu->cr2;
2085         sregs->cr3 = vcpu->cr3;
2086         sregs->cr4 = vcpu->cr4;
2087         sregs->cr8 = vcpu->cr8;
2088         sregs->efer = vcpu->shadow_efer;
2089         sregs->apic_base = vcpu->apic_base;
2090
2091         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2092                sizeof sregs->interrupt_bitmap);
2093
2094         vcpu_put(vcpu);
2095
2096         return 0;
2097 }
2098
2099 static void set_segment(struct kvm_vcpu *vcpu,
2100                         struct kvm_segment *var, int seg)
2101 {
2102         return kvm_arch_ops->set_segment(vcpu, var, seg);
2103 }
2104
2105 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2106                                     struct kvm_sregs *sregs)
2107 {
2108         int mmu_reset_needed = 0;
2109         int i;
2110         struct descriptor_table dt;
2111
2112         vcpu_load(vcpu);
2113
2114         dt.limit = sregs->idt.limit;
2115         dt.base = sregs->idt.base;
2116         kvm_arch_ops->set_idt(vcpu, &dt);
2117         dt.limit = sregs->gdt.limit;
2118         dt.base = sregs->gdt.base;
2119         kvm_arch_ops->set_gdt(vcpu, &dt);
2120
2121         vcpu->cr2 = sregs->cr2;
2122         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2123         vcpu->cr3 = sregs->cr3;
2124
2125         vcpu->cr8 = sregs->cr8;
2126
2127         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2128 #ifdef CONFIG_X86_64
2129         kvm_arch_ops->set_efer(vcpu, sregs->efer);
2130 #endif
2131         vcpu->apic_base = sregs->apic_base;
2132
2133         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2134
2135         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2136         kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2137
2138         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2139         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2140         if (!is_long_mode(vcpu) && is_pae(vcpu))
2141                 load_pdptrs(vcpu, vcpu->cr3);
2142
2143         if (mmu_reset_needed)
2144                 kvm_mmu_reset_context(vcpu);
2145
2146         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2147                sizeof vcpu->irq_pending);
2148         vcpu->irq_summary = 0;
2149         for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2150                 if (vcpu->irq_pending[i])
2151                         __set_bit(i, &vcpu->irq_summary);
2152
2153         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2154         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2155         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2156         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2157         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2158         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2159
2160         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2161         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2162
2163         vcpu_put(vcpu);
2164
2165         return 0;
2166 }
2167
2168 /*
2169  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2170  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2171  *
2172  * This list is modified at module load time to reflect the
2173  * capabilities of the host cpu.
2174  */
2175 static u32 msrs_to_save[] = {
2176         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2177         MSR_K6_STAR,
2178 #ifdef CONFIG_X86_64
2179         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2180 #endif
2181         MSR_IA32_TIME_STAMP_COUNTER,
2182 };
2183
2184 static unsigned num_msrs_to_save;
2185
2186 static u32 emulated_msrs[] = {
2187         MSR_IA32_MISC_ENABLE,
2188 };
2189
2190 static __init void kvm_init_msr_list(void)
2191 {
2192         u32 dummy[2];
2193         unsigned i, j;
2194
2195         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2196                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2197                         continue;
2198                 if (j < i)
2199                         msrs_to_save[j] = msrs_to_save[i];
2200                 j++;
2201         }
2202         num_msrs_to_save = j;
2203 }
2204
2205 /*
2206  * Adapt set_msr() to msr_io()'s calling convention
2207  */
2208 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2209 {
2210         return kvm_set_msr(vcpu, index, *data);
2211 }
2212
2213 /*
2214  * Read or write a bunch of msrs. All parameters are kernel addresses.
2215  *
2216  * @return number of msrs set successfully.
2217  */
2218 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2219                     struct kvm_msr_entry *entries,
2220                     int (*do_msr)(struct kvm_vcpu *vcpu,
2221                                   unsigned index, u64 *data))
2222 {
2223         int i;
2224
2225         vcpu_load(vcpu);
2226
2227         for (i = 0; i < msrs->nmsrs; ++i)
2228                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2229                         break;
2230
2231         vcpu_put(vcpu);
2232
2233         return i;
2234 }
2235
2236 /*
2237  * Read or write a bunch of msrs. Parameters are user addresses.
2238  *
2239  * @return number of msrs set successfully.
2240  */
2241 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2242                   int (*do_msr)(struct kvm_vcpu *vcpu,
2243                                 unsigned index, u64 *data),
2244                   int writeback)
2245 {
2246         struct kvm_msrs msrs;
2247         struct kvm_msr_entry *entries;
2248         int r, n;
2249         unsigned size;
2250
2251         r = -EFAULT;
2252         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2253                 goto out;
2254
2255         r = -E2BIG;
2256         if (msrs.nmsrs >= MAX_IO_MSRS)
2257                 goto out;
2258
2259         r = -ENOMEM;
2260         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2261         entries = vmalloc(size);
2262         if (!entries)
2263                 goto out;
2264
2265         r = -EFAULT;
2266         if (copy_from_user(entries, user_msrs->entries, size))
2267                 goto out_free;
2268
2269         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2270         if (r < 0)
2271                 goto out_free;
2272
2273         r = -EFAULT;
2274         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2275                 goto out_free;
2276
2277         r = n;
2278
2279 out_free:
2280         vfree(entries);
2281 out:
2282         return r;
2283 }
2284
2285 /*
2286  * Translate a guest virtual address to a guest physical address.
2287  */
2288 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2289                                     struct kvm_translation *tr)
2290 {
2291         unsigned long vaddr = tr->linear_address;
2292         gpa_t gpa;
2293
2294         vcpu_load(vcpu);
2295         spin_lock(&vcpu->kvm->lock);
2296         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2297         tr->physical_address = gpa;
2298         tr->valid = gpa != UNMAPPED_GVA;
2299         tr->writeable = 1;
2300         tr->usermode = 0;
2301         spin_unlock(&vcpu->kvm->lock);
2302         vcpu_put(vcpu);
2303
2304         return 0;
2305 }
2306
2307 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2308                                     struct kvm_interrupt *irq)
2309 {
2310         if (irq->irq < 0 || irq->irq >= 256)
2311                 return -EINVAL;
2312         vcpu_load(vcpu);
2313
2314         set_bit(irq->irq, vcpu->irq_pending);
2315         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2316
2317         vcpu_put(vcpu);
2318
2319         return 0;
2320 }
2321
2322 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2323                                       struct kvm_debug_guest *dbg)
2324 {
2325         int r;
2326
2327         vcpu_load(vcpu);
2328
2329         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2330
2331         vcpu_put(vcpu);
2332
2333         return r;
2334 }
2335
2336 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2337                                     unsigned long address,
2338                                     int *type)
2339 {
2340         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2341         unsigned long pgoff;
2342         struct page *page;
2343
2344         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2345         if (pgoff == 0)
2346                 page = virt_to_page(vcpu->run);
2347         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2348                 page = virt_to_page(vcpu->pio_data);
2349         else
2350                 return NOPAGE_SIGBUS;
2351         get_page(page);
2352         if (type != NULL)
2353                 *type = VM_FAULT_MINOR;
2354
2355         return page;
2356 }
2357
2358 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2359         .nopage = kvm_vcpu_nopage,
2360 };
2361
2362 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2363 {
2364         vma->vm_ops = &kvm_vcpu_vm_ops;
2365         return 0;
2366 }
2367
2368 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2369 {
2370         struct kvm_vcpu *vcpu = filp->private_data;
2371
2372         fput(vcpu->kvm->filp);
2373         return 0;
2374 }
2375
2376 static struct file_operations kvm_vcpu_fops = {
2377         .release        = kvm_vcpu_release,
2378         .unlocked_ioctl = kvm_vcpu_ioctl,
2379         .compat_ioctl   = kvm_vcpu_ioctl,
2380         .mmap           = kvm_vcpu_mmap,
2381 };
2382
2383 /*
2384  * Allocates an inode for the vcpu.
2385  */
2386 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2387 {
2388         int fd, r;
2389         struct inode *inode;
2390         struct file *file;
2391
2392         r = anon_inode_getfd(&fd, &inode, &file,
2393                              "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2394         if (r)
2395                 return r;
2396         atomic_inc(&vcpu->kvm->filp->f_count);
2397         return fd;
2398 }
2399
2400 /*
2401  * Creates some virtual cpus.  Good luck creating more than one.
2402  */
2403 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2404 {
2405         int r;
2406         struct kvm_vcpu *vcpu;
2407
2408         if (!valid_vcpu(n))
2409                 return -EINVAL;
2410
2411         vcpu = kvm_arch_ops->vcpu_create(kvm, n);
2412         if (IS_ERR(vcpu))
2413                 return PTR_ERR(vcpu);
2414
2415         vcpu_load(vcpu);
2416         r = kvm_mmu_setup(vcpu);
2417         vcpu_put(vcpu);
2418         if (r < 0)
2419                 goto free_vcpu;
2420
2421         spin_lock(&kvm->lock);
2422         if (kvm->vcpus[n]) {
2423                 r = -EEXIST;
2424                 spin_unlock(&kvm->lock);
2425                 goto mmu_unload;
2426         }
2427         kvm->vcpus[n] = vcpu;
2428         spin_unlock(&kvm->lock);
2429
2430         /* Now it's all set up, let userspace reach it */
2431         r = create_vcpu_fd(vcpu);
2432         if (r < 0)
2433                 goto unlink;
2434         return r;
2435
2436 unlink:
2437         spin_lock(&kvm->lock);
2438         kvm->vcpus[n] = NULL;
2439         spin_unlock(&kvm->lock);
2440
2441 mmu_unload:
2442         vcpu_load(vcpu);
2443         kvm_mmu_unload(vcpu);
2444         vcpu_put(vcpu);
2445
2446 free_vcpu:
2447         kvm_arch_ops->vcpu_free(vcpu);
2448         return r;
2449 }
2450
2451 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2452 {
2453         u64 efer;
2454         int i;
2455         struct kvm_cpuid_entry *e, *entry;
2456
2457         rdmsrl(MSR_EFER, efer);
2458         entry = NULL;
2459         for (i = 0; i < vcpu->cpuid_nent; ++i) {
2460                 e = &vcpu->cpuid_entries[i];
2461                 if (e->function == 0x80000001) {
2462                         entry = e;
2463                         break;
2464                 }
2465         }
2466         if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2467                 entry->edx &= ~(1 << 20);
2468                 printk(KERN_INFO "kvm: guest NX capability removed\n");
2469         }
2470 }
2471
2472 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2473                                     struct kvm_cpuid *cpuid,
2474                                     struct kvm_cpuid_entry __user *entries)
2475 {
2476         int r;
2477
2478         r = -E2BIG;
2479         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2480                 goto out;
2481         r = -EFAULT;
2482         if (copy_from_user(&vcpu->cpuid_entries, entries,
2483                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2484                 goto out;
2485         vcpu->cpuid_nent = cpuid->nent;
2486         cpuid_fix_nx_cap(vcpu);
2487         return 0;
2488
2489 out:
2490         return r;
2491 }
2492
2493 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2494 {
2495         if (sigset) {
2496                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2497                 vcpu->sigset_active = 1;
2498                 vcpu->sigset = *sigset;
2499         } else
2500                 vcpu->sigset_active = 0;
2501         return 0;
2502 }
2503
2504 /*
2505  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2506  * we have asm/x86/processor.h
2507  */
2508 struct fxsave {
2509         u16     cwd;
2510         u16     swd;
2511         u16     twd;
2512         u16     fop;
2513         u64     rip;
2514         u64     rdp;
2515         u32     mxcsr;
2516         u32     mxcsr_mask;
2517         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2518 #ifdef CONFIG_X86_64
2519         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2520 #else
2521         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2522 #endif
2523 };
2524
2525 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2526 {
2527         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2528
2529         vcpu_load(vcpu);
2530
2531         memcpy(fpu->fpr, fxsave->st_space, 128);
2532         fpu->fcw = fxsave->cwd;
2533         fpu->fsw = fxsave->swd;
2534         fpu->ftwx = fxsave->twd;
2535         fpu->last_opcode = fxsave->fop;
2536         fpu->last_ip = fxsave->rip;
2537         fpu->last_dp = fxsave->rdp;
2538         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2539
2540         vcpu_put(vcpu);
2541
2542         return 0;
2543 }
2544
2545 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2546 {
2547         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2548
2549         vcpu_load(vcpu);
2550
2551         memcpy(fxsave->st_space, fpu->fpr, 128);
2552         fxsave->cwd = fpu->fcw;
2553         fxsave->swd = fpu->fsw;
2554         fxsave->twd = fpu->ftwx;
2555         fxsave->fop = fpu->last_opcode;
2556         fxsave->rip = fpu->last_ip;
2557         fxsave->rdp = fpu->last_dp;
2558         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2559
2560         vcpu_put(vcpu);
2561
2562         return 0;
2563 }
2564
2565 static long kvm_vcpu_ioctl(struct file *filp,
2566                            unsigned int ioctl, unsigned long arg)
2567 {
2568         struct kvm_vcpu *vcpu = filp->private_data;
2569         void __user *argp = (void __user *)arg;
2570         int r = -EINVAL;
2571
2572         switch (ioctl) {
2573         case KVM_RUN:
2574                 r = -EINVAL;
2575                 if (arg)
2576                         goto out;
2577                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2578                 break;
2579         case KVM_GET_REGS: {
2580                 struct kvm_regs kvm_regs;
2581
2582                 memset(&kvm_regs, 0, sizeof kvm_regs);
2583                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2584                 if (r)
2585                         goto out;
2586                 r = -EFAULT;
2587                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2588                         goto out;
2589                 r = 0;
2590                 break;
2591         }
2592         case KVM_SET_REGS: {
2593                 struct kvm_regs kvm_regs;
2594
2595                 r = -EFAULT;
2596                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2597                         goto out;
2598                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2599                 if (r)
2600                         goto out;
2601                 r = 0;
2602                 break;
2603         }
2604         case KVM_GET_SREGS: {
2605                 struct kvm_sregs kvm_sregs;
2606
2607                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2608                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2609                 if (r)
2610                         goto out;
2611                 r = -EFAULT;
2612                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2613                         goto out;
2614                 r = 0;
2615                 break;
2616         }
2617         case KVM_SET_SREGS: {
2618                 struct kvm_sregs kvm_sregs;
2619
2620                 r = -EFAULT;
2621                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2622                         goto out;
2623                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2624                 if (r)
2625                         goto out;
2626                 r = 0;
2627                 break;
2628         }
2629         case KVM_TRANSLATE: {
2630                 struct kvm_translation tr;
2631
2632                 r = -EFAULT;
2633                 if (copy_from_user(&tr, argp, sizeof tr))
2634                         goto out;
2635                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2636                 if (r)
2637                         goto out;
2638                 r = -EFAULT;
2639                 if (copy_to_user(argp, &tr, sizeof tr))
2640                         goto out;
2641                 r = 0;
2642                 break;
2643         }
2644         case KVM_INTERRUPT: {
2645                 struct kvm_interrupt irq;
2646
2647                 r = -EFAULT;
2648                 if (copy_from_user(&irq, argp, sizeof irq))
2649                         goto out;
2650                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2651                 if (r)
2652                         goto out;
2653                 r = 0;
2654                 break;
2655         }
2656         case KVM_DEBUG_GUEST: {
2657                 struct kvm_debug_guest dbg;
2658
2659                 r = -EFAULT;
2660                 if (copy_from_user(&dbg, argp, sizeof dbg))
2661                         goto out;
2662                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2663                 if (r)
2664                         goto out;
2665                 r = 0;
2666                 break;
2667         }
2668         case KVM_GET_MSRS:
2669                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2670                 break;
2671         case KVM_SET_MSRS:
2672                 r = msr_io(vcpu, argp, do_set_msr, 0);
2673                 break;
2674         case KVM_SET_CPUID: {
2675                 struct kvm_cpuid __user *cpuid_arg = argp;
2676                 struct kvm_cpuid cpuid;
2677
2678                 r = -EFAULT;
2679                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2680                         goto out;
2681                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2682                 if (r)
2683                         goto out;
2684                 break;
2685         }
2686         case KVM_SET_SIGNAL_MASK: {
2687                 struct kvm_signal_mask __user *sigmask_arg = argp;
2688                 struct kvm_signal_mask kvm_sigmask;
2689                 sigset_t sigset, *p;
2690
2691                 p = NULL;
2692                 if (argp) {
2693                         r = -EFAULT;
2694                         if (copy_from_user(&kvm_sigmask, argp,
2695                                            sizeof kvm_sigmask))
2696                                 goto out;
2697                         r = -EINVAL;
2698                         if (kvm_sigmask.len != sizeof sigset)
2699                                 goto out;
2700                         r = -EFAULT;
2701                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2702                                            sizeof sigset))
2703                                 goto out;
2704                         p = &sigset;
2705                 }
2706                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2707                 break;
2708         }
2709         case KVM_GET_FPU: {
2710                 struct kvm_fpu fpu;
2711
2712                 memset(&fpu, 0, sizeof fpu);
2713                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2714                 if (r)
2715                         goto out;
2716                 r = -EFAULT;
2717                 if (copy_to_user(argp, &fpu, sizeof fpu))
2718                         goto out;
2719                 r = 0;
2720                 break;
2721         }
2722         case KVM_SET_FPU: {
2723                 struct kvm_fpu fpu;
2724
2725                 r = -EFAULT;
2726                 if (copy_from_user(&fpu, argp, sizeof fpu))
2727                         goto out;
2728                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2729                 if (r)
2730                         goto out;
2731                 r = 0;
2732                 break;
2733         }
2734         default:
2735                 ;
2736         }
2737 out:
2738         return r;
2739 }
2740
2741 static long kvm_vm_ioctl(struct file *filp,
2742                            unsigned int ioctl, unsigned long arg)
2743 {
2744         struct kvm *kvm = filp->private_data;
2745         void __user *argp = (void __user *)arg;
2746         int r = -EINVAL;
2747
2748         switch (ioctl) {
2749         case KVM_CREATE_VCPU:
2750                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2751                 if (r < 0)
2752                         goto out;
2753                 break;
2754         case KVM_SET_MEMORY_REGION: {
2755                 struct kvm_memory_region kvm_mem;
2756
2757                 r = -EFAULT;
2758                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2759                         goto out;
2760                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2761                 if (r)
2762                         goto out;
2763                 break;
2764         }
2765         case KVM_GET_DIRTY_LOG: {
2766                 struct kvm_dirty_log log;
2767
2768                 r = -EFAULT;
2769                 if (copy_from_user(&log, argp, sizeof log))
2770                         goto out;
2771                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2772                 if (r)
2773                         goto out;
2774                 break;
2775         }
2776         case KVM_SET_MEMORY_ALIAS: {
2777                 struct kvm_memory_alias alias;
2778
2779                 r = -EFAULT;
2780                 if (copy_from_user(&alias, argp, sizeof alias))
2781                         goto out;
2782                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2783                 if (r)
2784                         goto out;
2785                 break;
2786         }
2787         default:
2788                 ;
2789         }
2790 out:
2791         return r;
2792 }
2793
2794 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2795                                   unsigned long address,
2796                                   int *type)
2797 {
2798         struct kvm *kvm = vma->vm_file->private_data;
2799         unsigned long pgoff;
2800         struct page *page;
2801
2802         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2803         page = gfn_to_page(kvm, pgoff);
2804         if (!page)
2805                 return NOPAGE_SIGBUS;
2806         get_page(page);
2807         if (type != NULL)
2808                 *type = VM_FAULT_MINOR;
2809
2810         return page;
2811 }
2812
2813 static struct vm_operations_struct kvm_vm_vm_ops = {
2814         .nopage = kvm_vm_nopage,
2815 };
2816
2817 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2818 {
2819         vma->vm_ops = &kvm_vm_vm_ops;
2820         return 0;
2821 }
2822
2823 static struct file_operations kvm_vm_fops = {
2824         .release        = kvm_vm_release,
2825         .unlocked_ioctl = kvm_vm_ioctl,
2826         .compat_ioctl   = kvm_vm_ioctl,
2827         .mmap           = kvm_vm_mmap,
2828 };
2829
2830 static int kvm_dev_ioctl_create_vm(void)
2831 {
2832         int fd, r;
2833         struct inode *inode;
2834         struct file *file;
2835         struct kvm *kvm;
2836
2837         kvm = kvm_create_vm();
2838         if (IS_ERR(kvm))
2839                 return PTR_ERR(kvm);
2840         r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
2841         if (r) {
2842                 kvm_destroy_vm(kvm);
2843                 return r;
2844         }
2845
2846         kvm->filp = file;
2847
2848         return fd;
2849 }
2850
2851 static long kvm_dev_ioctl(struct file *filp,
2852                           unsigned int ioctl, unsigned long arg)
2853 {
2854         void __user *argp = (void __user *)arg;
2855         long r = -EINVAL;
2856
2857         switch (ioctl) {
2858         case KVM_GET_API_VERSION:
2859                 r = -EINVAL;
2860                 if (arg)
2861                         goto out;
2862                 r = KVM_API_VERSION;
2863                 break;
2864         case KVM_CREATE_VM:
2865                 r = -EINVAL;
2866                 if (arg)
2867                         goto out;
2868                 r = kvm_dev_ioctl_create_vm();
2869                 break;
2870         case KVM_GET_MSR_INDEX_LIST: {
2871                 struct kvm_msr_list __user *user_msr_list = argp;
2872                 struct kvm_msr_list msr_list;
2873                 unsigned n;
2874
2875                 r = -EFAULT;
2876                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2877                         goto out;
2878                 n = msr_list.nmsrs;
2879                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2880                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2881                         goto out;
2882                 r = -E2BIG;
2883                 if (n < num_msrs_to_save)
2884                         goto out;
2885                 r = -EFAULT;
2886                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2887                                  num_msrs_to_save * sizeof(u32)))
2888                         goto out;
2889                 if (copy_to_user(user_msr_list->indices
2890                                  + num_msrs_to_save * sizeof(u32),
2891                                  &emulated_msrs,
2892                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2893                         goto out;
2894                 r = 0;
2895                 break;
2896         }
2897         case KVM_CHECK_EXTENSION:
2898                 /*
2899                  * No extensions defined at present.
2900                  */
2901                 r = 0;
2902                 break;
2903         case KVM_GET_VCPU_MMAP_SIZE:
2904                 r = -EINVAL;
2905                 if (arg)
2906                         goto out;
2907                 r = 2 * PAGE_SIZE;
2908                 break;
2909         default:
2910                 ;
2911         }
2912 out:
2913         return r;
2914 }
2915
2916 static struct file_operations kvm_chardev_ops = {
2917         .open           = kvm_dev_open,
2918         .release        = kvm_dev_release,
2919         .unlocked_ioctl = kvm_dev_ioctl,
2920         .compat_ioctl   = kvm_dev_ioctl,
2921 };
2922
2923 static struct miscdevice kvm_dev = {
2924         KVM_MINOR,
2925         "kvm",
2926         &kvm_chardev_ops,
2927 };
2928
2929 /*
2930  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2931  * cached on it.
2932  */
2933 static void decache_vcpus_on_cpu(int cpu)
2934 {
2935         struct kvm *vm;
2936         struct kvm_vcpu *vcpu;
2937         int i;
2938
2939         spin_lock(&kvm_lock);
2940         list_for_each_entry(vm, &vm_list, vm_list) {
2941                 spin_lock(&vm->lock);
2942                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2943                         vcpu = vm->vcpus[i];
2944                         if (!vcpu)
2945                                 continue;
2946                         /*
2947                          * If the vcpu is locked, then it is running on some
2948                          * other cpu and therefore it is not cached on the
2949                          * cpu in question.
2950                          *
2951                          * If it's not locked, check the last cpu it executed
2952                          * on.
2953                          */
2954                         if (mutex_trylock(&vcpu->mutex)) {
2955                                 if (vcpu->cpu == cpu) {
2956                                         kvm_arch_ops->vcpu_decache(vcpu);
2957                                         vcpu->cpu = -1;
2958                                 }
2959                                 mutex_unlock(&vcpu->mutex);
2960                         }
2961                 }
2962                 spin_unlock(&vm->lock);
2963         }
2964         spin_unlock(&kvm_lock);
2965 }
2966
2967 static void hardware_enable(void *junk)
2968 {
2969         int cpu = raw_smp_processor_id();
2970
2971         if (cpu_isset(cpu, cpus_hardware_enabled))
2972                 return;
2973         cpu_set(cpu, cpus_hardware_enabled);
2974         kvm_arch_ops->hardware_enable(NULL);
2975 }
2976
2977 static void hardware_disable(void *junk)
2978 {
2979         int cpu = raw_smp_processor_id();
2980
2981         if (!cpu_isset(cpu, cpus_hardware_enabled))
2982                 return;
2983         cpu_clear(cpu, cpus_hardware_enabled);
2984         decache_vcpus_on_cpu(cpu);
2985         kvm_arch_ops->hardware_disable(NULL);
2986 }
2987
2988 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2989                            void *v)
2990 {
2991         int cpu = (long)v;
2992
2993         switch (val) {
2994         case CPU_DYING:
2995         case CPU_DYING_FROZEN:
2996                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2997                        cpu);
2998                 hardware_disable(NULL);
2999                 break;
3000         case CPU_UP_CANCELED:
3001         case CPU_UP_CANCELED_FROZEN:
3002                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3003                        cpu);
3004                 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3005                 break;
3006         case CPU_ONLINE:
3007         case CPU_ONLINE_FROZEN:
3008                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3009                        cpu);
3010                 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3011                 break;
3012         }
3013         return NOTIFY_OK;
3014 }
3015
3016 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3017                        void *v)
3018 {
3019         if (val == SYS_RESTART) {
3020                 /*
3021                  * Some (well, at least mine) BIOSes hang on reboot if
3022                  * in vmx root mode.
3023                  */
3024                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3025                 on_each_cpu(hardware_disable, NULL, 0, 1);
3026         }
3027         return NOTIFY_OK;
3028 }
3029
3030 static struct notifier_block kvm_reboot_notifier = {
3031         .notifier_call = kvm_reboot,
3032         .priority = 0,
3033 };
3034
3035 void kvm_io_bus_init(struct kvm_io_bus *bus)
3036 {
3037         memset(bus, 0, sizeof(*bus));
3038 }
3039
3040 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3041 {
3042         int i;
3043
3044         for (i = 0; i < bus->dev_count; i++) {
3045                 struct kvm_io_device *pos = bus->devs[i];
3046
3047                 kvm_iodevice_destructor(pos);
3048         }
3049 }
3050
3051 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3052 {
3053         int i;
3054
3055         for (i = 0; i < bus->dev_count; i++) {
3056                 struct kvm_io_device *pos = bus->devs[i];
3057
3058                 if (pos->in_range(pos, addr))
3059                         return pos;
3060         }
3061
3062         return NULL;
3063 }
3064
3065 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3066 {
3067         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3068
3069         bus->devs[bus->dev_count++] = dev;
3070 }
3071
3072 static struct notifier_block kvm_cpu_notifier = {
3073         .notifier_call = kvm_cpu_hotplug,
3074         .priority = 20, /* must be > scheduler priority */
3075 };
3076
3077 static u64 stat_get(void *_offset)
3078 {
3079         unsigned offset = (long)_offset;
3080         u64 total = 0;
3081         struct kvm *kvm;
3082         struct kvm_vcpu *vcpu;
3083         int i;
3084
3085         spin_lock(&kvm_lock);
3086         list_for_each_entry(kvm, &vm_list, vm_list)
3087                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3088                         vcpu = kvm->vcpus[i];
3089                         if (vcpu)
3090                                 total += *(u32 *)((void *)vcpu + offset);
3091                 }
3092         spin_unlock(&kvm_lock);
3093         return total;
3094 }
3095
3096 static void stat_set(void *offset, u64 val)
3097 {
3098 }
3099
3100 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
3101
3102 static __init void kvm_init_debug(void)
3103 {
3104         struct kvm_stats_debugfs_item *p;
3105
3106         debugfs_dir = debugfs_create_dir("kvm", NULL);
3107         for (p = debugfs_entries; p->name; ++p)
3108                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3109                                                 (void *)(long)p->offset,
3110                                                 &stat_fops);
3111 }
3112
3113 static void kvm_exit_debug(void)
3114 {
3115         struct kvm_stats_debugfs_item *p;
3116
3117         for (p = debugfs_entries; p->name; ++p)
3118                 debugfs_remove(p->dentry);
3119         debugfs_remove(debugfs_dir);
3120 }
3121
3122 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3123 {
3124         hardware_disable(NULL);
3125         return 0;
3126 }
3127
3128 static int kvm_resume(struct sys_device *dev)
3129 {
3130         hardware_enable(NULL);
3131         return 0;
3132 }
3133
3134 static struct sysdev_class kvm_sysdev_class = {
3135         set_kset_name("kvm"),
3136         .suspend = kvm_suspend,
3137         .resume = kvm_resume,
3138 };
3139
3140 static struct sys_device kvm_sysdev = {
3141         .id = 0,
3142         .cls = &kvm_sysdev_class,
3143 };
3144
3145 hpa_t bad_page_address;
3146
3147 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3148 {
3149         int r;
3150
3151         if (kvm_arch_ops) {
3152                 printk(KERN_ERR "kvm: already loaded the other module\n");
3153                 return -EEXIST;
3154         }
3155
3156         if (!ops->cpu_has_kvm_support()) {
3157                 printk(KERN_ERR "kvm: no hardware support\n");
3158                 return -EOPNOTSUPP;
3159         }
3160         if (ops->disabled_by_bios()) {
3161                 printk(KERN_ERR "kvm: disabled by bios\n");
3162                 return -EOPNOTSUPP;
3163         }
3164
3165         kvm_arch_ops = ops;
3166
3167         r = kvm_arch_ops->hardware_setup();
3168         if (r < 0)
3169                 goto out;
3170
3171         on_each_cpu(hardware_enable, NULL, 0, 1);
3172         r = register_cpu_notifier(&kvm_cpu_notifier);
3173         if (r)
3174                 goto out_free_1;
3175         register_reboot_notifier(&kvm_reboot_notifier);
3176
3177         r = sysdev_class_register(&kvm_sysdev_class);
3178         if (r)
3179                 goto out_free_2;
3180
3181         r = sysdev_register(&kvm_sysdev);
3182         if (r)
3183                 goto out_free_3;
3184
3185         kvm_chardev_ops.owner = module;
3186
3187         r = misc_register(&kvm_dev);
3188         if (r) {
3189                 printk (KERN_ERR "kvm: misc device register failed\n");
3190                 goto out_free;
3191         }
3192
3193         return r;
3194
3195 out_free:
3196         sysdev_unregister(&kvm_sysdev);
3197 out_free_3:
3198         sysdev_class_unregister(&kvm_sysdev_class);
3199 out_free_2:
3200         unregister_reboot_notifier(&kvm_reboot_notifier);
3201         unregister_cpu_notifier(&kvm_cpu_notifier);
3202 out_free_1:
3203         on_each_cpu(hardware_disable, NULL, 0, 1);
3204         kvm_arch_ops->hardware_unsetup();
3205 out:
3206         kvm_arch_ops = NULL;
3207         return r;
3208 }
3209
3210 void kvm_exit_arch(void)
3211 {
3212         misc_deregister(&kvm_dev);
3213         sysdev_unregister(&kvm_sysdev);
3214         sysdev_class_unregister(&kvm_sysdev_class);
3215         unregister_reboot_notifier(&kvm_reboot_notifier);
3216         unregister_cpu_notifier(&kvm_cpu_notifier);
3217         on_each_cpu(hardware_disable, NULL, 0, 1);
3218         kvm_arch_ops->hardware_unsetup();
3219         kvm_arch_ops = NULL;
3220 }
3221
3222 static __init int kvm_init(void)
3223 {
3224         static struct page *bad_page;
3225         int r;
3226
3227         r = kvm_mmu_module_init();
3228         if (r)
3229                 goto out4;
3230
3231         kvm_init_debug();
3232
3233         kvm_init_msr_list();
3234
3235         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3236                 r = -ENOMEM;
3237                 goto out;
3238         }
3239
3240         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3241         memset(__va(bad_page_address), 0, PAGE_SIZE);
3242
3243         return 0;
3244
3245 out:
3246         kvm_exit_debug();
3247         kvm_mmu_module_exit();
3248 out4:
3249         return r;
3250 }
3251
3252 static __exit void kvm_exit(void)
3253 {
3254         kvm_exit_debug();
3255         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3256         kvm_mmu_module_exit();
3257 }
3258
3259 module_init(kvm_init)
3260 module_exit(kvm_exit)
3261
3262 EXPORT_SYMBOL_GPL(kvm_init_arch);
3263 EXPORT_SYMBOL_GPL(kvm_exit_arch);