Merge branch 'upstream' of git://git.infradead.org/~dedekind/ubi-2.6
[linux-2.6] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "x86_emulate.h"
20 #include "segment_descriptor.h"
21
22 #include <linux/kvm.h>
23 #include <linux/module.h>
24 #include <linux/errno.h>
25 #include <linux/percpu.h>
26 #include <linux/gfp.h>
27 #include <linux/mm.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <linux/reboot.h>
31 #include <linux/debugfs.h>
32 #include <linux/highmem.h>
33 #include <linux/file.h>
34 #include <linux/sysdev.h>
35 #include <linux/cpu.h>
36 #include <linux/sched.h>
37 #include <linux/cpumask.h>
38 #include <linux/smp.h>
39 #include <linux/anon_inodes.h>
40
41 #include <asm/processor.h>
42 #include <asm/msr.h>
43 #include <asm/io.h>
44 #include <asm/uaccess.h>
45 #include <asm/desc.h>
46
47 MODULE_AUTHOR("Qumranet");
48 MODULE_LICENSE("GPL");
49
50 static DEFINE_SPINLOCK(kvm_lock);
51 static LIST_HEAD(vm_list);
52
53 static cpumask_t cpus_hardware_enabled;
54
55 struct kvm_arch_ops *kvm_arch_ops;
56
57 static void hardware_disable(void *ignored);
58
59 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
60
61 static struct kvm_stats_debugfs_item {
62         const char *name;
63         int offset;
64         struct dentry *dentry;
65 } debugfs_entries[] = {
66         { "pf_fixed", STAT_OFFSET(pf_fixed) },
67         { "pf_guest", STAT_OFFSET(pf_guest) },
68         { "tlb_flush", STAT_OFFSET(tlb_flush) },
69         { "invlpg", STAT_OFFSET(invlpg) },
70         { "exits", STAT_OFFSET(exits) },
71         { "io_exits", STAT_OFFSET(io_exits) },
72         { "mmio_exits", STAT_OFFSET(mmio_exits) },
73         { "signal_exits", STAT_OFFSET(signal_exits) },
74         { "irq_window", STAT_OFFSET(irq_window_exits) },
75         { "halt_exits", STAT_OFFSET(halt_exits) },
76         { "request_irq", STAT_OFFSET(request_irq_exits) },
77         { "irq_exits", STAT_OFFSET(irq_exits) },
78         { "light_exits", STAT_OFFSET(light_exits) },
79         { "efer_reload", STAT_OFFSET(efer_reload) },
80         { NULL }
81 };
82
83 static struct dentry *debugfs_dir;
84
85 #define MAX_IO_MSRS 256
86
87 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
88 #define LMSW_GUEST_MASK 0x0eULL
89 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
90 #define CR8_RESEVED_BITS (~0x0fULL)
91 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
92
93 #ifdef CONFIG_X86_64
94 // LDT or TSS descriptor in the GDT. 16 bytes.
95 struct segment_descriptor_64 {
96         struct segment_descriptor s;
97         u32 base_higher;
98         u32 pad_zero;
99 };
100
101 #endif
102
103 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
104                            unsigned long arg);
105
106 unsigned long segment_base(u16 selector)
107 {
108         struct descriptor_table gdt;
109         struct segment_descriptor *d;
110         unsigned long table_base;
111         typedef unsigned long ul;
112         unsigned long v;
113
114         if (selector == 0)
115                 return 0;
116
117         asm ("sgdt %0" : "=m"(gdt));
118         table_base = gdt.base;
119
120         if (selector & 4) {           /* from ldt */
121                 u16 ldt_selector;
122
123                 asm ("sldt %0" : "=g"(ldt_selector));
124                 table_base = segment_base(ldt_selector);
125         }
126         d = (struct segment_descriptor *)(table_base + (selector & ~7));
127         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
128 #ifdef CONFIG_X86_64
129         if (d->system == 0
130             && (d->type == 2 || d->type == 9 || d->type == 11))
131                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
132 #endif
133         return v;
134 }
135 EXPORT_SYMBOL_GPL(segment_base);
136
137 static inline int valid_vcpu(int n)
138 {
139         return likely(n >= 0 && n < KVM_MAX_VCPUS);
140 }
141
142 int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
143                    void *dest)
144 {
145         unsigned char *host_buf = dest;
146         unsigned long req_size = size;
147
148         while (size) {
149                 hpa_t paddr;
150                 unsigned now;
151                 unsigned offset;
152                 hva_t guest_buf;
153
154                 paddr = gva_to_hpa(vcpu, addr);
155
156                 if (is_error_hpa(paddr))
157                         break;
158
159                 guest_buf = (hva_t)kmap_atomic(
160                                         pfn_to_page(paddr >> PAGE_SHIFT),
161                                         KM_USER0);
162                 offset = addr & ~PAGE_MASK;
163                 guest_buf |= offset;
164                 now = min(size, PAGE_SIZE - offset);
165                 memcpy(host_buf, (void*)guest_buf, now);
166                 host_buf += now;
167                 addr += now;
168                 size -= now;
169                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
170         }
171         return req_size - size;
172 }
173 EXPORT_SYMBOL_GPL(kvm_read_guest);
174
175 int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
176                     void *data)
177 {
178         unsigned char *host_buf = data;
179         unsigned long req_size = size;
180
181         while (size) {
182                 hpa_t paddr;
183                 unsigned now;
184                 unsigned offset;
185                 hva_t guest_buf;
186                 gfn_t gfn;
187
188                 paddr = gva_to_hpa(vcpu, addr);
189
190                 if (is_error_hpa(paddr))
191                         break;
192
193                 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
194                 mark_page_dirty(vcpu->kvm, gfn);
195                 guest_buf = (hva_t)kmap_atomic(
196                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
197                 offset = addr & ~PAGE_MASK;
198                 guest_buf |= offset;
199                 now = min(size, PAGE_SIZE - offset);
200                 memcpy((void*)guest_buf, host_buf, now);
201                 host_buf += now;
202                 addr += now;
203                 size -= now;
204                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
205         }
206         return req_size - size;
207 }
208 EXPORT_SYMBOL_GPL(kvm_write_guest);
209
210 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
211 {
212         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
213                 return;
214
215         vcpu->guest_fpu_loaded = 1;
216         fx_save(vcpu->host_fx_image);
217         fx_restore(vcpu->guest_fx_image);
218 }
219 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
220
221 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
222 {
223         if (!vcpu->guest_fpu_loaded)
224                 return;
225
226         vcpu->guest_fpu_loaded = 0;
227         fx_save(vcpu->guest_fx_image);
228         fx_restore(vcpu->host_fx_image);
229 }
230 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
231
232 /*
233  * Switches to specified vcpu, until a matching vcpu_put()
234  */
235 static void vcpu_load(struct kvm_vcpu *vcpu)
236 {
237         mutex_lock(&vcpu->mutex);
238         kvm_arch_ops->vcpu_load(vcpu);
239 }
240
241 /*
242  * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL
243  * if the slot is not populated.
244  */
245 static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot)
246 {
247         struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
248
249         mutex_lock(&vcpu->mutex);
250         if (!vcpu->vmcs) {
251                 mutex_unlock(&vcpu->mutex);
252                 return NULL;
253         }
254         kvm_arch_ops->vcpu_load(vcpu);
255         return vcpu;
256 }
257
258 static void vcpu_put(struct kvm_vcpu *vcpu)
259 {
260         kvm_arch_ops->vcpu_put(vcpu);
261         mutex_unlock(&vcpu->mutex);
262 }
263
264 static void ack_flush(void *_completed)
265 {
266         atomic_t *completed = _completed;
267
268         atomic_inc(completed);
269 }
270
271 void kvm_flush_remote_tlbs(struct kvm *kvm)
272 {
273         int i, cpu, needed;
274         cpumask_t cpus;
275         struct kvm_vcpu *vcpu;
276         atomic_t completed;
277
278         atomic_set(&completed, 0);
279         cpus_clear(cpus);
280         needed = 0;
281         for (i = 0; i < kvm->nvcpus; ++i) {
282                 vcpu = &kvm->vcpus[i];
283                 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
284                         continue;
285                 cpu = vcpu->cpu;
286                 if (cpu != -1 && cpu != raw_smp_processor_id())
287                         if (!cpu_isset(cpu, cpus)) {
288                                 cpu_set(cpu, cpus);
289                                 ++needed;
290                         }
291         }
292
293         /*
294          * We really want smp_call_function_mask() here.  But that's not
295          * available, so ipi all cpus in parallel and wait for them
296          * to complete.
297          */
298         for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
299                 smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
300         while (atomic_read(&completed) != needed) {
301                 cpu_relax();
302                 barrier();
303         }
304 }
305
306 static struct kvm *kvm_create_vm(void)
307 {
308         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
309         int i;
310
311         if (!kvm)
312                 return ERR_PTR(-ENOMEM);
313
314         kvm_io_bus_init(&kvm->pio_bus);
315         spin_lock_init(&kvm->lock);
316         INIT_LIST_HEAD(&kvm->active_mmu_pages);
317         spin_lock(&kvm_lock);
318         list_add(&kvm->vm_list, &vm_list);
319         spin_unlock(&kvm_lock);
320         kvm_io_bus_init(&kvm->mmio_bus);
321         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
322                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
323
324                 mutex_init(&vcpu->mutex);
325                 vcpu->cpu = -1;
326                 vcpu->kvm = kvm;
327                 vcpu->mmu.root_hpa = INVALID_PAGE;
328         }
329         return kvm;
330 }
331
332 static int kvm_dev_open(struct inode *inode, struct file *filp)
333 {
334         return 0;
335 }
336
337 /*
338  * Free any memory in @free but not in @dont.
339  */
340 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
341                                   struct kvm_memory_slot *dont)
342 {
343         int i;
344
345         if (!dont || free->phys_mem != dont->phys_mem)
346                 if (free->phys_mem) {
347                         for (i = 0; i < free->npages; ++i)
348                                 if (free->phys_mem[i])
349                                         __free_page(free->phys_mem[i]);
350                         vfree(free->phys_mem);
351                 }
352
353         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
354                 vfree(free->dirty_bitmap);
355
356         free->phys_mem = NULL;
357         free->npages = 0;
358         free->dirty_bitmap = NULL;
359 }
360
361 static void kvm_free_physmem(struct kvm *kvm)
362 {
363         int i;
364
365         for (i = 0; i < kvm->nmemslots; ++i)
366                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
367 }
368
369 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
370 {
371         int i;
372
373         for (i = 0; i < 2; ++i)
374                 if (vcpu->pio.guest_pages[i]) {
375                         __free_page(vcpu->pio.guest_pages[i]);
376                         vcpu->pio.guest_pages[i] = NULL;
377                 }
378 }
379
380 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
381 {
382         if (!vcpu->vmcs)
383                 return;
384
385         vcpu_load(vcpu);
386         kvm_mmu_unload(vcpu);
387         vcpu_put(vcpu);
388 }
389
390 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
391 {
392         if (!vcpu->vmcs)
393                 return;
394
395         vcpu_load(vcpu);
396         kvm_mmu_destroy(vcpu);
397         vcpu_put(vcpu);
398         kvm_arch_ops->vcpu_free(vcpu);
399         free_page((unsigned long)vcpu->run);
400         vcpu->run = NULL;
401         free_page((unsigned long)vcpu->pio_data);
402         vcpu->pio_data = NULL;
403         free_pio_guest_pages(vcpu);
404 }
405
406 static void kvm_free_vcpus(struct kvm *kvm)
407 {
408         unsigned int i;
409
410         /*
411          * Unpin any mmu pages first.
412          */
413         for (i = 0; i < KVM_MAX_VCPUS; ++i)
414                 kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
415         for (i = 0; i < KVM_MAX_VCPUS; ++i)
416                 kvm_free_vcpu(&kvm->vcpus[i]);
417 }
418
419 static int kvm_dev_release(struct inode *inode, struct file *filp)
420 {
421         return 0;
422 }
423
424 static void kvm_destroy_vm(struct kvm *kvm)
425 {
426         spin_lock(&kvm_lock);
427         list_del(&kvm->vm_list);
428         spin_unlock(&kvm_lock);
429         kvm_io_bus_destroy(&kvm->pio_bus);
430         kvm_io_bus_destroy(&kvm->mmio_bus);
431         kvm_free_vcpus(kvm);
432         kvm_free_physmem(kvm);
433         kfree(kvm);
434 }
435
436 static int kvm_vm_release(struct inode *inode, struct file *filp)
437 {
438         struct kvm *kvm = filp->private_data;
439
440         kvm_destroy_vm(kvm);
441         return 0;
442 }
443
444 static void inject_gp(struct kvm_vcpu *vcpu)
445 {
446         kvm_arch_ops->inject_gp(vcpu, 0);
447 }
448
449 /*
450  * Load the pae pdptrs.  Return true is they are all valid.
451  */
452 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
453 {
454         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
455         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
456         int i;
457         u64 pdpte;
458         u64 *pdpt;
459         int ret;
460         struct page *page;
461
462         spin_lock(&vcpu->kvm->lock);
463         page = gfn_to_page(vcpu->kvm, pdpt_gfn);
464         /* FIXME: !page - emulate? 0xff? */
465         pdpt = kmap_atomic(page, KM_USER0);
466
467         ret = 1;
468         for (i = 0; i < 4; ++i) {
469                 pdpte = pdpt[offset + i];
470                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
471                         ret = 0;
472                         goto out;
473                 }
474         }
475
476         for (i = 0; i < 4; ++i)
477                 vcpu->pdptrs[i] = pdpt[offset + i];
478
479 out:
480         kunmap_atomic(pdpt, KM_USER0);
481         spin_unlock(&vcpu->kvm->lock);
482
483         return ret;
484 }
485
486 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
487 {
488         if (cr0 & CR0_RESEVED_BITS) {
489                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
490                        cr0, vcpu->cr0);
491                 inject_gp(vcpu);
492                 return;
493         }
494
495         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
496                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
497                 inject_gp(vcpu);
498                 return;
499         }
500
501         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
502                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
503                        "and a clear PE flag\n");
504                 inject_gp(vcpu);
505                 return;
506         }
507
508         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
509 #ifdef CONFIG_X86_64
510                 if ((vcpu->shadow_efer & EFER_LME)) {
511                         int cs_db, cs_l;
512
513                         if (!is_pae(vcpu)) {
514                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
515                                        "in long mode while PAE is disabled\n");
516                                 inject_gp(vcpu);
517                                 return;
518                         }
519                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
520                         if (cs_l) {
521                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
522                                        "in long mode while CS.L == 1\n");
523                                 inject_gp(vcpu);
524                                 return;
525
526                         }
527                 } else
528 #endif
529                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
530                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
531                                "reserved bits\n");
532                         inject_gp(vcpu);
533                         return;
534                 }
535
536         }
537
538         kvm_arch_ops->set_cr0(vcpu, cr0);
539         vcpu->cr0 = cr0;
540
541         spin_lock(&vcpu->kvm->lock);
542         kvm_mmu_reset_context(vcpu);
543         spin_unlock(&vcpu->kvm->lock);
544         return;
545 }
546 EXPORT_SYMBOL_GPL(set_cr0);
547
548 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
549 {
550         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
551 }
552 EXPORT_SYMBOL_GPL(lmsw);
553
554 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
555 {
556         if (cr4 & CR4_RESEVED_BITS) {
557                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
558                 inject_gp(vcpu);
559                 return;
560         }
561
562         if (is_long_mode(vcpu)) {
563                 if (!(cr4 & CR4_PAE_MASK)) {
564                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
565                                "in long mode\n");
566                         inject_gp(vcpu);
567                         return;
568                 }
569         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
570                    && !load_pdptrs(vcpu, vcpu->cr3)) {
571                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
572                 inject_gp(vcpu);
573         }
574
575         if (cr4 & CR4_VMXE_MASK) {
576                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
577                 inject_gp(vcpu);
578                 return;
579         }
580         kvm_arch_ops->set_cr4(vcpu, cr4);
581         spin_lock(&vcpu->kvm->lock);
582         kvm_mmu_reset_context(vcpu);
583         spin_unlock(&vcpu->kvm->lock);
584 }
585 EXPORT_SYMBOL_GPL(set_cr4);
586
587 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
588 {
589         if (is_long_mode(vcpu)) {
590                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
591                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
592                         inject_gp(vcpu);
593                         return;
594                 }
595         } else {
596                 if (cr3 & CR3_RESEVED_BITS) {
597                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
598                         inject_gp(vcpu);
599                         return;
600                 }
601                 if (is_paging(vcpu) && is_pae(vcpu) &&
602                     !load_pdptrs(vcpu, cr3)) {
603                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
604                                "reserved bits\n");
605                         inject_gp(vcpu);
606                         return;
607                 }
608         }
609
610         vcpu->cr3 = cr3;
611         spin_lock(&vcpu->kvm->lock);
612         /*
613          * Does the new cr3 value map to physical memory? (Note, we
614          * catch an invalid cr3 even in real-mode, because it would
615          * cause trouble later on when we turn on paging anyway.)
616          *
617          * A real CPU would silently accept an invalid cr3 and would
618          * attempt to use it - with largely undefined (and often hard
619          * to debug) behavior on the guest side.
620          */
621         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
622                 inject_gp(vcpu);
623         else
624                 vcpu->mmu.new_cr3(vcpu);
625         spin_unlock(&vcpu->kvm->lock);
626 }
627 EXPORT_SYMBOL_GPL(set_cr3);
628
629 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
630 {
631         if ( cr8 & CR8_RESEVED_BITS) {
632                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
633                 inject_gp(vcpu);
634                 return;
635         }
636         vcpu->cr8 = cr8;
637 }
638 EXPORT_SYMBOL_GPL(set_cr8);
639
640 void fx_init(struct kvm_vcpu *vcpu)
641 {
642         struct __attribute__ ((__packed__)) fx_image_s {
643                 u16 control; //fcw
644                 u16 status; //fsw
645                 u16 tag; // ftw
646                 u16 opcode; //fop
647                 u64 ip; // fpu ip
648                 u64 operand;// fpu dp
649                 u32 mxcsr;
650                 u32 mxcsr_mask;
651
652         } *fx_image;
653
654         fx_save(vcpu->host_fx_image);
655         fpu_init();
656         fx_save(vcpu->guest_fx_image);
657         fx_restore(vcpu->host_fx_image);
658
659         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
660         fx_image->mxcsr = 0x1f80;
661         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
662                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
663 }
664 EXPORT_SYMBOL_GPL(fx_init);
665
666 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
667 {
668         spin_lock(&vcpu->kvm->lock);
669         kvm_mmu_slot_remove_write_access(vcpu, slot);
670         spin_unlock(&vcpu->kvm->lock);
671 }
672
673 /*
674  * Allocate some memory and give it an address in the guest physical address
675  * space.
676  *
677  * Discontiguous memory is allowed, mostly for framebuffers.
678  */
679 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
680                                           struct kvm_memory_region *mem)
681 {
682         int r;
683         gfn_t base_gfn;
684         unsigned long npages;
685         unsigned long i;
686         struct kvm_memory_slot *memslot;
687         struct kvm_memory_slot old, new;
688         int memory_config_version;
689
690         r = -EINVAL;
691         /* General sanity checks */
692         if (mem->memory_size & (PAGE_SIZE - 1))
693                 goto out;
694         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
695                 goto out;
696         if (mem->slot >= KVM_MEMORY_SLOTS)
697                 goto out;
698         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
699                 goto out;
700
701         memslot = &kvm->memslots[mem->slot];
702         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
703         npages = mem->memory_size >> PAGE_SHIFT;
704
705         if (!npages)
706                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
707
708 raced:
709         spin_lock(&kvm->lock);
710
711         memory_config_version = kvm->memory_config_version;
712         new = old = *memslot;
713
714         new.base_gfn = base_gfn;
715         new.npages = npages;
716         new.flags = mem->flags;
717
718         /* Disallow changing a memory slot's size. */
719         r = -EINVAL;
720         if (npages && old.npages && npages != old.npages)
721                 goto out_unlock;
722
723         /* Check for overlaps */
724         r = -EEXIST;
725         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
726                 struct kvm_memory_slot *s = &kvm->memslots[i];
727
728                 if (s == memslot)
729                         continue;
730                 if (!((base_gfn + npages <= s->base_gfn) ||
731                       (base_gfn >= s->base_gfn + s->npages)))
732                         goto out_unlock;
733         }
734         /*
735          * Do memory allocations outside lock.  memory_config_version will
736          * detect any races.
737          */
738         spin_unlock(&kvm->lock);
739
740         /* Deallocate if slot is being removed */
741         if (!npages)
742                 new.phys_mem = NULL;
743
744         /* Free page dirty bitmap if unneeded */
745         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
746                 new.dirty_bitmap = NULL;
747
748         r = -ENOMEM;
749
750         /* Allocate if a slot is being created */
751         if (npages && !new.phys_mem) {
752                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
753
754                 if (!new.phys_mem)
755                         goto out_free;
756
757                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
758                 for (i = 0; i < npages; ++i) {
759                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
760                                                      | __GFP_ZERO);
761                         if (!new.phys_mem[i])
762                                 goto out_free;
763                         set_page_private(new.phys_mem[i],0);
764                 }
765         }
766
767         /* Allocate page dirty bitmap if needed */
768         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
769                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
770
771                 new.dirty_bitmap = vmalloc(dirty_bytes);
772                 if (!new.dirty_bitmap)
773                         goto out_free;
774                 memset(new.dirty_bitmap, 0, dirty_bytes);
775         }
776
777         spin_lock(&kvm->lock);
778
779         if (memory_config_version != kvm->memory_config_version) {
780                 spin_unlock(&kvm->lock);
781                 kvm_free_physmem_slot(&new, &old);
782                 goto raced;
783         }
784
785         r = -EAGAIN;
786         if (kvm->busy)
787                 goto out_unlock;
788
789         if (mem->slot >= kvm->nmemslots)
790                 kvm->nmemslots = mem->slot + 1;
791
792         *memslot = new;
793         ++kvm->memory_config_version;
794
795         spin_unlock(&kvm->lock);
796
797         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
798                 struct kvm_vcpu *vcpu;
799
800                 vcpu = vcpu_load_slot(kvm, i);
801                 if (!vcpu)
802                         continue;
803                 if (new.flags & KVM_MEM_LOG_DIRTY_PAGES)
804                         do_remove_write_access(vcpu, mem->slot);
805                 kvm_mmu_reset_context(vcpu);
806                 vcpu_put(vcpu);
807         }
808
809         kvm_free_physmem_slot(&old, &new);
810         return 0;
811
812 out_unlock:
813         spin_unlock(&kvm->lock);
814 out_free:
815         kvm_free_physmem_slot(&new, &old);
816 out:
817         return r;
818 }
819
820 /*
821  * Get (and clear) the dirty memory log for a memory slot.
822  */
823 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
824                                       struct kvm_dirty_log *log)
825 {
826         struct kvm_memory_slot *memslot;
827         int r, i;
828         int n;
829         int cleared;
830         unsigned long any = 0;
831
832         spin_lock(&kvm->lock);
833
834         /*
835          * Prevent changes to guest memory configuration even while the lock
836          * is not taken.
837          */
838         ++kvm->busy;
839         spin_unlock(&kvm->lock);
840         r = -EINVAL;
841         if (log->slot >= KVM_MEMORY_SLOTS)
842                 goto out;
843
844         memslot = &kvm->memslots[log->slot];
845         r = -ENOENT;
846         if (!memslot->dirty_bitmap)
847                 goto out;
848
849         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
850
851         for (i = 0; !any && i < n/sizeof(long); ++i)
852                 any = memslot->dirty_bitmap[i];
853
854         r = -EFAULT;
855         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
856                 goto out;
857
858         if (any) {
859                 cleared = 0;
860                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
861                         struct kvm_vcpu *vcpu;
862
863                         vcpu = vcpu_load_slot(kvm, i);
864                         if (!vcpu)
865                                 continue;
866                         if (!cleared) {
867                                 do_remove_write_access(vcpu, log->slot);
868                                 memset(memslot->dirty_bitmap, 0, n);
869                                 cleared = 1;
870                         }
871                         kvm_arch_ops->tlb_flush(vcpu);
872                         vcpu_put(vcpu);
873                 }
874         }
875
876         r = 0;
877
878 out:
879         spin_lock(&kvm->lock);
880         --kvm->busy;
881         spin_unlock(&kvm->lock);
882         return r;
883 }
884
885 /*
886  * Set a new alias region.  Aliases map a portion of physical memory into
887  * another portion.  This is useful for memory windows, for example the PC
888  * VGA region.
889  */
890 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
891                                          struct kvm_memory_alias *alias)
892 {
893         int r, n;
894         struct kvm_mem_alias *p;
895
896         r = -EINVAL;
897         /* General sanity checks */
898         if (alias->memory_size & (PAGE_SIZE - 1))
899                 goto out;
900         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
901                 goto out;
902         if (alias->slot >= KVM_ALIAS_SLOTS)
903                 goto out;
904         if (alias->guest_phys_addr + alias->memory_size
905             < alias->guest_phys_addr)
906                 goto out;
907         if (alias->target_phys_addr + alias->memory_size
908             < alias->target_phys_addr)
909                 goto out;
910
911         spin_lock(&kvm->lock);
912
913         p = &kvm->aliases[alias->slot];
914         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
915         p->npages = alias->memory_size >> PAGE_SHIFT;
916         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
917
918         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
919                 if (kvm->aliases[n - 1].npages)
920                         break;
921         kvm->naliases = n;
922
923         spin_unlock(&kvm->lock);
924
925         vcpu_load(&kvm->vcpus[0]);
926         spin_lock(&kvm->lock);
927         kvm_mmu_zap_all(&kvm->vcpus[0]);
928         spin_unlock(&kvm->lock);
929         vcpu_put(&kvm->vcpus[0]);
930
931         return 0;
932
933 out:
934         return r;
935 }
936
937 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
938 {
939         int i;
940         struct kvm_mem_alias *alias;
941
942         for (i = 0; i < kvm->naliases; ++i) {
943                 alias = &kvm->aliases[i];
944                 if (gfn >= alias->base_gfn
945                     && gfn < alias->base_gfn + alias->npages)
946                         return alias->target_gfn + gfn - alias->base_gfn;
947         }
948         return gfn;
949 }
950
951 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
952 {
953         int i;
954
955         for (i = 0; i < kvm->nmemslots; ++i) {
956                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
957
958                 if (gfn >= memslot->base_gfn
959                     && gfn < memslot->base_gfn + memslot->npages)
960                         return memslot;
961         }
962         return NULL;
963 }
964
965 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
966 {
967         gfn = unalias_gfn(kvm, gfn);
968         return __gfn_to_memslot(kvm, gfn);
969 }
970
971 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
972 {
973         struct kvm_memory_slot *slot;
974
975         gfn = unalias_gfn(kvm, gfn);
976         slot = __gfn_to_memslot(kvm, gfn);
977         if (!slot)
978                 return NULL;
979         return slot->phys_mem[gfn - slot->base_gfn];
980 }
981 EXPORT_SYMBOL_GPL(gfn_to_page);
982
983 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
984 {
985         int i;
986         struct kvm_memory_slot *memslot;
987         unsigned long rel_gfn;
988
989         for (i = 0; i < kvm->nmemslots; ++i) {
990                 memslot = &kvm->memslots[i];
991
992                 if (gfn >= memslot->base_gfn
993                     && gfn < memslot->base_gfn + memslot->npages) {
994
995                         if (!memslot->dirty_bitmap)
996                                 return;
997
998                         rel_gfn = gfn - memslot->base_gfn;
999
1000                         /* avoid RMW */
1001                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1002                                 set_bit(rel_gfn, memslot->dirty_bitmap);
1003                         return;
1004                 }
1005         }
1006 }
1007
1008 static int emulator_read_std(unsigned long addr,
1009                              void *val,
1010                              unsigned int bytes,
1011                              struct x86_emulate_ctxt *ctxt)
1012 {
1013         struct kvm_vcpu *vcpu = ctxt->vcpu;
1014         void *data = val;
1015
1016         while (bytes) {
1017                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1018                 unsigned offset = addr & (PAGE_SIZE-1);
1019                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1020                 unsigned long pfn;
1021                 struct page *page;
1022                 void *page_virt;
1023
1024                 if (gpa == UNMAPPED_GVA)
1025                         return X86EMUL_PROPAGATE_FAULT;
1026                 pfn = gpa >> PAGE_SHIFT;
1027                 page = gfn_to_page(vcpu->kvm, pfn);
1028                 if (!page)
1029                         return X86EMUL_UNHANDLEABLE;
1030                 page_virt = kmap_atomic(page, KM_USER0);
1031
1032                 memcpy(data, page_virt + offset, tocopy);
1033
1034                 kunmap_atomic(page_virt, KM_USER0);
1035
1036                 bytes -= tocopy;
1037                 data += tocopy;
1038                 addr += tocopy;
1039         }
1040
1041         return X86EMUL_CONTINUE;
1042 }
1043
1044 static int emulator_write_std(unsigned long addr,
1045                               const void *val,
1046                               unsigned int bytes,
1047                               struct x86_emulate_ctxt *ctxt)
1048 {
1049         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
1050                addr, bytes);
1051         return X86EMUL_UNHANDLEABLE;
1052 }
1053
1054 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1055                                                 gpa_t addr)
1056 {
1057         /*
1058          * Note that its important to have this wrapper function because
1059          * in the very near future we will be checking for MMIOs against
1060          * the LAPIC as well as the general MMIO bus
1061          */
1062         return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1063 }
1064
1065 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1066                                                gpa_t addr)
1067 {
1068         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1069 }
1070
1071 static int emulator_read_emulated(unsigned long addr,
1072                                   void *val,
1073                                   unsigned int bytes,
1074                                   struct x86_emulate_ctxt *ctxt)
1075 {
1076         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1077         struct kvm_io_device *mmio_dev;
1078         gpa_t                 gpa;
1079
1080         if (vcpu->mmio_read_completed) {
1081                 memcpy(val, vcpu->mmio_data, bytes);
1082                 vcpu->mmio_read_completed = 0;
1083                 return X86EMUL_CONTINUE;
1084         } else if (emulator_read_std(addr, val, bytes, ctxt)
1085                    == X86EMUL_CONTINUE)
1086                 return X86EMUL_CONTINUE;
1087
1088         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1089         if (gpa == UNMAPPED_GVA)
1090                 return X86EMUL_PROPAGATE_FAULT;
1091
1092         /*
1093          * Is this MMIO handled locally?
1094          */
1095         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1096         if (mmio_dev) {
1097                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1098                 return X86EMUL_CONTINUE;
1099         }
1100
1101         vcpu->mmio_needed = 1;
1102         vcpu->mmio_phys_addr = gpa;
1103         vcpu->mmio_size = bytes;
1104         vcpu->mmio_is_write = 0;
1105
1106         return X86EMUL_UNHANDLEABLE;
1107 }
1108
1109 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1110                                const void *val, int bytes)
1111 {
1112         struct page *page;
1113         void *virt;
1114         unsigned offset = offset_in_page(gpa);
1115
1116         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1117                 return 0;
1118         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1119         if (!page)
1120                 return 0;
1121         mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1122         virt = kmap_atomic(page, KM_USER0);
1123         if (memcmp(virt + offset_in_page(gpa), val, bytes)) {
1124                 kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
1125                 memcpy(virt + offset_in_page(gpa), val, bytes);
1126         }
1127         kunmap_atomic(virt, KM_USER0);
1128         return 1;
1129 }
1130
1131 static int emulator_write_emulated(unsigned long addr,
1132                                    const void *val,
1133                                    unsigned int bytes,
1134                                    struct x86_emulate_ctxt *ctxt)
1135 {
1136         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1137         struct kvm_io_device *mmio_dev;
1138         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1139
1140         if (gpa == UNMAPPED_GVA) {
1141                 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1142                 return X86EMUL_PROPAGATE_FAULT;
1143         }
1144
1145         if (emulator_write_phys(vcpu, gpa, val, bytes))
1146                 return X86EMUL_CONTINUE;
1147
1148         /*
1149          * Is this MMIO handled locally?
1150          */
1151         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1152         if (mmio_dev) {
1153                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1154                 return X86EMUL_CONTINUE;
1155         }
1156
1157         vcpu->mmio_needed = 1;
1158         vcpu->mmio_phys_addr = gpa;
1159         vcpu->mmio_size = bytes;
1160         vcpu->mmio_is_write = 1;
1161         memcpy(vcpu->mmio_data, val, bytes);
1162
1163         return X86EMUL_CONTINUE;
1164 }
1165
1166 static int emulator_cmpxchg_emulated(unsigned long addr,
1167                                      const void *old,
1168                                      const void *new,
1169                                      unsigned int bytes,
1170                                      struct x86_emulate_ctxt *ctxt)
1171 {
1172         static int reported;
1173
1174         if (!reported) {
1175                 reported = 1;
1176                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1177         }
1178         return emulator_write_emulated(addr, new, bytes, ctxt);
1179 }
1180
1181 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1182 {
1183         return kvm_arch_ops->get_segment_base(vcpu, seg);
1184 }
1185
1186 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1187 {
1188         return X86EMUL_CONTINUE;
1189 }
1190
1191 int emulate_clts(struct kvm_vcpu *vcpu)
1192 {
1193         unsigned long cr0;
1194
1195         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1196         kvm_arch_ops->set_cr0(vcpu, cr0);
1197         return X86EMUL_CONTINUE;
1198 }
1199
1200 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1201 {
1202         struct kvm_vcpu *vcpu = ctxt->vcpu;
1203
1204         switch (dr) {
1205         case 0 ... 3:
1206                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1207                 return X86EMUL_CONTINUE;
1208         default:
1209                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1210                        __FUNCTION__, dr);
1211                 return X86EMUL_UNHANDLEABLE;
1212         }
1213 }
1214
1215 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1216 {
1217         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1218         int exception;
1219
1220         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1221         if (exception) {
1222                 /* FIXME: better handling */
1223                 return X86EMUL_UNHANDLEABLE;
1224         }
1225         return X86EMUL_CONTINUE;
1226 }
1227
1228 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1229 {
1230         static int reported;
1231         u8 opcodes[4];
1232         unsigned long rip = ctxt->vcpu->rip;
1233         unsigned long rip_linear;
1234
1235         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1236
1237         if (reported)
1238                 return;
1239
1240         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1241
1242         printk(KERN_ERR "emulation failed but !mmio_needed?"
1243                " rip %lx %02x %02x %02x %02x\n",
1244                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1245         reported = 1;
1246 }
1247
1248 struct x86_emulate_ops emulate_ops = {
1249         .read_std            = emulator_read_std,
1250         .write_std           = emulator_write_std,
1251         .read_emulated       = emulator_read_emulated,
1252         .write_emulated      = emulator_write_emulated,
1253         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1254 };
1255
1256 int emulate_instruction(struct kvm_vcpu *vcpu,
1257                         struct kvm_run *run,
1258                         unsigned long cr2,
1259                         u16 error_code)
1260 {
1261         struct x86_emulate_ctxt emulate_ctxt;
1262         int r;
1263         int cs_db, cs_l;
1264
1265         vcpu->mmio_fault_cr2 = cr2;
1266         kvm_arch_ops->cache_regs(vcpu);
1267
1268         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1269
1270         emulate_ctxt.vcpu = vcpu;
1271         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1272         emulate_ctxt.cr2 = cr2;
1273         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1274                 ? X86EMUL_MODE_REAL : cs_l
1275                 ? X86EMUL_MODE_PROT64 : cs_db
1276                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1277
1278         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1279                 emulate_ctxt.cs_base = 0;
1280                 emulate_ctxt.ds_base = 0;
1281                 emulate_ctxt.es_base = 0;
1282                 emulate_ctxt.ss_base = 0;
1283         } else {
1284                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1285                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1286                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1287                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1288         }
1289
1290         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1291         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1292
1293         vcpu->mmio_is_write = 0;
1294         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1295
1296         if ((r || vcpu->mmio_is_write) && run) {
1297                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1298                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1299                 run->mmio.len = vcpu->mmio_size;
1300                 run->mmio.is_write = vcpu->mmio_is_write;
1301         }
1302
1303         if (r) {
1304                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1305                         return EMULATE_DONE;
1306                 if (!vcpu->mmio_needed) {
1307                         report_emulation_failure(&emulate_ctxt);
1308                         return EMULATE_FAIL;
1309                 }
1310                 return EMULATE_DO_MMIO;
1311         }
1312
1313         kvm_arch_ops->decache_regs(vcpu);
1314         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1315
1316         if (vcpu->mmio_is_write) {
1317                 vcpu->mmio_needed = 0;
1318                 return EMULATE_DO_MMIO;
1319         }
1320
1321         return EMULATE_DONE;
1322 }
1323 EXPORT_SYMBOL_GPL(emulate_instruction);
1324
1325 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1326 {
1327         if (vcpu->irq_summary)
1328                 return 1;
1329
1330         vcpu->run->exit_reason = KVM_EXIT_HLT;
1331         ++vcpu->stat.halt_exits;
1332         return 0;
1333 }
1334 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1335
1336 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1337 {
1338         unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1339
1340         kvm_arch_ops->cache_regs(vcpu);
1341         ret = -KVM_EINVAL;
1342 #ifdef CONFIG_X86_64
1343         if (is_long_mode(vcpu)) {
1344                 nr = vcpu->regs[VCPU_REGS_RAX];
1345                 a0 = vcpu->regs[VCPU_REGS_RDI];
1346                 a1 = vcpu->regs[VCPU_REGS_RSI];
1347                 a2 = vcpu->regs[VCPU_REGS_RDX];
1348                 a3 = vcpu->regs[VCPU_REGS_RCX];
1349                 a4 = vcpu->regs[VCPU_REGS_R8];
1350                 a5 = vcpu->regs[VCPU_REGS_R9];
1351         } else
1352 #endif
1353         {
1354                 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1355                 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1356                 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1357                 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1358                 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1359                 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1360                 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1361         }
1362         switch (nr) {
1363         default:
1364                 run->hypercall.args[0] = a0;
1365                 run->hypercall.args[1] = a1;
1366                 run->hypercall.args[2] = a2;
1367                 run->hypercall.args[3] = a3;
1368                 run->hypercall.args[4] = a4;
1369                 run->hypercall.args[5] = a5;
1370                 run->hypercall.ret = ret;
1371                 run->hypercall.longmode = is_long_mode(vcpu);
1372                 kvm_arch_ops->decache_regs(vcpu);
1373                 return 0;
1374         }
1375         vcpu->regs[VCPU_REGS_RAX] = ret;
1376         kvm_arch_ops->decache_regs(vcpu);
1377         return 1;
1378 }
1379 EXPORT_SYMBOL_GPL(kvm_hypercall);
1380
1381 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1382 {
1383         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1384 }
1385
1386 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1387 {
1388         struct descriptor_table dt = { limit, base };
1389
1390         kvm_arch_ops->set_gdt(vcpu, &dt);
1391 }
1392
1393 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1394 {
1395         struct descriptor_table dt = { limit, base };
1396
1397         kvm_arch_ops->set_idt(vcpu, &dt);
1398 }
1399
1400 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1401                    unsigned long *rflags)
1402 {
1403         lmsw(vcpu, msw);
1404         *rflags = kvm_arch_ops->get_rflags(vcpu);
1405 }
1406
1407 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1408 {
1409         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1410         switch (cr) {
1411         case 0:
1412                 return vcpu->cr0;
1413         case 2:
1414                 return vcpu->cr2;
1415         case 3:
1416                 return vcpu->cr3;
1417         case 4:
1418                 return vcpu->cr4;
1419         default:
1420                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1421                 return 0;
1422         }
1423 }
1424
1425 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1426                      unsigned long *rflags)
1427 {
1428         switch (cr) {
1429         case 0:
1430                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1431                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1432                 break;
1433         case 2:
1434                 vcpu->cr2 = val;
1435                 break;
1436         case 3:
1437                 set_cr3(vcpu, val);
1438                 break;
1439         case 4:
1440                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1441                 break;
1442         default:
1443                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1444         }
1445 }
1446
1447 /*
1448  * Register the para guest with the host:
1449  */
1450 static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1451 {
1452         struct kvm_vcpu_para_state *para_state;
1453         hpa_t para_state_hpa, hypercall_hpa;
1454         struct page *para_state_page;
1455         unsigned char *hypercall;
1456         gpa_t hypercall_gpa;
1457
1458         printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1459         printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1460
1461         /*
1462          * Needs to be page aligned:
1463          */
1464         if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1465                 goto err_gp;
1466
1467         para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1468         printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1469         if (is_error_hpa(para_state_hpa))
1470                 goto err_gp;
1471
1472         mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1473         para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1474         para_state = kmap_atomic(para_state_page, KM_USER0);
1475
1476         printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1477         printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1478
1479         para_state->host_version = KVM_PARA_API_VERSION;
1480         /*
1481          * We cannot support guests that try to register themselves
1482          * with a newer API version than the host supports:
1483          */
1484         if (para_state->guest_version > KVM_PARA_API_VERSION) {
1485                 para_state->ret = -KVM_EINVAL;
1486                 goto err_kunmap_skip;
1487         }
1488
1489         hypercall_gpa = para_state->hypercall_gpa;
1490         hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1491         printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1492         if (is_error_hpa(hypercall_hpa)) {
1493                 para_state->ret = -KVM_EINVAL;
1494                 goto err_kunmap_skip;
1495         }
1496
1497         printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1498         vcpu->para_state_page = para_state_page;
1499         vcpu->para_state_gpa = para_state_gpa;
1500         vcpu->hypercall_gpa = hypercall_gpa;
1501
1502         mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1503         hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1504                                 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1505         kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1506         kunmap_atomic(hypercall, KM_USER1);
1507
1508         para_state->ret = 0;
1509 err_kunmap_skip:
1510         kunmap_atomic(para_state, KM_USER0);
1511         return 0;
1512 err_gp:
1513         return 1;
1514 }
1515
1516 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1517 {
1518         u64 data;
1519
1520         switch (msr) {
1521         case 0xc0010010: /* SYSCFG */
1522         case 0xc0010015: /* HWCR */
1523         case MSR_IA32_PLATFORM_ID:
1524         case MSR_IA32_P5_MC_ADDR:
1525         case MSR_IA32_P5_MC_TYPE:
1526         case MSR_IA32_MC0_CTL:
1527         case MSR_IA32_MCG_STATUS:
1528         case MSR_IA32_MCG_CAP:
1529         case MSR_IA32_MC0_MISC:
1530         case MSR_IA32_MC0_MISC+4:
1531         case MSR_IA32_MC0_MISC+8:
1532         case MSR_IA32_MC0_MISC+12:
1533         case MSR_IA32_MC0_MISC+16:
1534         case MSR_IA32_UCODE_REV:
1535         case MSR_IA32_PERF_STATUS:
1536         case MSR_IA32_EBL_CR_POWERON:
1537                 /* MTRR registers */
1538         case 0xfe:
1539         case 0x200 ... 0x2ff:
1540                 data = 0;
1541                 break;
1542         case 0xcd: /* fsb frequency */
1543                 data = 3;
1544                 break;
1545         case MSR_IA32_APICBASE:
1546                 data = vcpu->apic_base;
1547                 break;
1548         case MSR_IA32_MISC_ENABLE:
1549                 data = vcpu->ia32_misc_enable_msr;
1550                 break;
1551 #ifdef CONFIG_X86_64
1552         case MSR_EFER:
1553                 data = vcpu->shadow_efer;
1554                 break;
1555 #endif
1556         default:
1557                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1558                 return 1;
1559         }
1560         *pdata = data;
1561         return 0;
1562 }
1563 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1564
1565 /*
1566  * Reads an msr value (of 'msr_index') into 'pdata'.
1567  * Returns 0 on success, non-0 otherwise.
1568  * Assumes vcpu_load() was already called.
1569  */
1570 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1571 {
1572         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1573 }
1574
1575 #ifdef CONFIG_X86_64
1576
1577 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1578 {
1579         if (efer & EFER_RESERVED_BITS) {
1580                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1581                        efer);
1582                 inject_gp(vcpu);
1583                 return;
1584         }
1585
1586         if (is_paging(vcpu)
1587             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1588                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1589                 inject_gp(vcpu);
1590                 return;
1591         }
1592
1593         kvm_arch_ops->set_efer(vcpu, efer);
1594
1595         efer &= ~EFER_LMA;
1596         efer |= vcpu->shadow_efer & EFER_LMA;
1597
1598         vcpu->shadow_efer = efer;
1599 }
1600
1601 #endif
1602
1603 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1604 {
1605         switch (msr) {
1606 #ifdef CONFIG_X86_64
1607         case MSR_EFER:
1608                 set_efer(vcpu, data);
1609                 break;
1610 #endif
1611         case MSR_IA32_MC0_STATUS:
1612                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1613                        __FUNCTION__, data);
1614                 break;
1615         case MSR_IA32_MCG_STATUS:
1616                 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1617                         __FUNCTION__, data);
1618                 break;
1619         case MSR_IA32_UCODE_REV:
1620         case MSR_IA32_UCODE_WRITE:
1621         case 0x200 ... 0x2ff: /* MTRRs */
1622                 break;
1623         case MSR_IA32_APICBASE:
1624                 vcpu->apic_base = data;
1625                 break;
1626         case MSR_IA32_MISC_ENABLE:
1627                 vcpu->ia32_misc_enable_msr = data;
1628                 break;
1629         /*
1630          * This is the 'probe whether the host is KVM' logic:
1631          */
1632         case MSR_KVM_API_MAGIC:
1633                 return vcpu_register_para(vcpu, data);
1634
1635         default:
1636                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1637                 return 1;
1638         }
1639         return 0;
1640 }
1641 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1642
1643 /*
1644  * Writes msr value into into the appropriate "register".
1645  * Returns 0 on success, non-0 otherwise.
1646  * Assumes vcpu_load() was already called.
1647  */
1648 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1649 {
1650         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1651 }
1652
1653 void kvm_resched(struct kvm_vcpu *vcpu)
1654 {
1655         if (!need_resched())
1656                 return;
1657         vcpu_put(vcpu);
1658         cond_resched();
1659         vcpu_load(vcpu);
1660 }
1661 EXPORT_SYMBOL_GPL(kvm_resched);
1662
1663 void load_msrs(struct vmx_msr_entry *e, int n)
1664 {
1665         int i;
1666
1667         for (i = 0; i < n; ++i)
1668                 wrmsrl(e[i].index, e[i].data);
1669 }
1670 EXPORT_SYMBOL_GPL(load_msrs);
1671
1672 void save_msrs(struct vmx_msr_entry *e, int n)
1673 {
1674         int i;
1675
1676         for (i = 0; i < n; ++i)
1677                 rdmsrl(e[i].index, e[i].data);
1678 }
1679 EXPORT_SYMBOL_GPL(save_msrs);
1680
1681 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1682 {
1683         int i;
1684         u32 function;
1685         struct kvm_cpuid_entry *e, *best;
1686
1687         kvm_arch_ops->cache_regs(vcpu);
1688         function = vcpu->regs[VCPU_REGS_RAX];
1689         vcpu->regs[VCPU_REGS_RAX] = 0;
1690         vcpu->regs[VCPU_REGS_RBX] = 0;
1691         vcpu->regs[VCPU_REGS_RCX] = 0;
1692         vcpu->regs[VCPU_REGS_RDX] = 0;
1693         best = NULL;
1694         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1695                 e = &vcpu->cpuid_entries[i];
1696                 if (e->function == function) {
1697                         best = e;
1698                         break;
1699                 }
1700                 /*
1701                  * Both basic or both extended?
1702                  */
1703                 if (((e->function ^ function) & 0x80000000) == 0)
1704                         if (!best || e->function > best->function)
1705                                 best = e;
1706         }
1707         if (best) {
1708                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1709                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1710                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1711                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1712         }
1713         kvm_arch_ops->decache_regs(vcpu);
1714         kvm_arch_ops->skip_emulated_instruction(vcpu);
1715 }
1716 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1717
1718 static int pio_copy_data(struct kvm_vcpu *vcpu)
1719 {
1720         void *p = vcpu->pio_data;
1721         void *q;
1722         unsigned bytes;
1723         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1724
1725         kvm_arch_ops->vcpu_put(vcpu);
1726         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1727                  PAGE_KERNEL);
1728         if (!q) {
1729                 kvm_arch_ops->vcpu_load(vcpu);
1730                 free_pio_guest_pages(vcpu);
1731                 return -ENOMEM;
1732         }
1733         q += vcpu->pio.guest_page_offset;
1734         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1735         if (vcpu->pio.in)
1736                 memcpy(q, p, bytes);
1737         else
1738                 memcpy(p, q, bytes);
1739         q -= vcpu->pio.guest_page_offset;
1740         vunmap(q);
1741         kvm_arch_ops->vcpu_load(vcpu);
1742         free_pio_guest_pages(vcpu);
1743         return 0;
1744 }
1745
1746 static int complete_pio(struct kvm_vcpu *vcpu)
1747 {
1748         struct kvm_pio_request *io = &vcpu->pio;
1749         long delta;
1750         int r;
1751
1752         kvm_arch_ops->cache_regs(vcpu);
1753
1754         if (!io->string) {
1755                 if (io->in)
1756                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1757                                io->size);
1758         } else {
1759                 if (io->in) {
1760                         r = pio_copy_data(vcpu);
1761                         if (r) {
1762                                 kvm_arch_ops->cache_regs(vcpu);
1763                                 return r;
1764                         }
1765                 }
1766
1767                 delta = 1;
1768                 if (io->rep) {
1769                         delta *= io->cur_count;
1770                         /*
1771                          * The size of the register should really depend on
1772                          * current address size.
1773                          */
1774                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1775                 }
1776                 if (io->down)
1777                         delta = -delta;
1778                 delta *= io->size;
1779                 if (io->in)
1780                         vcpu->regs[VCPU_REGS_RDI] += delta;
1781                 else
1782                         vcpu->regs[VCPU_REGS_RSI] += delta;
1783         }
1784
1785         kvm_arch_ops->decache_regs(vcpu);
1786
1787         io->count -= io->cur_count;
1788         io->cur_count = 0;
1789
1790         if (!io->count)
1791                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1792         return 0;
1793 }
1794
1795 void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu)
1796 {
1797         /* TODO: String I/O for in kernel device */
1798
1799         if (vcpu->pio.in)
1800                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1801                                   vcpu->pio.size,
1802                                   vcpu->pio_data);
1803         else
1804                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1805                                    vcpu->pio.size,
1806                                    vcpu->pio_data);
1807 }
1808
1809 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1810                   int size, unsigned long count, int string, int down,
1811                   gva_t address, int rep, unsigned port)
1812 {
1813         unsigned now, in_page;
1814         int i;
1815         int nr_pages = 1;
1816         struct page *page;
1817         struct kvm_io_device *pio_dev;
1818
1819         vcpu->run->exit_reason = KVM_EXIT_IO;
1820         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1821         vcpu->run->io.size = size;
1822         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1823         vcpu->run->io.count = count;
1824         vcpu->run->io.port = port;
1825         vcpu->pio.count = count;
1826         vcpu->pio.cur_count = count;
1827         vcpu->pio.size = size;
1828         vcpu->pio.in = in;
1829         vcpu->pio.port = port;
1830         vcpu->pio.string = string;
1831         vcpu->pio.down = down;
1832         vcpu->pio.guest_page_offset = offset_in_page(address);
1833         vcpu->pio.rep = rep;
1834
1835         pio_dev = vcpu_find_pio_dev(vcpu, port);
1836         if (!string) {
1837                 kvm_arch_ops->cache_regs(vcpu);
1838                 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1839                 kvm_arch_ops->decache_regs(vcpu);
1840                 if (pio_dev) {
1841                         kernel_pio(pio_dev, vcpu);
1842                         complete_pio(vcpu);
1843                         return 1;
1844                 }
1845                 return 0;
1846         }
1847         /* TODO: String I/O for in kernel device */
1848         if (pio_dev)
1849                 printk(KERN_ERR "kvm_setup_pio: no string io support\n");
1850
1851         if (!count) {
1852                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1853                 return 1;
1854         }
1855
1856         now = min(count, PAGE_SIZE / size);
1857
1858         if (!down)
1859                 in_page = PAGE_SIZE - offset_in_page(address);
1860         else
1861                 in_page = offset_in_page(address) + size;
1862         now = min(count, (unsigned long)in_page / size);
1863         if (!now) {
1864                 /*
1865                  * String I/O straddles page boundary.  Pin two guest pages
1866                  * so that we satisfy atomicity constraints.  Do just one
1867                  * transaction to avoid complexity.
1868                  */
1869                 nr_pages = 2;
1870                 now = 1;
1871         }
1872         if (down) {
1873                 /*
1874                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1875                  */
1876                 printk(KERN_ERR "kvm: guest string pio down\n");
1877                 inject_gp(vcpu);
1878                 return 1;
1879         }
1880         vcpu->run->io.count = now;
1881         vcpu->pio.cur_count = now;
1882
1883         for (i = 0; i < nr_pages; ++i) {
1884                 spin_lock(&vcpu->kvm->lock);
1885                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1886                 if (page)
1887                         get_page(page);
1888                 vcpu->pio.guest_pages[i] = page;
1889                 spin_unlock(&vcpu->kvm->lock);
1890                 if (!page) {
1891                         inject_gp(vcpu);
1892                         free_pio_guest_pages(vcpu);
1893                         return 1;
1894                 }
1895         }
1896
1897         if (!vcpu->pio.in)
1898                 return pio_copy_data(vcpu);
1899         return 0;
1900 }
1901 EXPORT_SYMBOL_GPL(kvm_setup_pio);
1902
1903 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1904 {
1905         int r;
1906         sigset_t sigsaved;
1907
1908         vcpu_load(vcpu);
1909
1910         if (vcpu->sigset_active)
1911                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1912
1913         /* re-sync apic's tpr */
1914         vcpu->cr8 = kvm_run->cr8;
1915
1916         if (vcpu->pio.cur_count) {
1917                 r = complete_pio(vcpu);
1918                 if (r)
1919                         goto out;
1920         }
1921
1922         if (vcpu->mmio_needed) {
1923                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1924                 vcpu->mmio_read_completed = 1;
1925                 vcpu->mmio_needed = 0;
1926                 r = emulate_instruction(vcpu, kvm_run,
1927                                         vcpu->mmio_fault_cr2, 0);
1928                 if (r == EMULATE_DO_MMIO) {
1929                         /*
1930                          * Read-modify-write.  Back to userspace.
1931                          */
1932                         kvm_run->exit_reason = KVM_EXIT_MMIO;
1933                         r = 0;
1934                         goto out;
1935                 }
1936         }
1937
1938         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1939                 kvm_arch_ops->cache_regs(vcpu);
1940                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1941                 kvm_arch_ops->decache_regs(vcpu);
1942         }
1943
1944         r = kvm_arch_ops->run(vcpu, kvm_run);
1945
1946 out:
1947         if (vcpu->sigset_active)
1948                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1949
1950         vcpu_put(vcpu);
1951         return r;
1952 }
1953
1954 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1955                                    struct kvm_regs *regs)
1956 {
1957         vcpu_load(vcpu);
1958
1959         kvm_arch_ops->cache_regs(vcpu);
1960
1961         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1962         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1963         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1964         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1965         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1966         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1967         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1968         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1969 #ifdef CONFIG_X86_64
1970         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1971         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1972         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1973         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1974         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1975         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1976         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1977         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1978 #endif
1979
1980         regs->rip = vcpu->rip;
1981         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1982
1983         /*
1984          * Don't leak debug flags in case they were set for guest debugging
1985          */
1986         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1987                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1988
1989         vcpu_put(vcpu);
1990
1991         return 0;
1992 }
1993
1994 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1995                                    struct kvm_regs *regs)
1996 {
1997         vcpu_load(vcpu);
1998
1999         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2000         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2001         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2002         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2003         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2004         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2005         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2006         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2007 #ifdef CONFIG_X86_64
2008         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2009         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2010         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2011         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2012         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2013         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2014         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2015         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2016 #endif
2017
2018         vcpu->rip = regs->rip;
2019         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
2020
2021         kvm_arch_ops->decache_regs(vcpu);
2022
2023         vcpu_put(vcpu);
2024
2025         return 0;
2026 }
2027
2028 static void get_segment(struct kvm_vcpu *vcpu,
2029                         struct kvm_segment *var, int seg)
2030 {
2031         return kvm_arch_ops->get_segment(vcpu, var, seg);
2032 }
2033
2034 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2035                                     struct kvm_sregs *sregs)
2036 {
2037         struct descriptor_table dt;
2038
2039         vcpu_load(vcpu);
2040
2041         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2042         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2043         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2044         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2045         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2046         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2047
2048         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2049         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2050
2051         kvm_arch_ops->get_idt(vcpu, &dt);
2052         sregs->idt.limit = dt.limit;
2053         sregs->idt.base = dt.base;
2054         kvm_arch_ops->get_gdt(vcpu, &dt);
2055         sregs->gdt.limit = dt.limit;
2056         sregs->gdt.base = dt.base;
2057
2058         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2059         sregs->cr0 = vcpu->cr0;
2060         sregs->cr2 = vcpu->cr2;
2061         sregs->cr3 = vcpu->cr3;
2062         sregs->cr4 = vcpu->cr4;
2063         sregs->cr8 = vcpu->cr8;
2064         sregs->efer = vcpu->shadow_efer;
2065         sregs->apic_base = vcpu->apic_base;
2066
2067         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2068                sizeof sregs->interrupt_bitmap);
2069
2070         vcpu_put(vcpu);
2071
2072         return 0;
2073 }
2074
2075 static void set_segment(struct kvm_vcpu *vcpu,
2076                         struct kvm_segment *var, int seg)
2077 {
2078         return kvm_arch_ops->set_segment(vcpu, var, seg);
2079 }
2080
2081 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2082                                     struct kvm_sregs *sregs)
2083 {
2084         int mmu_reset_needed = 0;
2085         int i;
2086         struct descriptor_table dt;
2087
2088         vcpu_load(vcpu);
2089
2090         dt.limit = sregs->idt.limit;
2091         dt.base = sregs->idt.base;
2092         kvm_arch_ops->set_idt(vcpu, &dt);
2093         dt.limit = sregs->gdt.limit;
2094         dt.base = sregs->gdt.base;
2095         kvm_arch_ops->set_gdt(vcpu, &dt);
2096
2097         vcpu->cr2 = sregs->cr2;
2098         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2099         vcpu->cr3 = sregs->cr3;
2100
2101         vcpu->cr8 = sregs->cr8;
2102
2103         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2104 #ifdef CONFIG_X86_64
2105         kvm_arch_ops->set_efer(vcpu, sregs->efer);
2106 #endif
2107         vcpu->apic_base = sregs->apic_base;
2108
2109         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2110
2111         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2112         kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2113
2114         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2115         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2116         if (!is_long_mode(vcpu) && is_pae(vcpu))
2117                 load_pdptrs(vcpu, vcpu->cr3);
2118
2119         if (mmu_reset_needed)
2120                 kvm_mmu_reset_context(vcpu);
2121
2122         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2123                sizeof vcpu->irq_pending);
2124         vcpu->irq_summary = 0;
2125         for (i = 0; i < NR_IRQ_WORDS; ++i)
2126                 if (vcpu->irq_pending[i])
2127                         __set_bit(i, &vcpu->irq_summary);
2128
2129         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2130         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2131         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2132         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2133         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2134         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2135
2136         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2137         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2138
2139         vcpu_put(vcpu);
2140
2141         return 0;
2142 }
2143
2144 /*
2145  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2146  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2147  *
2148  * This list is modified at module load time to reflect the
2149  * capabilities of the host cpu.
2150  */
2151 static u32 msrs_to_save[] = {
2152         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2153         MSR_K6_STAR,
2154 #ifdef CONFIG_X86_64
2155         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2156 #endif
2157         MSR_IA32_TIME_STAMP_COUNTER,
2158 };
2159
2160 static unsigned num_msrs_to_save;
2161
2162 static u32 emulated_msrs[] = {
2163         MSR_IA32_MISC_ENABLE,
2164 };
2165
2166 static __init void kvm_init_msr_list(void)
2167 {
2168         u32 dummy[2];
2169         unsigned i, j;
2170
2171         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2172                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2173                         continue;
2174                 if (j < i)
2175                         msrs_to_save[j] = msrs_to_save[i];
2176                 j++;
2177         }
2178         num_msrs_to_save = j;
2179 }
2180
2181 /*
2182  * Adapt set_msr() to msr_io()'s calling convention
2183  */
2184 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2185 {
2186         return set_msr(vcpu, index, *data);
2187 }
2188
2189 /*
2190  * Read or write a bunch of msrs. All parameters are kernel addresses.
2191  *
2192  * @return number of msrs set successfully.
2193  */
2194 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2195                     struct kvm_msr_entry *entries,
2196                     int (*do_msr)(struct kvm_vcpu *vcpu,
2197                                   unsigned index, u64 *data))
2198 {
2199         int i;
2200
2201         vcpu_load(vcpu);
2202
2203         for (i = 0; i < msrs->nmsrs; ++i)
2204                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2205                         break;
2206
2207         vcpu_put(vcpu);
2208
2209         return i;
2210 }
2211
2212 /*
2213  * Read or write a bunch of msrs. Parameters are user addresses.
2214  *
2215  * @return number of msrs set successfully.
2216  */
2217 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2218                   int (*do_msr)(struct kvm_vcpu *vcpu,
2219                                 unsigned index, u64 *data),
2220                   int writeback)
2221 {
2222         struct kvm_msrs msrs;
2223         struct kvm_msr_entry *entries;
2224         int r, n;
2225         unsigned size;
2226
2227         r = -EFAULT;
2228         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2229                 goto out;
2230
2231         r = -E2BIG;
2232         if (msrs.nmsrs >= MAX_IO_MSRS)
2233                 goto out;
2234
2235         r = -ENOMEM;
2236         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2237         entries = vmalloc(size);
2238         if (!entries)
2239                 goto out;
2240
2241         r = -EFAULT;
2242         if (copy_from_user(entries, user_msrs->entries, size))
2243                 goto out_free;
2244
2245         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2246         if (r < 0)
2247                 goto out_free;
2248
2249         r = -EFAULT;
2250         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2251                 goto out_free;
2252
2253         r = n;
2254
2255 out_free:
2256         vfree(entries);
2257 out:
2258         return r;
2259 }
2260
2261 /*
2262  * Translate a guest virtual address to a guest physical address.
2263  */
2264 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2265                                     struct kvm_translation *tr)
2266 {
2267         unsigned long vaddr = tr->linear_address;
2268         gpa_t gpa;
2269
2270         vcpu_load(vcpu);
2271         spin_lock(&vcpu->kvm->lock);
2272         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2273         tr->physical_address = gpa;
2274         tr->valid = gpa != UNMAPPED_GVA;
2275         tr->writeable = 1;
2276         tr->usermode = 0;
2277         spin_unlock(&vcpu->kvm->lock);
2278         vcpu_put(vcpu);
2279
2280         return 0;
2281 }
2282
2283 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2284                                     struct kvm_interrupt *irq)
2285 {
2286         if (irq->irq < 0 || irq->irq >= 256)
2287                 return -EINVAL;
2288         vcpu_load(vcpu);
2289
2290         set_bit(irq->irq, vcpu->irq_pending);
2291         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2292
2293         vcpu_put(vcpu);
2294
2295         return 0;
2296 }
2297
2298 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2299                                       struct kvm_debug_guest *dbg)
2300 {
2301         int r;
2302
2303         vcpu_load(vcpu);
2304
2305         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2306
2307         vcpu_put(vcpu);
2308
2309         return r;
2310 }
2311
2312 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2313                                     unsigned long address,
2314                                     int *type)
2315 {
2316         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2317         unsigned long pgoff;
2318         struct page *page;
2319
2320         *type = VM_FAULT_MINOR;
2321         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2322         if (pgoff == 0)
2323                 page = virt_to_page(vcpu->run);
2324         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2325                 page = virt_to_page(vcpu->pio_data);
2326         else
2327                 return NOPAGE_SIGBUS;
2328         get_page(page);
2329         return page;
2330 }
2331
2332 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2333         .nopage = kvm_vcpu_nopage,
2334 };
2335
2336 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2337 {
2338         vma->vm_ops = &kvm_vcpu_vm_ops;
2339         return 0;
2340 }
2341
2342 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2343 {
2344         struct kvm_vcpu *vcpu = filp->private_data;
2345
2346         fput(vcpu->kvm->filp);
2347         return 0;
2348 }
2349
2350 static struct file_operations kvm_vcpu_fops = {
2351         .release        = kvm_vcpu_release,
2352         .unlocked_ioctl = kvm_vcpu_ioctl,
2353         .compat_ioctl   = kvm_vcpu_ioctl,
2354         .mmap           = kvm_vcpu_mmap,
2355 };
2356
2357 /*
2358  * Allocates an inode for the vcpu.
2359  */
2360 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2361 {
2362         int fd, r;
2363         struct inode *inode;
2364         struct file *file;
2365
2366         r = anon_inode_getfd(&fd, &inode, &file,
2367                              "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2368         if (r)
2369                 return r;
2370         atomic_inc(&vcpu->kvm->filp->f_count);
2371         return fd;
2372 }
2373
2374 /*
2375  * Creates some virtual cpus.  Good luck creating more than one.
2376  */
2377 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2378 {
2379         int r;
2380         struct kvm_vcpu *vcpu;
2381         struct page *page;
2382
2383         r = -EINVAL;
2384         if (!valid_vcpu(n))
2385                 goto out;
2386
2387         vcpu = &kvm->vcpus[n];
2388
2389         mutex_lock(&vcpu->mutex);
2390
2391         if (vcpu->vmcs) {
2392                 mutex_unlock(&vcpu->mutex);
2393                 return -EEXIST;
2394         }
2395
2396         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2397         r = -ENOMEM;
2398         if (!page)
2399                 goto out_unlock;
2400         vcpu->run = page_address(page);
2401
2402         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2403         r = -ENOMEM;
2404         if (!page)
2405                 goto out_free_run;
2406         vcpu->pio_data = page_address(page);
2407
2408         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
2409                                            FX_IMAGE_ALIGN);
2410         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
2411         vcpu->cr0 = 0x10;
2412
2413         r = kvm_arch_ops->vcpu_create(vcpu);
2414         if (r < 0)
2415                 goto out_free_vcpus;
2416
2417         r = kvm_mmu_create(vcpu);
2418         if (r < 0)
2419                 goto out_free_vcpus;
2420
2421         kvm_arch_ops->vcpu_load(vcpu);
2422         r = kvm_mmu_setup(vcpu);
2423         if (r >= 0)
2424                 r = kvm_arch_ops->vcpu_setup(vcpu);
2425         vcpu_put(vcpu);
2426
2427         if (r < 0)
2428                 goto out_free_vcpus;
2429
2430         r = create_vcpu_fd(vcpu);
2431         if (r < 0)
2432                 goto out_free_vcpus;
2433
2434         spin_lock(&kvm_lock);
2435         if (n >= kvm->nvcpus)
2436                 kvm->nvcpus = n + 1;
2437         spin_unlock(&kvm_lock);
2438
2439         return r;
2440
2441 out_free_vcpus:
2442         kvm_free_vcpu(vcpu);
2443 out_free_run:
2444         free_page((unsigned long)vcpu->run);
2445         vcpu->run = NULL;
2446 out_unlock:
2447         mutex_unlock(&vcpu->mutex);
2448 out:
2449         return r;
2450 }
2451
2452 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2453 {
2454         u64 efer;
2455         int i;
2456         struct kvm_cpuid_entry *e, *entry;
2457
2458         rdmsrl(MSR_EFER, efer);
2459         entry = NULL;
2460         for (i = 0; i < vcpu->cpuid_nent; ++i) {
2461                 e = &vcpu->cpuid_entries[i];
2462                 if (e->function == 0x80000001) {
2463                         entry = e;
2464                         break;
2465                 }
2466         }
2467         if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) {
2468                 entry->edx &= ~(1 << 20);
2469                 printk(KERN_INFO ": guest NX capability removed\n");
2470         }
2471 }
2472
2473 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2474                                     struct kvm_cpuid *cpuid,
2475                                     struct kvm_cpuid_entry __user *entries)
2476 {
2477         int r;
2478
2479         r = -E2BIG;
2480         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2481                 goto out;
2482         r = -EFAULT;
2483         if (copy_from_user(&vcpu->cpuid_entries, entries,
2484                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2485                 goto out;
2486         vcpu->cpuid_nent = cpuid->nent;
2487         cpuid_fix_nx_cap(vcpu);
2488         return 0;
2489
2490 out:
2491         return r;
2492 }
2493
2494 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2495 {
2496         if (sigset) {
2497                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2498                 vcpu->sigset_active = 1;
2499                 vcpu->sigset = *sigset;
2500         } else
2501                 vcpu->sigset_active = 0;
2502         return 0;
2503 }
2504
2505 /*
2506  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2507  * we have asm/x86/processor.h
2508  */
2509 struct fxsave {
2510         u16     cwd;
2511         u16     swd;
2512         u16     twd;
2513         u16     fop;
2514         u64     rip;
2515         u64     rdp;
2516         u32     mxcsr;
2517         u32     mxcsr_mask;
2518         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2519 #ifdef CONFIG_X86_64
2520         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2521 #else
2522         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2523 #endif
2524 };
2525
2526 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2527 {
2528         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2529
2530         vcpu_load(vcpu);
2531
2532         memcpy(fpu->fpr, fxsave->st_space, 128);
2533         fpu->fcw = fxsave->cwd;
2534         fpu->fsw = fxsave->swd;
2535         fpu->ftwx = fxsave->twd;
2536         fpu->last_opcode = fxsave->fop;
2537         fpu->last_ip = fxsave->rip;
2538         fpu->last_dp = fxsave->rdp;
2539         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2540
2541         vcpu_put(vcpu);
2542
2543         return 0;
2544 }
2545
2546 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2547 {
2548         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2549
2550         vcpu_load(vcpu);
2551
2552         memcpy(fxsave->st_space, fpu->fpr, 128);
2553         fxsave->cwd = fpu->fcw;
2554         fxsave->swd = fpu->fsw;
2555         fxsave->twd = fpu->ftwx;
2556         fxsave->fop = fpu->last_opcode;
2557         fxsave->rip = fpu->last_ip;
2558         fxsave->rdp = fpu->last_dp;
2559         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2560
2561         vcpu_put(vcpu);
2562
2563         return 0;
2564 }
2565
2566 static long kvm_vcpu_ioctl(struct file *filp,
2567                            unsigned int ioctl, unsigned long arg)
2568 {
2569         struct kvm_vcpu *vcpu = filp->private_data;
2570         void __user *argp = (void __user *)arg;
2571         int r = -EINVAL;
2572
2573         switch (ioctl) {
2574         case KVM_RUN:
2575                 r = -EINVAL;
2576                 if (arg)
2577                         goto out;
2578                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2579                 break;
2580         case KVM_GET_REGS: {
2581                 struct kvm_regs kvm_regs;
2582
2583                 memset(&kvm_regs, 0, sizeof kvm_regs);
2584                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2585                 if (r)
2586                         goto out;
2587                 r = -EFAULT;
2588                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2589                         goto out;
2590                 r = 0;
2591                 break;
2592         }
2593         case KVM_SET_REGS: {
2594                 struct kvm_regs kvm_regs;
2595
2596                 r = -EFAULT;
2597                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2598                         goto out;
2599                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2600                 if (r)
2601                         goto out;
2602                 r = 0;
2603                 break;
2604         }
2605         case KVM_GET_SREGS: {
2606                 struct kvm_sregs kvm_sregs;
2607
2608                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2609                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2610                 if (r)
2611                         goto out;
2612                 r = -EFAULT;
2613                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2614                         goto out;
2615                 r = 0;
2616                 break;
2617         }
2618         case KVM_SET_SREGS: {
2619                 struct kvm_sregs kvm_sregs;
2620
2621                 r = -EFAULT;
2622                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2623                         goto out;
2624                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2625                 if (r)
2626                         goto out;
2627                 r = 0;
2628                 break;
2629         }
2630         case KVM_TRANSLATE: {
2631                 struct kvm_translation tr;
2632
2633                 r = -EFAULT;
2634                 if (copy_from_user(&tr, argp, sizeof tr))
2635                         goto out;
2636                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2637                 if (r)
2638                         goto out;
2639                 r = -EFAULT;
2640                 if (copy_to_user(argp, &tr, sizeof tr))
2641                         goto out;
2642                 r = 0;
2643                 break;
2644         }
2645         case KVM_INTERRUPT: {
2646                 struct kvm_interrupt irq;
2647
2648                 r = -EFAULT;
2649                 if (copy_from_user(&irq, argp, sizeof irq))
2650                         goto out;
2651                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2652                 if (r)
2653                         goto out;
2654                 r = 0;
2655                 break;
2656         }
2657         case KVM_DEBUG_GUEST: {
2658                 struct kvm_debug_guest dbg;
2659
2660                 r = -EFAULT;
2661                 if (copy_from_user(&dbg, argp, sizeof dbg))
2662                         goto out;
2663                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2664                 if (r)
2665                         goto out;
2666                 r = 0;
2667                 break;
2668         }
2669         case KVM_GET_MSRS:
2670                 r = msr_io(vcpu, argp, get_msr, 1);
2671                 break;
2672         case KVM_SET_MSRS:
2673                 r = msr_io(vcpu, argp, do_set_msr, 0);
2674                 break;
2675         case KVM_SET_CPUID: {
2676                 struct kvm_cpuid __user *cpuid_arg = argp;
2677                 struct kvm_cpuid cpuid;
2678
2679                 r = -EFAULT;
2680                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2681                         goto out;
2682                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2683                 if (r)
2684                         goto out;
2685                 break;
2686         }
2687         case KVM_SET_SIGNAL_MASK: {
2688                 struct kvm_signal_mask __user *sigmask_arg = argp;
2689                 struct kvm_signal_mask kvm_sigmask;
2690                 sigset_t sigset, *p;
2691
2692                 p = NULL;
2693                 if (argp) {
2694                         r = -EFAULT;
2695                         if (copy_from_user(&kvm_sigmask, argp,
2696                                            sizeof kvm_sigmask))
2697                                 goto out;
2698                         r = -EINVAL;
2699                         if (kvm_sigmask.len != sizeof sigset)
2700                                 goto out;
2701                         r = -EFAULT;
2702                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2703                                            sizeof sigset))
2704                                 goto out;
2705                         p = &sigset;
2706                 }
2707                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2708                 break;
2709         }
2710         case KVM_GET_FPU: {
2711                 struct kvm_fpu fpu;
2712
2713                 memset(&fpu, 0, sizeof fpu);
2714                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2715                 if (r)
2716                         goto out;
2717                 r = -EFAULT;
2718                 if (copy_to_user(argp, &fpu, sizeof fpu))
2719                         goto out;
2720                 r = 0;
2721                 break;
2722         }
2723         case KVM_SET_FPU: {
2724                 struct kvm_fpu fpu;
2725
2726                 r = -EFAULT;
2727                 if (copy_from_user(&fpu, argp, sizeof fpu))
2728                         goto out;
2729                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2730                 if (r)
2731                         goto out;
2732                 r = 0;
2733                 break;
2734         }
2735         default:
2736                 ;
2737         }
2738 out:
2739         return r;
2740 }
2741
2742 static long kvm_vm_ioctl(struct file *filp,
2743                            unsigned int ioctl, unsigned long arg)
2744 {
2745         struct kvm *kvm = filp->private_data;
2746         void __user *argp = (void __user *)arg;
2747         int r = -EINVAL;
2748
2749         switch (ioctl) {
2750         case KVM_CREATE_VCPU:
2751                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2752                 if (r < 0)
2753                         goto out;
2754                 break;
2755         case KVM_SET_MEMORY_REGION: {
2756                 struct kvm_memory_region kvm_mem;
2757
2758                 r = -EFAULT;
2759                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2760                         goto out;
2761                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2762                 if (r)
2763                         goto out;
2764                 break;
2765         }
2766         case KVM_GET_DIRTY_LOG: {
2767                 struct kvm_dirty_log log;
2768
2769                 r = -EFAULT;
2770                 if (copy_from_user(&log, argp, sizeof log))
2771                         goto out;
2772                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2773                 if (r)
2774                         goto out;
2775                 break;
2776         }
2777         case KVM_SET_MEMORY_ALIAS: {
2778                 struct kvm_memory_alias alias;
2779
2780                 r = -EFAULT;
2781                 if (copy_from_user(&alias, argp, sizeof alias))
2782                         goto out;
2783                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2784                 if (r)
2785                         goto out;
2786                 break;
2787         }
2788         default:
2789                 ;
2790         }
2791 out:
2792         return r;
2793 }
2794
2795 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2796                                   unsigned long address,
2797                                   int *type)
2798 {
2799         struct kvm *kvm = vma->vm_file->private_data;
2800         unsigned long pgoff;
2801         struct page *page;
2802
2803         *type = VM_FAULT_MINOR;
2804         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2805         page = gfn_to_page(kvm, pgoff);
2806         if (!page)
2807                 return NOPAGE_SIGBUS;
2808         get_page(page);
2809         return page;
2810 }
2811
2812 static struct vm_operations_struct kvm_vm_vm_ops = {
2813         .nopage = kvm_vm_nopage,
2814 };
2815
2816 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2817 {
2818         vma->vm_ops = &kvm_vm_vm_ops;
2819         return 0;
2820 }
2821
2822 static struct file_operations kvm_vm_fops = {
2823         .release        = kvm_vm_release,
2824         .unlocked_ioctl = kvm_vm_ioctl,
2825         .compat_ioctl   = kvm_vm_ioctl,
2826         .mmap           = kvm_vm_mmap,
2827 };
2828
2829 static int kvm_dev_ioctl_create_vm(void)
2830 {
2831         int fd, r;
2832         struct inode *inode;
2833         struct file *file;
2834         struct kvm *kvm;
2835
2836         kvm = kvm_create_vm();
2837         if (IS_ERR(kvm))
2838                 return PTR_ERR(kvm);
2839         r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
2840         if (r) {
2841                 kvm_destroy_vm(kvm);
2842                 return r;
2843         }
2844
2845         kvm->filp = file;
2846
2847         return fd;
2848 }
2849
2850 static long kvm_dev_ioctl(struct file *filp,
2851                           unsigned int ioctl, unsigned long arg)
2852 {
2853         void __user *argp = (void __user *)arg;
2854         long r = -EINVAL;
2855
2856         switch (ioctl) {
2857         case KVM_GET_API_VERSION:
2858                 r = -EINVAL;
2859                 if (arg)
2860                         goto out;
2861                 r = KVM_API_VERSION;
2862                 break;
2863         case KVM_CREATE_VM:
2864                 r = -EINVAL;
2865                 if (arg)
2866                         goto out;
2867                 r = kvm_dev_ioctl_create_vm();
2868                 break;
2869         case KVM_GET_MSR_INDEX_LIST: {
2870                 struct kvm_msr_list __user *user_msr_list = argp;
2871                 struct kvm_msr_list msr_list;
2872                 unsigned n;
2873
2874                 r = -EFAULT;
2875                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2876                         goto out;
2877                 n = msr_list.nmsrs;
2878                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2879                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2880                         goto out;
2881                 r = -E2BIG;
2882                 if (n < num_msrs_to_save)
2883                         goto out;
2884                 r = -EFAULT;
2885                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2886                                  num_msrs_to_save * sizeof(u32)))
2887                         goto out;
2888                 if (copy_to_user(user_msr_list->indices
2889                                  + num_msrs_to_save * sizeof(u32),
2890                                  &emulated_msrs,
2891                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2892                         goto out;
2893                 r = 0;
2894                 break;
2895         }
2896         case KVM_CHECK_EXTENSION:
2897                 /*
2898                  * No extensions defined at present.
2899                  */
2900                 r = 0;
2901                 break;
2902         case KVM_GET_VCPU_MMAP_SIZE:
2903                 r = -EINVAL;
2904                 if (arg)
2905                         goto out;
2906                 r = 2 * PAGE_SIZE;
2907                 break;
2908         default:
2909                 ;
2910         }
2911 out:
2912         return r;
2913 }
2914
2915 static struct file_operations kvm_chardev_ops = {
2916         .open           = kvm_dev_open,
2917         .release        = kvm_dev_release,
2918         .unlocked_ioctl = kvm_dev_ioctl,
2919         .compat_ioctl   = kvm_dev_ioctl,
2920 };
2921
2922 static struct miscdevice kvm_dev = {
2923         KVM_MINOR,
2924         "kvm",
2925         &kvm_chardev_ops,
2926 };
2927
2928 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2929                        void *v)
2930 {
2931         if (val == SYS_RESTART) {
2932                 /*
2933                  * Some (well, at least mine) BIOSes hang on reboot if
2934                  * in vmx root mode.
2935                  */
2936                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2937                 on_each_cpu(hardware_disable, NULL, 0, 1);
2938         }
2939         return NOTIFY_OK;
2940 }
2941
2942 static struct notifier_block kvm_reboot_notifier = {
2943         .notifier_call = kvm_reboot,
2944         .priority = 0,
2945 };
2946
2947 /*
2948  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2949  * cached on it.
2950  */
2951 static void decache_vcpus_on_cpu(int cpu)
2952 {
2953         struct kvm *vm;
2954         struct kvm_vcpu *vcpu;
2955         int i;
2956
2957         spin_lock(&kvm_lock);
2958         list_for_each_entry(vm, &vm_list, vm_list)
2959                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2960                         vcpu = &vm->vcpus[i];
2961                         /*
2962                          * If the vcpu is locked, then it is running on some
2963                          * other cpu and therefore it is not cached on the
2964                          * cpu in question.
2965                          *
2966                          * If it's not locked, check the last cpu it executed
2967                          * on.
2968                          */
2969                         if (mutex_trylock(&vcpu->mutex)) {
2970                                 if (vcpu->cpu == cpu) {
2971                                         kvm_arch_ops->vcpu_decache(vcpu);
2972                                         vcpu->cpu = -1;
2973                                 }
2974                                 mutex_unlock(&vcpu->mutex);
2975                         }
2976                 }
2977         spin_unlock(&kvm_lock);
2978 }
2979
2980 static void hardware_enable(void *junk)
2981 {
2982         int cpu = raw_smp_processor_id();
2983
2984         if (cpu_isset(cpu, cpus_hardware_enabled))
2985                 return;
2986         cpu_set(cpu, cpus_hardware_enabled);
2987         kvm_arch_ops->hardware_enable(NULL);
2988 }
2989
2990 static void hardware_disable(void *junk)
2991 {
2992         int cpu = raw_smp_processor_id();
2993
2994         if (!cpu_isset(cpu, cpus_hardware_enabled))
2995                 return;
2996         cpu_clear(cpu, cpus_hardware_enabled);
2997         decache_vcpus_on_cpu(cpu);
2998         kvm_arch_ops->hardware_disable(NULL);
2999 }
3000
3001 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3002                            void *v)
3003 {
3004         int cpu = (long)v;
3005
3006         switch (val) {
3007         case CPU_DYING:
3008         case CPU_DYING_FROZEN:
3009         case CPU_UP_CANCELED:
3010         case CPU_UP_CANCELED_FROZEN:
3011                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3012                        cpu);
3013                 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3014                 break;
3015         case CPU_ONLINE:
3016         case CPU_ONLINE_FROZEN:
3017                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3018                        cpu);
3019                 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3020                 break;
3021         }
3022         return NOTIFY_OK;
3023 }
3024
3025 void kvm_io_bus_init(struct kvm_io_bus *bus)
3026 {
3027         memset(bus, 0, sizeof(*bus));
3028 }
3029
3030 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3031 {
3032         int i;
3033
3034         for (i = 0; i < bus->dev_count; i++) {
3035                 struct kvm_io_device *pos = bus->devs[i];
3036
3037                 kvm_iodevice_destructor(pos);
3038         }
3039 }
3040
3041 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3042 {
3043         int i;
3044
3045         for (i = 0; i < bus->dev_count; i++) {
3046                 struct kvm_io_device *pos = bus->devs[i];
3047
3048                 if (pos->in_range(pos, addr))
3049                         return pos;
3050         }
3051
3052         return NULL;
3053 }
3054
3055 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3056 {
3057         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3058
3059         bus->devs[bus->dev_count++] = dev;
3060 }
3061
3062 static struct notifier_block kvm_cpu_notifier = {
3063         .notifier_call = kvm_cpu_hotplug,
3064         .priority = 20, /* must be > scheduler priority */
3065 };
3066
3067 static u64 stat_get(void *_offset)
3068 {
3069         unsigned offset = (long)_offset;
3070         u64 total = 0;
3071         struct kvm *kvm;
3072         struct kvm_vcpu *vcpu;
3073         int i;
3074
3075         spin_lock(&kvm_lock);
3076         list_for_each_entry(kvm, &vm_list, vm_list)
3077                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3078                         vcpu = &kvm->vcpus[i];
3079                         total += *(u32 *)((void *)vcpu + offset);
3080                 }
3081         spin_unlock(&kvm_lock);
3082         return total;
3083 }
3084
3085 static void stat_set(void *offset, u64 val)
3086 {
3087 }
3088
3089 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
3090
3091 static __init void kvm_init_debug(void)
3092 {
3093         struct kvm_stats_debugfs_item *p;
3094
3095         debugfs_dir = debugfs_create_dir("kvm", NULL);
3096         for (p = debugfs_entries; p->name; ++p)
3097                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3098                                                 (void *)(long)p->offset,
3099                                                 &stat_fops);
3100 }
3101
3102 static void kvm_exit_debug(void)
3103 {
3104         struct kvm_stats_debugfs_item *p;
3105
3106         for (p = debugfs_entries; p->name; ++p)
3107                 debugfs_remove(p->dentry);
3108         debugfs_remove(debugfs_dir);
3109 }
3110
3111 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3112 {
3113         hardware_disable(NULL);
3114         return 0;
3115 }
3116
3117 static int kvm_resume(struct sys_device *dev)
3118 {
3119         hardware_enable(NULL);
3120         return 0;
3121 }
3122
3123 static struct sysdev_class kvm_sysdev_class = {
3124         set_kset_name("kvm"),
3125         .suspend = kvm_suspend,
3126         .resume = kvm_resume,
3127 };
3128
3129 static struct sys_device kvm_sysdev = {
3130         .id = 0,
3131         .cls = &kvm_sysdev_class,
3132 };
3133
3134 hpa_t bad_page_address;
3135
3136 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3137 {
3138         int r;
3139
3140         if (kvm_arch_ops) {
3141                 printk(KERN_ERR "kvm: already loaded the other module\n");
3142                 return -EEXIST;
3143         }
3144
3145         if (!ops->cpu_has_kvm_support()) {
3146                 printk(KERN_ERR "kvm: no hardware support\n");
3147                 return -EOPNOTSUPP;
3148         }
3149         if (ops->disabled_by_bios()) {
3150                 printk(KERN_ERR "kvm: disabled by bios\n");
3151                 return -EOPNOTSUPP;
3152         }
3153
3154         kvm_arch_ops = ops;
3155
3156         r = kvm_arch_ops->hardware_setup();
3157         if (r < 0)
3158                 goto out;
3159
3160         on_each_cpu(hardware_enable, NULL, 0, 1);
3161         r = register_cpu_notifier(&kvm_cpu_notifier);
3162         if (r)
3163                 goto out_free_1;
3164         register_reboot_notifier(&kvm_reboot_notifier);
3165
3166         r = sysdev_class_register(&kvm_sysdev_class);
3167         if (r)
3168                 goto out_free_2;
3169
3170         r = sysdev_register(&kvm_sysdev);
3171         if (r)
3172                 goto out_free_3;
3173
3174         kvm_chardev_ops.owner = module;
3175
3176         r = misc_register(&kvm_dev);
3177         if (r) {
3178                 printk (KERN_ERR "kvm: misc device register failed\n");
3179                 goto out_free;
3180         }
3181
3182         return r;
3183
3184 out_free:
3185         sysdev_unregister(&kvm_sysdev);
3186 out_free_3:
3187         sysdev_class_unregister(&kvm_sysdev_class);
3188 out_free_2:
3189         unregister_reboot_notifier(&kvm_reboot_notifier);
3190         unregister_cpu_notifier(&kvm_cpu_notifier);
3191 out_free_1:
3192         on_each_cpu(hardware_disable, NULL, 0, 1);
3193         kvm_arch_ops->hardware_unsetup();
3194 out:
3195         kvm_arch_ops = NULL;
3196         return r;
3197 }
3198
3199 void kvm_exit_arch(void)
3200 {
3201         misc_deregister(&kvm_dev);
3202         sysdev_unregister(&kvm_sysdev);
3203         sysdev_class_unregister(&kvm_sysdev_class);
3204         unregister_reboot_notifier(&kvm_reboot_notifier);
3205         unregister_cpu_notifier(&kvm_cpu_notifier);
3206         on_each_cpu(hardware_disable, NULL, 0, 1);
3207         kvm_arch_ops->hardware_unsetup();
3208         kvm_arch_ops = NULL;
3209 }
3210
3211 static __init int kvm_init(void)
3212 {
3213         static struct page *bad_page;
3214         int r;
3215
3216         r = kvm_mmu_module_init();
3217         if (r)
3218                 goto out4;
3219
3220         kvm_init_debug();
3221
3222         kvm_init_msr_list();
3223
3224         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3225                 r = -ENOMEM;
3226                 goto out;
3227         }
3228
3229         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3230         memset(__va(bad_page_address), 0, PAGE_SIZE);
3231
3232         return 0;
3233
3234 out:
3235         kvm_exit_debug();
3236         kvm_mmu_module_exit();
3237 out4:
3238         return r;
3239 }
3240
3241 static __exit void kvm_exit(void)
3242 {
3243         kvm_exit_debug();
3244         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3245         kvm_mmu_module_exit();
3246 }
3247
3248 module_init(kvm_init)
3249 module_exit(kvm_exit)
3250
3251 EXPORT_SYMBOL_GPL(kvm_init_arch);
3252 EXPORT_SYMBOL_GPL(kvm_exit_arch);