Merge branch 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6] / arch / x86 / kvm / svm.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * AMD SVM support
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  *
8  * Authors:
9  *   Yaniv Kamay  <yaniv@qumranet.com>
10  *   Avi Kivity   <avi@qumranet.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2.  See
13  * the COPYING file in the top-level directory.
14  *
15  */
16 #include <linux/kvm_host.h>
17
18 #include "kvm_svm.h"
19 #include "irq.h"
20 #include "mmu.h"
21 #include "kvm_cache_regs.h"
22
23 #include <linux/module.h>
24 #include <linux/kernel.h>
25 #include <linux/vmalloc.h>
26 #include <linux/highmem.h>
27 #include <linux/sched.h>
28
29 #include <asm/desc.h>
30
31 #include <asm/virtext.h>
32
33 #define __ex(x) __kvm_handle_fault_on_reboot(x)
34
35 MODULE_AUTHOR("Qumranet");
36 MODULE_LICENSE("GPL");
37
38 #define IOPM_ALLOC_ORDER 2
39 #define MSRPM_ALLOC_ORDER 1
40
41 #define SEG_TYPE_LDT 2
42 #define SEG_TYPE_BUSY_TSS16 3
43
44 #define SVM_FEATURE_NPT  (1 << 0)
45 #define SVM_FEATURE_LBRV (1 << 1)
46 #define SVM_FEATURE_SVML (1 << 2)
47
48 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
49
50 /* Turn on to get debugging output*/
51 /* #define NESTED_DEBUG */
52
53 #ifdef NESTED_DEBUG
54 #define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
55 #else
56 #define nsvm_printk(fmt, args...) do {} while(0)
57 #endif
58
59 /* enable NPT for AMD64 and X86 with PAE */
60 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61 static bool npt_enabled = true;
62 #else
63 static bool npt_enabled = false;
64 #endif
65 static int npt = 1;
66
67 module_param(npt, int, S_IRUGO);
68
69 static int nested = 0;
70 module_param(nested, int, S_IRUGO);
71
72 static void kvm_reput_irq(struct vcpu_svm *svm);
73 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
74
75 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
76 static int nested_svm_vmexit(struct vcpu_svm *svm);
77 static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
78                              void *arg2, void *opaque);
79 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
80                                       bool has_error_code, u32 error_code);
81
82 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
83 {
84         return container_of(vcpu, struct vcpu_svm, vcpu);
85 }
86
87 static inline bool is_nested(struct vcpu_svm *svm)
88 {
89         return svm->nested_vmcb;
90 }
91
92 static unsigned long iopm_base;
93
94 struct kvm_ldttss_desc {
95         u16 limit0;
96         u16 base0;
97         unsigned base1 : 8, type : 5, dpl : 2, p : 1;
98         unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
99         u32 base3;
100         u32 zero1;
101 } __attribute__((packed));
102
103 struct svm_cpu_data {
104         int cpu;
105
106         u64 asid_generation;
107         u32 max_asid;
108         u32 next_asid;
109         struct kvm_ldttss_desc *tss_desc;
110
111         struct page *save_area;
112 };
113
114 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
115 static uint32_t svm_features;
116
117 struct svm_init_data {
118         int cpu;
119         int r;
120 };
121
122 static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
123
124 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
125 #define MSRS_RANGE_SIZE 2048
126 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
127
128 #define MAX_INST_SIZE 15
129
130 static inline u32 svm_has(u32 feat)
131 {
132         return svm_features & feat;
133 }
134
135 static inline u8 pop_irq(struct kvm_vcpu *vcpu)
136 {
137         int word_index = __ffs(vcpu->arch.irq_summary);
138         int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
139         int irq = word_index * BITS_PER_LONG + bit_index;
140
141         clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
142         if (!vcpu->arch.irq_pending[word_index])
143                 clear_bit(word_index, &vcpu->arch.irq_summary);
144         return irq;
145 }
146
147 static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
148 {
149         set_bit(irq, vcpu->arch.irq_pending);
150         set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
151 }
152
153 static inline void clgi(void)
154 {
155         asm volatile (__ex(SVM_CLGI));
156 }
157
158 static inline void stgi(void)
159 {
160         asm volatile (__ex(SVM_STGI));
161 }
162
163 static inline void invlpga(unsigned long addr, u32 asid)
164 {
165         asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
166 }
167
168 static inline unsigned long kvm_read_cr2(void)
169 {
170         unsigned long cr2;
171
172         asm volatile ("mov %%cr2, %0" : "=r" (cr2));
173         return cr2;
174 }
175
176 static inline void kvm_write_cr2(unsigned long val)
177 {
178         asm volatile ("mov %0, %%cr2" :: "r" (val));
179 }
180
181 static inline void force_new_asid(struct kvm_vcpu *vcpu)
182 {
183         to_svm(vcpu)->asid_generation--;
184 }
185
186 static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
187 {
188         force_new_asid(vcpu);
189 }
190
191 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
192 {
193         if (!npt_enabled && !(efer & EFER_LMA))
194                 efer &= ~EFER_LME;
195
196         to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
197         vcpu->arch.shadow_efer = efer;
198 }
199
200 static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
201                                 bool has_error_code, u32 error_code)
202 {
203         struct vcpu_svm *svm = to_svm(vcpu);
204
205         /* If we are within a nested VM we'd better #VMEXIT and let the
206            guest handle the exception */
207         if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
208                 return;
209
210         svm->vmcb->control.event_inj = nr
211                 | SVM_EVTINJ_VALID
212                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
213                 | SVM_EVTINJ_TYPE_EXEPT;
214         svm->vmcb->control.event_inj_err = error_code;
215 }
216
217 static bool svm_exception_injected(struct kvm_vcpu *vcpu)
218 {
219         struct vcpu_svm *svm = to_svm(vcpu);
220
221         return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
222 }
223
224 static int is_external_interrupt(u32 info)
225 {
226         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
227         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
228 }
229
230 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
231 {
232         struct vcpu_svm *svm = to_svm(vcpu);
233
234         if (!svm->next_rip) {
235                 printk(KERN_DEBUG "%s: NOP\n", __func__);
236                 return;
237         }
238         if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
239                 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
240                        __func__, kvm_rip_read(vcpu), svm->next_rip);
241
242         kvm_rip_write(vcpu, svm->next_rip);
243         svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
244
245         vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
246 }
247
248 static int has_svm(void)
249 {
250         const char *msg;
251
252         if (!cpu_has_svm(&msg)) {
253                 printk(KERN_INFO "has_svm: %s\n", msg);
254                 return 0;
255         }
256
257         return 1;
258 }
259
260 static void svm_hardware_disable(void *garbage)
261 {
262         cpu_svm_disable();
263 }
264
265 static void svm_hardware_enable(void *garbage)
266 {
267
268         struct svm_cpu_data *svm_data;
269         uint64_t efer;
270         struct desc_ptr gdt_descr;
271         struct desc_struct *gdt;
272         int me = raw_smp_processor_id();
273
274         if (!has_svm()) {
275                 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
276                 return;
277         }
278         svm_data = per_cpu(svm_data, me);
279
280         if (!svm_data) {
281                 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
282                        me);
283                 return;
284         }
285
286         svm_data->asid_generation = 1;
287         svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
288         svm_data->next_asid = svm_data->max_asid + 1;
289
290         asm volatile ("sgdt %0" : "=m"(gdt_descr));
291         gdt = (struct desc_struct *)gdt_descr.address;
292         svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
293
294         rdmsrl(MSR_EFER, efer);
295         wrmsrl(MSR_EFER, efer | EFER_SVME);
296
297         wrmsrl(MSR_VM_HSAVE_PA,
298                page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
299 }
300
301 static void svm_cpu_uninit(int cpu)
302 {
303         struct svm_cpu_data *svm_data
304                 = per_cpu(svm_data, raw_smp_processor_id());
305
306         if (!svm_data)
307                 return;
308
309         per_cpu(svm_data, raw_smp_processor_id()) = NULL;
310         __free_page(svm_data->save_area);
311         kfree(svm_data);
312 }
313
314 static int svm_cpu_init(int cpu)
315 {
316         struct svm_cpu_data *svm_data;
317         int r;
318
319         svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
320         if (!svm_data)
321                 return -ENOMEM;
322         svm_data->cpu = cpu;
323         svm_data->save_area = alloc_page(GFP_KERNEL);
324         r = -ENOMEM;
325         if (!svm_data->save_area)
326                 goto err_1;
327
328         per_cpu(svm_data, cpu) = svm_data;
329
330         return 0;
331
332 err_1:
333         kfree(svm_data);
334         return r;
335
336 }
337
338 static void set_msr_interception(u32 *msrpm, unsigned msr,
339                                  int read, int write)
340 {
341         int i;
342
343         for (i = 0; i < NUM_MSR_MAPS; i++) {
344                 if (msr >= msrpm_ranges[i] &&
345                     msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
346                         u32 msr_offset = (i * MSRS_IN_RANGE + msr -
347                                           msrpm_ranges[i]) * 2;
348
349                         u32 *base = msrpm + (msr_offset / 32);
350                         u32 msr_shift = msr_offset % 32;
351                         u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
352                         *base = (*base & ~(0x3 << msr_shift)) |
353                                 (mask << msr_shift);
354                         return;
355                 }
356         }
357         BUG();
358 }
359
360 static void svm_vcpu_init_msrpm(u32 *msrpm)
361 {
362         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
363
364 #ifdef CONFIG_X86_64
365         set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
366         set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
367         set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
368         set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
369         set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
370         set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
371 #endif
372         set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
373         set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
374         set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
375         set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
376 }
377
378 static void svm_enable_lbrv(struct vcpu_svm *svm)
379 {
380         u32 *msrpm = svm->msrpm;
381
382         svm->vmcb->control.lbr_ctl = 1;
383         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
384         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
385         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
386         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
387 }
388
389 static void svm_disable_lbrv(struct vcpu_svm *svm)
390 {
391         u32 *msrpm = svm->msrpm;
392
393         svm->vmcb->control.lbr_ctl = 0;
394         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
395         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
396         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
397         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
398 }
399
400 static __init int svm_hardware_setup(void)
401 {
402         int cpu;
403         struct page *iopm_pages;
404         void *iopm_va;
405         int r;
406
407         iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
408
409         if (!iopm_pages)
410                 return -ENOMEM;
411
412         iopm_va = page_address(iopm_pages);
413         memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
414         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
415
416         if (boot_cpu_has(X86_FEATURE_NX))
417                 kvm_enable_efer_bits(EFER_NX);
418
419         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
420                 kvm_enable_efer_bits(EFER_FFXSR);
421
422         if (nested) {
423                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
424                 kvm_enable_efer_bits(EFER_SVME);
425         }
426
427         for_each_online_cpu(cpu) {
428                 r = svm_cpu_init(cpu);
429                 if (r)
430                         goto err;
431         }
432
433         svm_features = cpuid_edx(SVM_CPUID_FUNC);
434
435         if (!svm_has(SVM_FEATURE_NPT))
436                 npt_enabled = false;
437
438         if (npt_enabled && !npt) {
439                 printk(KERN_INFO "kvm: Nested Paging disabled\n");
440                 npt_enabled = false;
441         }
442
443         if (npt_enabled) {
444                 printk(KERN_INFO "kvm: Nested Paging enabled\n");
445                 kvm_enable_tdp();
446         } else
447                 kvm_disable_tdp();
448
449         return 0;
450
451 err:
452         __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
453         iopm_base = 0;
454         return r;
455 }
456
457 static __exit void svm_hardware_unsetup(void)
458 {
459         int cpu;
460
461         for_each_online_cpu(cpu)
462                 svm_cpu_uninit(cpu);
463
464         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
465         iopm_base = 0;
466 }
467
468 static void init_seg(struct vmcb_seg *seg)
469 {
470         seg->selector = 0;
471         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
472                 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
473         seg->limit = 0xffff;
474         seg->base = 0;
475 }
476
477 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
478 {
479         seg->selector = 0;
480         seg->attrib = SVM_SELECTOR_P_MASK | type;
481         seg->limit = 0xffff;
482         seg->base = 0;
483 }
484
485 static void init_vmcb(struct vcpu_svm *svm)
486 {
487         struct vmcb_control_area *control = &svm->vmcb->control;
488         struct vmcb_save_area *save = &svm->vmcb->save;
489
490         control->intercept_cr_read =    INTERCEPT_CR0_MASK |
491                                         INTERCEPT_CR3_MASK |
492                                         INTERCEPT_CR4_MASK;
493
494         control->intercept_cr_write =   INTERCEPT_CR0_MASK |
495                                         INTERCEPT_CR3_MASK |
496                                         INTERCEPT_CR4_MASK |
497                                         INTERCEPT_CR8_MASK;
498
499         control->intercept_dr_read =    INTERCEPT_DR0_MASK |
500                                         INTERCEPT_DR1_MASK |
501                                         INTERCEPT_DR2_MASK |
502                                         INTERCEPT_DR3_MASK;
503
504         control->intercept_dr_write =   INTERCEPT_DR0_MASK |
505                                         INTERCEPT_DR1_MASK |
506                                         INTERCEPT_DR2_MASK |
507                                         INTERCEPT_DR3_MASK |
508                                         INTERCEPT_DR5_MASK |
509                                         INTERCEPT_DR7_MASK;
510
511         control->intercept_exceptions = (1 << PF_VECTOR) |
512                                         (1 << UD_VECTOR) |
513                                         (1 << MC_VECTOR);
514
515
516         control->intercept =    (1ULL << INTERCEPT_INTR) |
517                                 (1ULL << INTERCEPT_NMI) |
518                                 (1ULL << INTERCEPT_SMI) |
519                                 (1ULL << INTERCEPT_CPUID) |
520                                 (1ULL << INTERCEPT_INVD) |
521                                 (1ULL << INTERCEPT_HLT) |
522                                 (1ULL << INTERCEPT_INVLPG) |
523                                 (1ULL << INTERCEPT_INVLPGA) |
524                                 (1ULL << INTERCEPT_IOIO_PROT) |
525                                 (1ULL << INTERCEPT_MSR_PROT) |
526                                 (1ULL << INTERCEPT_TASK_SWITCH) |
527                                 (1ULL << INTERCEPT_SHUTDOWN) |
528                                 (1ULL << INTERCEPT_VMRUN) |
529                                 (1ULL << INTERCEPT_VMMCALL) |
530                                 (1ULL << INTERCEPT_VMLOAD) |
531                                 (1ULL << INTERCEPT_VMSAVE) |
532                                 (1ULL << INTERCEPT_STGI) |
533                                 (1ULL << INTERCEPT_CLGI) |
534                                 (1ULL << INTERCEPT_SKINIT) |
535                                 (1ULL << INTERCEPT_WBINVD) |
536                                 (1ULL << INTERCEPT_MONITOR) |
537                                 (1ULL << INTERCEPT_MWAIT);
538
539         control->iopm_base_pa = iopm_base;
540         control->msrpm_base_pa = __pa(svm->msrpm);
541         control->tsc_offset = 0;
542         control->int_ctl = V_INTR_MASKING_MASK;
543
544         init_seg(&save->es);
545         init_seg(&save->ss);
546         init_seg(&save->ds);
547         init_seg(&save->fs);
548         init_seg(&save->gs);
549
550         save->cs.selector = 0xf000;
551         /* Executable/Readable Code Segment */
552         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
553                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
554         save->cs.limit = 0xffff;
555         /*
556          * cs.base should really be 0xffff0000, but vmx can't handle that, so
557          * be consistent with it.
558          *
559          * Replace when we have real mode working for vmx.
560          */
561         save->cs.base = 0xf0000;
562
563         save->gdtr.limit = 0xffff;
564         save->idtr.limit = 0xffff;
565
566         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
567         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
568
569         save->efer = EFER_SVME;
570         save->dr6 = 0xffff0ff0;
571         save->dr7 = 0x400;
572         save->rflags = 2;
573         save->rip = 0x0000fff0;
574         svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
575
576         /*
577          * cr0 val on cpu init should be 0x60000010, we enable cpu
578          * cache by default. the orderly way is to enable cache in bios.
579          */
580         save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
581         save->cr4 = X86_CR4_PAE;
582         /* rdx = ?? */
583
584         if (npt_enabled) {
585                 /* Setup VMCB for Nested Paging */
586                 control->nested_ctl = 1;
587                 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
588                                         (1ULL << INTERCEPT_INVLPG));
589                 control->intercept_exceptions &= ~(1 << PF_VECTOR);
590                 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
591                                                 INTERCEPT_CR3_MASK);
592                 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
593                                                  INTERCEPT_CR3_MASK);
594                 save->g_pat = 0x0007040600070406ULL;
595                 /* enable caching because the QEMU Bios doesn't enable it */
596                 save->cr0 = X86_CR0_ET;
597                 save->cr3 = 0;
598                 save->cr4 = 0;
599         }
600         force_new_asid(&svm->vcpu);
601
602         svm->nested_vmcb = 0;
603         svm->vcpu.arch.hflags = HF_GIF_MASK;
604 }
605
606 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
607 {
608         struct vcpu_svm *svm = to_svm(vcpu);
609
610         init_vmcb(svm);
611
612         if (vcpu->vcpu_id != 0) {
613                 kvm_rip_write(vcpu, 0);
614                 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
615                 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
616         }
617         vcpu->arch.regs_avail = ~0;
618         vcpu->arch.regs_dirty = ~0;
619
620         return 0;
621 }
622
623 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
624 {
625         struct vcpu_svm *svm;
626         struct page *page;
627         struct page *msrpm_pages;
628         struct page *hsave_page;
629         struct page *nested_msrpm_pages;
630         int err;
631
632         svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
633         if (!svm) {
634                 err = -ENOMEM;
635                 goto out;
636         }
637
638         err = kvm_vcpu_init(&svm->vcpu, kvm, id);
639         if (err)
640                 goto free_svm;
641
642         page = alloc_page(GFP_KERNEL);
643         if (!page) {
644                 err = -ENOMEM;
645                 goto uninit;
646         }
647
648         err = -ENOMEM;
649         msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
650         if (!msrpm_pages)
651                 goto uninit;
652
653         nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
654         if (!nested_msrpm_pages)
655                 goto uninit;
656
657         svm->msrpm = page_address(msrpm_pages);
658         svm_vcpu_init_msrpm(svm->msrpm);
659
660         hsave_page = alloc_page(GFP_KERNEL);
661         if (!hsave_page)
662                 goto uninit;
663         svm->hsave = page_address(hsave_page);
664
665         svm->nested_msrpm = page_address(nested_msrpm_pages);
666
667         svm->vmcb = page_address(page);
668         clear_page(svm->vmcb);
669         svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
670         svm->asid_generation = 0;
671         init_vmcb(svm);
672
673         fx_init(&svm->vcpu);
674         svm->vcpu.fpu_active = 1;
675         svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
676         if (svm->vcpu.vcpu_id == 0)
677                 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
678
679         return &svm->vcpu;
680
681 uninit:
682         kvm_vcpu_uninit(&svm->vcpu);
683 free_svm:
684         kmem_cache_free(kvm_vcpu_cache, svm);
685 out:
686         return ERR_PTR(err);
687 }
688
689 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
690 {
691         struct vcpu_svm *svm = to_svm(vcpu);
692
693         __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
694         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
695         __free_page(virt_to_page(svm->hsave));
696         __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
697         kvm_vcpu_uninit(vcpu);
698         kmem_cache_free(kvm_vcpu_cache, svm);
699 }
700
701 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
702 {
703         struct vcpu_svm *svm = to_svm(vcpu);
704         int i;
705
706         if (unlikely(cpu != vcpu->cpu)) {
707                 u64 tsc_this, delta;
708
709                 /*
710                  * Make sure that the guest sees a monotonically
711                  * increasing TSC.
712                  */
713                 rdtscll(tsc_this);
714                 delta = vcpu->arch.host_tsc - tsc_this;
715                 svm->vmcb->control.tsc_offset += delta;
716                 vcpu->cpu = cpu;
717                 kvm_migrate_timers(vcpu);
718         }
719
720         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
721                 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
722 }
723
724 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
725 {
726         struct vcpu_svm *svm = to_svm(vcpu);
727         int i;
728
729         ++vcpu->stat.host_state_reload;
730         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
731                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
732
733         rdtscll(vcpu->arch.host_tsc);
734 }
735
736 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
737 {
738         return to_svm(vcpu)->vmcb->save.rflags;
739 }
740
741 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
742 {
743         to_svm(vcpu)->vmcb->save.rflags = rflags;
744 }
745
746 static void svm_set_vintr(struct vcpu_svm *svm)
747 {
748         svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
749 }
750
751 static void svm_clear_vintr(struct vcpu_svm *svm)
752 {
753         svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
754 }
755
756 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
757 {
758         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
759
760         switch (seg) {
761         case VCPU_SREG_CS: return &save->cs;
762         case VCPU_SREG_DS: return &save->ds;
763         case VCPU_SREG_ES: return &save->es;
764         case VCPU_SREG_FS: return &save->fs;
765         case VCPU_SREG_GS: return &save->gs;
766         case VCPU_SREG_SS: return &save->ss;
767         case VCPU_SREG_TR: return &save->tr;
768         case VCPU_SREG_LDTR: return &save->ldtr;
769         }
770         BUG();
771         return NULL;
772 }
773
774 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
775 {
776         struct vmcb_seg *s = svm_seg(vcpu, seg);
777
778         return s->base;
779 }
780
781 static void svm_get_segment(struct kvm_vcpu *vcpu,
782                             struct kvm_segment *var, int seg)
783 {
784         struct vmcb_seg *s = svm_seg(vcpu, seg);
785
786         var->base = s->base;
787         var->limit = s->limit;
788         var->selector = s->selector;
789         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
790         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
791         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
792         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
793         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
794         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
795         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
796         var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
797
798         /* AMD's VMCB does not have an explicit unusable field, so emulate it
799          * for cross vendor migration purposes by "not present"
800          */
801         var->unusable = !var->present || (var->type == 0);
802
803         switch (seg) {
804         case VCPU_SREG_CS:
805                 /*
806                  * SVM always stores 0 for the 'G' bit in the CS selector in
807                  * the VMCB on a VMEXIT. This hurts cross-vendor migration:
808                  * Intel's VMENTRY has a check on the 'G' bit.
809                  */
810                 var->g = s->limit > 0xfffff;
811                 break;
812         case VCPU_SREG_TR:
813                 /*
814                  * Work around a bug where the busy flag in the tr selector
815                  * isn't exposed
816                  */
817                 var->type |= 0x2;
818                 break;
819         case VCPU_SREG_DS:
820         case VCPU_SREG_ES:
821         case VCPU_SREG_FS:
822         case VCPU_SREG_GS:
823                 /*
824                  * The accessed bit must always be set in the segment
825                  * descriptor cache, although it can be cleared in the
826                  * descriptor, the cached bit always remains at 1. Since
827                  * Intel has a check on this, set it here to support
828                  * cross-vendor migration.
829                  */
830                 if (!var->unusable)
831                         var->type |= 0x1;
832                 break;
833         }
834 }
835
836 static int svm_get_cpl(struct kvm_vcpu *vcpu)
837 {
838         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
839
840         return save->cpl;
841 }
842
843 static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
844 {
845         struct vcpu_svm *svm = to_svm(vcpu);
846
847         dt->limit = svm->vmcb->save.idtr.limit;
848         dt->base = svm->vmcb->save.idtr.base;
849 }
850
851 static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
852 {
853         struct vcpu_svm *svm = to_svm(vcpu);
854
855         svm->vmcb->save.idtr.limit = dt->limit;
856         svm->vmcb->save.idtr.base = dt->base ;
857 }
858
859 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
860 {
861         struct vcpu_svm *svm = to_svm(vcpu);
862
863         dt->limit = svm->vmcb->save.gdtr.limit;
864         dt->base = svm->vmcb->save.gdtr.base;
865 }
866
867 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
868 {
869         struct vcpu_svm *svm = to_svm(vcpu);
870
871         svm->vmcb->save.gdtr.limit = dt->limit;
872         svm->vmcb->save.gdtr.base = dt->base ;
873 }
874
875 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
876 {
877 }
878
879 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
880 {
881         struct vcpu_svm *svm = to_svm(vcpu);
882
883 #ifdef CONFIG_X86_64
884         if (vcpu->arch.shadow_efer & EFER_LME) {
885                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
886                         vcpu->arch.shadow_efer |= EFER_LMA;
887                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
888                 }
889
890                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
891                         vcpu->arch.shadow_efer &= ~EFER_LMA;
892                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
893                 }
894         }
895 #endif
896         if (npt_enabled)
897                 goto set;
898
899         if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
900                 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
901                 vcpu->fpu_active = 1;
902         }
903
904         vcpu->arch.cr0 = cr0;
905         cr0 |= X86_CR0_PG | X86_CR0_WP;
906         if (!vcpu->fpu_active) {
907                 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
908                 cr0 |= X86_CR0_TS;
909         }
910 set:
911         /*
912          * re-enable caching here because the QEMU bios
913          * does not do it - this results in some delay at
914          * reboot
915          */
916         cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
917         svm->vmcb->save.cr0 = cr0;
918 }
919
920 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
921 {
922         unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
923         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
924
925         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
926                 force_new_asid(vcpu);
927
928         vcpu->arch.cr4 = cr4;
929         if (!npt_enabled)
930                 cr4 |= X86_CR4_PAE;
931         cr4 |= host_cr4_mce;
932         to_svm(vcpu)->vmcb->save.cr4 = cr4;
933 }
934
935 static void svm_set_segment(struct kvm_vcpu *vcpu,
936                             struct kvm_segment *var, int seg)
937 {
938         struct vcpu_svm *svm = to_svm(vcpu);
939         struct vmcb_seg *s = svm_seg(vcpu, seg);
940
941         s->base = var->base;
942         s->limit = var->limit;
943         s->selector = var->selector;
944         if (var->unusable)
945                 s->attrib = 0;
946         else {
947                 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
948                 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
949                 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
950                 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
951                 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
952                 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
953                 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
954                 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
955         }
956         if (seg == VCPU_SREG_CS)
957                 svm->vmcb->save.cpl
958                         = (svm->vmcb->save.cs.attrib
959                            >> SVM_SELECTOR_DPL_SHIFT) & 3;
960
961 }
962
963 static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
964 {
965         int old_debug = vcpu->guest_debug;
966         struct vcpu_svm *svm = to_svm(vcpu);
967
968         vcpu->guest_debug = dbg->control;
969
970         svm->vmcb->control.intercept_exceptions &=
971                 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
972         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
973                 if (vcpu->guest_debug &
974                     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
975                         svm->vmcb->control.intercept_exceptions |=
976                                 1 << DB_VECTOR;
977                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
978                         svm->vmcb->control.intercept_exceptions |=
979                                 1 << BP_VECTOR;
980         } else
981                 vcpu->guest_debug = 0;
982
983         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
984                 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
985         else
986                 svm->vmcb->save.dr7 = vcpu->arch.dr7;
987
988         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
989                 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
990         else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
991                 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
992
993         return 0;
994 }
995
996 static int svm_get_irq(struct kvm_vcpu *vcpu)
997 {
998         struct vcpu_svm *svm = to_svm(vcpu);
999         u32 exit_int_info = svm->vmcb->control.exit_int_info;
1000
1001         if (is_external_interrupt(exit_int_info))
1002                 return exit_int_info & SVM_EVTINJ_VEC_MASK;
1003         return -1;
1004 }
1005
1006 static void load_host_msrs(struct kvm_vcpu *vcpu)
1007 {
1008 #ifdef CONFIG_X86_64
1009         wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1010 #endif
1011 }
1012
1013 static void save_host_msrs(struct kvm_vcpu *vcpu)
1014 {
1015 #ifdef CONFIG_X86_64
1016         rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1017 #endif
1018 }
1019
1020 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
1021 {
1022         if (svm_data->next_asid > svm_data->max_asid) {
1023                 ++svm_data->asid_generation;
1024                 svm_data->next_asid = 1;
1025                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1026         }
1027
1028         svm->vcpu.cpu = svm_data->cpu;
1029         svm->asid_generation = svm_data->asid_generation;
1030         svm->vmcb->control.asid = svm_data->next_asid++;
1031 }
1032
1033 static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
1034 {
1035         struct vcpu_svm *svm = to_svm(vcpu);
1036         unsigned long val;
1037
1038         switch (dr) {
1039         case 0 ... 3:
1040                 val = vcpu->arch.db[dr];
1041                 break;
1042         case 6:
1043                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1044                         val = vcpu->arch.dr6;
1045                 else
1046                         val = svm->vmcb->save.dr6;
1047                 break;
1048         case 7:
1049                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1050                         val = vcpu->arch.dr7;
1051                 else
1052                         val = svm->vmcb->save.dr7;
1053                 break;
1054         default:
1055                 val = 0;
1056         }
1057
1058         KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
1059         return val;
1060 }
1061
1062 static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1063                        int *exception)
1064 {
1065         struct vcpu_svm *svm = to_svm(vcpu);
1066
1067         KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
1068
1069         *exception = 0;
1070
1071         switch (dr) {
1072         case 0 ... 3:
1073                 vcpu->arch.db[dr] = value;
1074                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1075                         vcpu->arch.eff_db[dr] = value;
1076                 return;
1077         case 4 ... 5:
1078                 if (vcpu->arch.cr4 & X86_CR4_DE)
1079                         *exception = UD_VECTOR;
1080                 return;
1081         case 6:
1082                 if (value & 0xffffffff00000000ULL) {
1083                         *exception = GP_VECTOR;
1084                         return;
1085                 }
1086                 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1087                 return;
1088         case 7:
1089                 if (value & 0xffffffff00000000ULL) {
1090                         *exception = GP_VECTOR;
1091                         return;
1092                 }
1093                 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1094                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1095                         svm->vmcb->save.dr7 = vcpu->arch.dr7;
1096                         vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1097                 }
1098                 return;
1099         default:
1100                 /* FIXME: Possible case? */
1101                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1102                        __func__, dr);
1103                 *exception = UD_VECTOR;
1104                 return;
1105         }
1106 }
1107
1108 static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1109 {
1110         u32 exit_int_info = svm->vmcb->control.exit_int_info;
1111         struct kvm *kvm = svm->vcpu.kvm;
1112         u64 fault_address;
1113         u32 error_code;
1114         bool event_injection = false;
1115
1116         if (!irqchip_in_kernel(kvm) &&
1117             is_external_interrupt(exit_int_info)) {
1118                 event_injection = true;
1119                 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1120         }
1121
1122         fault_address  = svm->vmcb->control.exit_info_2;
1123         error_code = svm->vmcb->control.exit_info_1;
1124
1125         if (!npt_enabled)
1126                 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
1127                             (u32)fault_address, (u32)(fault_address >> 32),
1128                             handler);
1129         else
1130                 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1131                             (u32)fault_address, (u32)(fault_address >> 32),
1132                             handler);
1133         /*
1134          * FIXME: Tis shouldn't be necessary here, but there is a flush
1135          * missing in the MMU code. Until we find this bug, flush the
1136          * complete TLB here on an NPF
1137          */
1138         if (npt_enabled)
1139                 svm_flush_tlb(&svm->vcpu);
1140
1141         if (!npt_enabled && event_injection)
1142                 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1143         return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1144 }
1145
1146 static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1147 {
1148         if (!(svm->vcpu.guest_debug &
1149               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
1150                 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1151                 return 1;
1152         }
1153         kvm_run->exit_reason = KVM_EXIT_DEBUG;
1154         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1155         kvm_run->debug.arch.exception = DB_VECTOR;
1156         return 0;
1157 }
1158
1159 static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1160 {
1161         kvm_run->exit_reason = KVM_EXIT_DEBUG;
1162         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1163         kvm_run->debug.arch.exception = BP_VECTOR;
1164         return 0;
1165 }
1166
1167 static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1168 {
1169         int er;
1170
1171         er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
1172         if (er != EMULATE_DONE)
1173                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1174         return 1;
1175 }
1176
1177 static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1178 {
1179         svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1180         if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
1181                 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
1182         svm->vcpu.fpu_active = 1;
1183
1184         return 1;
1185 }
1186
1187 static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1188 {
1189         /*
1190          * On an #MC intercept the MCE handler is not called automatically in
1191          * the host. So do it by hand here.
1192          */
1193         asm volatile (
1194                 "int $0x12\n");
1195         /* not sure if we ever come back to this point */
1196
1197         return 1;
1198 }
1199
1200 static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1201 {
1202         /*
1203          * VMCB is undefined after a SHUTDOWN intercept
1204          * so reinitialize it.
1205          */
1206         clear_page(svm->vmcb);
1207         init_vmcb(svm);
1208
1209         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1210         return 0;
1211 }
1212
1213 static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1214 {
1215         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1216         int size, in, string;
1217         unsigned port;
1218
1219         ++svm->vcpu.stat.io_exits;
1220
1221         svm->next_rip = svm->vmcb->control.exit_info_2;
1222
1223         string = (io_info & SVM_IOIO_STR_MASK) != 0;
1224
1225         if (string) {
1226                 if (emulate_instruction(&svm->vcpu,
1227                                         kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1228                         return 0;
1229                 return 1;
1230         }
1231
1232         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1233         port = io_info >> 16;
1234         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1235
1236         skip_emulated_instruction(&svm->vcpu);
1237         return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1238 }
1239
1240 static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1241 {
1242         KVMTRACE_0D(NMI, &svm->vcpu, handler);
1243         return 1;
1244 }
1245
1246 static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1247 {
1248         ++svm->vcpu.stat.irq_exits;
1249         KVMTRACE_0D(INTR, &svm->vcpu, handler);
1250         return 1;
1251 }
1252
1253 static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1254 {
1255         return 1;
1256 }
1257
1258 static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1259 {
1260         svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1261         skip_emulated_instruction(&svm->vcpu);
1262         return kvm_emulate_halt(&svm->vcpu);
1263 }
1264
1265 static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1266 {
1267         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1268         skip_emulated_instruction(&svm->vcpu);
1269         kvm_emulate_hypercall(&svm->vcpu);
1270         return 1;
1271 }
1272
1273 static int nested_svm_check_permissions(struct vcpu_svm *svm)
1274 {
1275         if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
1276             || !is_paging(&svm->vcpu)) {
1277                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1278                 return 1;
1279         }
1280
1281         if (svm->vmcb->save.cpl) {
1282                 kvm_inject_gp(&svm->vcpu, 0);
1283                 return 1;
1284         }
1285
1286        return 0;
1287 }
1288
1289 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1290                                       bool has_error_code, u32 error_code)
1291 {
1292         if (is_nested(svm)) {
1293                 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1294                 svm->vmcb->control.exit_code_hi = 0;
1295                 svm->vmcb->control.exit_info_1 = error_code;
1296                 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1297                 if (nested_svm_exit_handled(svm, false)) {
1298                         nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
1299
1300                         nested_svm_vmexit(svm);
1301                         return 1;
1302                 }
1303         }
1304
1305         return 0;
1306 }
1307
1308 static inline int nested_svm_intr(struct vcpu_svm *svm)
1309 {
1310         if (is_nested(svm)) {
1311                 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1312                         return 0;
1313
1314                 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1315                         return 0;
1316
1317                 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1318
1319                 if (nested_svm_exit_handled(svm, false)) {
1320                         nsvm_printk("VMexit -> INTR\n");
1321                         nested_svm_vmexit(svm);
1322                         return 1;
1323                 }
1324         }
1325
1326         return 0;
1327 }
1328
1329 static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
1330 {
1331         struct page *page;
1332
1333         down_read(&current->mm->mmap_sem);
1334         page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1335         up_read(&current->mm->mmap_sem);
1336
1337         if (is_error_page(page)) {
1338                 printk(KERN_INFO "%s: could not find page at 0x%llx\n",
1339                        __func__, gpa);
1340                 kvm_release_page_clean(page);
1341                 kvm_inject_gp(&svm->vcpu, 0);
1342                 return NULL;
1343         }
1344         return page;
1345 }
1346
1347 static int nested_svm_do(struct vcpu_svm *svm,
1348                          u64 arg1_gpa, u64 arg2_gpa, void *opaque,
1349                          int (*handler)(struct vcpu_svm *svm,
1350                                         void *arg1,
1351                                         void *arg2,
1352                                         void *opaque))
1353 {
1354         struct page *arg1_page;
1355         struct page *arg2_page = NULL;
1356         void *arg1;
1357         void *arg2 = NULL;
1358         int retval;
1359
1360         arg1_page = nested_svm_get_page(svm, arg1_gpa);
1361         if(arg1_page == NULL)
1362                 return 1;
1363
1364         if (arg2_gpa) {
1365                 arg2_page = nested_svm_get_page(svm, arg2_gpa);
1366                 if(arg2_page == NULL) {
1367                         kvm_release_page_clean(arg1_page);
1368                         return 1;
1369                 }
1370         }
1371
1372         arg1 = kmap_atomic(arg1_page, KM_USER0);
1373         if (arg2_gpa)
1374                 arg2 = kmap_atomic(arg2_page, KM_USER1);
1375
1376         retval = handler(svm, arg1, arg2, opaque);
1377
1378         kunmap_atomic(arg1, KM_USER0);
1379         if (arg2_gpa)
1380                 kunmap_atomic(arg2, KM_USER1);
1381
1382         kvm_release_page_dirty(arg1_page);
1383         if (arg2_gpa)
1384                 kvm_release_page_dirty(arg2_page);
1385
1386         return retval;
1387 }
1388
1389 static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
1390                                         void *arg1,
1391                                         void *arg2,
1392                                         void *opaque)
1393 {
1394         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1395         bool kvm_overrides = *(bool *)opaque;
1396         u32 exit_code = svm->vmcb->control.exit_code;
1397
1398         if (kvm_overrides) {
1399                 switch (exit_code) {
1400                 case SVM_EXIT_INTR:
1401                 case SVM_EXIT_NMI:
1402                         return 0;
1403                 /* For now we are always handling NPFs when using them */
1404                 case SVM_EXIT_NPF:
1405                         if (npt_enabled)
1406                                 return 0;
1407                         break;
1408                 /* When we're shadowing, trap PFs */
1409                 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1410                         if (!npt_enabled)
1411                                 return 0;
1412                         break;
1413                 default:
1414                         break;
1415                 }
1416         }
1417
1418         switch (exit_code) {
1419         case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1420                 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1421                 if (nested_vmcb->control.intercept_cr_read & cr_bits)
1422                         return 1;
1423                 break;
1424         }
1425         case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
1426                 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
1427                 if (nested_vmcb->control.intercept_cr_write & cr_bits)
1428                         return 1;
1429                 break;
1430         }
1431         case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1432                 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1433                 if (nested_vmcb->control.intercept_dr_read & dr_bits)
1434                         return 1;
1435                 break;
1436         }
1437         case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1438                 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1439                 if (nested_vmcb->control.intercept_dr_write & dr_bits)
1440                         return 1;
1441                 break;
1442         }
1443         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1444                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1445                 if (nested_vmcb->control.intercept_exceptions & excp_bits)
1446                         return 1;
1447                 break;
1448         }
1449         default: {
1450                 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1451                 nsvm_printk("exit code: 0x%x\n", exit_code);
1452                 if (nested_vmcb->control.intercept & exit_bits)
1453                         return 1;
1454         }
1455         }
1456
1457         return 0;
1458 }
1459
1460 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
1461                                        void *arg1, void *arg2,
1462                                        void *opaque)
1463 {
1464         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1465         u8 *msrpm = (u8 *)arg2;
1466         u32 t0, t1;
1467         u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1468         u32 param = svm->vmcb->control.exit_info_1 & 1;
1469
1470         if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1471                 return 0;
1472
1473         switch(msr) {
1474         case 0 ... 0x1fff:
1475                 t0 = (msr * 2) % 8;
1476                 t1 = msr / 8;
1477                 break;
1478         case 0xc0000000 ... 0xc0001fff:
1479                 t0 = (8192 + msr - 0xc0000000) * 2;
1480                 t1 = (t0 / 8);
1481                 t0 %= 8;
1482                 break;
1483         case 0xc0010000 ... 0xc0011fff:
1484                 t0 = (16384 + msr - 0xc0010000) * 2;
1485                 t1 = (t0 / 8);
1486                 t0 %= 8;
1487                 break;
1488         default:
1489                 return 1;
1490                 break;
1491         }
1492         if (msrpm[t1] & ((1 << param) << t0))
1493                 return 1;
1494
1495         return 0;
1496 }
1497
1498 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
1499 {
1500         bool k = kvm_override;
1501
1502         switch (svm->vmcb->control.exit_code) {
1503         case SVM_EXIT_MSR:
1504                 return nested_svm_do(svm, svm->nested_vmcb,
1505                                      svm->nested_vmcb_msrpm, NULL,
1506                                      nested_svm_exit_handled_msr);
1507         default: break;
1508         }
1509
1510         return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
1511                              nested_svm_exit_handled_real);
1512 }
1513
1514 static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
1515                                   void *arg2, void *opaque)
1516 {
1517         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1518         struct vmcb *hsave = svm->hsave;
1519         u64 nested_save[] = { nested_vmcb->save.cr0,
1520                               nested_vmcb->save.cr3,
1521                               nested_vmcb->save.cr4,
1522                               nested_vmcb->save.efer,
1523                               nested_vmcb->control.intercept_cr_read,
1524                               nested_vmcb->control.intercept_cr_write,
1525                               nested_vmcb->control.intercept_dr_read,
1526                               nested_vmcb->control.intercept_dr_write,
1527                               nested_vmcb->control.intercept_exceptions,
1528                               nested_vmcb->control.intercept,
1529                               nested_vmcb->control.msrpm_base_pa,
1530                               nested_vmcb->control.iopm_base_pa,
1531                               nested_vmcb->control.tsc_offset };
1532
1533         /* Give the current vmcb to the guest */
1534         memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
1535         nested_vmcb->save.cr0 = nested_save[0];
1536         if (!npt_enabled)
1537                 nested_vmcb->save.cr3 = nested_save[1];
1538         nested_vmcb->save.cr4 = nested_save[2];
1539         nested_vmcb->save.efer = nested_save[3];
1540         nested_vmcb->control.intercept_cr_read = nested_save[4];
1541         nested_vmcb->control.intercept_cr_write = nested_save[5];
1542         nested_vmcb->control.intercept_dr_read = nested_save[6];
1543         nested_vmcb->control.intercept_dr_write = nested_save[7];
1544         nested_vmcb->control.intercept_exceptions = nested_save[8];
1545         nested_vmcb->control.intercept = nested_save[9];
1546         nested_vmcb->control.msrpm_base_pa = nested_save[10];
1547         nested_vmcb->control.iopm_base_pa = nested_save[11];
1548         nested_vmcb->control.tsc_offset = nested_save[12];
1549
1550         /* We always set V_INTR_MASKING and remember the old value in hflags */
1551         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1552                 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
1553
1554         if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
1555             (nested_vmcb->control.int_vector)) {
1556                 nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
1557                                 nested_vmcb->control.int_vector);
1558         }
1559
1560         /* Restore the original control entries */
1561         svm->vmcb->control = hsave->control;
1562
1563         /* Kill any pending exceptions */
1564         if (svm->vcpu.arch.exception.pending == true)
1565                 nsvm_printk("WARNING: Pending Exception\n");
1566         svm->vcpu.arch.exception.pending = false;
1567
1568         /* Restore selected save entries */
1569         svm->vmcb->save.es = hsave->save.es;
1570         svm->vmcb->save.cs = hsave->save.cs;
1571         svm->vmcb->save.ss = hsave->save.ss;
1572         svm->vmcb->save.ds = hsave->save.ds;
1573         svm->vmcb->save.gdtr = hsave->save.gdtr;
1574         svm->vmcb->save.idtr = hsave->save.idtr;
1575         svm->vmcb->save.rflags = hsave->save.rflags;
1576         svm_set_efer(&svm->vcpu, hsave->save.efer);
1577         svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
1578         svm_set_cr4(&svm->vcpu, hsave->save.cr4);
1579         if (npt_enabled) {
1580                 svm->vmcb->save.cr3 = hsave->save.cr3;
1581                 svm->vcpu.arch.cr3 = hsave->save.cr3;
1582         } else {
1583                 kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1584         }
1585         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1586         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
1587         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
1588         svm->vmcb->save.dr7 = 0;
1589         svm->vmcb->save.cpl = 0;
1590         svm->vmcb->control.exit_int_info = 0;
1591
1592         svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1593         /* Exit nested SVM mode */
1594         svm->nested_vmcb = 0;
1595
1596         return 0;
1597 }
1598
1599 static int nested_svm_vmexit(struct vcpu_svm *svm)
1600 {
1601         nsvm_printk("VMexit\n");
1602         if (nested_svm_do(svm, svm->nested_vmcb, 0,
1603                           NULL, nested_svm_vmexit_real))
1604                 return 1;
1605
1606         kvm_mmu_reset_context(&svm->vcpu);
1607         kvm_mmu_load(&svm->vcpu);
1608
1609         return 0;
1610 }
1611
1612 static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
1613                                   void *arg2, void *opaque)
1614 {
1615         int i;
1616         u32 *nested_msrpm = (u32*)arg1;
1617         for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
1618                 svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
1619         svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
1620
1621         return 0;
1622 }
1623
1624 static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1625                             void *arg2, void *opaque)
1626 {
1627         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1628         struct vmcb *hsave = svm->hsave;
1629
1630         /* nested_vmcb is our indicator if nested SVM is activated */
1631         svm->nested_vmcb = svm->vmcb->save.rax;
1632
1633         /* Clear internal status */
1634         svm->vcpu.arch.exception.pending = false;
1635
1636         /* Save the old vmcb, so we don't need to pick what we save, but
1637            can restore everything when a VMEXIT occurs */
1638         memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
1639         /* We need to remember the original CR3 in the SPT case */
1640         if (!npt_enabled)
1641                 hsave->save.cr3 = svm->vcpu.arch.cr3;
1642         hsave->save.cr4 = svm->vcpu.arch.cr4;
1643         hsave->save.rip = svm->next_rip;
1644
1645         if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
1646                 svm->vcpu.arch.hflags |= HF_HIF_MASK;
1647         else
1648                 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
1649
1650         /* Load the nested guest state */
1651         svm->vmcb->save.es = nested_vmcb->save.es;
1652         svm->vmcb->save.cs = nested_vmcb->save.cs;
1653         svm->vmcb->save.ss = nested_vmcb->save.ss;
1654         svm->vmcb->save.ds = nested_vmcb->save.ds;
1655         svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
1656         svm->vmcb->save.idtr = nested_vmcb->save.idtr;
1657         svm->vmcb->save.rflags = nested_vmcb->save.rflags;
1658         svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
1659         svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
1660         svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
1661         if (npt_enabled) {
1662                 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1663                 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1664         } else {
1665                 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1666                 kvm_mmu_reset_context(&svm->vcpu);
1667         }
1668         svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
1669         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1670         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1671         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
1672         /* In case we don't even reach vcpu_run, the fields are not updated */
1673         svm->vmcb->save.rax = nested_vmcb->save.rax;
1674         svm->vmcb->save.rsp = nested_vmcb->save.rsp;
1675         svm->vmcb->save.rip = nested_vmcb->save.rip;
1676         svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
1677         svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1678         svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1679
1680         /* We don't want a nested guest to be more powerful than the guest,
1681            so all intercepts are ORed */
1682         svm->vmcb->control.intercept_cr_read |=
1683                 nested_vmcb->control.intercept_cr_read;
1684         svm->vmcb->control.intercept_cr_write |=
1685                 nested_vmcb->control.intercept_cr_write;
1686         svm->vmcb->control.intercept_dr_read |=
1687                 nested_vmcb->control.intercept_dr_read;
1688         svm->vmcb->control.intercept_dr_write |=
1689                 nested_vmcb->control.intercept_dr_write;
1690         svm->vmcb->control.intercept_exceptions |=
1691                 nested_vmcb->control.intercept_exceptions;
1692
1693         svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1694
1695         svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1696
1697         force_new_asid(&svm->vcpu);
1698         svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1699         svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1700         svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1701         if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1702                 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1703                                 nested_vmcb->control.int_ctl);
1704         }
1705         if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1706                 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1707         else
1708                 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1709
1710         nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1711                         nested_vmcb->control.exit_int_info,
1712                         nested_vmcb->control.int_state);
1713
1714         svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1715         svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1716         svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1717         if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1718                 nsvm_printk("Injecting Event: 0x%x\n",
1719                                 nested_vmcb->control.event_inj);
1720         svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1721         svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1722
1723         svm->vcpu.arch.hflags |= HF_GIF_MASK;
1724
1725         return 0;
1726 }
1727
1728 static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1729 {
1730         to_vmcb->save.fs = from_vmcb->save.fs;
1731         to_vmcb->save.gs = from_vmcb->save.gs;
1732         to_vmcb->save.tr = from_vmcb->save.tr;
1733         to_vmcb->save.ldtr = from_vmcb->save.ldtr;
1734         to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
1735         to_vmcb->save.star = from_vmcb->save.star;
1736         to_vmcb->save.lstar = from_vmcb->save.lstar;
1737         to_vmcb->save.cstar = from_vmcb->save.cstar;
1738         to_vmcb->save.sfmask = from_vmcb->save.sfmask;
1739         to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
1740         to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
1741         to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1742
1743         return 1;
1744 }
1745
1746 static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
1747                              void *arg2, void *opaque)
1748 {
1749         return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
1750 }
1751
1752 static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
1753                              void *arg2, void *opaque)
1754 {
1755         return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
1756 }
1757
1758 static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1759 {
1760         if (nested_svm_check_permissions(svm))
1761                 return 1;
1762
1763         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1764         skip_emulated_instruction(&svm->vcpu);
1765
1766         nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
1767
1768         return 1;
1769 }
1770
1771 static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1772 {
1773         if (nested_svm_check_permissions(svm))
1774                 return 1;
1775
1776         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1777         skip_emulated_instruction(&svm->vcpu);
1778
1779         nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
1780
1781         return 1;
1782 }
1783
1784 static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1785 {
1786         nsvm_printk("VMrun\n");
1787         if (nested_svm_check_permissions(svm))
1788                 return 1;
1789
1790         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1791         skip_emulated_instruction(&svm->vcpu);
1792
1793         if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
1794                           NULL, nested_svm_vmrun))
1795                 return 1;
1796
1797         if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
1798                       NULL, nested_svm_vmrun_msrpm))
1799                 return 1;
1800
1801         return 1;
1802 }
1803
1804 static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1805 {
1806         if (nested_svm_check_permissions(svm))
1807                 return 1;
1808
1809         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1810         skip_emulated_instruction(&svm->vcpu);
1811
1812         svm->vcpu.arch.hflags |= HF_GIF_MASK;
1813
1814         return 1;
1815 }
1816
1817 static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1818 {
1819         if (nested_svm_check_permissions(svm))
1820                 return 1;
1821
1822         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1823         skip_emulated_instruction(&svm->vcpu);
1824
1825         svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1826
1827         /* After a CLGI no interrupts should come */
1828         svm_clear_vintr(svm);
1829         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1830
1831         return 1;
1832 }
1833
1834 static int invalid_op_interception(struct vcpu_svm *svm,
1835                                    struct kvm_run *kvm_run)
1836 {
1837         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1838         return 1;
1839 }
1840
1841 static int task_switch_interception(struct vcpu_svm *svm,
1842                                     struct kvm_run *kvm_run)
1843 {
1844         u16 tss_selector;
1845
1846         tss_selector = (u16)svm->vmcb->control.exit_info_1;
1847         if (svm->vmcb->control.exit_info_2 &
1848             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
1849                 return kvm_task_switch(&svm->vcpu, tss_selector,
1850                                        TASK_SWITCH_IRET);
1851         if (svm->vmcb->control.exit_info_2 &
1852             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
1853                 return kvm_task_switch(&svm->vcpu, tss_selector,
1854                                        TASK_SWITCH_JMP);
1855         return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
1856 }
1857
1858 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1859 {
1860         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1861         kvm_emulate_cpuid(&svm->vcpu);
1862         return 1;
1863 }
1864
1865 static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1866 {
1867         if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
1868                 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1869         return 1;
1870 }
1871
1872 static int emulate_on_interception(struct vcpu_svm *svm,
1873                                    struct kvm_run *kvm_run)
1874 {
1875         if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1876                 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1877         return 1;
1878 }
1879
1880 static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1881 {
1882         emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1883         if (irqchip_in_kernel(svm->vcpu.kvm))
1884                 return 1;
1885         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1886         return 0;
1887 }
1888
1889 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1890 {
1891         struct vcpu_svm *svm = to_svm(vcpu);
1892
1893         switch (ecx) {
1894         case MSR_IA32_TIME_STAMP_COUNTER: {
1895                 u64 tsc;
1896
1897                 rdtscll(tsc);
1898                 *data = svm->vmcb->control.tsc_offset + tsc;
1899                 break;
1900         }
1901         case MSR_K6_STAR:
1902                 *data = svm->vmcb->save.star;
1903                 break;
1904 #ifdef CONFIG_X86_64
1905         case MSR_LSTAR:
1906                 *data = svm->vmcb->save.lstar;
1907                 break;
1908         case MSR_CSTAR:
1909                 *data = svm->vmcb->save.cstar;
1910                 break;
1911         case MSR_KERNEL_GS_BASE:
1912                 *data = svm->vmcb->save.kernel_gs_base;
1913                 break;
1914         case MSR_SYSCALL_MASK:
1915                 *data = svm->vmcb->save.sfmask;
1916                 break;
1917 #endif
1918         case MSR_IA32_SYSENTER_CS:
1919                 *data = svm->vmcb->save.sysenter_cs;
1920                 break;
1921         case MSR_IA32_SYSENTER_EIP:
1922                 *data = svm->vmcb->save.sysenter_eip;
1923                 break;
1924         case MSR_IA32_SYSENTER_ESP:
1925                 *data = svm->vmcb->save.sysenter_esp;
1926                 break;
1927         /* Nobody will change the following 5 values in the VMCB so
1928            we can safely return them on rdmsr. They will always be 0
1929            until LBRV is implemented. */
1930         case MSR_IA32_DEBUGCTLMSR:
1931                 *data = svm->vmcb->save.dbgctl;
1932                 break;
1933         case MSR_IA32_LASTBRANCHFROMIP:
1934                 *data = svm->vmcb->save.br_from;
1935                 break;
1936         case MSR_IA32_LASTBRANCHTOIP:
1937                 *data = svm->vmcb->save.br_to;
1938                 break;
1939         case MSR_IA32_LASTINTFROMIP:
1940                 *data = svm->vmcb->save.last_excp_from;
1941                 break;
1942         case MSR_IA32_LASTINTTOIP:
1943                 *data = svm->vmcb->save.last_excp_to;
1944                 break;
1945         case MSR_VM_HSAVE_PA:
1946                 *data = svm->hsave_msr;
1947                 break;
1948         case MSR_VM_CR:
1949                 *data = 0;
1950                 break;
1951         case MSR_IA32_UCODE_REV:
1952                 *data = 0x01000065;
1953                 break;
1954         default:
1955                 return kvm_get_msr_common(vcpu, ecx, data);
1956         }
1957         return 0;
1958 }
1959
1960 static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1961 {
1962         u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1963         u64 data;
1964
1965         if (svm_get_msr(&svm->vcpu, ecx, &data))
1966                 kvm_inject_gp(&svm->vcpu, 0);
1967         else {
1968                 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1969                             (u32)(data >> 32), handler);
1970
1971                 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
1972                 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1973                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1974                 skip_emulated_instruction(&svm->vcpu);
1975         }
1976         return 1;
1977 }
1978
1979 static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1980 {
1981         struct vcpu_svm *svm = to_svm(vcpu);
1982
1983         switch (ecx) {
1984         case MSR_IA32_TIME_STAMP_COUNTER: {
1985                 u64 tsc;
1986
1987                 rdtscll(tsc);
1988                 svm->vmcb->control.tsc_offset = data - tsc;
1989                 break;
1990         }
1991         case MSR_K6_STAR:
1992                 svm->vmcb->save.star = data;
1993                 break;
1994 #ifdef CONFIG_X86_64
1995         case MSR_LSTAR:
1996                 svm->vmcb->save.lstar = data;
1997                 break;
1998         case MSR_CSTAR:
1999                 svm->vmcb->save.cstar = data;
2000                 break;
2001         case MSR_KERNEL_GS_BASE:
2002                 svm->vmcb->save.kernel_gs_base = data;
2003                 break;
2004         case MSR_SYSCALL_MASK:
2005                 svm->vmcb->save.sfmask = data;
2006                 break;
2007 #endif
2008         case MSR_IA32_SYSENTER_CS:
2009                 svm->vmcb->save.sysenter_cs = data;
2010                 break;
2011         case MSR_IA32_SYSENTER_EIP:
2012                 svm->vmcb->save.sysenter_eip = data;
2013                 break;
2014         case MSR_IA32_SYSENTER_ESP:
2015                 svm->vmcb->save.sysenter_esp = data;
2016                 break;
2017         case MSR_IA32_DEBUGCTLMSR:
2018                 if (!svm_has(SVM_FEATURE_LBRV)) {
2019                         pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2020                                         __func__, data);
2021                         break;
2022                 }
2023                 if (data & DEBUGCTL_RESERVED_BITS)
2024                         return 1;
2025
2026                 svm->vmcb->save.dbgctl = data;
2027                 if (data & (1ULL<<0))
2028                         svm_enable_lbrv(svm);
2029                 else
2030                         svm_disable_lbrv(svm);
2031                 break;
2032         case MSR_K7_EVNTSEL0:
2033         case MSR_K7_EVNTSEL1:
2034         case MSR_K7_EVNTSEL2:
2035         case MSR_K7_EVNTSEL3:
2036         case MSR_K7_PERFCTR0:
2037         case MSR_K7_PERFCTR1:
2038         case MSR_K7_PERFCTR2:
2039         case MSR_K7_PERFCTR3:
2040                 /*
2041                  * Just discard all writes to the performance counters; this
2042                  * should keep both older linux and windows 64-bit guests
2043                  * happy
2044                  */
2045                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
2046
2047                 break;
2048         case MSR_VM_HSAVE_PA:
2049                 svm->hsave_msr = data;
2050                 break;
2051         default:
2052                 return kvm_set_msr_common(vcpu, ecx, data);
2053         }
2054         return 0;
2055 }
2056
2057 static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2058 {
2059         u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2060         u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2061                 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2062
2063         KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
2064                     handler);
2065
2066         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2067         if (svm_set_msr(&svm->vcpu, ecx, data))
2068                 kvm_inject_gp(&svm->vcpu, 0);
2069         else
2070                 skip_emulated_instruction(&svm->vcpu);
2071         return 1;
2072 }
2073
2074 static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2075 {
2076         if (svm->vmcb->control.exit_info_1)
2077                 return wrmsr_interception(svm, kvm_run);
2078         else
2079                 return rdmsr_interception(svm, kvm_run);
2080 }
2081
2082 static int interrupt_window_interception(struct vcpu_svm *svm,
2083                                    struct kvm_run *kvm_run)
2084 {
2085         KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
2086
2087         svm_clear_vintr(svm);
2088         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2089         /*
2090          * If the user space waits to inject interrupts, exit as soon as
2091          * possible
2092          */
2093         if (kvm_run->request_interrupt_window &&
2094             !svm->vcpu.arch.irq_summary) {
2095                 ++svm->vcpu.stat.irq_window_exits;
2096                 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2097                 return 0;
2098         }
2099
2100         return 1;
2101 }
2102
2103 static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2104                                       struct kvm_run *kvm_run) = {
2105         [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
2106         [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
2107         [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
2108         [SVM_EXIT_READ_CR8]                     = emulate_on_interception,
2109         /* for now: */
2110         [SVM_EXIT_WRITE_CR0]                    = emulate_on_interception,
2111         [SVM_EXIT_WRITE_CR3]                    = emulate_on_interception,
2112         [SVM_EXIT_WRITE_CR4]                    = emulate_on_interception,
2113         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
2114         [SVM_EXIT_READ_DR0]                     = emulate_on_interception,
2115         [SVM_EXIT_READ_DR1]                     = emulate_on_interception,
2116         [SVM_EXIT_READ_DR2]                     = emulate_on_interception,
2117         [SVM_EXIT_READ_DR3]                     = emulate_on_interception,
2118         [SVM_EXIT_WRITE_DR0]                    = emulate_on_interception,
2119         [SVM_EXIT_WRITE_DR1]                    = emulate_on_interception,
2120         [SVM_EXIT_WRITE_DR2]                    = emulate_on_interception,
2121         [SVM_EXIT_WRITE_DR3]                    = emulate_on_interception,
2122         [SVM_EXIT_WRITE_DR5]                    = emulate_on_interception,
2123         [SVM_EXIT_WRITE_DR7]                    = emulate_on_interception,
2124         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
2125         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
2126         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
2127         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
2128         [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
2129         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
2130         [SVM_EXIT_INTR]                         = intr_interception,
2131         [SVM_EXIT_NMI]                          = nmi_interception,
2132         [SVM_EXIT_SMI]                          = nop_on_interception,
2133         [SVM_EXIT_INIT]                         = nop_on_interception,
2134         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
2135         /* [SVM_EXIT_CR0_SEL_WRITE]             = emulate_on_interception, */
2136         [SVM_EXIT_CPUID]                        = cpuid_interception,
2137         [SVM_EXIT_INVD]                         = emulate_on_interception,
2138         [SVM_EXIT_HLT]                          = halt_interception,
2139         [SVM_EXIT_INVLPG]                       = invlpg_interception,
2140         [SVM_EXIT_INVLPGA]                      = invalid_op_interception,
2141         [SVM_EXIT_IOIO]                         = io_interception,
2142         [SVM_EXIT_MSR]                          = msr_interception,
2143         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
2144         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
2145         [SVM_EXIT_VMRUN]                        = vmrun_interception,
2146         [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
2147         [SVM_EXIT_VMLOAD]                       = vmload_interception,
2148         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
2149         [SVM_EXIT_STGI]                         = stgi_interception,
2150         [SVM_EXIT_CLGI]                         = clgi_interception,
2151         [SVM_EXIT_SKINIT]                       = invalid_op_interception,
2152         [SVM_EXIT_WBINVD]                       = emulate_on_interception,
2153         [SVM_EXIT_MONITOR]                      = invalid_op_interception,
2154         [SVM_EXIT_MWAIT]                        = invalid_op_interception,
2155         [SVM_EXIT_NPF]                          = pf_interception,
2156 };
2157
2158 static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2159 {
2160         struct vcpu_svm *svm = to_svm(vcpu);
2161         u32 exit_code = svm->vmcb->control.exit_code;
2162
2163         KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
2164                     (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
2165
2166         if (is_nested(svm)) {
2167                 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
2168                             exit_code, svm->vmcb->control.exit_info_1,
2169                             svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
2170                 if (nested_svm_exit_handled(svm, true)) {
2171                         nested_svm_vmexit(svm);
2172                         nsvm_printk("-> #VMEXIT\n");
2173                         return 1;
2174                 }
2175         }
2176
2177         if (npt_enabled) {
2178                 int mmu_reload = 0;
2179                 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
2180                         svm_set_cr0(vcpu, svm->vmcb->save.cr0);
2181                         mmu_reload = 1;
2182                 }
2183                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2184                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2185                 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2186                         if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
2187                                 kvm_inject_gp(vcpu, 0);
2188                                 return 1;
2189                         }
2190                 }
2191                 if (mmu_reload) {
2192                         kvm_mmu_reset_context(vcpu);
2193                         kvm_mmu_load(vcpu);
2194                 }
2195         }
2196
2197         kvm_reput_irq(svm);
2198
2199         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2200                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2201                 kvm_run->fail_entry.hardware_entry_failure_reason
2202                         = svm->vmcb->control.exit_code;
2203                 return 0;
2204         }
2205
2206         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2207             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2208             exit_code != SVM_EXIT_NPF)
2209                 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2210                        "exit_code 0x%x\n",
2211                        __func__, svm->vmcb->control.exit_int_info,
2212                        exit_code);
2213
2214         if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
2215             || !svm_exit_handlers[exit_code]) {
2216                 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2217                 kvm_run->hw.hardware_exit_reason = exit_code;
2218                 return 0;
2219         }
2220
2221         return svm_exit_handlers[exit_code](svm, kvm_run);
2222 }
2223
2224 static void reload_tss(struct kvm_vcpu *vcpu)
2225 {
2226         int cpu = raw_smp_processor_id();
2227
2228         struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
2229         svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
2230         load_TR_desc();
2231 }
2232
2233 static void pre_svm_run(struct vcpu_svm *svm)
2234 {
2235         int cpu = raw_smp_processor_id();
2236
2237         struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
2238
2239         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2240         if (svm->vcpu.cpu != cpu ||
2241             svm->asid_generation != svm_data->asid_generation)
2242                 new_asid(svm, svm_data);
2243 }
2244
2245
2246 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2247 {
2248         struct vmcb_control_area *control;
2249
2250         KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
2251
2252         ++svm->vcpu.stat.irq_injections;
2253         control = &svm->vmcb->control;
2254         control->int_vector = irq;
2255         control->int_ctl &= ~V_INTR_PRIO_MASK;
2256         control->int_ctl |= V_IRQ_MASK |
2257                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2258 }
2259
2260 static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
2261 {
2262         struct vcpu_svm *svm = to_svm(vcpu);
2263
2264         nested_svm_intr(svm);
2265
2266         svm_inject_irq(svm, irq);
2267 }
2268
2269 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
2270 {
2271         struct vcpu_svm *svm = to_svm(vcpu);
2272         struct vmcb *vmcb = svm->vmcb;
2273         int max_irr, tpr;
2274
2275         if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
2276                 return;
2277
2278         vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2279
2280         max_irr = kvm_lapic_find_highest_irr(vcpu);
2281         if (max_irr == -1)
2282                 return;
2283
2284         tpr = kvm_lapic_get_cr8(vcpu) << 4;
2285
2286         if (tpr >= (max_irr & 0xf0))
2287                 vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
2288 }
2289
2290 static void svm_intr_assist(struct kvm_vcpu *vcpu)
2291 {
2292         struct vcpu_svm *svm = to_svm(vcpu);
2293         struct vmcb *vmcb = svm->vmcb;
2294         int intr_vector = -1;
2295
2296         if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
2297             ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
2298                 intr_vector = vmcb->control.exit_int_info &
2299                               SVM_EVTINJ_VEC_MASK;
2300                 vmcb->control.exit_int_info = 0;
2301                 svm_inject_irq(svm, intr_vector);
2302                 goto out;
2303         }
2304
2305         if (vmcb->control.int_ctl & V_IRQ_MASK)
2306                 goto out;
2307
2308         if (!kvm_cpu_has_interrupt(vcpu))
2309                 goto out;
2310
2311         if (nested_svm_intr(svm))
2312                 goto out;
2313
2314         if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
2315                 goto out;
2316
2317         if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
2318             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
2319             (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
2320                 /* unable to deliver irq, set pending irq */
2321                 svm_set_vintr(svm);
2322                 svm_inject_irq(svm, 0x0);
2323                 goto out;
2324         }
2325         /* Okay, we can deliver the interrupt: grab it and update PIC state. */
2326         intr_vector = kvm_cpu_get_interrupt(vcpu);
2327         svm_inject_irq(svm, intr_vector);
2328 out:
2329         update_cr8_intercept(vcpu);
2330 }
2331
2332 static void kvm_reput_irq(struct vcpu_svm *svm)
2333 {
2334         struct vmcb_control_area *control = &svm->vmcb->control;
2335
2336         if ((control->int_ctl & V_IRQ_MASK)
2337             && !irqchip_in_kernel(svm->vcpu.kvm)) {
2338                 control->int_ctl &= ~V_IRQ_MASK;
2339                 push_irq(&svm->vcpu, control->int_vector);
2340         }
2341
2342         svm->vcpu.arch.interrupt_window_open =
2343                 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2344                  (svm->vcpu.arch.hflags & HF_GIF_MASK);
2345 }
2346
2347 static void svm_do_inject_vector(struct vcpu_svm *svm)
2348 {
2349         struct kvm_vcpu *vcpu = &svm->vcpu;
2350         int word_index = __ffs(vcpu->arch.irq_summary);
2351         int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2352         int irq = word_index * BITS_PER_LONG + bit_index;
2353
2354         clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2355         if (!vcpu->arch.irq_pending[word_index])
2356                 clear_bit(word_index, &vcpu->arch.irq_summary);
2357         svm_inject_irq(svm, irq);
2358 }
2359
2360 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2361                                        struct kvm_run *kvm_run)
2362 {
2363         struct vcpu_svm *svm = to_svm(vcpu);
2364         struct vmcb_control_area *control = &svm->vmcb->control;
2365
2366         if (nested_svm_intr(svm))
2367                 return;
2368
2369         svm->vcpu.arch.interrupt_window_open =
2370                 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2371                  (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
2372                  (svm->vcpu.arch.hflags & HF_GIF_MASK));
2373
2374         if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
2375                 /*
2376                  * If interrupts enabled, and not blocked by sti or mov ss. Good.
2377                  */
2378                 svm_do_inject_vector(svm);
2379
2380         /*
2381          * Interrupts blocked.  Wait for unblock.
2382          */
2383         if (!svm->vcpu.arch.interrupt_window_open &&
2384             (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
2385                 svm_set_vintr(svm);
2386         else
2387                 svm_clear_vintr(svm);
2388 }
2389
2390 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
2391 {
2392         return 0;
2393 }
2394
2395 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
2396 {
2397         force_new_asid(vcpu);
2398 }
2399
2400 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
2401 {
2402 }
2403
2404 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2405 {
2406         struct vcpu_svm *svm = to_svm(vcpu);
2407
2408         if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2409                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2410                 kvm_lapic_set_tpr(vcpu, cr8);
2411         }
2412 }
2413
2414 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2415 {
2416         struct vcpu_svm *svm = to_svm(vcpu);
2417         u64 cr8;
2418
2419         if (!irqchip_in_kernel(vcpu->kvm))
2420                 return;
2421
2422         cr8 = kvm_get_cr8(vcpu);
2423         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2424         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
2425 }
2426
2427 #ifdef CONFIG_X86_64
2428 #define R "r"
2429 #else
2430 #define R "e"
2431 #endif
2432
2433 static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2434 {
2435         struct vcpu_svm *svm = to_svm(vcpu);
2436         u16 fs_selector;
2437         u16 gs_selector;
2438         u16 ldt_selector;
2439
2440         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2441         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2442         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
2443
2444         pre_svm_run(svm);
2445
2446         sync_lapic_to_cr8(vcpu);
2447
2448         save_host_msrs(vcpu);
2449         fs_selector = kvm_read_fs();
2450         gs_selector = kvm_read_gs();
2451         ldt_selector = kvm_read_ldt();
2452         svm->host_cr2 = kvm_read_cr2();
2453         if (!is_nested(svm))
2454                 svm->vmcb->save.cr2 = vcpu->arch.cr2;
2455         /* required for live migration with NPT */
2456         if (npt_enabled)
2457                 svm->vmcb->save.cr3 = vcpu->arch.cr3;
2458
2459         clgi();
2460
2461         local_irq_enable();
2462
2463         asm volatile (
2464                 "push %%"R"bp; \n\t"
2465                 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
2466                 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
2467                 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
2468                 "mov %c[rsi](%[svm]), %%"R"si \n\t"
2469                 "mov %c[rdi](%[svm]), %%"R"di \n\t"
2470                 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
2471 #ifdef CONFIG_X86_64
2472                 "mov %c[r8](%[svm]),  %%r8  \n\t"
2473                 "mov %c[r9](%[svm]),  %%r9  \n\t"
2474                 "mov %c[r10](%[svm]), %%r10 \n\t"
2475                 "mov %c[r11](%[svm]), %%r11 \n\t"
2476                 "mov %c[r12](%[svm]), %%r12 \n\t"
2477                 "mov %c[r13](%[svm]), %%r13 \n\t"
2478                 "mov %c[r14](%[svm]), %%r14 \n\t"
2479                 "mov %c[r15](%[svm]), %%r15 \n\t"
2480 #endif
2481
2482                 /* Enter guest mode */
2483                 "push %%"R"ax \n\t"
2484                 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
2485                 __ex(SVM_VMLOAD) "\n\t"
2486                 __ex(SVM_VMRUN) "\n\t"
2487                 __ex(SVM_VMSAVE) "\n\t"
2488                 "pop %%"R"ax \n\t"
2489
2490                 /* Save guest registers, load host registers */
2491                 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
2492                 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
2493                 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
2494                 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
2495                 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
2496                 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
2497 #ifdef CONFIG_X86_64
2498                 "mov %%r8,  %c[r8](%[svm]) \n\t"
2499                 "mov %%r9,  %c[r9](%[svm]) \n\t"
2500                 "mov %%r10, %c[r10](%[svm]) \n\t"
2501                 "mov %%r11, %c[r11](%[svm]) \n\t"
2502                 "mov %%r12, %c[r12](%[svm]) \n\t"
2503                 "mov %%r13, %c[r13](%[svm]) \n\t"
2504                 "mov %%r14, %c[r14](%[svm]) \n\t"
2505                 "mov %%r15, %c[r15](%[svm]) \n\t"
2506 #endif
2507                 "pop %%"R"bp"
2508                 :
2509                 : [svm]"a"(svm),
2510                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
2511                   [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
2512                   [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
2513                   [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
2514                   [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
2515                   [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
2516                   [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
2517 #ifdef CONFIG_X86_64
2518                   , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
2519                   [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
2520                   [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
2521                   [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
2522                   [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
2523                   [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
2524                   [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
2525                   [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
2526 #endif
2527                 : "cc", "memory"
2528                 , R"bx", R"cx", R"dx", R"si", R"di"
2529 #ifdef CONFIG_X86_64
2530                 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
2531 #endif
2532                 );
2533
2534         vcpu->arch.cr2 = svm->vmcb->save.cr2;
2535         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
2536         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
2537         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
2538
2539         kvm_write_cr2(svm->host_cr2);
2540
2541         kvm_load_fs(fs_selector);
2542         kvm_load_gs(gs_selector);
2543         kvm_load_ldt(ldt_selector);
2544         load_host_msrs(vcpu);
2545
2546         reload_tss(vcpu);
2547
2548         local_irq_disable();
2549
2550         stgi();
2551
2552         sync_cr8_to_lapic(vcpu);
2553
2554         svm->next_rip = 0;
2555 }
2556
2557 #undef R
2558
2559 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
2560 {
2561         struct vcpu_svm *svm = to_svm(vcpu);
2562
2563         if (npt_enabled) {
2564                 svm->vmcb->control.nested_cr3 = root;
2565                 force_new_asid(vcpu);
2566                 return;
2567         }
2568
2569         svm->vmcb->save.cr3 = root;
2570         force_new_asid(vcpu);
2571
2572         if (vcpu->fpu_active) {
2573                 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
2574                 svm->vmcb->save.cr0 |= X86_CR0_TS;
2575                 vcpu->fpu_active = 0;
2576         }
2577 }
2578
2579 static int is_disabled(void)
2580 {
2581         u64 vm_cr;
2582
2583         rdmsrl(MSR_VM_CR, vm_cr);
2584         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
2585                 return 1;
2586
2587         return 0;
2588 }
2589
2590 static void
2591 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2592 {
2593         /*
2594          * Patch in the VMMCALL instruction:
2595          */
2596         hypercall[0] = 0x0f;
2597         hypercall[1] = 0x01;
2598         hypercall[2] = 0xd9;
2599 }
2600
2601 static void svm_check_processor_compat(void *rtn)
2602 {
2603         *(int *)rtn = 0;
2604 }
2605
2606 static bool svm_cpu_has_accelerated_tpr(void)
2607 {
2608         return false;
2609 }
2610
2611 static int get_npt_level(void)
2612 {
2613 #ifdef CONFIG_X86_64
2614         return PT64_ROOT_LEVEL;
2615 #else
2616         return PT32E_ROOT_LEVEL;
2617 #endif
2618 }
2619
2620 static int svm_get_mt_mask_shift(void)
2621 {
2622         return 0;
2623 }
2624
2625 static struct kvm_x86_ops svm_x86_ops = {
2626         .cpu_has_kvm_support = has_svm,
2627         .disabled_by_bios = is_disabled,
2628         .hardware_setup = svm_hardware_setup,
2629         .hardware_unsetup = svm_hardware_unsetup,
2630         .check_processor_compatibility = svm_check_processor_compat,
2631         .hardware_enable = svm_hardware_enable,
2632         .hardware_disable = svm_hardware_disable,
2633         .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
2634
2635         .vcpu_create = svm_create_vcpu,
2636         .vcpu_free = svm_free_vcpu,
2637         .vcpu_reset = svm_vcpu_reset,
2638
2639         .prepare_guest_switch = svm_prepare_guest_switch,
2640         .vcpu_load = svm_vcpu_load,
2641         .vcpu_put = svm_vcpu_put,
2642
2643         .set_guest_debug = svm_guest_debug,
2644         .get_msr = svm_get_msr,
2645         .set_msr = svm_set_msr,
2646         .get_segment_base = svm_get_segment_base,
2647         .get_segment = svm_get_segment,
2648         .set_segment = svm_set_segment,
2649         .get_cpl = svm_get_cpl,
2650         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
2651         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
2652         .set_cr0 = svm_set_cr0,
2653         .set_cr3 = svm_set_cr3,
2654         .set_cr4 = svm_set_cr4,
2655         .set_efer = svm_set_efer,
2656         .get_idt = svm_get_idt,
2657         .set_idt = svm_set_idt,
2658         .get_gdt = svm_get_gdt,
2659         .set_gdt = svm_set_gdt,
2660         .get_dr = svm_get_dr,
2661         .set_dr = svm_set_dr,
2662         .get_rflags = svm_get_rflags,
2663         .set_rflags = svm_set_rflags,
2664
2665         .tlb_flush = svm_flush_tlb,
2666
2667         .run = svm_vcpu_run,
2668         .handle_exit = handle_exit,
2669         .skip_emulated_instruction = skip_emulated_instruction,
2670         .patch_hypercall = svm_patch_hypercall,
2671         .get_irq = svm_get_irq,
2672         .set_irq = svm_set_irq,
2673         .queue_exception = svm_queue_exception,
2674         .exception_injected = svm_exception_injected,
2675         .inject_pending_irq = svm_intr_assist,
2676         .inject_pending_vectors = do_interrupt_requests,
2677
2678         .set_tss_addr = svm_set_tss_addr,
2679         .get_tdp_level = get_npt_level,
2680         .get_mt_mask_shift = svm_get_mt_mask_shift,
2681 };
2682
2683 static int __init svm_init(void)
2684 {
2685         return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
2686                               THIS_MODULE);
2687 }
2688
2689 static void __exit svm_exit(void)
2690 {
2691         kvm_exit();
2692 }
2693
2694 module_init(svm_init)
2695 module_exit(svm_exit)