2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
9 * Copyright (C) 2006 Qumranet, Inc.
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
19 #include <linux/types.h>
20 #include <linux/string.h>
23 #include <linux/highmem.h>
24 #include <linux/module.h>
29 #define pgprintk(x...) do { } while (0)
33 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
34 __FILE__, __LINE__, #x); \
37 #define PT64_ENT_PER_PAGE 512
38 #define PT32_ENT_PER_PAGE 1024
40 #define PT_WRITABLE_SHIFT 1
42 #define PT_PRESENT_MASK (1ULL << 0)
43 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
44 #define PT_USER_MASK (1ULL << 2)
45 #define PT_PWT_MASK (1ULL << 3)
46 #define PT_PCD_MASK (1ULL << 4)
47 #define PT_ACCESSED_MASK (1ULL << 5)
48 #define PT_DIRTY_MASK (1ULL << 6)
49 #define PT_PAGE_SIZE_MASK (1ULL << 7)
50 #define PT_PAT_MASK (1ULL << 7)
51 #define PT_GLOBAL_MASK (1ULL << 8)
52 #define PT64_NX_MASK (1ULL << 63)
54 #define PT_PAT_SHIFT 7
55 #define PT_DIR_PAT_SHIFT 12
56 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
58 #define PT32_DIR_PSE36_SIZE 4
59 #define PT32_DIR_PSE36_SHIFT 13
60 #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
63 #define PT32_PTE_COPY_MASK \
64 (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
66 #define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
68 #define PT_FIRST_AVAIL_BITS_SHIFT 9
69 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
71 #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
72 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
74 #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
75 #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
77 #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
78 #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
80 #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
82 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
84 #define PT64_LEVEL_BITS 9
86 #define PT64_LEVEL_SHIFT(level) \
87 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
89 #define PT64_LEVEL_MASK(level) \
90 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
92 #define PT64_INDEX(address, level)\
93 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
96 #define PT32_LEVEL_BITS 10
98 #define PT32_LEVEL_SHIFT(level) \
99 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
101 #define PT32_LEVEL_MASK(level) \
102 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
104 #define PT32_INDEX(address, level)\
105 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
108 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK)
109 #define PT64_DIR_BASE_ADDR_MASK \
110 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
112 #define PT32_BASE_ADDR_MASK PAGE_MASK
113 #define PT32_DIR_BASE_ADDR_MASK \
114 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
117 #define PFERR_PRESENT_MASK (1U << 0)
118 #define PFERR_WRITE_MASK (1U << 1)
119 #define PFERR_USER_MASK (1U << 2)
121 #define PT64_ROOT_LEVEL 4
122 #define PT32_ROOT_LEVEL 2
123 #define PT32E_ROOT_LEVEL 3
125 #define PT_DIRECTORY_LEVEL 2
126 #define PT_PAGE_TABLE_LEVEL 1
128 static int is_write_protection(struct kvm_vcpu *vcpu)
130 return vcpu->cr0 & CR0_WP_MASK;
133 static int is_cpuid_PSE36(void)
138 static int is_present_pte(unsigned long pte)
140 return pte & PT_PRESENT_MASK;
143 static int is_writeble_pte(unsigned long pte)
145 return pte & PT_WRITABLE_MASK;
148 static int is_io_pte(unsigned long pte)
150 return pte & PT_SHADOW_IO_MARK;
153 static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
155 struct kvm_mmu_page *page_head = page_header(page_hpa);
157 list_del(&page_head->link);
158 page_head->page_hpa = page_hpa;
159 list_add(&page_head->link, &vcpu->free_pages);
162 static int is_empty_shadow_page(hpa_t page_hpa)
166 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
173 static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
175 struct kvm_mmu_page *page;
177 if (list_empty(&vcpu->free_pages))
180 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
181 list_del(&page->link);
182 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
183 ASSERT(is_empty_shadow_page(page->page_hpa));
184 page->slot_bitmap = 0;
186 page->parent_pte = parent_pte;
187 return page->page_hpa;
190 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
192 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
193 struct kvm_mmu_page *page_head = page_header(__pa(pte));
195 __set_bit(slot, &page_head->slot_bitmap);
198 hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
200 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
202 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
205 hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
207 struct kvm_memory_slot *slot;
210 ASSERT((gpa & HPA_ERR_MASK) == 0);
211 slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
213 return gpa | HPA_ERR_MASK;
214 page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
215 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
216 | (gpa & (PAGE_SIZE-1));
219 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
221 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
223 if (gpa == UNMAPPED_GVA)
225 return gpa_to_hpa(vcpu, gpa);
229 static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
233 ASSERT(VALID_PAGE(page_hpa));
234 ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
237 memset(__va(page_hpa), 0, PAGE_SIZE);
242 for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
244 u64 current_ent = *pos;
247 if (is_present_pte(current_ent))
248 release_pt_page_64(vcpu,
254 kvm_mmu_free_page(vcpu, page_hpa);
257 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
261 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
263 int level = PT32E_ROOT_LEVEL;
264 hpa_t table_addr = vcpu->mmu.root_hpa;
267 u32 index = PT64_INDEX(v, level);
270 ASSERT(VALID_PAGE(table_addr));
271 table = __va(table_addr);
274 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
275 page_header_update_slot(vcpu->kvm, table, v);
276 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
281 if (table[index] == 0) {
282 hpa_t new_table = kvm_mmu_alloc_page(vcpu,
285 if (!VALID_PAGE(new_table)) {
286 pgprintk("nonpaging_map: ENOMEM\n");
290 if (level == PT32E_ROOT_LEVEL)
291 table[index] = new_table | PT_PRESENT_MASK;
293 table[index] = new_table | PT_PRESENT_MASK |
294 PT_WRITABLE_MASK | PT_USER_MASK;
296 table_addr = table[index] & PT64_BASE_ADDR_MASK;
300 static void nonpaging_flush(struct kvm_vcpu *vcpu)
302 hpa_t root = vcpu->mmu.root_hpa;
304 ++kvm_stat.tlb_flush;
305 pgprintk("nonpaging_flush\n");
306 ASSERT(VALID_PAGE(root));
307 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
308 root = kvm_mmu_alloc_page(vcpu, NULL);
309 ASSERT(VALID_PAGE(root));
310 vcpu->mmu.root_hpa = root;
312 root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK));
313 kvm_arch_ops->set_cr3(vcpu, root);
314 kvm_arch_ops->tlb_flush(vcpu);
317 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
322 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
329 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
334 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
336 if (is_error_hpa(paddr))
339 ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
341 nonpaging_flush(vcpu);
349 static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
353 static void nonpaging_free(struct kvm_vcpu *vcpu)
358 root = vcpu->mmu.root_hpa;
359 if (VALID_PAGE(root))
360 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
361 vcpu->mmu.root_hpa = INVALID_PAGE;
364 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
366 struct kvm_mmu *context = &vcpu->mmu;
368 context->new_cr3 = nonpaging_new_cr3;
369 context->page_fault = nonpaging_page_fault;
370 context->inval_page = nonpaging_inval_page;
371 context->gva_to_gpa = nonpaging_gva_to_gpa;
372 context->free = nonpaging_free;
373 context->root_level = PT32E_ROOT_LEVEL;
374 context->shadow_root_level = PT32E_ROOT_LEVEL;
375 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
376 ASSERT(VALID_PAGE(context->root_hpa));
377 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
382 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
384 struct kvm_mmu_page *page, *npage;
386 list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
391 if (!page->parent_pte)
394 *page->parent_pte = 0;
395 release_pt_page_64(vcpu, page->page_hpa, 1);
397 ++kvm_stat.tlb_flush;
398 kvm_arch_ops->tlb_flush(vcpu);
401 static void paging_new_cr3(struct kvm_vcpu *vcpu)
403 kvm_mmu_flush_tlb(vcpu);
406 static void mark_pagetable_nonglobal(void *shadow_pte)
408 page_header(__pa(shadow_pte))->global = 0;
411 static inline void set_pte_common(struct kvm_vcpu *vcpu,
419 *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
421 access_bits &= ~PT_WRITABLE_MASK;
423 if (access_bits & PT_WRITABLE_MASK)
424 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
426 *shadow_pte |= access_bits;
428 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
430 if (!(*shadow_pte & PT_GLOBAL_MASK))
431 mark_pagetable_nonglobal(shadow_pte);
433 if (is_error_hpa(paddr)) {
434 *shadow_pte |= gaddr;
435 *shadow_pte |= PT_SHADOW_IO_MARK;
436 *shadow_pte &= ~PT_PRESENT_MASK;
438 *shadow_pte |= paddr;
439 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
443 static void inject_page_fault(struct kvm_vcpu *vcpu,
447 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
450 static inline int fix_read_pf(u64 *shadow_ent)
452 if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
453 !(*shadow_ent & PT_USER_MASK)) {
455 * If supervisor write protect is disabled, we shadow kernel
456 * pages as user pages so we can trap the write access.
458 *shadow_ent |= PT_USER_MASK;
459 *shadow_ent &= ~PT_WRITABLE_MASK;
467 static int may_access(u64 pte, int write, int user)
470 if (user && !(pte & PT_USER_MASK))
472 if (write && !(pte & PT_WRITABLE_MASK))
478 * Remove a shadow pte.
480 static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
482 hpa_t page_addr = vcpu->mmu.root_hpa;
483 int level = vcpu->mmu.shadow_root_level;
488 u32 index = PT64_INDEX(addr, level);
489 u64 *table = __va(page_addr);
491 if (level == PT_PAGE_TABLE_LEVEL ) {
496 if (!is_present_pte(table[index]))
499 page_addr = table[index] & PT64_BASE_ADDR_MASK;
501 if (level == PT_DIRECTORY_LEVEL &&
502 (table[index] & PT_SHADOW_PS_MARK)) {
504 release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
506 kvm_arch_ops->tlb_flush(vcpu);
512 static void paging_free(struct kvm_vcpu *vcpu)
514 nonpaging_free(vcpu);
518 #include "paging_tmpl.h"
522 #include "paging_tmpl.h"
525 static int paging64_init_context(struct kvm_vcpu *vcpu)
527 struct kvm_mmu *context = &vcpu->mmu;
529 ASSERT(is_pae(vcpu));
530 context->new_cr3 = paging_new_cr3;
531 context->page_fault = paging64_page_fault;
532 context->inval_page = paging_inval_page;
533 context->gva_to_gpa = paging64_gva_to_gpa;
534 context->free = paging_free;
535 context->root_level = PT64_ROOT_LEVEL;
536 context->shadow_root_level = PT64_ROOT_LEVEL;
537 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
538 ASSERT(VALID_PAGE(context->root_hpa));
539 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
540 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
544 static int paging32_init_context(struct kvm_vcpu *vcpu)
546 struct kvm_mmu *context = &vcpu->mmu;
548 context->new_cr3 = paging_new_cr3;
549 context->page_fault = paging32_page_fault;
550 context->inval_page = paging_inval_page;
551 context->gva_to_gpa = paging32_gva_to_gpa;
552 context->free = paging_free;
553 context->root_level = PT32_ROOT_LEVEL;
554 context->shadow_root_level = PT32E_ROOT_LEVEL;
555 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
556 ASSERT(VALID_PAGE(context->root_hpa));
557 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
558 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
562 static int paging32E_init_context(struct kvm_vcpu *vcpu)
566 if ((ret = paging64_init_context(vcpu)))
569 vcpu->mmu.root_level = PT32E_ROOT_LEVEL;
570 vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL;
574 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
577 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
579 if (!is_paging(vcpu))
580 return nonpaging_init_context(vcpu);
581 else if (kvm_arch_ops->is_long_mode(vcpu))
582 return paging64_init_context(vcpu);
583 else if (is_pae(vcpu))
584 return paging32E_init_context(vcpu);
586 return paging32_init_context(vcpu);
589 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
592 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
593 vcpu->mmu.free(vcpu);
594 vcpu->mmu.root_hpa = INVALID_PAGE;
598 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
600 destroy_kvm_mmu(vcpu);
601 return init_kvm_mmu(vcpu);
604 static void free_mmu_pages(struct kvm_vcpu *vcpu)
606 while (!list_empty(&vcpu->free_pages)) {
607 struct kvm_mmu_page *page;
609 page = list_entry(vcpu->free_pages.next,
610 struct kvm_mmu_page, link);
611 list_del(&page->link);
612 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
613 page->page_hpa = INVALID_PAGE;
617 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
623 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
625 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
627 INIT_LIST_HEAD(&page_header->link);
628 if ((page = alloc_page(GFP_KVM_MMU)) == NULL)
630 page->private = (unsigned long)page_header;
631 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
632 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
633 list_add(&page_header->link, &vcpu->free_pages);
638 free_mmu_pages(vcpu);
642 int kvm_mmu_init(struct kvm_vcpu *vcpu)
647 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
648 ASSERT(list_empty(&vcpu->free_pages));
650 if ((r = alloc_mmu_pages(vcpu)))
653 if ((r = init_kvm_mmu(vcpu))) {
654 free_mmu_pages(vcpu);
660 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
664 destroy_kvm_mmu(vcpu);
665 free_mmu_pages(vcpu);
668 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
670 struct kvm_mmu_page *page;
672 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
676 if (!test_bit(slot, &page->slot_bitmap))
679 pt = __va(page->page_hpa);
680 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
682 if (pt[i] & PT_WRITABLE_MASK)
683 pt[i] &= ~PT_WRITABLE_MASK;