4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/debugfs.h>
44 #include <linux/bug.h>
46 #include <asm/pgtable.h>
47 #include <asm/tlbflush.h>
48 #include <asm/fixmap.h>
49 #include <asm/mmu_context.h>
50 #include <asm/paravirt.h>
51 #include <asm/linkage.h>
53 #include <asm/xen/hypercall.h>
54 #include <asm/xen/hypervisor.h>
57 #include <xen/interface/xen.h>
59 #include "multicalls.h"
63 #define MMU_UPDATE_HISTO 30
65 #ifdef CONFIG_XEN_DEBUG_FS
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
89 u32 prot_commit_batched;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
100 static inline void check_zero(void)
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
108 #define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
111 #else /* !CONFIG_XEN_DEBUG_FS */
113 #define ADD_STATS(elem, val) do { (void)(val); } while(0)
115 #endif /* CONFIG_XEN_DEBUG_FS */
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary.
121 #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
124 #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
125 #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
127 /* Placeholder for holes in the address space */
128 static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
129 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
131 /* Array of pointers to pages containing p2m entries */
132 static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
133 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
135 /* Arrays of p2m arrays expressed in mfns used for save/restore */
136 static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
138 static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
141 static inline unsigned p2m_top_index(unsigned long pfn)
143 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
144 return pfn / P2M_ENTRIES_PER_PAGE;
147 static inline unsigned p2m_index(unsigned long pfn)
149 return pfn % P2M_ENTRIES_PER_PAGE;
152 /* Build the parallel p2m_top_mfn structures */
153 void xen_setup_mfn_list_list(void)
157 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
158 unsigned topidx = p2m_top_index(pfn);
160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
163 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
168 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
170 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
171 virt_to_mfn(p2m_top_mfn_list);
172 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
175 /* Set up p2m_top to point to the domain-builder provided p2m pages */
176 void __init xen_build_dynamic_phys_to_machine(void)
178 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
182 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
183 unsigned topidx = p2m_top_index(pfn);
185 p2m_top[topidx] = &mfn_list[pfn];
189 unsigned long get_phys_to_machine(unsigned long pfn)
191 unsigned topidx, idx;
193 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
194 return INVALID_P2M_ENTRY;
196 topidx = p2m_top_index(pfn);
197 idx = p2m_index(pfn);
198 return p2m_top[topidx][idx];
200 EXPORT_SYMBOL_GPL(get_phys_to_machine);
202 static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
210 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
211 p[i] = INVALID_P2M_ENTRY;
213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
214 free_page((unsigned long)p);
216 *mfnp = virt_to_mfn(p);
219 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
221 unsigned topidx, idx;
223 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
224 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
228 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
229 BUG_ON(mfn != INVALID_P2M_ENTRY);
233 topidx = p2m_top_index(pfn);
234 if (p2m_top[topidx] == p2m_missing) {
235 /* no need to allocate a page to store an invalid entry */
236 if (mfn == INVALID_P2M_ENTRY)
238 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
241 idx = p2m_index(pfn);
242 p2m_top[topidx][idx] = mfn;
245 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
247 unsigned long address = (unsigned long)vaddr;
253 * if the PFN is in the linear mapped vaddr range, we can just use
254 * the (quick) virt_to_machine() p2m lookup
256 if (virt_addr_valid(vaddr))
257 return virt_to_machine(vaddr);
259 /* otherwise we have to do a (slower) full page-table walk */
261 pte = lookup_address(address, &level);
263 offset = address & ~PAGE_MASK;
264 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
267 void make_lowmem_page_readonly(void *vaddr)
270 unsigned long address = (unsigned long)vaddr;
273 pte = lookup_address(address, &level);
276 ptev = pte_wrprotect(*pte);
278 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
282 void make_lowmem_page_readwrite(void *vaddr)
285 unsigned long address = (unsigned long)vaddr;
288 pte = lookup_address(address, &level);
291 ptev = pte_mkwrite(*pte);
293 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
298 static bool xen_page_pinned(void *ptr)
300 struct page *page = virt_to_page(ptr);
302 return PagePinned(page);
305 static void xen_extend_mmu_update(const struct mmu_update *update)
307 struct multicall_space mcs;
308 struct mmu_update *u;
310 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
312 if (mcs.mc != NULL) {
313 ADD_STATS(mmu_update_extended, 1);
314 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
318 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
319 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
321 ADD_STATS(mmu_update_histo[0], 1);
323 ADD_STATS(mmu_update, 1);
324 mcs = __xen_mc_entry(sizeof(*u));
325 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
326 ADD_STATS(mmu_update_histo[1], 1);
333 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
341 /* ptr may be ioremapped for 64-bit pagetable setup */
342 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
343 u.val = pmd_val_ma(val);
344 xen_extend_mmu_update(&u);
346 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
348 xen_mc_issue(PARAVIRT_LAZY_MMU);
353 void xen_set_pmd(pmd_t *ptr, pmd_t val)
355 ADD_STATS(pmd_update, 1);
357 /* If page is not pinned, we can just update the entry
359 if (!xen_page_pinned(ptr)) {
364 ADD_STATS(pmd_update_pinned, 1);
366 xen_set_pmd_hyper(ptr, val);
370 * Associate a virtual page frame with a given physical page frame
371 * and protection flags for that frame.
373 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
375 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
378 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
379 pte_t *ptep, pte_t pteval)
381 /* updates to init_mm may be done without lock */
385 ADD_STATS(set_pte_at, 1);
386 // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
387 ADD_STATS(set_pte_at_current, mm == current->mm);
388 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
390 if (mm == current->mm || mm == &init_mm) {
391 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
392 struct multicall_space mcs;
393 mcs = xen_mc_entry(0);
395 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
396 ADD_STATS(set_pte_at_batched, 1);
397 xen_mc_issue(PARAVIRT_LAZY_MMU);
400 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
403 xen_set_pte(ptep, pteval);
410 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
411 unsigned long addr, pte_t *ptep)
413 /* Just return the pte as-is. We preserve the bits on commit */
417 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
418 pte_t *ptep, pte_t pte)
424 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
425 u.val = pte_val_ma(pte);
426 xen_extend_mmu_update(&u);
428 ADD_STATS(prot_commit, 1);
429 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
431 xen_mc_issue(PARAVIRT_LAZY_MMU);
434 /* Assume pteval_t is equivalent to all the other *val_t types. */
435 static pteval_t pte_mfn_to_pfn(pteval_t val)
437 if (val & _PAGE_PRESENT) {
438 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
439 pteval_t flags = val & PTE_FLAGS_MASK;
440 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
446 static pteval_t pte_pfn_to_mfn(pteval_t val)
448 if (val & _PAGE_PRESENT) {
449 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
450 pteval_t flags = val & PTE_FLAGS_MASK;
451 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
457 pteval_t xen_pte_val(pte_t pte)
459 return pte_mfn_to_pfn(pte.pte);
462 pgdval_t xen_pgd_val(pgd_t pgd)
464 return pte_mfn_to_pfn(pgd.pgd);
467 pte_t xen_make_pte(pteval_t pte)
469 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte);
473 pgd_t xen_make_pgd(pgdval_t pgd)
475 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd);
479 pmdval_t xen_pmd_val(pmd_t pmd)
481 return pte_mfn_to_pfn(pmd.pmd);
484 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
492 /* ptr may be ioremapped for 64-bit pagetable setup */
493 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
494 u.val = pud_val_ma(val);
495 xen_extend_mmu_update(&u);
497 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
499 xen_mc_issue(PARAVIRT_LAZY_MMU);
504 void xen_set_pud(pud_t *ptr, pud_t val)
506 ADD_STATS(pud_update, 1);
508 /* If page is not pinned, we can just update the entry
510 if (!xen_page_pinned(ptr)) {
515 ADD_STATS(pud_update_pinned, 1);
517 xen_set_pud_hyper(ptr, val);
520 void xen_set_pte(pte_t *ptep, pte_t pte)
522 ADD_STATS(pte_update, 1);
523 // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
524 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
526 #ifdef CONFIG_X86_PAE
527 ptep->pte_high = pte.pte_high;
529 ptep->pte_low = pte.pte_low;
535 #ifdef CONFIG_X86_PAE
536 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
538 set_64bit((u64 *)ptep, native_pte_val(pte));
541 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
544 smp_wmb(); /* make sure low gets written first */
548 void xen_pmd_clear(pmd_t *pmdp)
550 set_pmd(pmdp, __pmd(0));
552 #endif /* CONFIG_X86_PAE */
554 pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd);
560 #if PAGETABLE_LEVELS == 4
561 pudval_t xen_pud_val(pud_t pud)
563 return pte_mfn_to_pfn(pud.pud);
566 pud_t xen_make_pud(pudval_t pud)
568 pud = pte_pfn_to_mfn(pud);
570 return native_make_pud(pud);
573 pgd_t *xen_get_user_pgd(pgd_t *pgd)
575 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
576 unsigned offset = pgd - pgd_page;
577 pgd_t *user_ptr = NULL;
579 if (offset < pgd_index(USER_LIMIT)) {
580 struct page *page = virt_to_page(pgd_page);
581 user_ptr = (pgd_t *)page->private;
589 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
593 u.ptr = virt_to_machine(ptr).maddr;
594 u.val = pgd_val_ma(val);
595 xen_extend_mmu_update(&u);
599 * Raw hypercall-based set_pgd, intended for in early boot before
600 * there's a page structure. This implies:
601 * 1. The only existing pagetable is the kernel's
602 * 2. It is always pinned
603 * 3. It has no user pagetable attached to it
605 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
611 __xen_set_pgd_hyper(ptr, val);
613 xen_mc_issue(PARAVIRT_LAZY_MMU);
618 void xen_set_pgd(pgd_t *ptr, pgd_t val)
620 pgd_t *user_ptr = xen_get_user_pgd(ptr);
622 ADD_STATS(pgd_update, 1);
624 /* If page is not pinned, we can just update the entry
626 if (!xen_page_pinned(ptr)) {
629 WARN_ON(xen_page_pinned(user_ptr));
635 ADD_STATS(pgd_update_pinned, 1);
636 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
638 /* If it's pinned, then we can at least batch the kernel and
639 user updates together. */
642 __xen_set_pgd_hyper(ptr, val);
644 __xen_set_pgd_hyper(user_ptr, val);
646 xen_mc_issue(PARAVIRT_LAZY_MMU);
648 #endif /* PAGETABLE_LEVELS == 4 */
651 * (Yet another) pagetable walker. This one is intended for pinning a
652 * pagetable. This means that it walks a pagetable and calls the
653 * callback function on each page it finds making up the page table,
654 * at every level. It walks the entire pagetable, but it only bothers
655 * pinning pte pages which are below limit. In the normal case this
656 * will be STACK_TOP_MAX, but at boot we need to pin up to
659 * For 32-bit the important bit is that we don't pin beyond there,
660 * because then we start getting into Xen's ptes.
662 * For 64-bit, we must skip the Xen hole in the middle of the address
663 * space, just after the big x86-64 virtual hole.
665 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
666 int (*func)(struct mm_struct *mm, struct page *,
671 unsigned hole_low, hole_high;
672 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
673 unsigned pgdidx, pudidx, pmdidx;
675 /* The limit is the last byte to be touched */
677 BUG_ON(limit >= FIXADDR_TOP);
679 if (xen_feature(XENFEAT_auto_translated_physmap))
683 * 64-bit has a great big hole in the middle of the address
684 * space, which contains the Xen mappings. On 32-bit these
685 * will end up making a zero-sized hole and so is a no-op.
687 hole_low = pgd_index(USER_LIMIT);
688 hole_high = pgd_index(PAGE_OFFSET);
690 pgdidx_limit = pgd_index(limit);
692 pudidx_limit = pud_index(limit);
697 pmdidx_limit = pmd_index(limit);
702 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
705 if (pgdidx >= hole_low && pgdidx < hole_high)
708 if (!pgd_val(pgd[pgdidx]))
711 pud = pud_offset(&pgd[pgdidx], 0);
713 if (PTRS_PER_PUD > 1) /* not folded */
714 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
716 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
719 if (pgdidx == pgdidx_limit &&
720 pudidx > pudidx_limit)
723 if (pud_none(pud[pudidx]))
726 pmd = pmd_offset(&pud[pudidx], 0);
728 if (PTRS_PER_PMD > 1) /* not folded */
729 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
731 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
734 if (pgdidx == pgdidx_limit &&
735 pudidx == pudidx_limit &&
736 pmdidx > pmdidx_limit)
739 if (pmd_none(pmd[pmdidx]))
742 pte = pmd_page(pmd[pmdidx]);
743 flush |= (*func)(mm, pte, PT_PTE);
749 /* Do the top level last, so that the callbacks can use it as
750 a cue to do final things like tlb flushes. */
751 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
756 static int xen_pgd_walk(struct mm_struct *mm,
757 int (*func)(struct mm_struct *mm, struct page *,
761 return __xen_pgd_walk(mm, mm->pgd, func, limit);
764 /* If we're using split pte locks, then take the page's lock and
765 return a pointer to it. Otherwise return NULL. */
766 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
768 spinlock_t *ptl = NULL;
770 #if USE_SPLIT_PTLOCKS
771 ptl = __pte_lockptr(page);
772 spin_lock_nest_lock(ptl, &mm->page_table_lock);
778 static void xen_pte_unlock(void *v)
784 static void xen_do_pin(unsigned level, unsigned long pfn)
786 struct mmuext_op *op;
787 struct multicall_space mcs;
789 mcs = __xen_mc_entry(sizeof(*op));
792 op->arg1.mfn = pfn_to_mfn(pfn);
793 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
796 static int xen_pin_page(struct mm_struct *mm, struct page *page,
799 unsigned pgfl = TestSetPagePinned(page);
803 flush = 0; /* already pinned */
804 else if (PageHighMem(page))
805 /* kmaps need flushing if we found an unpinned
809 void *pt = lowmem_page_address(page);
810 unsigned long pfn = page_to_pfn(page);
811 struct multicall_space mcs = __xen_mc_entry(0);
817 * We need to hold the pagetable lock between the time
818 * we make the pagetable RO and when we actually pin
819 * it. If we don't, then other users may come in and
820 * attempt to update the pagetable by writing it,
821 * which will fail because the memory is RO but not
822 * pinned, so Xen won't do the trap'n'emulate.
824 * If we're using split pte locks, we can't hold the
825 * entire pagetable's worth of locks during the
826 * traverse, because we may wrap the preempt count (8
827 * bits). The solution is to mark RO and pin each PTE
828 * page while holding the lock. This means the number
829 * of locks we end up holding is never more than a
830 * batch size (~32 entries, at present).
832 * If we're not using split pte locks, we needn't pin
833 * the PTE pages independently, because we're
834 * protected by the overall pagetable lock.
838 ptl = xen_pte_lock(page, mm);
840 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
841 pfn_pte(pfn, PAGE_KERNEL_RO),
842 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
845 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
847 /* Queue a deferred unlock for when this batch
849 xen_mc_callback(xen_pte_unlock, ptl);
856 /* This is called just after a mm has been created, but it has not
857 been used yet. We need to make sure that its pagetable is all
858 read-only, and can be pinned. */
859 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
865 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
866 /* re-enable interrupts for flushing */
876 pgd_t *user_pgd = xen_get_user_pgd(pgd);
878 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
881 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
882 xen_do_pin(MMUEXT_PIN_L4_TABLE,
883 PFN_DOWN(__pa(user_pgd)));
886 #else /* CONFIG_X86_32 */
887 #ifdef CONFIG_X86_PAE
888 /* Need to make sure unshared kernel PMD is pinnable */
889 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
892 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
893 #endif /* CONFIG_X86_64 */
897 static void xen_pgd_pin(struct mm_struct *mm)
899 __xen_pgd_pin(mm, mm->pgd);
903 * On save, we need to pin all pagetables to make sure they get their
904 * mfns turned into pfns. Search the list for any unpinned pgds and pin
905 * them (unpinned pgds are not currently in use, probably because the
906 * process is under construction or destruction).
908 * Expected to be called in stop_machine() ("equivalent to taking
909 * every spinlock in the system"), so the locking doesn't really
910 * matter all that much.
912 void xen_mm_pin_all(void)
917 spin_lock_irqsave(&pgd_lock, flags);
919 list_for_each_entry(page, &pgd_list, lru) {
920 if (!PagePinned(page)) {
921 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
922 SetPageSavePinned(page);
926 spin_unlock_irqrestore(&pgd_lock, flags);
930 * The init_mm pagetable is really pinned as soon as its created, but
931 * that's before we have page structures to store the bits. So do all
932 * the book-keeping now.
934 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
941 void __init xen_mark_init_mm_pinned(void)
943 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
946 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
949 unsigned pgfl = TestClearPagePinned(page);
951 if (pgfl && !PageHighMem(page)) {
952 void *pt = lowmem_page_address(page);
953 unsigned long pfn = page_to_pfn(page);
954 spinlock_t *ptl = NULL;
955 struct multicall_space mcs;
958 * Do the converse to pin_page. If we're using split
959 * pte locks, we must be holding the lock for while
960 * the pte page is unpinned but still RO to prevent
961 * concurrent updates from seeing it in this
962 * partially-pinned state.
964 if (level == PT_PTE) {
965 ptl = xen_pte_lock(page, mm);
968 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
971 mcs = __xen_mc_entry(0);
973 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
974 pfn_pte(pfn, PAGE_KERNEL),
975 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
978 /* unlock when batch completed */
979 xen_mc_callback(xen_pte_unlock, ptl);
983 return 0; /* never need to flush on unpin */
986 /* Release a pagetables pages back as normal RW */
987 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
991 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
995 pgd_t *user_pgd = xen_get_user_pgd(pgd);
998 xen_do_pin(MMUEXT_UNPIN_TABLE,
999 PFN_DOWN(__pa(user_pgd)));
1000 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1005 #ifdef CONFIG_X86_PAE
1006 /* Need to make sure unshared kernel PMD is unpinned */
1007 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1011 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1016 static void xen_pgd_unpin(struct mm_struct *mm)
1018 __xen_pgd_unpin(mm, mm->pgd);
1022 * On resume, undo any pinning done at save, so that the rest of the
1023 * kernel doesn't see any unexpected pinned pagetables.
1025 void xen_mm_unpin_all(void)
1027 unsigned long flags;
1030 spin_lock_irqsave(&pgd_lock, flags);
1032 list_for_each_entry(page, &pgd_list, lru) {
1033 if (PageSavePinned(page)) {
1034 BUG_ON(!PagePinned(page));
1035 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1036 ClearPageSavePinned(page);
1040 spin_unlock_irqrestore(&pgd_lock, flags);
1043 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1045 spin_lock(&next->page_table_lock);
1047 spin_unlock(&next->page_table_lock);
1050 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1052 spin_lock(&mm->page_table_lock);
1054 spin_unlock(&mm->page_table_lock);
1059 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1060 we need to repoint it somewhere else before we can unpin it. */
1061 static void drop_other_mm_ref(void *info)
1063 struct mm_struct *mm = info;
1064 struct mm_struct *active_mm;
1066 #ifdef CONFIG_X86_64
1067 active_mm = read_pda(active_mm);
1069 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1072 if (active_mm == mm)
1073 leave_mm(smp_processor_id());
1075 /* If this cpu still has a stale cr3 reference, then make sure
1076 it has been flushed. */
1077 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
1078 load_cr3(swapper_pg_dir);
1079 arch_flush_lazy_cpu_mode();
1083 static void xen_drop_mm_ref(struct mm_struct *mm)
1088 if (current->active_mm == mm) {
1089 if (current->mm == mm)
1090 load_cr3(swapper_pg_dir);
1092 leave_mm(smp_processor_id());
1093 arch_flush_lazy_cpu_mode();
1096 /* Get the "official" set of cpus referring to our pagetable. */
1097 mask = mm->cpu_vm_mask;
1099 /* It's possible that a vcpu may have a stale reference to our
1100 cr3, because its in lazy mode, and it hasn't yet flushed
1101 its set of pending hypercalls yet. In this case, we can
1102 look at its actual current cr3 value, and force it to flush
1104 for_each_online_cpu(cpu) {
1105 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1109 if (!cpus_empty(mask))
1110 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
1113 static void xen_drop_mm_ref(struct mm_struct *mm)
1115 if (current->active_mm == mm)
1116 load_cr3(swapper_pg_dir);
1121 * While a process runs, Xen pins its pagetables, which means that the
1122 * hypervisor forces it to be read-only, and it controls all updates
1123 * to it. This means that all pagetable updates have to go via the
1124 * hypervisor, which is moderately expensive.
1126 * Since we're pulling the pagetable down, we switch to use init_mm,
1127 * unpin old process pagetable and mark it all read-write, which
1128 * allows further operations on it to be simple memory accesses.
1130 * The only subtle point is that another CPU may be still using the
1131 * pagetable because of lazy tlb flushing. This means we need need to
1132 * switch all CPUs off this pagetable before we can unpin it.
1134 void xen_exit_mmap(struct mm_struct *mm)
1136 get_cpu(); /* make sure we don't move around */
1137 xen_drop_mm_ref(mm);
1140 spin_lock(&mm->page_table_lock);
1142 /* pgd may not be pinned in the error exit path of execve */
1143 if (xen_page_pinned(mm->pgd))
1146 spin_unlock(&mm->page_table_lock);
1149 #ifdef CONFIG_XEN_DEBUG_FS
1151 static struct dentry *d_mmu_debug;
1153 static int __init xen_mmu_debugfs(void)
1155 struct dentry *d_xen = xen_init_debugfs();
1160 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1162 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1164 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1165 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1166 &mmu_stats.pgd_update_pinned);
1167 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1168 &mmu_stats.pgd_update_pinned);
1170 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1171 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1172 &mmu_stats.pud_update_pinned);
1173 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1174 &mmu_stats.pud_update_pinned);
1176 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1177 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1178 &mmu_stats.pmd_update_pinned);
1179 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1180 &mmu_stats.pmd_update_pinned);
1182 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1183 // debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1184 // &mmu_stats.pte_update_pinned);
1185 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1186 &mmu_stats.pte_update_pinned);
1188 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1189 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1190 &mmu_stats.mmu_update_extended);
1191 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1192 mmu_stats.mmu_update_histo, 20);
1194 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1195 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1196 &mmu_stats.set_pte_at_batched);
1197 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1198 &mmu_stats.set_pte_at_current);
1199 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1200 &mmu_stats.set_pte_at_kernel);
1202 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1203 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1204 &mmu_stats.prot_commit_batched);
1208 fs_initcall(xen_mmu_debugfs);
1210 #endif /* CONFIG_XEN_DEBUG_FS */