2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
67 * 12-63: Context Ptr (12 - (haw-1))
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
77 return (root->val & 1);
79 static inline void set_root_present(struct root_entry *root)
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
85 root->val |= value & VTD_PAGE_MASK;
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
91 return (struct context_entry *)
92 (root_present(root)?phys_to_virt(
93 root->val & VTD_PAGE_MASK) :
100 * 1: fault processing disable
101 * 2-3: translation type
102 * 12-63: address space root
108 struct context_entry {
113 static inline bool context_present(struct context_entry *context)
115 return (context->lo & 1);
117 static inline void context_set_present(struct context_entry *context)
122 static inline void context_set_fault_enable(struct context_entry *context)
124 context->lo &= (((u64)-1) << 2) | 1;
127 #define CONTEXT_TT_MULTI_LEVEL 0
129 static inline void context_set_translation_type(struct context_entry *context,
132 context->lo &= (((u64)-1) << 4) | 3;
133 context->lo |= (value & 3) << 2;
136 static inline void context_set_address_root(struct context_entry *context,
139 context->lo |= value & VTD_PAGE_MASK;
142 static inline void context_set_address_width(struct context_entry *context,
145 context->hi |= value & 7;
148 static inline void context_set_domain_id(struct context_entry *context,
151 context->hi |= (value & ((1 << 16) - 1)) << 8;
154 static inline void context_clear_entry(struct context_entry *context)
166 * 12-63: Host physcial address
172 static inline void dma_clear_pte(struct dma_pte *pte)
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
179 pte->val |= DMA_PTE_READ;
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
184 pte->val |= DMA_PTE_WRITE;
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
189 pte->val = (pte->val & ~3) | (prot & 3);
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
194 return (pte->val & VTD_PAGE_MASK);
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
199 pte->val |= (addr & VTD_PAGE_MASK);
202 static inline bool dma_pte_present(struct dma_pte *pte)
204 return (pte->val & 3) != 0;
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
210 /* domain represents a virtual machine, more than one devices
211 * across iommus may be owned in one domain, e.g. kvm guest.
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
216 int id; /* domain id */
217 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
219 struct list_head devices; /* all devices' list */
220 struct iova_domain iovad; /* iova's that belong to this domain */
222 struct dma_pte *pgd; /* virtual address */
223 spinlock_t mapping_lock; /* page table lock */
224 int gaw; /* max guest address width */
226 /* adjusted guest address width, 0 is level 2 30-bit */
229 int flags; /* flags to find out type of domain */
231 int iommu_coherency;/* indicate coherency of iommu access */
232 int iommu_count; /* reference count of iommu */
233 spinlock_t iommu_lock; /* protect iommu set in domain */
234 u64 max_addr; /* maximum mapped address */
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239 struct list_head link; /* link to domain siblings */
240 struct list_head global; /* link to global list */
241 u8 bus; /* PCI bus numer */
242 u8 devfn; /* PCI devfn number */
243 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244 struct dmar_domain *domain; /* pointer to domain */
247 static void flush_unmaps_timeout(unsigned long data);
249 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
254 struct iova *iova[HIGH_WATER_MARK];
255 struct dmar_domain *domain[HIGH_WATER_MARK];
258 static struct deferred_flush_tables *deferred_flush;
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
267 static long list_size;
269 static void domain_remove_dev_info(struct dmar_domain *domain);
271 #ifdef CONFIG_DMAR_DEFAULT_ON
272 int dmar_disabled = 0;
274 int dmar_disabled = 1;
275 #endif /*CONFIG_DMAR_DEFAULT_ON*/
277 static int __initdata dmar_map_gfx = 1;
278 static int dmar_forcedac;
279 static int intel_iommu_strict;
281 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
282 static DEFINE_SPINLOCK(device_domain_lock);
283 static LIST_HEAD(device_domain_list);
285 static struct iommu_ops intel_iommu_ops;
287 static int __init intel_iommu_setup(char *str)
292 if (!strncmp(str, "on", 2)) {
294 printk(KERN_INFO "Intel-IOMMU: enabled\n");
295 } else if (!strncmp(str, "off", 3)) {
297 printk(KERN_INFO "Intel-IOMMU: disabled\n");
298 } else if (!strncmp(str, "igfx_off", 8)) {
301 "Intel-IOMMU: disable GFX device mapping\n");
302 } else if (!strncmp(str, "forcedac", 8)) {
304 "Intel-IOMMU: Forcing DAC for PCI devices\n");
306 } else if (!strncmp(str, "strict", 6)) {
308 "Intel-IOMMU: disable batched IOTLB flush\n");
309 intel_iommu_strict = 1;
312 str += strcspn(str, ",");
318 __setup("intel_iommu=", intel_iommu_setup);
320 static struct kmem_cache *iommu_domain_cache;
321 static struct kmem_cache *iommu_devinfo_cache;
322 static struct kmem_cache *iommu_iova_cache;
324 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
329 /* trying to avoid low memory issues */
330 flags = current->flags & PF_MEMALLOC;
331 current->flags |= PF_MEMALLOC;
332 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
333 current->flags &= (~PF_MEMALLOC | flags);
338 static inline void *alloc_pgtable_page(void)
343 /* trying to avoid low memory issues */
344 flags = current->flags & PF_MEMALLOC;
345 current->flags |= PF_MEMALLOC;
346 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
347 current->flags &= (~PF_MEMALLOC | flags);
351 static inline void free_pgtable_page(void *vaddr)
353 free_page((unsigned long)vaddr);
356 static inline void *alloc_domain_mem(void)
358 return iommu_kmem_cache_alloc(iommu_domain_cache);
361 static void free_domain_mem(void *vaddr)
363 kmem_cache_free(iommu_domain_cache, vaddr);
366 static inline void * alloc_devinfo_mem(void)
368 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
371 static inline void free_devinfo_mem(void *vaddr)
373 kmem_cache_free(iommu_devinfo_cache, vaddr);
376 struct iova *alloc_iova_mem(void)
378 return iommu_kmem_cache_alloc(iommu_iova_cache);
381 void free_iova_mem(struct iova *iova)
383 kmem_cache_free(iommu_iova_cache, iova);
387 static inline int width_to_agaw(int width);
389 /* calculate agaw for each iommu.
390 * "SAGAW" may be different across iommus, use a default agaw, and
391 * get a supported less agaw for iommus that don't support the default agaw.
393 int iommu_calculate_agaw(struct intel_iommu *iommu)
398 sagaw = cap_sagaw(iommu->cap);
399 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
401 if (test_bit(agaw, &sagaw))
408 /* in native case, each domain is related to only one iommu */
409 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
413 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
415 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
416 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
419 return g_iommus[iommu_id];
422 /* "Coherency" capability may be different across iommus */
423 static void domain_update_iommu_coherency(struct dmar_domain *domain)
427 domain->iommu_coherency = 1;
429 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
430 for (; i < g_num_of_iommus; ) {
431 if (!ecap_coherent(g_iommus[i]->ecap)) {
432 domain->iommu_coherency = 0;
435 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
439 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
441 struct dmar_drhd_unit *drhd = NULL;
444 for_each_drhd_unit(drhd) {
448 for (i = 0; i < drhd->devices_cnt; i++)
449 if (drhd->devices[i] &&
450 drhd->devices[i]->bus->number == bus &&
451 drhd->devices[i]->devfn == devfn)
454 if (drhd->include_all)
461 static void domain_flush_cache(struct dmar_domain *domain,
462 void *addr, int size)
464 if (!domain->iommu_coherency)
465 clflush_cache_range(addr, size);
468 /* Gets context entry for a given bus and devfn */
469 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
472 struct root_entry *root;
473 struct context_entry *context;
474 unsigned long phy_addr;
477 spin_lock_irqsave(&iommu->lock, flags);
478 root = &iommu->root_entry[bus];
479 context = get_context_addr_from_root(root);
481 context = (struct context_entry *)alloc_pgtable_page();
483 spin_unlock_irqrestore(&iommu->lock, flags);
486 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
487 phy_addr = virt_to_phys((void *)context);
488 set_root_value(root, phy_addr);
489 set_root_present(root);
490 __iommu_flush_cache(iommu, root, sizeof(*root));
492 spin_unlock_irqrestore(&iommu->lock, flags);
493 return &context[devfn];
496 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
498 struct root_entry *root;
499 struct context_entry *context;
503 spin_lock_irqsave(&iommu->lock, flags);
504 root = &iommu->root_entry[bus];
505 context = get_context_addr_from_root(root);
510 ret = context_present(&context[devfn]);
512 spin_unlock_irqrestore(&iommu->lock, flags);
516 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
518 struct root_entry *root;
519 struct context_entry *context;
522 spin_lock_irqsave(&iommu->lock, flags);
523 root = &iommu->root_entry[bus];
524 context = get_context_addr_from_root(root);
526 context_clear_entry(&context[devfn]);
527 __iommu_flush_cache(iommu, &context[devfn], \
530 spin_unlock_irqrestore(&iommu->lock, flags);
533 static void free_context_table(struct intel_iommu *iommu)
535 struct root_entry *root;
538 struct context_entry *context;
540 spin_lock_irqsave(&iommu->lock, flags);
541 if (!iommu->root_entry) {
544 for (i = 0; i < ROOT_ENTRY_NR; i++) {
545 root = &iommu->root_entry[i];
546 context = get_context_addr_from_root(root);
548 free_pgtable_page(context);
550 free_pgtable_page(iommu->root_entry);
551 iommu->root_entry = NULL;
553 spin_unlock_irqrestore(&iommu->lock, flags);
556 /* page table handling */
557 #define LEVEL_STRIDE (9)
558 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
560 static inline int agaw_to_level(int agaw)
565 static inline int agaw_to_width(int agaw)
567 return 30 + agaw * LEVEL_STRIDE;
571 static inline int width_to_agaw(int width)
573 return (width - 30) / LEVEL_STRIDE;
576 static inline unsigned int level_to_offset_bits(int level)
578 return (12 + (level - 1) * LEVEL_STRIDE);
581 static inline int address_level_offset(u64 addr, int level)
583 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
586 static inline u64 level_mask(int level)
588 return ((u64)-1 << level_to_offset_bits(level));
591 static inline u64 level_size(int level)
593 return ((u64)1 << level_to_offset_bits(level));
596 static inline u64 align_to_level(u64 addr, int level)
598 return ((addr + level_size(level) - 1) & level_mask(level));
601 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
603 int addr_width = agaw_to_width(domain->agaw);
604 struct dma_pte *parent, *pte = NULL;
605 int level = agaw_to_level(domain->agaw);
609 BUG_ON(!domain->pgd);
611 addr &= (((u64)1) << addr_width) - 1;
612 parent = domain->pgd;
614 spin_lock_irqsave(&domain->mapping_lock, flags);
618 offset = address_level_offset(addr, level);
619 pte = &parent[offset];
623 if (!dma_pte_present(pte)) {
624 tmp_page = alloc_pgtable_page();
627 spin_unlock_irqrestore(&domain->mapping_lock,
631 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
632 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
634 * high level table always sets r/w, last level page
635 * table control read/write
637 dma_set_pte_readable(pte);
638 dma_set_pte_writable(pte);
639 domain_flush_cache(domain, pte, sizeof(*pte));
641 parent = phys_to_virt(dma_pte_addr(pte));
645 spin_unlock_irqrestore(&domain->mapping_lock, flags);
649 /* return address's pte at specific level */
650 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
653 struct dma_pte *parent, *pte = NULL;
654 int total = agaw_to_level(domain->agaw);
657 parent = domain->pgd;
658 while (level <= total) {
659 offset = address_level_offset(addr, total);
660 pte = &parent[offset];
664 if (!dma_pte_present(pte))
666 parent = phys_to_virt(dma_pte_addr(pte));
672 /* clear one page's page table */
673 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
675 struct dma_pte *pte = NULL;
677 /* get last level pte */
678 pte = dma_addr_level_pte(domain, addr, 1);
682 domain_flush_cache(domain, pte, sizeof(*pte));
686 /* clear last level pte, a tlb flush should be followed */
687 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
689 int addr_width = agaw_to_width(domain->agaw);
691 start &= (((u64)1) << addr_width) - 1;
692 end &= (((u64)1) << addr_width) - 1;
693 /* in case it's partial page */
694 start = PAGE_ALIGN(start);
697 /* we don't need lock here, nobody else touches the iova range */
698 while (start < end) {
699 dma_pte_clear_one(domain, start);
700 start += VTD_PAGE_SIZE;
704 /* free page table pages. last level pte should already be cleared */
705 static void dma_pte_free_pagetable(struct dmar_domain *domain,
708 int addr_width = agaw_to_width(domain->agaw);
710 int total = agaw_to_level(domain->agaw);
714 start &= (((u64)1) << addr_width) - 1;
715 end &= (((u64)1) << addr_width) - 1;
717 /* we don't need lock here, nobody else touches the iova range */
719 while (level <= total) {
720 tmp = align_to_level(start, level);
721 if (tmp >= end || (tmp + level_size(level) > end))
725 pte = dma_addr_level_pte(domain, tmp, level);
728 phys_to_virt(dma_pte_addr(pte)));
730 domain_flush_cache(domain, pte, sizeof(*pte));
732 tmp += level_size(level);
737 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
738 free_pgtable_page(domain->pgd);
744 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
746 struct root_entry *root;
749 root = (struct root_entry *)alloc_pgtable_page();
753 __iommu_flush_cache(iommu, root, ROOT_SIZE);
755 spin_lock_irqsave(&iommu->lock, flags);
756 iommu->root_entry = root;
757 spin_unlock_irqrestore(&iommu->lock, flags);
762 static void iommu_set_root_entry(struct intel_iommu *iommu)
768 addr = iommu->root_entry;
770 spin_lock_irqsave(&iommu->register_lock, flag);
771 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
773 cmd = iommu->gcmd | DMA_GCMD_SRTP;
774 writel(cmd, iommu->reg + DMAR_GCMD_REG);
776 /* Make sure hardware complete it */
777 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
778 readl, (sts & DMA_GSTS_RTPS), sts);
780 spin_unlock_irqrestore(&iommu->register_lock, flag);
783 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
788 if (!cap_rwbf(iommu->cap))
790 val = iommu->gcmd | DMA_GCMD_WBF;
792 spin_lock_irqsave(&iommu->register_lock, flag);
793 writel(val, iommu->reg + DMAR_GCMD_REG);
795 /* Make sure hardware complete it */
796 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
797 readl, (!(val & DMA_GSTS_WBFS)), val);
799 spin_unlock_irqrestore(&iommu->register_lock, flag);
802 /* return value determine if we need a write buffer flush */
803 static int __iommu_flush_context(struct intel_iommu *iommu,
804 u16 did, u16 source_id, u8 function_mask, u64 type,
805 int non_present_entry_flush)
811 * In the non-present entry flush case, if hardware doesn't cache
812 * non-present entry we do nothing and if hardware cache non-present
813 * entry, we flush entries of domain 0 (the domain id is used to cache
814 * any non-present entries)
816 if (non_present_entry_flush) {
817 if (!cap_caching_mode(iommu->cap))
824 case DMA_CCMD_GLOBAL_INVL:
825 val = DMA_CCMD_GLOBAL_INVL;
827 case DMA_CCMD_DOMAIN_INVL:
828 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
830 case DMA_CCMD_DEVICE_INVL:
831 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
832 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
839 spin_lock_irqsave(&iommu->register_lock, flag);
840 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
842 /* Make sure hardware complete it */
843 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
844 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
846 spin_unlock_irqrestore(&iommu->register_lock, flag);
848 /* flush context entry will implicitly flush write buffer */
852 /* return value determine if we need a write buffer flush */
853 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
854 u64 addr, unsigned int size_order, u64 type,
855 int non_present_entry_flush)
857 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
858 u64 val = 0, val_iva = 0;
862 * In the non-present entry flush case, if hardware doesn't cache
863 * non-present entry we do nothing and if hardware cache non-present
864 * entry, we flush entries of domain 0 (the domain id is used to cache
865 * any non-present entries)
867 if (non_present_entry_flush) {
868 if (!cap_caching_mode(iommu->cap))
875 case DMA_TLB_GLOBAL_FLUSH:
876 /* global flush doesn't need set IVA_REG */
877 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
879 case DMA_TLB_DSI_FLUSH:
880 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
882 case DMA_TLB_PSI_FLUSH:
883 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
884 /* Note: always flush non-leaf currently */
885 val_iva = size_order | addr;
890 /* Note: set drain read/write */
893 * This is probably to be super secure.. Looks like we can
894 * ignore it without any impact.
896 if (cap_read_drain(iommu->cap))
897 val |= DMA_TLB_READ_DRAIN;
899 if (cap_write_drain(iommu->cap))
900 val |= DMA_TLB_WRITE_DRAIN;
902 spin_lock_irqsave(&iommu->register_lock, flag);
903 /* Note: Only uses first TLB reg currently */
905 dmar_writeq(iommu->reg + tlb_offset, val_iva);
906 dmar_writeq(iommu->reg + tlb_offset + 8, val);
908 /* Make sure hardware complete it */
909 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
910 dmar_readq, (!(val & DMA_TLB_IVT)), val);
912 spin_unlock_irqrestore(&iommu->register_lock, flag);
914 /* check IOTLB invalidation granularity */
915 if (DMA_TLB_IAIG(val) == 0)
916 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
917 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
918 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
919 (unsigned long long)DMA_TLB_IIRG(type),
920 (unsigned long long)DMA_TLB_IAIG(val));
921 /* flush iotlb entry will implicitly flush write buffer */
925 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
926 u64 addr, unsigned int pages, int non_present_entry_flush)
930 BUG_ON(addr & (~VTD_PAGE_MASK));
933 /* Fallback to domain selective flush if no PSI support */
934 if (!cap_pgsel_inv(iommu->cap))
935 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
937 non_present_entry_flush);
940 * PSI requires page size to be 2 ^ x, and the base address is naturally
941 * aligned to the size
943 mask = ilog2(__roundup_pow_of_two(pages));
944 /* Fallback to domain selective flush if size is too big */
945 if (mask > cap_max_amask_val(iommu->cap))
946 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
947 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
949 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
951 non_present_entry_flush);
954 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
959 spin_lock_irqsave(&iommu->register_lock, flags);
960 pmen = readl(iommu->reg + DMAR_PMEN_REG);
961 pmen &= ~DMA_PMEN_EPM;
962 writel(pmen, iommu->reg + DMAR_PMEN_REG);
964 /* wait for the protected region status bit to clear */
965 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
966 readl, !(pmen & DMA_PMEN_PRS), pmen);
968 spin_unlock_irqrestore(&iommu->register_lock, flags);
971 static int iommu_enable_translation(struct intel_iommu *iommu)
976 spin_lock_irqsave(&iommu->register_lock, flags);
977 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
979 /* Make sure hardware complete it */
980 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
981 readl, (sts & DMA_GSTS_TES), sts);
983 iommu->gcmd |= DMA_GCMD_TE;
984 spin_unlock_irqrestore(&iommu->register_lock, flags);
988 static int iommu_disable_translation(struct intel_iommu *iommu)
993 spin_lock_irqsave(&iommu->register_lock, flag);
994 iommu->gcmd &= ~DMA_GCMD_TE;
995 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
997 /* Make sure hardware complete it */
998 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
999 readl, (!(sts & DMA_GSTS_TES)), sts);
1001 spin_unlock_irqrestore(&iommu->register_lock, flag);
1005 /* iommu interrupt handling. Most stuff are MSI-like. */
1007 static const char *fault_reason_strings[] =
1010 "Present bit in root entry is clear",
1011 "Present bit in context entry is clear",
1012 "Invalid context entry",
1013 "Access beyond MGAW",
1014 "PTE Write access is not set",
1015 "PTE Read access is not set",
1016 "Next page table ptr is invalid",
1017 "Root table address invalid",
1018 "Context table ptr is invalid",
1019 "non-zero reserved fields in RTP",
1020 "non-zero reserved fields in CTP",
1021 "non-zero reserved fields in PTE",
1023 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1025 const char *dmar_get_fault_reason(u8 fault_reason)
1027 if (fault_reason > MAX_FAULT_REASON_IDX)
1030 return fault_reason_strings[fault_reason];
1033 void dmar_msi_unmask(unsigned int irq)
1035 struct intel_iommu *iommu = get_irq_data(irq);
1039 spin_lock_irqsave(&iommu->register_lock, flag);
1040 writel(0, iommu->reg + DMAR_FECTL_REG);
1041 /* Read a reg to force flush the post write */
1042 readl(iommu->reg + DMAR_FECTL_REG);
1043 spin_unlock_irqrestore(&iommu->register_lock, flag);
1046 void dmar_msi_mask(unsigned int irq)
1049 struct intel_iommu *iommu = get_irq_data(irq);
1052 spin_lock_irqsave(&iommu->register_lock, flag);
1053 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1054 /* Read a reg to force flush the post write */
1055 readl(iommu->reg + DMAR_FECTL_REG);
1056 spin_unlock_irqrestore(&iommu->register_lock, flag);
1059 void dmar_msi_write(int irq, struct msi_msg *msg)
1061 struct intel_iommu *iommu = get_irq_data(irq);
1064 spin_lock_irqsave(&iommu->register_lock, flag);
1065 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1066 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1067 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1068 spin_unlock_irqrestore(&iommu->register_lock, flag);
1071 void dmar_msi_read(int irq, struct msi_msg *msg)
1073 struct intel_iommu *iommu = get_irq_data(irq);
1076 spin_lock_irqsave(&iommu->register_lock, flag);
1077 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1078 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1079 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1080 spin_unlock_irqrestore(&iommu->register_lock, flag);
1083 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1084 u8 fault_reason, u16 source_id, unsigned long long addr)
1088 reason = dmar_get_fault_reason(fault_reason);
1091 "DMAR:[%s] Request device [%02x:%02x.%d] "
1092 "fault addr %llx \n"
1093 "DMAR:[fault reason %02d] %s\n",
1094 (type ? "DMA Read" : "DMA Write"),
1095 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1096 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1100 #define PRIMARY_FAULT_REG_LEN (16)
1101 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1103 struct intel_iommu *iommu = dev_id;
1104 int reg, fault_index;
1108 spin_lock_irqsave(&iommu->register_lock, flag);
1109 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1111 /* TBD: ignore advanced fault log currently */
1112 if (!(fault_status & DMA_FSTS_PPF))
1113 goto clear_overflow;
1115 fault_index = dma_fsts_fault_record_index(fault_status);
1116 reg = cap_fault_reg_offset(iommu->cap);
1124 /* highest 32 bits */
1125 data = readl(iommu->reg + reg +
1126 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1127 if (!(data & DMA_FRCD_F))
1130 fault_reason = dma_frcd_fault_reason(data);
1131 type = dma_frcd_type(data);
1133 data = readl(iommu->reg + reg +
1134 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1135 source_id = dma_frcd_source_id(data);
1137 guest_addr = dmar_readq(iommu->reg + reg +
1138 fault_index * PRIMARY_FAULT_REG_LEN);
1139 guest_addr = dma_frcd_page_addr(guest_addr);
1140 /* clear the fault */
1141 writel(DMA_FRCD_F, iommu->reg + reg +
1142 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1144 spin_unlock_irqrestore(&iommu->register_lock, flag);
1146 iommu_page_fault_do_one(iommu, type, fault_reason,
1147 source_id, guest_addr);
1150 if (fault_index > cap_num_fault_regs(iommu->cap))
1152 spin_lock_irqsave(&iommu->register_lock, flag);
1155 /* clear primary fault overflow */
1156 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1157 if (fault_status & DMA_FSTS_PFO)
1158 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1160 spin_unlock_irqrestore(&iommu->register_lock, flag);
1164 int dmar_set_interrupt(struct intel_iommu *iommu)
1170 printk(KERN_ERR "IOMMU: no free vectors\n");
1174 set_irq_data(irq, iommu);
1177 ret = arch_setup_dmar_msi(irq);
1179 set_irq_data(irq, NULL);
1185 /* Force fault register is cleared */
1186 iommu_page_fault(irq, iommu);
1188 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1190 printk(KERN_ERR "IOMMU: can't request irq\n");
1194 static int iommu_init_domains(struct intel_iommu *iommu)
1196 unsigned long ndomains;
1197 unsigned long nlongs;
1199 ndomains = cap_ndoms(iommu->cap);
1200 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1201 nlongs = BITS_TO_LONGS(ndomains);
1203 /* TBD: there might be 64K domains,
1204 * consider other allocation for future chip
1206 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1207 if (!iommu->domain_ids) {
1208 printk(KERN_ERR "Allocating domain id array failed\n");
1211 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1213 if (!iommu->domains) {
1214 printk(KERN_ERR "Allocating domain array failed\n");
1215 kfree(iommu->domain_ids);
1219 spin_lock_init(&iommu->lock);
1222 * if Caching mode is set, then invalid translations are tagged
1223 * with domainid 0. Hence we need to pre-allocate it.
1225 if (cap_caching_mode(iommu->cap))
1226 set_bit(0, iommu->domain_ids);
1231 static void domain_exit(struct dmar_domain *domain);
1232 static void vm_domain_exit(struct dmar_domain *domain);
1234 void free_dmar_iommu(struct intel_iommu *iommu)
1236 struct dmar_domain *domain;
1238 unsigned long flags;
1240 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1241 for (; i < cap_ndoms(iommu->cap); ) {
1242 domain = iommu->domains[i];
1243 clear_bit(i, iommu->domain_ids);
1245 spin_lock_irqsave(&domain->iommu_lock, flags);
1246 if (--domain->iommu_count == 0) {
1247 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1248 vm_domain_exit(domain);
1250 domain_exit(domain);
1252 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1254 i = find_next_bit(iommu->domain_ids,
1255 cap_ndoms(iommu->cap), i+1);
1258 if (iommu->gcmd & DMA_GCMD_TE)
1259 iommu_disable_translation(iommu);
1262 set_irq_data(iommu->irq, NULL);
1263 /* This will mask the irq */
1264 free_irq(iommu->irq, iommu);
1265 destroy_irq(iommu->irq);
1268 kfree(iommu->domains);
1269 kfree(iommu->domain_ids);
1271 g_iommus[iommu->seq_id] = NULL;
1273 /* if all iommus are freed, free g_iommus */
1274 for (i = 0; i < g_num_of_iommus; i++) {
1279 if (i == g_num_of_iommus)
1282 /* free context mapping */
1283 free_context_table(iommu);
1286 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1289 unsigned long ndomains;
1290 struct dmar_domain *domain;
1291 unsigned long flags;
1293 domain = alloc_domain_mem();
1297 ndomains = cap_ndoms(iommu->cap);
1299 spin_lock_irqsave(&iommu->lock, flags);
1300 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1301 if (num >= ndomains) {
1302 spin_unlock_irqrestore(&iommu->lock, flags);
1303 free_domain_mem(domain);
1304 printk(KERN_ERR "IOMMU: no free domain ids\n");
1308 set_bit(num, iommu->domain_ids);
1310 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1311 set_bit(iommu->seq_id, &domain->iommu_bmp);
1313 iommu->domains[num] = domain;
1314 spin_unlock_irqrestore(&iommu->lock, flags);
1319 static void iommu_free_domain(struct dmar_domain *domain)
1321 unsigned long flags;
1322 struct intel_iommu *iommu;
1324 iommu = domain_get_iommu(domain);
1326 spin_lock_irqsave(&iommu->lock, flags);
1327 clear_bit(domain->id, iommu->domain_ids);
1328 spin_unlock_irqrestore(&iommu->lock, flags);
1331 static struct iova_domain reserved_iova_list;
1332 static struct lock_class_key reserved_alloc_key;
1333 static struct lock_class_key reserved_rbtree_key;
1335 static void dmar_init_reserved_ranges(void)
1337 struct pci_dev *pdev = NULL;
1342 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1344 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1345 &reserved_alloc_key);
1346 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1347 &reserved_rbtree_key);
1349 /* IOAPIC ranges shouldn't be accessed by DMA */
1350 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1351 IOVA_PFN(IOAPIC_RANGE_END));
1353 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1355 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1356 for_each_pci_dev(pdev) {
1359 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1360 r = &pdev->resource[i];
1361 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1365 size = r->end - addr;
1366 size = PAGE_ALIGN(size);
1367 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1368 IOVA_PFN(size + addr) - 1);
1370 printk(KERN_ERR "Reserve iova failed\n");
1376 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1378 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1381 static inline int guestwidth_to_adjustwidth(int gaw)
1384 int r = (gaw - 12) % 9;
1395 static int domain_init(struct dmar_domain *domain, int guest_width)
1397 struct intel_iommu *iommu;
1398 int adjust_width, agaw;
1399 unsigned long sagaw;
1401 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1402 spin_lock_init(&domain->mapping_lock);
1403 spin_lock_init(&domain->iommu_lock);
1405 domain_reserve_special_ranges(domain);
1407 /* calculate AGAW */
1408 iommu = domain_get_iommu(domain);
1409 if (guest_width > cap_mgaw(iommu->cap))
1410 guest_width = cap_mgaw(iommu->cap);
1411 domain->gaw = guest_width;
1412 adjust_width = guestwidth_to_adjustwidth(guest_width);
1413 agaw = width_to_agaw(adjust_width);
1414 sagaw = cap_sagaw(iommu->cap);
1415 if (!test_bit(agaw, &sagaw)) {
1416 /* hardware doesn't support it, choose a bigger one */
1417 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1418 agaw = find_next_bit(&sagaw, 5, agaw);
1422 domain->agaw = agaw;
1423 INIT_LIST_HEAD(&domain->devices);
1425 if (ecap_coherent(iommu->ecap))
1426 domain->iommu_coherency = 1;
1428 domain->iommu_coherency = 0;
1430 domain->iommu_count = 1;
1432 /* always allocate the top pgd */
1433 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1436 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1440 static void domain_exit(struct dmar_domain *domain)
1444 /* Domain 0 is reserved, so dont process it */
1448 domain_remove_dev_info(domain);
1450 put_iova_domain(&domain->iovad);
1451 end = DOMAIN_MAX_ADDR(domain->gaw);
1452 end = end & (~PAGE_MASK);
1455 dma_pte_clear_range(domain, 0, end);
1457 /* free page tables */
1458 dma_pte_free_pagetable(domain, 0, end);
1460 iommu_free_domain(domain);
1461 free_domain_mem(domain);
1464 static int domain_context_mapping_one(struct dmar_domain *domain,
1467 struct context_entry *context;
1468 unsigned long flags;
1469 struct intel_iommu *iommu;
1470 struct dma_pte *pgd;
1472 unsigned long ndomains;
1476 pr_debug("Set context mapping for %02x:%02x.%d\n",
1477 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1478 BUG_ON(!domain->pgd);
1480 iommu = device_to_iommu(bus, devfn);
1484 context = device_to_context_entry(iommu, bus, devfn);
1487 spin_lock_irqsave(&iommu->lock, flags);
1488 if (context_present(context)) {
1489 spin_unlock_irqrestore(&iommu->lock, flags);
1496 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1499 /* find an available domain id for this device in iommu */
1500 ndomains = cap_ndoms(iommu->cap);
1501 num = find_first_bit(iommu->domain_ids, ndomains);
1502 for (; num < ndomains; ) {
1503 if (iommu->domains[num] == domain) {
1508 num = find_next_bit(iommu->domain_ids,
1509 cap_ndoms(iommu->cap), num+1);
1513 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1514 if (num >= ndomains) {
1515 spin_unlock_irqrestore(&iommu->lock, flags);
1516 printk(KERN_ERR "IOMMU: no free domain ids\n");
1520 set_bit(num, iommu->domain_ids);
1521 iommu->domains[num] = domain;
1525 /* Skip top levels of page tables for
1526 * iommu which has less agaw than default.
1528 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1529 pgd = phys_to_virt(dma_pte_addr(pgd));
1530 if (!dma_pte_present(pgd)) {
1531 spin_unlock_irqrestore(&iommu->lock, flags);
1537 context_set_domain_id(context, id);
1538 context_set_address_width(context, iommu->agaw);
1539 context_set_address_root(context, virt_to_phys(pgd));
1540 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1541 context_set_fault_enable(context);
1542 context_set_present(context);
1543 domain_flush_cache(domain, context, sizeof(*context));
1545 /* it's a non-present to present mapping */
1546 if (iommu->flush.flush_context(iommu, domain->id,
1547 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1548 DMA_CCMD_DEVICE_INVL, 1))
1549 iommu_flush_write_buffer(iommu);
1551 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1553 spin_unlock_irqrestore(&iommu->lock, flags);
1555 spin_lock_irqsave(&domain->iommu_lock, flags);
1556 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1557 domain->iommu_count++;
1558 domain_update_iommu_coherency(domain);
1560 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1565 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1568 struct pci_dev *tmp, *parent;
1570 ret = domain_context_mapping_one(domain, pdev->bus->number,
1575 /* dependent device mapping */
1576 tmp = pci_find_upstream_pcie_bridge(pdev);
1579 /* Secondary interface's bus number and devfn 0 */
1580 parent = pdev->bus->self;
1581 while (parent != tmp) {
1582 ret = domain_context_mapping_one(domain, parent->bus->number,
1586 parent = parent->bus->self;
1588 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1589 return domain_context_mapping_one(domain,
1590 tmp->subordinate->number, 0);
1591 else /* this is a legacy PCI bridge */
1592 return domain_context_mapping_one(domain,
1593 tmp->bus->number, tmp->devfn);
1596 static int domain_context_mapped(struct pci_dev *pdev)
1599 struct pci_dev *tmp, *parent;
1600 struct intel_iommu *iommu;
1602 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1606 ret = device_context_mapped(iommu,
1607 pdev->bus->number, pdev->devfn);
1610 /* dependent device mapping */
1611 tmp = pci_find_upstream_pcie_bridge(pdev);
1614 /* Secondary interface's bus number and devfn 0 */
1615 parent = pdev->bus->self;
1616 while (parent != tmp) {
1617 ret = device_context_mapped(iommu, parent->bus->number,
1621 parent = parent->bus->self;
1624 return device_context_mapped(iommu,
1625 tmp->subordinate->number, 0);
1627 return device_context_mapped(iommu,
1628 tmp->bus->number, tmp->devfn);
1632 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1633 u64 hpa, size_t size, int prot)
1635 u64 start_pfn, end_pfn;
1636 struct dma_pte *pte;
1638 int addr_width = agaw_to_width(domain->agaw);
1640 hpa &= (((u64)1) << addr_width) - 1;
1642 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1645 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1646 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1648 while (start_pfn < end_pfn) {
1649 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1652 /* We don't need lock here, nobody else
1653 * touches the iova range
1655 BUG_ON(dma_pte_addr(pte));
1656 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1657 dma_set_pte_prot(pte, prot);
1658 domain_flush_cache(domain, pte, sizeof(*pte));
1665 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1670 clear_context_table(iommu, bus, devfn);
1671 iommu->flush.flush_context(iommu, 0, 0, 0,
1672 DMA_CCMD_GLOBAL_INVL, 0);
1673 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1674 DMA_TLB_GLOBAL_FLUSH, 0);
1677 static void domain_remove_dev_info(struct dmar_domain *domain)
1679 struct device_domain_info *info;
1680 unsigned long flags;
1681 struct intel_iommu *iommu;
1683 spin_lock_irqsave(&device_domain_lock, flags);
1684 while (!list_empty(&domain->devices)) {
1685 info = list_entry(domain->devices.next,
1686 struct device_domain_info, link);
1687 list_del(&info->link);
1688 list_del(&info->global);
1690 info->dev->dev.archdata.iommu = NULL;
1691 spin_unlock_irqrestore(&device_domain_lock, flags);
1693 iommu = device_to_iommu(info->bus, info->devfn);
1694 iommu_detach_dev(iommu, info->bus, info->devfn);
1695 free_devinfo_mem(info);
1697 spin_lock_irqsave(&device_domain_lock, flags);
1699 spin_unlock_irqrestore(&device_domain_lock, flags);
1704 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1706 static struct dmar_domain *
1707 find_domain(struct pci_dev *pdev)
1709 struct device_domain_info *info;
1711 /* No lock here, assumes no domain exit in normal case */
1712 info = pdev->dev.archdata.iommu;
1714 return info->domain;
1718 /* domain is initialized */
1719 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1721 struct dmar_domain *domain, *found = NULL;
1722 struct intel_iommu *iommu;
1723 struct dmar_drhd_unit *drhd;
1724 struct device_domain_info *info, *tmp;
1725 struct pci_dev *dev_tmp;
1726 unsigned long flags;
1727 int bus = 0, devfn = 0;
1729 domain = find_domain(pdev);
1733 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1735 if (dev_tmp->is_pcie) {
1736 bus = dev_tmp->subordinate->number;
1739 bus = dev_tmp->bus->number;
1740 devfn = dev_tmp->devfn;
1742 spin_lock_irqsave(&device_domain_lock, flags);
1743 list_for_each_entry(info, &device_domain_list, global) {
1744 if (info->bus == bus && info->devfn == devfn) {
1745 found = info->domain;
1749 spin_unlock_irqrestore(&device_domain_lock, flags);
1750 /* pcie-pci bridge already has a domain, uses it */
1757 /* Allocate new domain for the device */
1758 drhd = dmar_find_matched_drhd_unit(pdev);
1760 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1764 iommu = drhd->iommu;
1766 domain = iommu_alloc_domain(iommu);
1770 if (domain_init(domain, gaw)) {
1771 domain_exit(domain);
1775 /* register pcie-to-pci device */
1777 info = alloc_devinfo_mem();
1779 domain_exit(domain);
1783 info->devfn = devfn;
1785 info->domain = domain;
1786 /* This domain is shared by devices under p2p bridge */
1787 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1789 /* pcie-to-pci bridge already has a domain, uses it */
1791 spin_lock_irqsave(&device_domain_lock, flags);
1792 list_for_each_entry(tmp, &device_domain_list, global) {
1793 if (tmp->bus == bus && tmp->devfn == devfn) {
1794 found = tmp->domain;
1799 free_devinfo_mem(info);
1800 domain_exit(domain);
1803 list_add(&info->link, &domain->devices);
1804 list_add(&info->global, &device_domain_list);
1806 spin_unlock_irqrestore(&device_domain_lock, flags);
1810 info = alloc_devinfo_mem();
1813 info->bus = pdev->bus->number;
1814 info->devfn = pdev->devfn;
1816 info->domain = domain;
1817 spin_lock_irqsave(&device_domain_lock, flags);
1818 /* somebody is fast */
1819 found = find_domain(pdev);
1820 if (found != NULL) {
1821 spin_unlock_irqrestore(&device_domain_lock, flags);
1822 if (found != domain) {
1823 domain_exit(domain);
1826 free_devinfo_mem(info);
1829 list_add(&info->link, &domain->devices);
1830 list_add(&info->global, &device_domain_list);
1831 pdev->dev.archdata.iommu = info;
1832 spin_unlock_irqrestore(&device_domain_lock, flags);
1835 /* recheck it here, maybe others set it */
1836 return find_domain(pdev);
1839 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1840 unsigned long long start,
1841 unsigned long long end)
1843 struct dmar_domain *domain;
1845 unsigned long long base;
1849 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1850 pci_name(pdev), start, end);
1851 /* page table init */
1852 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1856 /* The address might not be aligned */
1857 base = start & PAGE_MASK;
1859 size = PAGE_ALIGN(size);
1860 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1861 IOVA_PFN(base + size) - 1)) {
1862 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1867 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1868 size, base, pci_name(pdev));
1870 * RMRR range might have overlap with physical memory range,
1873 dma_pte_clear_range(domain, base, base + size);
1875 ret = domain_page_mapping(domain, base, base, size,
1876 DMA_PTE_READ|DMA_PTE_WRITE);
1880 /* context entry init */
1881 ret = domain_context_mapping(domain, pdev);
1885 domain_exit(domain);
1890 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1891 struct pci_dev *pdev)
1893 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1895 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1896 rmrr->end_address + 1);
1899 #ifdef CONFIG_DMAR_GFX_WA
1900 struct iommu_prepare_data {
1901 struct pci_dev *pdev;
1905 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1906 unsigned long end_pfn, void *datax)
1908 struct iommu_prepare_data *data;
1910 data = (struct iommu_prepare_data *)datax;
1912 data->ret = iommu_prepare_identity_map(data->pdev,
1913 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1918 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1921 struct iommu_prepare_data data;
1926 for_each_online_node(nid) {
1927 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1934 static void __init iommu_prepare_gfx_mapping(void)
1936 struct pci_dev *pdev = NULL;
1939 for_each_pci_dev(pdev) {
1940 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1941 !IS_GFX_DEVICE(pdev))
1943 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1945 ret = iommu_prepare_with_active_regions(pdev);
1947 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1950 #else /* !CONFIG_DMAR_GFX_WA */
1951 static inline void iommu_prepare_gfx_mapping(void)
1957 #ifdef CONFIG_DMAR_FLOPPY_WA
1958 static inline void iommu_prepare_isa(void)
1960 struct pci_dev *pdev;
1963 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1967 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1968 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1971 printk("IOMMU: Failed to create 0-64M identity map, "
1972 "floppy might not work\n");
1976 static inline void iommu_prepare_isa(void)
1980 #endif /* !CONFIG_DMAR_FLPY_WA */
1982 static int __init init_dmars(void)
1984 struct dmar_drhd_unit *drhd;
1985 struct dmar_rmrr_unit *rmrr;
1986 struct pci_dev *pdev;
1987 struct intel_iommu *iommu;
1988 int i, ret, unit = 0;
1993 * initialize and program root entry to not present
1996 for_each_drhd_unit(drhd) {
1999 * lock not needed as this is only incremented in the single
2000 * threaded kernel __init code path all other access are read
2005 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2008 printk(KERN_ERR "Allocating global iommu array failed\n");
2013 deferred_flush = kzalloc(g_num_of_iommus *
2014 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2015 if (!deferred_flush) {
2021 for_each_drhd_unit(drhd) {
2025 iommu = drhd->iommu;
2026 g_iommus[iommu->seq_id] = iommu;
2028 ret = iommu_init_domains(iommu);
2034 * we could share the same root & context tables
2035 * amoung all IOMMU's. Need to Split it later.
2037 ret = iommu_alloc_root_entry(iommu);
2039 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2044 for_each_drhd_unit(drhd) {
2048 iommu = drhd->iommu;
2049 if (dmar_enable_qi(iommu)) {
2051 * Queued Invalidate not enabled, use Register Based
2054 iommu->flush.flush_context = __iommu_flush_context;
2055 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2056 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2058 (unsigned long long)drhd->reg_base_addr);
2060 iommu->flush.flush_context = qi_flush_context;
2061 iommu->flush.flush_iotlb = qi_flush_iotlb;
2062 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2064 (unsigned long long)drhd->reg_base_addr);
2070 * for each dev attached to rmrr
2072 * locate drhd for dev, alloc domain for dev
2073 * allocate free domain
2074 * allocate page table entries for rmrr
2075 * if context not allocated for bus
2076 * allocate and init context
2077 * set present in root table for this bus
2078 * init context with domain, translation etc
2082 for_each_rmrr_units(rmrr) {
2083 for (i = 0; i < rmrr->devices_cnt; i++) {
2084 pdev = rmrr->devices[i];
2085 /* some BIOS lists non-exist devices in DMAR table */
2088 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2091 "IOMMU: mapping reserved region failed\n");
2095 iommu_prepare_gfx_mapping();
2097 iommu_prepare_isa();
2102 * global invalidate context cache
2103 * global invalidate iotlb
2104 * enable translation
2106 for_each_drhd_unit(drhd) {
2109 iommu = drhd->iommu;
2110 sprintf (iommu->name, "dmar%d", unit++);
2112 iommu_flush_write_buffer(iommu);
2114 ret = dmar_set_interrupt(iommu);
2118 iommu_set_root_entry(iommu);
2120 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2122 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2124 iommu_disable_protect_mem_regions(iommu);
2126 ret = iommu_enable_translation(iommu);
2133 for_each_drhd_unit(drhd) {
2136 iommu = drhd->iommu;
2143 static inline u64 aligned_size(u64 host_addr, size_t size)
2146 addr = (host_addr & (~PAGE_MASK)) + size;
2147 return PAGE_ALIGN(addr);
2151 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2155 /* Make sure it's in range */
2156 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2157 if (!size || (IOVA_START_ADDR + size > end))
2160 piova = alloc_iova(&domain->iovad,
2161 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2165 static struct iova *
2166 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2167 size_t size, u64 dma_mask)
2169 struct pci_dev *pdev = to_pci_dev(dev);
2170 struct iova *iova = NULL;
2172 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2173 iova = iommu_alloc_iova(domain, size, dma_mask);
2176 * First try to allocate an io virtual address in
2177 * DMA_32BIT_MASK and if that fails then try allocating
2180 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2182 iova = iommu_alloc_iova(domain, size, dma_mask);
2186 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2193 static struct dmar_domain *
2194 get_valid_domain_for_dev(struct pci_dev *pdev)
2196 struct dmar_domain *domain;
2199 domain = get_domain_for_dev(pdev,
2200 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2203 "Allocating domain for %s failed", pci_name(pdev));
2207 /* make sure context mapping is ok */
2208 if (unlikely(!domain_context_mapped(pdev))) {
2209 ret = domain_context_mapping(domain, pdev);
2212 "Domain context map for %s failed",
2221 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2222 size_t size, int dir, u64 dma_mask)
2224 struct pci_dev *pdev = to_pci_dev(hwdev);
2225 struct dmar_domain *domain;
2226 phys_addr_t start_paddr;
2230 struct intel_iommu *iommu;
2232 BUG_ON(dir == DMA_NONE);
2233 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2236 domain = get_valid_domain_for_dev(pdev);
2240 iommu = domain_get_iommu(domain);
2241 size = aligned_size((u64)paddr, size);
2243 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2247 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2250 * Check if DMAR supports zero-length reads on write only
2253 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2254 !cap_zlr(iommu->cap))
2255 prot |= DMA_PTE_READ;
2256 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2257 prot |= DMA_PTE_WRITE;
2259 * paddr - (paddr + size) might be partial page, we should map the whole
2260 * page. Note: if two part of one page are separately mapped, we
2261 * might have two guest_addr mapping to the same host paddr, but this
2262 * is not a big problem
2264 ret = domain_page_mapping(domain, start_paddr,
2265 ((u64)paddr) & PAGE_MASK, size, prot);
2269 /* it's a non-present to present mapping */
2270 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2271 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2273 iommu_flush_write_buffer(iommu);
2275 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2279 __free_iova(&domain->iovad, iova);
2280 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2281 pci_name(pdev), size, (unsigned long long)paddr, dir);
2285 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2286 size_t size, int dir)
2288 return __intel_map_single(hwdev, paddr, size, dir,
2289 to_pci_dev(hwdev)->dma_mask);
2292 static void flush_unmaps(void)
2298 /* just flush them all */
2299 for (i = 0; i < g_num_of_iommus; i++) {
2300 struct intel_iommu *iommu = g_iommus[i];
2304 if (deferred_flush[i].next) {
2305 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2306 DMA_TLB_GLOBAL_FLUSH, 0);
2307 for (j = 0; j < deferred_flush[i].next; j++) {
2308 __free_iova(&deferred_flush[i].domain[j]->iovad,
2309 deferred_flush[i].iova[j]);
2311 deferred_flush[i].next = 0;
2318 static void flush_unmaps_timeout(unsigned long data)
2320 unsigned long flags;
2322 spin_lock_irqsave(&async_umap_flush_lock, flags);
2324 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2327 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2329 unsigned long flags;
2331 struct intel_iommu *iommu;
2333 spin_lock_irqsave(&async_umap_flush_lock, flags);
2334 if (list_size == HIGH_WATER_MARK)
2337 iommu = domain_get_iommu(dom);
2338 iommu_id = iommu->seq_id;
2340 next = deferred_flush[iommu_id].next;
2341 deferred_flush[iommu_id].domain[next] = dom;
2342 deferred_flush[iommu_id].iova[next] = iova;
2343 deferred_flush[iommu_id].next++;
2346 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2350 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2353 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2356 struct pci_dev *pdev = to_pci_dev(dev);
2357 struct dmar_domain *domain;
2358 unsigned long start_addr;
2360 struct intel_iommu *iommu;
2362 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2364 domain = find_domain(pdev);
2367 iommu = domain_get_iommu(domain);
2369 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2373 start_addr = iova->pfn_lo << PAGE_SHIFT;
2374 size = aligned_size((u64)dev_addr, size);
2376 pr_debug("Device %s unmapping: %lx@%llx\n",
2377 pci_name(pdev), size, (unsigned long long)start_addr);
2379 /* clear the whole page */
2380 dma_pte_clear_range(domain, start_addr, start_addr + size);
2381 /* free page tables */
2382 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2383 if (intel_iommu_strict) {
2384 if (iommu_flush_iotlb_psi(iommu,
2385 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2386 iommu_flush_write_buffer(iommu);
2388 __free_iova(&domain->iovad, iova);
2390 add_unmap(domain, iova);
2392 * queue up the release of the unmap to save the 1/6th of the
2393 * cpu used up by the iotlb flush operation...
2398 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2399 dma_addr_t *dma_handle, gfp_t flags)
2404 size = PAGE_ALIGN(size);
2405 order = get_order(size);
2406 flags &= ~(GFP_DMA | GFP_DMA32);
2408 vaddr = (void *)__get_free_pages(flags, order);
2411 memset(vaddr, 0, size);
2413 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2415 hwdev->coherent_dma_mask);
2418 free_pages((unsigned long)vaddr, order);
2422 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2423 dma_addr_t dma_handle)
2427 size = PAGE_ALIGN(size);
2428 order = get_order(size);
2430 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2431 free_pages((unsigned long)vaddr, order);
2434 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2436 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2437 int nelems, int dir)
2440 struct pci_dev *pdev = to_pci_dev(hwdev);
2441 struct dmar_domain *domain;
2442 unsigned long start_addr;
2446 struct scatterlist *sg;
2447 struct intel_iommu *iommu;
2449 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2452 domain = find_domain(pdev);
2455 iommu = domain_get_iommu(domain);
2457 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2460 for_each_sg(sglist, sg, nelems, i) {
2461 addr = SG_ENT_VIRT_ADDRESS(sg);
2462 size += aligned_size((u64)addr, sg->length);
2465 start_addr = iova->pfn_lo << PAGE_SHIFT;
2467 /* clear the whole page */
2468 dma_pte_clear_range(domain, start_addr, start_addr + size);
2469 /* free page tables */
2470 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2472 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2473 size >> VTD_PAGE_SHIFT, 0))
2474 iommu_flush_write_buffer(iommu);
2477 __free_iova(&domain->iovad, iova);
2480 static int intel_nontranslate_map_sg(struct device *hddev,
2481 struct scatterlist *sglist, int nelems, int dir)
2484 struct scatterlist *sg;
2486 for_each_sg(sglist, sg, nelems, i) {
2487 BUG_ON(!sg_page(sg));
2488 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2489 sg->dma_length = sg->length;
2494 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2499 struct pci_dev *pdev = to_pci_dev(hwdev);
2500 struct dmar_domain *domain;
2504 struct iova *iova = NULL;
2506 struct scatterlist *sg;
2507 unsigned long start_addr;
2508 struct intel_iommu *iommu;
2510 BUG_ON(dir == DMA_NONE);
2511 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2512 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2514 domain = get_valid_domain_for_dev(pdev);
2518 iommu = domain_get_iommu(domain);
2520 for_each_sg(sglist, sg, nelems, i) {
2521 addr = SG_ENT_VIRT_ADDRESS(sg);
2522 addr = (void *)virt_to_phys(addr);
2523 size += aligned_size((u64)addr, sg->length);
2526 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2528 sglist->dma_length = 0;
2533 * Check if DMAR supports zero-length reads on write only
2536 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2537 !cap_zlr(iommu->cap))
2538 prot |= DMA_PTE_READ;
2539 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2540 prot |= DMA_PTE_WRITE;
2542 start_addr = iova->pfn_lo << PAGE_SHIFT;
2544 for_each_sg(sglist, sg, nelems, i) {
2545 addr = SG_ENT_VIRT_ADDRESS(sg);
2546 addr = (void *)virt_to_phys(addr);
2547 size = aligned_size((u64)addr, sg->length);
2548 ret = domain_page_mapping(domain, start_addr + offset,
2549 ((u64)addr) & PAGE_MASK,
2552 /* clear the page */
2553 dma_pte_clear_range(domain, start_addr,
2554 start_addr + offset);
2555 /* free page tables */
2556 dma_pte_free_pagetable(domain, start_addr,
2557 start_addr + offset);
2559 __free_iova(&domain->iovad, iova);
2562 sg->dma_address = start_addr + offset +
2563 ((u64)addr & (~PAGE_MASK));
2564 sg->dma_length = sg->length;
2568 /* it's a non-present to present mapping */
2569 if (iommu_flush_iotlb_psi(iommu, domain->id,
2570 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2571 iommu_flush_write_buffer(iommu);
2575 static struct dma_mapping_ops intel_dma_ops = {
2576 .alloc_coherent = intel_alloc_coherent,
2577 .free_coherent = intel_free_coherent,
2578 .map_single = intel_map_single,
2579 .unmap_single = intel_unmap_single,
2580 .map_sg = intel_map_sg,
2581 .unmap_sg = intel_unmap_sg,
2584 static inline int iommu_domain_cache_init(void)
2588 iommu_domain_cache = kmem_cache_create("iommu_domain",
2589 sizeof(struct dmar_domain),
2594 if (!iommu_domain_cache) {
2595 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2602 static inline int iommu_devinfo_cache_init(void)
2606 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2607 sizeof(struct device_domain_info),
2611 if (!iommu_devinfo_cache) {
2612 printk(KERN_ERR "Couldn't create devinfo cache\n");
2619 static inline int iommu_iova_cache_init(void)
2623 iommu_iova_cache = kmem_cache_create("iommu_iova",
2624 sizeof(struct iova),
2628 if (!iommu_iova_cache) {
2629 printk(KERN_ERR "Couldn't create iova cache\n");
2636 static int __init iommu_init_mempool(void)
2639 ret = iommu_iova_cache_init();
2643 ret = iommu_domain_cache_init();
2647 ret = iommu_devinfo_cache_init();
2651 kmem_cache_destroy(iommu_domain_cache);
2653 kmem_cache_destroy(iommu_iova_cache);
2658 static void __init iommu_exit_mempool(void)
2660 kmem_cache_destroy(iommu_devinfo_cache);
2661 kmem_cache_destroy(iommu_domain_cache);
2662 kmem_cache_destroy(iommu_iova_cache);
2666 static void __init init_no_remapping_devices(void)
2668 struct dmar_drhd_unit *drhd;
2670 for_each_drhd_unit(drhd) {
2671 if (!drhd->include_all) {
2673 for (i = 0; i < drhd->devices_cnt; i++)
2674 if (drhd->devices[i] != NULL)
2676 /* ignore DMAR unit if no pci devices exist */
2677 if (i == drhd->devices_cnt)
2685 for_each_drhd_unit(drhd) {
2687 if (drhd->ignored || drhd->include_all)
2690 for (i = 0; i < drhd->devices_cnt; i++)
2691 if (drhd->devices[i] &&
2692 !IS_GFX_DEVICE(drhd->devices[i]))
2695 if (i < drhd->devices_cnt)
2698 /* bypass IOMMU if it is just for gfx devices */
2700 for (i = 0; i < drhd->devices_cnt; i++) {
2701 if (!drhd->devices[i])
2703 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2708 int __init intel_iommu_init(void)
2712 if (dmar_table_init())
2715 if (dmar_dev_scope_init())
2719 * Check the need for DMA-remapping initialization now.
2720 * Above initialization will also be used by Interrupt-remapping.
2722 if (no_iommu || swiotlb || dmar_disabled)
2725 iommu_init_mempool();
2726 dmar_init_reserved_ranges();
2728 init_no_remapping_devices();
2732 printk(KERN_ERR "IOMMU: dmar init failed\n");
2733 put_iova_domain(&reserved_iova_list);
2734 iommu_exit_mempool();
2738 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2740 init_timer(&unmap_timer);
2742 dma_ops = &intel_dma_ops;
2744 register_iommu(&intel_iommu_ops);
2749 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2750 struct pci_dev *pdev)
2752 struct device_domain_info *info;
2753 unsigned long flags;
2755 info = alloc_devinfo_mem();
2759 info->bus = pdev->bus->number;
2760 info->devfn = pdev->devfn;
2762 info->domain = domain;
2764 spin_lock_irqsave(&device_domain_lock, flags);
2765 list_add(&info->link, &domain->devices);
2766 list_add(&info->global, &device_domain_list);
2767 pdev->dev.archdata.iommu = info;
2768 spin_unlock_irqrestore(&device_domain_lock, flags);
2773 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2774 struct pci_dev *pdev)
2776 struct device_domain_info *info;
2777 struct intel_iommu *iommu;
2778 unsigned long flags;
2780 struct list_head *entry, *tmp;
2782 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2786 spin_lock_irqsave(&device_domain_lock, flags);
2787 list_for_each_safe(entry, tmp, &domain->devices) {
2788 info = list_entry(entry, struct device_domain_info, link);
2789 if (info->bus == pdev->bus->number &&
2790 info->devfn == pdev->devfn) {
2791 list_del(&info->link);
2792 list_del(&info->global);
2794 info->dev->dev.archdata.iommu = NULL;
2795 spin_unlock_irqrestore(&device_domain_lock, flags);
2797 iommu_detach_dev(iommu, info->bus, info->devfn);
2798 free_devinfo_mem(info);
2800 spin_lock_irqsave(&device_domain_lock, flags);
2808 /* if there is no other devices under the same iommu
2809 * owned by this domain, clear this iommu in iommu_bmp
2810 * update iommu count and coherency
2812 if (device_to_iommu(info->bus, info->devfn) == iommu)
2817 unsigned long tmp_flags;
2818 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2819 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2820 domain->iommu_count--;
2821 domain_update_iommu_coherency(domain);
2822 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2825 spin_unlock_irqrestore(&device_domain_lock, flags);
2828 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2830 struct device_domain_info *info;
2831 struct intel_iommu *iommu;
2832 unsigned long flags1, flags2;
2834 spin_lock_irqsave(&device_domain_lock, flags1);
2835 while (!list_empty(&domain->devices)) {
2836 info = list_entry(domain->devices.next,
2837 struct device_domain_info, link);
2838 list_del(&info->link);
2839 list_del(&info->global);
2841 info->dev->dev.archdata.iommu = NULL;
2843 spin_unlock_irqrestore(&device_domain_lock, flags1);
2845 iommu = device_to_iommu(info->bus, info->devfn);
2846 iommu_detach_dev(iommu, info->bus, info->devfn);
2848 /* clear this iommu in iommu_bmp, update iommu count
2851 spin_lock_irqsave(&domain->iommu_lock, flags2);
2852 if (test_and_clear_bit(iommu->seq_id,
2853 &domain->iommu_bmp)) {
2854 domain->iommu_count--;
2855 domain_update_iommu_coherency(domain);
2857 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2859 free_devinfo_mem(info);
2860 spin_lock_irqsave(&device_domain_lock, flags1);
2862 spin_unlock_irqrestore(&device_domain_lock, flags1);
2865 /* domain id for virtual machine, it won't be set in context */
2866 static unsigned long vm_domid;
2868 static int vm_domain_min_agaw(struct dmar_domain *domain)
2871 int min_agaw = domain->agaw;
2873 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2874 for (; i < g_num_of_iommus; ) {
2875 if (min_agaw > g_iommus[i]->agaw)
2876 min_agaw = g_iommus[i]->agaw;
2878 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2884 static struct dmar_domain *iommu_alloc_vm_domain(void)
2886 struct dmar_domain *domain;
2888 domain = alloc_domain_mem();
2892 domain->id = vm_domid++;
2893 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2894 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2899 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2903 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2904 spin_lock_init(&domain->mapping_lock);
2905 spin_lock_init(&domain->iommu_lock);
2907 domain_reserve_special_ranges(domain);
2909 /* calculate AGAW */
2910 domain->gaw = guest_width;
2911 adjust_width = guestwidth_to_adjustwidth(guest_width);
2912 domain->agaw = width_to_agaw(adjust_width);
2914 INIT_LIST_HEAD(&domain->devices);
2916 domain->iommu_count = 0;
2917 domain->iommu_coherency = 0;
2918 domain->max_addr = 0;
2920 /* always allocate the top pgd */
2921 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2924 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2928 static void iommu_free_vm_domain(struct dmar_domain *domain)
2930 unsigned long flags;
2931 struct dmar_drhd_unit *drhd;
2932 struct intel_iommu *iommu;
2934 unsigned long ndomains;
2936 for_each_drhd_unit(drhd) {
2939 iommu = drhd->iommu;
2941 ndomains = cap_ndoms(iommu->cap);
2942 i = find_first_bit(iommu->domain_ids, ndomains);
2943 for (; i < ndomains; ) {
2944 if (iommu->domains[i] == domain) {
2945 spin_lock_irqsave(&iommu->lock, flags);
2946 clear_bit(i, iommu->domain_ids);
2947 iommu->domains[i] = NULL;
2948 spin_unlock_irqrestore(&iommu->lock, flags);
2951 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2956 static void vm_domain_exit(struct dmar_domain *domain)
2960 /* Domain 0 is reserved, so dont process it */
2964 vm_domain_remove_all_dev_info(domain);
2966 put_iova_domain(&domain->iovad);
2967 end = DOMAIN_MAX_ADDR(domain->gaw);
2968 end = end & (~VTD_PAGE_MASK);
2971 dma_pte_clear_range(domain, 0, end);
2973 /* free page tables */
2974 dma_pte_free_pagetable(domain, 0, end);
2976 iommu_free_vm_domain(domain);
2977 free_domain_mem(domain);
2980 static int intel_iommu_domain_init(struct iommu_domain *domain)
2982 struct dmar_domain *dmar_domain;
2984 dmar_domain = iommu_alloc_vm_domain();
2987 "intel_iommu_domain_init: dmar_domain == NULL\n");
2990 if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2992 "intel_iommu_domain_init() failed\n");
2993 vm_domain_exit(dmar_domain);
2996 domain->priv = dmar_domain;
3001 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3003 struct dmar_domain *dmar_domain = domain->priv;
3005 domain->priv = NULL;
3006 vm_domain_exit(dmar_domain);
3009 static int intel_iommu_attach_device(struct iommu_domain *domain,
3012 struct dmar_domain *dmar_domain = domain->priv;
3013 struct pci_dev *pdev = to_pci_dev(dev);
3014 struct intel_iommu *iommu;
3019 /* normally pdev is not mapped */
3020 if (unlikely(domain_context_mapped(pdev))) {
3021 struct dmar_domain *old_domain;
3023 old_domain = find_domain(pdev);
3025 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3026 vm_domain_remove_one_dev_info(old_domain, pdev);
3028 domain_remove_dev_info(old_domain);
3032 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3036 /* check if this iommu agaw is sufficient for max mapped address */
3037 addr_width = agaw_to_width(iommu->agaw);
3038 end = DOMAIN_MAX_ADDR(addr_width);
3039 end = end & VTD_PAGE_MASK;
3040 if (end < dmar_domain->max_addr) {
3041 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3042 "sufficient for the mapped address (%llx)\n",
3043 __func__, iommu->agaw, dmar_domain->max_addr);
3047 ret = domain_context_mapping(dmar_domain, pdev);
3051 ret = vm_domain_add_dev_info(dmar_domain, pdev);
3055 static void intel_iommu_detach_device(struct iommu_domain *domain,
3058 struct dmar_domain *dmar_domain = domain->priv;
3059 struct pci_dev *pdev = to_pci_dev(dev);
3061 vm_domain_remove_one_dev_info(dmar_domain, pdev);
3064 static int intel_iommu_map_range(struct iommu_domain *domain,
3065 unsigned long iova, phys_addr_t hpa,
3066 size_t size, int iommu_prot)
3068 struct dmar_domain *dmar_domain = domain->priv;
3074 if (iommu_prot & IOMMU_READ)
3075 prot |= DMA_PTE_READ;
3076 if (iommu_prot & IOMMU_WRITE)
3077 prot |= DMA_PTE_WRITE;
3079 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3080 if (dmar_domain->max_addr < max_addr) {
3084 /* check if minimum agaw is sufficient for mapped address */
3085 min_agaw = vm_domain_min_agaw(dmar_domain);
3086 addr_width = agaw_to_width(min_agaw);
3087 end = DOMAIN_MAX_ADDR(addr_width);
3088 end = end & VTD_PAGE_MASK;
3089 if (end < max_addr) {
3090 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3091 "sufficient for the mapped address (%llx)\n",
3092 __func__, min_agaw, max_addr);
3095 dmar_domain->max_addr = max_addr;
3098 ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3102 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3103 unsigned long iova, size_t size)
3105 struct dmar_domain *dmar_domain = domain->priv;
3108 /* The address might not be aligned */
3109 base = iova & VTD_PAGE_MASK;
3110 size = VTD_PAGE_ALIGN(size);
3111 dma_pte_clear_range(dmar_domain, base, base + size);
3113 if (dmar_domain->max_addr == base + size)
3114 dmar_domain->max_addr = base;
3117 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3120 struct dmar_domain *dmar_domain = domain->priv;
3121 struct dma_pte *pte;
3124 pte = addr_to_dma_pte(dmar_domain, iova);
3126 phys = dma_pte_addr(pte);
3131 static struct iommu_ops intel_iommu_ops = {
3132 .domain_init = intel_iommu_domain_init,
3133 .domain_destroy = intel_iommu_domain_destroy,
3134 .attach_dev = intel_iommu_attach_device,
3135 .detach_dev = intel_iommu_detach_device,
3136 .map = intel_iommu_map_range,
3137 .unmap = intel_iommu_unmap_range,
3138 .iova_to_phys = intel_iommu_iova_to_phys,